checkpoint

author: polwex <polwex@sortug.com> 2025-07-23 02:37:15 +0700
committer: polwex <polwex@sortug.com> 2025-07-23 02:37:15 +0700
commit: 42dd99bfac9777a4ecc6700b87edf26a5c984de6 (patch)
tree: 031e45d187f45def4b58ad7590d39dec3924600d /src
parent: 4c6913644b362b28f15b125c2fbe48165f1e048c (diff)
13 files changed, 885 insertions, 137 deletions
diff --git a/src/claude.ts b/src/claude.ts
index 377316e..2a56bc1 100644
--- a/src/claude.ts
+++ b/src/claude.ts
@@ -1,20 +1,30 @@
 import Claude from "@anthropic-ai/sdk";
 import { RESPONSE_LENGTH } from "./logic/constants";
-import type { AResult, ChatMessage, OChoice, OChunk, OMessage } from "./types";
+import type {
+  AIModelAPI,
+  ChatMessage,
+  OChoice,
+  OChunk,
+  OMessage,
+} from "./types";
 import { BOOKWORM_SYS } from "./prompts";
+import type { AsyncRes } from "sortug";
 
 type Message = Claude.Messages.MessageParam;
 
-export default class Conversation {
-  private tokenizer: (text: string) => number;
-  private maxTokens: number;
-  model: string = "claude-3-5-sonnet-20241022";
+export default class ClaudeAPI implements AIModelAPI {
+  private model: string = "claude-3-7-sonnet-20250219";
+  tokenizer: (text: string) => number;
+  maxTokens: number;
+  // model: string = "claude-3-5-sonnet-20241022";
   constructor(
     maxTokens = 200_000,
     tokenizer: (text: string) => number = (text) => text.length / 3,
+    model?: string,
   ) {
     this.maxTokens = maxTokens;
     this.tokenizer = tokenizer;
+    if (model) this.model = model;
   }
   public setModel(model: string) {
     this.model = model;
@@ -101,7 +111,7 @@ export default class Conversation {
     system: string,
     messages: Message[],
     isR1: boolean = false,
-  ): Promise<AResult<string[]>> {
+  ): Promise<AsyncRes<string[]>> {
     try {
       const claud = new Claude();
       // const list = await claud.models.list();
diff --git a/src/gemini.ts b/src/gemini.ts
index 2f685a2..3e636c2 100644
--- a/src/gemini.ts
+++ b/src/gemini.ts
@@ -1,137 +1,132 @@
 import {
-  GenerativeModel,
-  GoogleGenerativeAI,
+  Chat,
+  GoogleGenAI,
   type Content,
-  type GenerateContentResult,
-} from "@google/generative-ai";
+  type GeneratedImage,
+  type GeneratedVideo,
+} from "@google/genai";
 import { RESPONSE_LENGTH } from "./logic/constants";
-import type { AResult, ChatMessage, OChoice, OChunk, OMessage } from "./types";
+import type {
+  AIModelAPI,
+  ChatMessage,
+  OChoice,
+  OChunk,
+  OMessage,
+} from "./types";
+import type { AsyncRes } from "sortug";
 
-export default class Conversation {
-  private tokenizer: (text: string) => number;
-  private maxTokens: number;
-  private model: GenerativeModel;
+export default class GeminiAPI {
+  tokenizer: (text: string) => number;
+  maxTokens: number;
+  private model: string;
+  api: GoogleGenAI;
+  chats: Map<string, Chat> = new Map<string, Chat>();
 
   constructor(
     maxTokens = 200_000,
     tokenizer: (text: string) => number = (text) => text.length / 3,
+    model?: string,
   ) {
     this.maxTokens = maxTokens;
     this.tokenizer = tokenizer;
 
-    const gem = new GoogleGenerativeAI(Bun.env["GEMINI_API_KEY"]!);
-    this.model = gem.getGenerativeModel({
-      model: "gemini-2.0-flash-exp",
-      generationConfig: { maxOutputTokens: RESPONSE_LENGTH },
-    });
+    const gem = new GoogleGenAI({ apiKey: Bun.env["GEMINI_API_KEY"]! });
+    this.api = gem;
+    this.model = model || "gemini-2.5-pro-preview-05-06	";
   }
 
-  public setModel(model: string) {
-    const gem = new GoogleGenerativeAI(Bun.env["GEMINI_API_KEY"]!);
-    this.model = gem.getGenerativeModel({
-      model,
-      generationConfig: { maxOutputTokens: RESPONSE_LENGTH },
-    });
+  createChat({ name, history }: { name?: string; history?: Content[] }) {
+    const chat = this.api.chats.create({ model: this.model, history });
+    this.chats.set(name ? name : Date.now().toString(), chat);
   }
-  private mapMessages(input: ChatMessage[]): Content[] {
-    return input.map((m) => ({
-      role: m.author === "gemini" ? "model" : "user",
-      parts: [{ text: m.text }],
-    }));
+  async followChat(name: string, message: string): AsyncRes<string> {
+    const chat = this.chats.get(name);
+    if (!chat) return { error: "no chat with that name" };
+    else {
+      const response = await chat.sendMessage({ message });
+      const text = response.text;
+      return { ok: text || "" };
+    }
   }
-
-  private mapMessagesR1(input: ChatMessage[]): Content[] {
-    return input.reduce((acc: Content[], m, i) => {
-      const prev = acc[i - 1];
-      const role = m.author === "gemini" ? "model" : "user";
-      const msg = { role, parts: [{ text: m.text }] };
-      if (prev?.role === role) acc[i - 1] = msg;
-      else acc = [...acc, msg];
-      return acc;
-    }, []);
+  async followChatStream(
+    name: string,
+    message: string,
+    handler: (data: string) => void,
+  ) {
+    const chat = this.chats.get(name);
+    if (!chat) throw new Error("no chat!");
+    else {
+      const response = await chat.sendMessageStream({ message });
+      for await (const chunk of response) {
+        const text = chunk.text;
+        handler(text || "");
+      }
+    }
   }
 
-  private async apiCall(
-    messages: Content[],
-    isR1: boolean = false,
-  ): Promise<AResult<string[]>> {
+  async send(message: string, systemPrompt?: string): AsyncRes<string> {
     try {
-      const chat = this.model.startChat({ history: messages });
-      const res = await chat.sendMessage("");
-      return { ok: [res.response.text()] };
+      const opts = {
+        model: this.model,
+        contents: message,
+      };
+      const fopts = systemPrompt
+        ? { ...opts, config: { systemInstruction: systemPrompt } }
+        : opts;
+      const response = await this.api.models.generateContent(fopts);
+      return { ok: response.text || "" };
     } catch (e) {
-      console.log(e, "error in gemini api");
       return { error: `${e}` };
     }
   }
+  async sendStream(
+    handler: (s: string) => void,
+    message: string,
+    systemPrompt?: string,
+  ) {
+    const opts = {
+      model: this.model,
+      contents: message,
+    };
+    const fopts = systemPrompt
+      ? { ...opts, config: { systemInstruction: systemPrompt } }
+      : opts;
+    const response = await this.api.models.generateContentStream(fopts);
+    for await (const chunk of response) {
+      handler(chunk.text || "");
+    }
+  }
 
-  private async apiCallStream(
-    messages: Content[],
-    handle: (c: any) => void,
-    isR1: boolean = false,
-  ): Promise<void> {
+  async makeImage(prompt: string): AsyncRes<GeneratedImage[]> {
     try {
-      const chat = this.model.startChat({ history: messages });
-      const res = await chat.sendMessage("");
-      // for await (const chunk of res.stream()) {
-      //   handle(chunk.text());
-      // }
+      const response = await this.api.models.generateImages({
+        model: this.model,
+        prompt,
+      });
+      // TODO if empty or undefined return error
+      return { ok: response.generatedImages || [] };
     } catch (e) {
-      console.log(e, "error in gemini api");
-      handle(`Error streaming Gemini, ${e}`);
+      return { error: `${e}` };
     }
   }
-
-  public async send(sys: string, input: ChatMessage[]) {
-    const messages = this.mapMessages(input);
-    const truncated = this.truncateHistory(messages);
-    const res = await this.apiCall(truncated);
-    return res;
-  }
-
-  public async sendR1(input: ChatMessage[]) {
-    const messages = this.mapMessagesR1(input);
-    const truncated = this.truncateHistory(messages);
-    const res = await this.apiCall(truncated, true);
-    return res;
-  }
-
-  public async stream(
-    sys: string,
-    input: ChatMessage[],
-    handle: (c: any) => void,
-  ) {
-    const messages = this.mapMessages(input);
-    const truncated = this.truncateHistory(messages);
-    await this.apiCallStream(truncated, handle);
-  }
-
-  public async streamR1(input: ChatMessage[], handle: (c: any) => void) {
-    const messages = this.mapMessagesR1(input);
-    const truncated = this.truncateHistory(messages);
-    await this.apiCallStream(truncated, handle, true);
-  }
-
-  public async sendDoc(data: ArrayBuffer, mimeType: string, prompt: string) {
-    const res = await this.model.generateContent([
-      {
-        inlineData: {
-          data: Buffer.from(data).toString("base64"),
-          mimeType,
-        },
-      },
-      prompt,
-    ]);
-    return res;
-  }
-
-  private truncateHistory(messages: Content[]): Content[] {
-    const totalTokens = messages.reduce((total, message) => {
-      return total + this.tokenizer(message.parts[0].text || "");
-    }, 0);
-    while (totalTokens > this.maxTokens && messages.length > 1) {
-      messages.splice(0, 1);
+  async makeVideo({
+    prompt,
+    image,
+  }: {
+    prompt?: string;
+    image?: string;
+  }): AsyncRes<GeneratedVideo[]> {
+    try {
+      const response = await this.api.models.generateVideos({
+        model: this.model,
+        prompt,
+      });
+      // TODO if empty or undefined return error
+      return { ok: response.response?.generatedVideos || [] };
+    } catch (e) {
+      return { error: `${e}` };
     }
-    return messages;
   }
 }
+// TODO how to use caches
+// https://ai.google.dev/api/caching
diff --git a/src/gemini2.ts b/src/gemini2.ts
new file mode 100644
index 0000000..291553f
--- /dev/null
+++ b/src/gemini2.ts
@@ -0,0 +1,149 @@
+import {
+  GenerativeModel,
+  GoogleGenerativeAI,
+  type Content,
+  type GenerateContentResult,
+} from "@google/generative-ai";
+import { RESPONSE_LENGTH } from "./logic/constants";
+import type {
+  AIModelAPI,
+  ChatMessage,
+  OChoice,
+  OChunk,
+  OMessage,
+} from "./types";
+import type { AsyncRes } from "sortug";
+
+export default class GeminiAPI implements AIModelAPI {
+  tokenizer: (text: string) => number;
+  maxTokens: number;
+  private model: GenerativeModel;
+
+  constructor(
+    maxTokens = 200_000,
+    tokenizer: (text: string) => number = (text) => text.length / 3,
+    model?: string,
+  ) {
+    this.maxTokens = maxTokens;
+    this.tokenizer = tokenizer;
+
+    const gem = new GoogleGenerativeAI(Bun.env["GEMINI_API_KEY"]!);
+    this.model = gem.getGenerativeModel({
+      // model: model || "gemini-2.0-flash-exp",
+      model: model || "gemini-2.5-pro-preview-05-06	",
+      generationConfig: { maxOutputTokens: RESPONSE_LENGTH },
+    });
+  }
+
+  public setModel(model: string) {
+    const gem = new GoogleGenerativeAI(Bun.env["GEMINI_API_KEY"]!);
+    this.model = gem.getGenerativeModel({
+      model,
+      generationConfig: { maxOutputTokens: RESPONSE_LENGTH },
+    });
+  }
+  private mapMessages(input: ChatMessage[]): Content[] {
+    return input.map((m) => ({
+      role: m.author === "gemini" ? "model" : "user",
+      parts: [{ text: m.text }],
+    }));
+  }
+
+  private mapMessagesR1(input: ChatMessage[]): Content[] {
+    return input.reduce((acc: Content[], m, i) => {
+      const prev = acc[i - 1];
+      const role = m.author === "gemini" ? "model" : "user";
+      const msg = { role, parts: [{ text: m.text }] };
+      if (prev?.role === role) acc[i - 1] = msg;
+      else acc = [...acc, msg];
+      return acc;
+    }, []);
+  }
+
+  private async apiCall(
+    messages: Content[],
+    isR1: boolean = false,
+  ): Promise<AsyncRes<string[]>> {
+    try {
+      const chat = this.model.startChat({ history: messages });
+      const res = await chat.sendMessage("");
+      return { ok: [res.response.text()] };
+    } catch (e) {
+      console.log(e, "error in gemini api");
+      return { error: `${e}` };
+    }
+  }
+
+  private async apiCallStream(
+    messages: Content[],
+    handle: (c: any) => void,
+    isR1: boolean = false,
+  ): Promise<void> {
+    try {
+      const chat = this.model.startChat({ history: messages });
+      const res = await chat.sendMessage("");
+      // for await (const chunk of res.stream()) {
+      //   handle(chunk.text());
+      // }
+    } catch (e) {
+      console.log(e, "error in gemini api");
+      handle(`Error streaming Gemini, ${e}`);
+    }
+  }
+
+  public async send(sys: string, input: ChatMessage[]) {
+    console.log({ sys, input });
+    this.model.systemInstruction = { role: "system", parts: [{ text: sys }] };
+    const messages = this.mapMessages(input);
+    const truncated = this.truncateHistory(messages);
+    const res = await this.apiCall(truncated);
+    return res;
+  }
+
+  public async sendR1(input: ChatMessage[]) {
+    const messages = this.mapMessagesR1(input);
+    const truncated = this.truncateHistory(messages);
+    const res = await this.apiCall(truncated, true);
+    return res;
+  }
+
+  public async stream(
+    sys: string,
+    input: ChatMessage[],
+    handle: (c: any) => void,
+  ) {
+    this.model.systemInstruction = { role: "system", parts: [{ text: sys }] };
+    const messages = this.mapMessages(input);
+    const truncated = this.truncateHistory(messages);
+    await this.apiCallStream(truncated, handle);
+  }
+
+  public async streamR1(input: ChatMessage[], handle: (c: any) => void) {
+    const messages = this.mapMessagesR1(input);
+    const truncated = this.truncateHistory(messages);
+    await this.apiCallStream(truncated, handle, true);
+  }
+
+  public async sendDoc(data: ArrayBuffer, mimeType: string, prompt: string) {
+    const res = await this.model.generateContent([
+      {
+        inlineData: {
+          data: Buffer.from(data).toString("base64"),
+          mimeType,
+        },
+      },
+      prompt,
+    ]);
+    return res;
+  }
+
+  private truncateHistory(messages: Content[]): Content[] {
+    const totalTokens = messages.reduce((total, message) => {
+      return total + this.tokenizer(message.parts[0].text || "");
+    }, 0);
+    while (totalTokens > this.maxTokens && messages.length > 1) {
+      messages.splice(0, 1);
+    }
+    return messages;
+  }
+}
diff --git a/src/model.ts b/src/generic.ts
index 39b42dc..50c4435 100644
--- a/src/model.ts
+++ b/src/generic.ts
@@ -1,29 +1,30 @@
 import OpenAI from "openai";
 import { MAX_TOKENS, RESPONSE_LENGTH } from "./logic/constants";
-import type { AResult, ChatMessage, OChoice } from "./types";
+import type { AIModelAPI, ChatMessage, OChoice } from "./types";
+import type { AsyncRes } from "sortug";
 
 type Message = OpenAI.Chat.Completions.ChatCompletionMessageParam;
 
 type Props = {
   baseURL: string;
   apiKey: string;
-  model: string;
+  model?: string;
   maxTokens?: number;
   tokenizer?: (text: string) => number;
 };
-export default class Conversation {
+export default class OpenAIAPI implements AIModelAPI {
   private apiKey;
   private baseURL;
-  private maxTokens: number = MAX_TOKENS;
-  private tokenizer: (text: string) => number = (text) => text.length / 3;
   private api;
-  private model;
+  maxTokens: number = MAX_TOKENS;
+  tokenizer: (text: string) => number = (text) => text.length / 3;
+  model;
 
   constructor(props: Props) {
     this.apiKey = props.apiKey;
     this.baseURL = props.baseURL;
     this.api = new OpenAI({ baseURL: this.baseURL, apiKey: this.apiKey });
-    this.model = props.model;
+    this.model = props.model || "";
     if (props.maxTokens) this.maxTokens = props.maxTokens;
     if (props.tokenizer) this.tokenizer = props.tokenizer;
   }
@@ -36,7 +37,7 @@ export default class Conversation {
     });
   }
 
-  public async send(sys: string, input: ChatMessage[]): AResult<string[]> {
+  public async send(sys: string, input: ChatMessage[]): AsyncRes<string[]> {
     const messages = this.mapMessages(input);
     const sysMsg: Message = { role: "system", content: sys };
     const allMessages = [sysMsg, ...messages];
@@ -44,12 +45,15 @@ export default class Conversation {
     const truncated = this.truncateHistory(allMessages);
     const res = await this.apiCall(truncated);
     if ("error" in res) return res;
-    else
+    else {
       try {
-        return { ok: res.ok.map((c) => c.message.content!) };
+        // TODO type this properly
+        const choices: OChoice[] = res.ok;
+        return { ok: choices.map((c) => c.message.content!) };
       } catch (e) {
         return { error: `${e}` };
       }
+    }
   }
 
   public async stream(
@@ -77,7 +81,7 @@ export default class Conversation {
   }
 
   // TODO custom temperature?
-  private async apiCall(messages: Message[]): AResult<OChoice[]> {
+  private async apiCall(messages: Message[]): AsyncRes<OChoice[]> {
     console.log({ messages }, "at the very end");
     try {
       const completion = await this.api.chat.completions.create({
diff --git a/src/nlp/index.ts b/src/nlp/index.ts
new file mode 100644
index 0000000..ebed586
--- /dev/null
+++ b/src/nlp/index.ts
@@ -0,0 +1,7 @@
+import * as Spacy from "./spacy";
+import * as Stanza from "./stanza";
+import * as ISO from "./iso";
+import { ocr } from "./ocr";
+import type * as Types from "./types";
+export * from "./nlp";
+export { ISO, ocr, Stanza, Spacy, type Types };
diff --git a/src/nlp/iso.ts b/src/nlp/iso.ts
new file mode 100644
index 0000000..3e60850
--- /dev/null
+++ b/src/nlp/iso.ts
@@ -0,0 +1,10 @@
+import { franc, francAll } from "franc-all";
+import { iso6393To1 } from "iso-639-3";
+export { iso6393, iso6393To1, iso6393To2B, iso6393To2T } from "iso-639-3";
+export * as BCP47 from "bcp-47";
+
+export function detectLang(text: string) {
+  const iso3 = franc(text);
+  const iso1 = iso6393To1[iso3];
+  return iso1 ? iso1 : iso3;
+}
diff --git a/src/nlp/nlp.ts b/src/nlp/nlp.ts
new file mode 100644
index 0000000..3b1e3a7
--- /dev/null
+++ b/src/nlp/nlp.ts
@@ -0,0 +1,208 @@
+export const isPunctuation = (text: string): boolean => {
+  // Common punctuation characters
+  const punctuationRegex = /^[.,;:!?()[\]{}'"«»""''…-]+$/;
+  return punctuationRegex.test(text);
+};
+
+// Get color for different syntactic categories
+export function getColorForType(type: string): string {
+  const colors: Record<string, string> = {
+    // Phrasal categories
+    S: "#6495ED", // Sentence - cornflower blue
+    NP: "#FF7F50", // Noun Phrase - coral
+    VP: "#32CD32", // Verb Phrase - lime green
+    PP: "#9370DB", // Prepositional Phrase - medium purple
+    ADJP: "#FFD700", // Adjective Phrase - gold
+    ADVP: "#FF69B4", // Adverb Phrase - hot pink
+
+    // Part-of-speech tags
+    NN: "#FFA07A", // Noun - light salmon
+    NNS: "#FFA07A", // Plural Noun - light salmon
+    NNP: "#FFA07A", // Proper Noun - light salmon
+    VB: "#90EE90", // Verb - light green
+    VBP: "#90EE90", // Present tense verb - light green
+    VBG: "#90EE90", // Gerund verb - light green
+    VBZ: "#90EE90", // 3rd person singular present verb - light green
+    VBD: "#90EE90", // Past tense verb - light green
+    VBN: "#90EE90", // Past participle verb - light green
+    JJ: "#F0E68C", // Adjective - khaki
+    RB: "#DDA0DD", // Adverb - plum
+    IN: "#87CEFA", // Preposition - light sky blue
+    DT: "#D3D3D3", // Determiner - light gray
+    PRP: "#D8BFD8", // Personal pronoun - thistle
+    CC: "#A9A9A9", // Coordinating conjunction - dark gray
+
+    // Default
+    ROOT: "#000000", // Root - black
+    LEAF: "#666666", // Leaf nodes - dark gray
+  };
+
+  return colors[type] || "#666666";
+}
+
+// Get a description for node types
+export function getDescription(type: string): string {
+  const descriptions: Record<string, string> = {
+    S: "Sentence",
+    SBAR: "Subordinating conjunction clause",
+    SBARQ: "Direct question",
+    SINV: "Declarative sentence with subject-aux inversion",
+    SQ: "Subconstituent of SBARQ excluding wh-word",
+    WHADVP: "wh-adverb phrase",
+    WHNP: "wh-nounphrase",
+    WHPP: "wh-prepositional phrase",
+    WDT: "wh-determiner",
+    WP: "wh-pronoun",
+    WRB: "wh-adverb",
+    WP$: "possesive wh-pronoun",
+    MD: "modal",
+    X: "Unknown",
+    NP: "Noun Phrase",
+    VP: "Verb Phrase",
+    PP: "Prepositional Phrase",
+    ADJP: "Adjective Phrase",
+    ADVP: "Adverb Phrase",
+    LS: "List item market",
+    SYM: "Symbol",
+    NN: "Noun",
+    NNS: "Plural Noun",
+    NNP: "Proper Noun",
+    NNPS: "Proper Noun, Plural",
+    VB: "Verb (base form)",
+    VBP: "Verb (present tense)",
+    VBG: "Verb (gerund/present participle)",
+    VBZ: "Verb (3rd person singular present)",
+    VBD: "Verb (past tense)",
+    VBN: "Verb (past participle)",
+    JJ: "Adjective",
+    JJR: "Adjective, comparative",
+    JJS: "Adjective, superlative",
+    EX: "Existential there",
+    RB: "Adverb",
+    RBR: "Adverb, comparative",
+    RBS: "Adverb, superlative",
+    RP: "Particle",
+    IN: "Preposition",
+    TO: "to",
+    DT: "Determiner",
+    PDT: "Predeterminer",
+    PRP: "Personal Pronoun",
+    PP$: "Possesive Pronoun",
+    PRP$: "Possesive Pronoun",
+    POS: "Possesive ending",
+    FW: "Foreign Word",
+    CC: "Coordinating Conjunction",
+    CD: "Cardinal number",
+    UH: "interjection",
+    ROOT: "Root Node",
+    CLR: "figurative motion",
+    FRAG: "fragment",
+    ":": "Colon/Semicolon",
+    ",": "Comma",
+    ".": "Period",
+  };
+
+  return descriptions[type] || type;
+}
+
+// https://universaldependencies.org/u/dep/xcomp.htmlexport
+
+export function unpackDeprel(type: string): string {
+  const descriptions: Record<string, string> = {
+    nsubj: "nominal subject",
+    obj: "object",
+    iobj: "indirect object",
+    csubj: "clausal subject",
+    ccomp: "clausal complement",
+    xcomp: "open clausal complement",
+    obl: "oblique nominal",
+    vocative: "vocative",
+    expl: "expletive",
+    dislocated: "dislocated",
+    nmod: "nominal modifier",
+    appos: "appositional modifier",
+    nummod: "numeric modifier",
+    advcl: "adverbial clause modifier",
+    acl: "admonimal clause",
+    advmod: "adverbial modifier",
+    discourse: "dicourse element",
+    aux: "auxiliary",
+    cop: "copula",
+    mark: "marker",
+    amod: "adjectival modifier",
+    det: "determiner",
+    clf: "classifier",
+    case: "case marker",
+    conj: "conjunction",
+    cc: "coordinating conjunction",
+    fixed: "fixed multiword expression",
+    flat: "flat expression",
+    list: "list",
+    parataxis: "parataxis",
+    compound: "compound",
+    orphan: "orphan",
+    goeswith: "goes with",
+    reparandum: "overriden disfluency",
+    punct: "punctuation",
+    root: "root",
+    dep: "unspecified dependency",
+  };
+  const res = descriptions[type];
+  if (!res) console.log("tag not found!!", type);
+
+  return res || type;
+}
+
+export function deprelColors(type: string): string {
+  const colors: Record<string, string> = {
+    // Phrasal categories
+    s: "#6495ED", // Sentence - cornflower blue
+    nsubj: "#6495ED", // Sentence - cornflower blue
+    root: "#FFD700", // Adjective Phrase - gold
+    p: "#FFD700", // Adjective Phrase - gold
+    NP: "#FF7F50", // Noun Phrase - coral
+    VP: "#32CD32", // Verb Phrase - lime green
+    PP: "#9370DB", // Prepositional Phrase - medium purple
+    ADVP: "#FF69B4", // Adverb Phrase - hot pink
+
+    // Part-of-speech tags
+    NN: "#FFA07A", // Noun - light salmon
+    NNS: "#FFA07A", // Plural Noun - light salmon
+    NNP: "#FFA07A", // Proper Noun - light salmon
+    VB: "#90EE90", // Verb - light green
+    VBP: "#90EE90", // Present tense verb - light green
+    VBG: "#90EE90", // Gerund verb - light green
+    VBZ: "#90EE90", // 3rd person singular present verb - light green
+    VBD: "#90EE90", // Past tense verb - light green
+    VBN: "#90EE90", // Past participle verb - light green
+    JJ: "#F0E68C", // Adjective - khaki
+    RB: "#DDA0DD", // Adverb - plum
+    IN: "#87CEFA", // Preposition - light sky blue
+    DT: "#D3D3D3", // Determiner - light gray
+    PRP: "#D8BFD8", // Personal pronoun - thistle
+    CC: "#A9A9A9", // Coordinating conjunction - dark gray
+
+    // Default
+    ROOT: "#000000", // Root - black
+    LEAF: "#666666", // Leaf nodes - dark gray
+  };
+
+  return colors[type] || "#666666";
+}
+export function unpackPos(pos: string): string {
+  const map: Record<string, string> = {
+    adj: "adjective",
+    adv: "adverb",
+    adv_phrase: "adverbial phrase",
+    combining_form: "combining form",
+    conj: "conjunction",
+    det: "determinant",
+    intj: "interjection",
+    num: "number",
+    prep: "preposition",
+    prep_phrase: "prepositional phrase",
+    pron: "pronoun",
+    punct: "punctuation",
+  };
+  return map[pos] || pos;
+}
diff --git a/src/nlp/ocr.ts b/src/nlp/ocr.ts
new file mode 100644
index 0000000..1c40355
--- /dev/null
+++ b/src/nlp/ocr.ts
@@ -0,0 +1,18 @@
+import type { AsyncRes } from "sortug";
+
+export async function ocr(formData: FormData): AsyncRes<string[]> {
+  const endpoint = "http://localhost:8102/ocr";
+
+  const opts = {
+    method: "POST",
+    body: formData,
+    headers: { "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY! },
+  };
+  try {
+    const res = await fetch(endpoint, opts);
+    const j = await res.json();
+    return { ok: j };
+  } catch (e) {
+    return { error: `${e}` };
+  }
+}
diff --git a/src/nlp/spacy.ts b/src/nlp/spacy.ts
new file mode 100644
index 0000000..d79de55
--- /dev/null
+++ b/src/nlp/spacy.ts
@@ -0,0 +1,79 @@
+import type { AsyncRes, Result } from "sortug";
+import { detectLang } from "./iso";
+const ENDPOINT = "http://localhost:8102";
+
+export async function run(text: string, langg?: string): AsyncRes<SpacyRes> {
+  try {
+    const lang = langg ? langg : detectLang(text);
+    const body = JSON.stringify({ string: text, lang });
+    const opts = {
+      headers: {
+        "Content-type": "application/json",
+        "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+      },
+      method: "POST",
+      body,
+    };
+    const res = await fetch(ENDPOINT + "/spacy", opts);
+    const j = await res.json();
+    console.log("spacy", j);
+    return { ok: j };
+  } catch (e) {
+    return { error: `${e}` };
+  }
+}
+
+export type SpacyResBig = {
+  doc: {
+    text: string;
+    ents: any[];
+    sents: Array<{ start: number; end: number }>;
+    tokens: Token[];
+  };
+  segs: Sentence[];
+};
+export type SpacyRes = {
+  input: string;
+  segments: Sentence[];
+};
+export type Sentence = {
+  text: string;
+  start: number;
+  end: number;
+  root: Token;
+  subj: Token;
+  arcs: Arc[];
+  words: Word[];
+};
+export type Arc = {
+  start: number;
+  end: number;
+  label: string; // deprel label
+  dir: string;
+};
+export type Token = {
+  id: number;
+  head: number;
+  start: number;
+  end: number;
+  dep: string;
+  lemma: string;
+  morph: string;
+  pos: string;
+  tag: string;
+  text: string;
+};
+
+export interface Word extends Token {
+  ancestors: number[];
+  children: [];
+  n_lefts: number;
+  n_rights: number;
+  left_edge: number;
+  right_edge: number;
+  morph_map: Record<string, string>;
+}
+
+export function isChild(w: Word, topId: number): boolean {
+  return w.id === topId || w.ancestors.includes(topId);
+}
diff --git a/src/nlp/stanza.ts b/src/nlp/stanza.ts
new file mode 100644
index 0000000..5836b91
--- /dev/null
+++ b/src/nlp/stanza.ts
@@ -0,0 +1,210 @@
+import type { AsyncRes, Result } from "sortug";
+import { detectLang } from "./iso";
+
+const ENDPOINT = "http://localhost:8102";
+export async function segmenter(
+  text: string,
+  langg?: string,
+): AsyncRes<StanzaRes> {
+  try {
+    const lang = langg ? langg : detectLang(text);
+    const body = JSON.stringify({ lang, string: text });
+    const opts = {
+      headers: {
+        "Content-type": "application/json",
+        "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+      },
+      method: "POST",
+      body,
+    };
+    const res = await fetch(ENDPOINT + "/stanza", opts);
+    const j = await res.json();
+    return { ok: j };
+  } catch (e) {
+    return { error: `${e}` };
+  }
+}
+export async function idLang(text: string) {
+  try {
+    const body = JSON.stringify({ string: text });
+    const opts = {
+      headers: {
+        "Content-type": "application/json",
+        "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+      },
+      method: "POST",
+      body,
+    };
+    const res = await fetch(ENDPOINT + "/detect-lang", opts);
+    const j = await res.json();
+    return { ok: j };
+  } catch (e) {
+    return { error: `${e}` };
+  }
+}
+export type StanzaRes = { input: string; segments: Sentence[] };
+export type Sentence = {
+  text: string;
+  sentiment: number;
+  constituency: TreeNode;
+  constring: string;
+  dependencies: Dependency[];
+  entities: Entity[];
+  tokens: Token[];
+  words: Word[];
+};
+export type TreeNode = {
+  label: string;
+  children: TreeNode[];
+};
+export type Dependency = Array<[Word, string, Word]>;
+export type Word = {
+  id: number;
+  text: string;
+  lemma: string;
+  upos: string;
+  xpos: string;
+  feats: string;
+  head: number;
+  deprel: string;
+  start_char: number;
+  end_char: number;
+};
+export type Token = {
+  id: [number, number];
+  text: string;
+  misc: string;
+  words: Word[];
+  start_char: number;
+  end_char: number;
+  ner: string;
+};
+export type Entity = {
+  text: string;
+  misc: string;
+  start_char: number;
+  end_char: number;
+  type: string;
+};
+
+// mine
+export type Clause = {
+  words: Word[];
+  dependency: Dependency;
+  text: string;
+};
+// "amod",
+// {
+//   "id": 1,
+//   "text": "Stony",
+//   "lemma": "Stony",
+//   "upos": "ADJ",
+//   "xpos": "NNP",
+//   "feats": "Degree=Pos",
+//   "head": 3,
+//   "deprel": "amod",
+//   "start_char": 0,
+//   "end_char": 5
+// }
+//
+//
+
+export interface ParsedGrammar {
+  predicateCore: number;
+  subjectCore: number | null;
+  tree: Record<number, number[]>;
+  wordMap: WordMap;
+  words: BigWord[];
+}
+export interface BigWord extends Word {
+  ancestry: number[];
+  component: "s" | "p" | "u";
+}
+export type ComputedDependency = {
+  word: BigWord;
+  children: ComputedDependency[];
+};
+export type WordMap = Record<number, Word>;
+
+export function buildTreeFromWords(words: Word[]): Result<ParsedGrammar> {
+  const roots = words.filter((w) => w.deprel === "root");
+  if (roots.length > 1) {
+    console.log("roots", roots);
+    return { error: "too many roots" };
+  } else if (roots.length === 0) {
+    return { error: "no roots" };
+  } else {
+    const root = roots[0];
+    const wordmap = words.reduce((acc: WordMap, item) => {
+      acc[item.id] = item;
+      return acc;
+    }, {});
+    return { ok: parseFurther(words, wordmap, root) };
+  }
+}
+function parseFurther(
+  words: Word[],
+  wordMap: WordMap,
+  root: Word,
+): ParsedGrammar {
+  const predicateCore = root.id;
+  let subjectCore: number | null = null;
+  const tree: Record<number, number[]> = {};
+  const bigwords: BigWord[] = [];
+  const getAncestry = (parent: Word): number[] => {
+    const kids = tree[parent.head] || [];
+    tree[parent.head] = [...kids, parent.id];
+    if (parent.deprel === "nsubj") subjectCore = parent.id;
+
+    console.log("getting ancestry " + parent.id, parent.text);
+    const grandpa = wordMap[parent.head];
+    if (!grandpa) return [parent.id];
+    else return [parent.id, ...getAncestry(grandpa)];
+  };
+  let idx = 0;
+  for (const w of words) {
+    if (w.deprel === "punct") {
+      const prev = words[idx - 1];
+      if (!prev) continue;
+      prev.text += w.text;
+      continue;
+    }
+    const parent = wordMap[w.head];
+    if (!parent) tree[w.id] = [];
+    const ancestry = !parent ? [] : getAncestry(parent);
+    const component =
+      subjectCore && (w.id === subjectCore || ancestry.includes(subjectCore))
+        ? "s"
+        : w.id === predicateCore || ancestry.includes(root.id)
+          ? "p"
+          : "u";
+    const bw: BigWord = { ...w, component, ancestry };
+    wordMap[w.id] = bw;
+    bigwords.push(bw);
+    idx++;
+  }
+  const pg: ParsedGrammar = {
+    predicateCore,
+    subjectCore,
+    wordMap,
+    tree,
+    words: bigwords,
+  };
+  return pg;
+}
+
+export function oneDescendant(node: TreeNode): boolean {
+  if (node.children.length !== 1) return false;
+  else {
+    const child = node.children[0];
+    return child.children.length === 0;
+  }
+}
+
+// function findChildren(wordmap: WordMap, word: Word): ComputedDependency {
+//   const children = words.filter((w) => w.head === head.id);
+//   return {
+//     word: head,
+//     children: children.map((c) => findChildren(words, c)),
+//   };
+// }
diff --git a/src/nlp/types.ts b/src/nlp/types.ts
new file mode 100644
index 0000000..605a637
--- /dev/null
+++ b/src/nlp/types.ts
@@ -0,0 +1,50 @@
+export type ViewLevel =
+  | "text"
+  | "paragraph"
+  | "sentence"
+  | "clause"
+  | "word"
+  | "syllable"
+  | "phoneme";
+export interface ViewState {
+  level: ViewLevel;
+  pIndex: number | null;
+  sIndex: number | null;
+  cIndex: number | null;
+  wIndex: number | null;
+  yIndex: number | null;
+  fIndex: number | null;
+}
+
+export interface ViewProps {
+  idx: number;
+  rawText: string;
+  context: Context;
+}
+export type Context = {
+  parentText: string;
+  segmented: string[];
+  idx: number;
+};
+
+export type WordData = {
+  confidence: number;
+  frequency: number | null;
+  id: number;
+  ipa: Array<{ ipa: string; tags: string[] }>;
+  spelling: string;
+  type: ExpressionType;
+  syllables: number;
+  lang: string;
+  prosody: any;
+  senses: Sense[];
+};
+export type ExpressionType = "word" | "expression" | "syllable";
+export type Sense = {
+  etymology: string;
+  pos: string;
+  forms: Array<{ form: string; tags: string[] }>;
+  related: any;
+  senses: Array<{ glosses: string[]; links: Array<[string, string]> }>;
+};
+export type LoadingStatus = "pending" | "loading" | "success" | "error";
diff --git a/src/openai.ts b/src/openai.ts
index 2e15dcf..12939bc 100644
--- a/src/openai.ts
+++ b/src/openai.ts
@@ -1,14 +1,8 @@
 import fs from "fs";
 import OpenAI from "openai";
 import { RESPONSE_LENGTH } from "./logic/constants";
-import type {
-  AResult,
-  ChatMessage,
-  OChoice,
-  OChunk,
-  OMessage,
-  Result,
-} from "./types";
+import type { ChatMessage, OChoice, OChunk, OMessage } from "./types";
+import type { AsyncRes, Result } from "sortug";
 import OpenAIToolUse from "./openai_tools";
 import type { FileObject } from "openai/src/resources/files.js";
 
@@ -26,7 +20,7 @@ export default class Conversation {
   private baseURL: string = "https://api.openai.com/v1";
   private tokenizer: (text: string) => number = (text) => text.length / 3;
   openai;
-  private model: string = "chatgpt-4o-latest";
+  private model: string = "gpt-4.1";
 
   constructor(props: Props) {
     if (props.apiKey) this.apiKey = props.apiKey;
@@ -56,7 +50,7 @@ export default class Conversation {
     }, []);
   }
 
-  public async send(sys: string, input: ChatMessage[]): AResult<OChoice[]> {
+  public async send(sys: string, input: ChatMessage[]): AsyncRes<OChoice[]> {
     const messages = this.mapMessages(input);
     const sysMsg: Message = { role: "system", content: sys };
     const allMessages = [sysMsg, ...messages];
@@ -65,7 +59,7 @@ export default class Conversation {
     return res;
   }
 
-  public async sendR1(input: ChatMessage[]): AResult<OChoice[]> {
+  public async sendR1(input: ChatMessage[]): AsyncRes<OChoice[]> {
     const messages = this.mapMessagesR1(input);
     const truncated = this.truncateHistory(messages);
     const res = await this.apiCall(truncated);
@@ -102,7 +96,7 @@ export default class Conversation {
     return messages;
   }
 
-  private async apiCall(messages: Message[]): AResult<OChoice[]> {
+  private async apiCall(messages: Message[]): AsyncRes<OChoice[]> {
     try {
       const completion = await this.openai.chat.completions.create({
         temperature: 1.3,
diff --git a/src/types/index.ts b/src/types/index.ts
index 97be443..b276457 100644
--- a/src/types/index.ts
+++ b/src/types/index.ts
@@ -1,15 +1,29 @@
 import type OpenAI from "openai";
+import type { AsyncRes } from "sortug";
 export type ChatMessage = {
   author: string;
   text: string;
   sent: number;
   reasoning?: string;
 };
-export type Result<T> = { ok: T } | { error: string };
-export type AResult<T> = Promise<{ ok: T } | { error: string }>;
 
 // openai
 export type OChoice = OpenAI.Chat.Completions.ChatCompletion.Choice;
 export type OChunk = OpenAI.Chat.Completions.ChatCompletionChunk.Choice;
 export type OMessage = OpenAI.Chat.Completions.ChatCompletionMessageParam;
 export type ContentType = { text: string } | { audio: Response };
+export type AIModelChoice =
+  | { name: "deepseek" | "chatgpt" | "claude" | "gemini" | "grok" }
+  | { other: { baseURL: string; apiKey: string } };
+export interface AIModelAPI {
+  setModel: (model: string) => void;
+  tokenizer: (text: string) => number;
+  maxTokens: number;
+
+  send: (systemPrompt: string, input: ChatMessage[]) => AsyncRes<string[]>;
+  stream: (
+    systemPrompt: string,
+    input: ChatMessage[],
+    handler: (data: any) => void,
+  ) => void;
+}
author	polwex <polwex@sortug.com>	2025-07-23 02:37:15 +0700
committer	polwex <polwex@sortug.com>	2025-07-23 02:37:15 +0700
commit	42dd99bfac9777a4ecc6700b87edf26a5c984de6 (patch)
tree	031e45d187f45def4b58ad7590d39dec3924600d /src
parent	4c6913644b362b28f15b125c2fbe48165f1e048c (diff)