diff options
| author | polwex <polwex@sortug.com> | 2025-07-23 02:37:15 +0700 |
|---|---|---|
| committer | polwex <polwex@sortug.com> | 2025-07-23 02:37:15 +0700 |
| commit | 42dd99bfac9777a4ecc6700b87edf26a5c984de6 (patch) | |
| tree | 031e45d187f45def4b58ad7590d39dec3924600d /src | |
| parent | 4c6913644b362b28f15b125c2fbe48165f1e048c (diff) | |
checkpoint
Diffstat (limited to 'src')
| -rw-r--r-- | src/claude.ts | 22 | ||||
| -rw-r--r-- | src/gemini.ts | 207 | ||||
| -rw-r--r-- | src/gemini2.ts | 149 | ||||
| -rw-r--r-- | src/generic.ts (renamed from src/model.ts) | 26 | ||||
| -rw-r--r-- | src/nlp/index.ts | 7 | ||||
| -rw-r--r-- | src/nlp/iso.ts | 10 | ||||
| -rw-r--r-- | src/nlp/nlp.ts | 208 | ||||
| -rw-r--r-- | src/nlp/ocr.ts | 18 | ||||
| -rw-r--r-- | src/nlp/spacy.ts | 79 | ||||
| -rw-r--r-- | src/nlp/stanza.ts | 210 | ||||
| -rw-r--r-- | src/nlp/types.ts | 50 | ||||
| -rw-r--r-- | src/openai.ts | 18 | ||||
| -rw-r--r-- | src/types/index.ts | 18 |
13 files changed, 885 insertions, 137 deletions
diff --git a/src/claude.ts b/src/claude.ts index 377316e..2a56bc1 100644 --- a/src/claude.ts +++ b/src/claude.ts @@ -1,20 +1,30 @@ import Claude from "@anthropic-ai/sdk"; import { RESPONSE_LENGTH } from "./logic/constants"; -import type { AResult, ChatMessage, OChoice, OChunk, OMessage } from "./types"; +import type { + AIModelAPI, + ChatMessage, + OChoice, + OChunk, + OMessage, +} from "./types"; import { BOOKWORM_SYS } from "./prompts"; +import type { AsyncRes } from "sortug"; type Message = Claude.Messages.MessageParam; -export default class Conversation { - private tokenizer: (text: string) => number; - private maxTokens: number; - model: string = "claude-3-5-sonnet-20241022"; +export default class ClaudeAPI implements AIModelAPI { + private model: string = "claude-3-7-sonnet-20250219"; + tokenizer: (text: string) => number; + maxTokens: number; + // model: string = "claude-3-5-sonnet-20241022"; constructor( maxTokens = 200_000, tokenizer: (text: string) => number = (text) => text.length / 3, + model?: string, ) { this.maxTokens = maxTokens; this.tokenizer = tokenizer; + if (model) this.model = model; } public setModel(model: string) { this.model = model; @@ -101,7 +111,7 @@ export default class Conversation { system: string, messages: Message[], isR1: boolean = false, - ): Promise<AResult<string[]>> { + ): Promise<AsyncRes<string[]>> { try { const claud = new Claude(); // const list = await claud.models.list(); diff --git a/src/gemini.ts b/src/gemini.ts index 2f685a2..3e636c2 100644 --- a/src/gemini.ts +++ b/src/gemini.ts @@ -1,137 +1,132 @@ import { - GenerativeModel, - GoogleGenerativeAI, + Chat, + GoogleGenAI, type Content, - type GenerateContentResult, -} from "@google/generative-ai"; + type GeneratedImage, + type GeneratedVideo, +} from "@google/genai"; import { RESPONSE_LENGTH } from "./logic/constants"; -import type { AResult, ChatMessage, OChoice, OChunk, OMessage } from "./types"; +import type { + AIModelAPI, + ChatMessage, + OChoice, + OChunk, + OMessage, +} from "./types"; +import type { AsyncRes } from "sortug"; -export default class Conversation { - private tokenizer: (text: string) => number; - private maxTokens: number; - private model: GenerativeModel; +export default class GeminiAPI { + tokenizer: (text: string) => number; + maxTokens: number; + private model: string; + api: GoogleGenAI; + chats: Map<string, Chat> = new Map<string, Chat>(); constructor( maxTokens = 200_000, tokenizer: (text: string) => number = (text) => text.length / 3, + model?: string, ) { this.maxTokens = maxTokens; this.tokenizer = tokenizer; - const gem = new GoogleGenerativeAI(Bun.env["GEMINI_API_KEY"]!); - this.model = gem.getGenerativeModel({ - model: "gemini-2.0-flash-exp", - generationConfig: { maxOutputTokens: RESPONSE_LENGTH }, - }); + const gem = new GoogleGenAI({ apiKey: Bun.env["GEMINI_API_KEY"]! }); + this.api = gem; + this.model = model || "gemini-2.5-pro-preview-05-06 "; } - public setModel(model: string) { - const gem = new GoogleGenerativeAI(Bun.env["GEMINI_API_KEY"]!); - this.model = gem.getGenerativeModel({ - model, - generationConfig: { maxOutputTokens: RESPONSE_LENGTH }, - }); + createChat({ name, history }: { name?: string; history?: Content[] }) { + const chat = this.api.chats.create({ model: this.model, history }); + this.chats.set(name ? name : Date.now().toString(), chat); } - private mapMessages(input: ChatMessage[]): Content[] { - return input.map((m) => ({ - role: m.author === "gemini" ? "model" : "user", - parts: [{ text: m.text }], - })); + async followChat(name: string, message: string): AsyncRes<string> { + const chat = this.chats.get(name); + if (!chat) return { error: "no chat with that name" }; + else { + const response = await chat.sendMessage({ message }); + const text = response.text; + return { ok: text || "" }; + } } - - private mapMessagesR1(input: ChatMessage[]): Content[] { - return input.reduce((acc: Content[], m, i) => { - const prev = acc[i - 1]; - const role = m.author === "gemini" ? "model" : "user"; - const msg = { role, parts: [{ text: m.text }] }; - if (prev?.role === role) acc[i - 1] = msg; - else acc = [...acc, msg]; - return acc; - }, []); + async followChatStream( + name: string, + message: string, + handler: (data: string) => void, + ) { + const chat = this.chats.get(name); + if (!chat) throw new Error("no chat!"); + else { + const response = await chat.sendMessageStream({ message }); + for await (const chunk of response) { + const text = chunk.text; + handler(text || ""); + } + } } - private async apiCall( - messages: Content[], - isR1: boolean = false, - ): Promise<AResult<string[]>> { + async send(message: string, systemPrompt?: string): AsyncRes<string> { try { - const chat = this.model.startChat({ history: messages }); - const res = await chat.sendMessage(""); - return { ok: [res.response.text()] }; + const opts = { + model: this.model, + contents: message, + }; + const fopts = systemPrompt + ? { ...opts, config: { systemInstruction: systemPrompt } } + : opts; + const response = await this.api.models.generateContent(fopts); + return { ok: response.text || "" }; } catch (e) { - console.log(e, "error in gemini api"); return { error: `${e}` }; } } + async sendStream( + handler: (s: string) => void, + message: string, + systemPrompt?: string, + ) { + const opts = { + model: this.model, + contents: message, + }; + const fopts = systemPrompt + ? { ...opts, config: { systemInstruction: systemPrompt } } + : opts; + const response = await this.api.models.generateContentStream(fopts); + for await (const chunk of response) { + handler(chunk.text || ""); + } + } - private async apiCallStream( - messages: Content[], - handle: (c: any) => void, - isR1: boolean = false, - ): Promise<void> { + async makeImage(prompt: string): AsyncRes<GeneratedImage[]> { try { - const chat = this.model.startChat({ history: messages }); - const res = await chat.sendMessage(""); - // for await (const chunk of res.stream()) { - // handle(chunk.text()); - // } + const response = await this.api.models.generateImages({ + model: this.model, + prompt, + }); + // TODO if empty or undefined return error + return { ok: response.generatedImages || [] }; } catch (e) { - console.log(e, "error in gemini api"); - handle(`Error streaming Gemini, ${e}`); + return { error: `${e}` }; } } - - public async send(sys: string, input: ChatMessage[]) { - const messages = this.mapMessages(input); - const truncated = this.truncateHistory(messages); - const res = await this.apiCall(truncated); - return res; - } - - public async sendR1(input: ChatMessage[]) { - const messages = this.mapMessagesR1(input); - const truncated = this.truncateHistory(messages); - const res = await this.apiCall(truncated, true); - return res; - } - - public async stream( - sys: string, - input: ChatMessage[], - handle: (c: any) => void, - ) { - const messages = this.mapMessages(input); - const truncated = this.truncateHistory(messages); - await this.apiCallStream(truncated, handle); - } - - public async streamR1(input: ChatMessage[], handle: (c: any) => void) { - const messages = this.mapMessagesR1(input); - const truncated = this.truncateHistory(messages); - await this.apiCallStream(truncated, handle, true); - } - - public async sendDoc(data: ArrayBuffer, mimeType: string, prompt: string) { - const res = await this.model.generateContent([ - { - inlineData: { - data: Buffer.from(data).toString("base64"), - mimeType, - }, - }, - prompt, - ]); - return res; - } - - private truncateHistory(messages: Content[]): Content[] { - const totalTokens = messages.reduce((total, message) => { - return total + this.tokenizer(message.parts[0].text || ""); - }, 0); - while (totalTokens > this.maxTokens && messages.length > 1) { - messages.splice(0, 1); + async makeVideo({ + prompt, + image, + }: { + prompt?: string; + image?: string; + }): AsyncRes<GeneratedVideo[]> { + try { + const response = await this.api.models.generateVideos({ + model: this.model, + prompt, + }); + // TODO if empty or undefined return error + return { ok: response.response?.generatedVideos || [] }; + } catch (e) { + return { error: `${e}` }; } - return messages; } } +// TODO how to use caches +// https://ai.google.dev/api/caching diff --git a/src/gemini2.ts b/src/gemini2.ts new file mode 100644 index 0000000..291553f --- /dev/null +++ b/src/gemini2.ts @@ -0,0 +1,149 @@ +import { + GenerativeModel, + GoogleGenerativeAI, + type Content, + type GenerateContentResult, +} from "@google/generative-ai"; +import { RESPONSE_LENGTH } from "./logic/constants"; +import type { + AIModelAPI, + ChatMessage, + OChoice, + OChunk, + OMessage, +} from "./types"; +import type { AsyncRes } from "sortug"; + +export default class GeminiAPI implements AIModelAPI { + tokenizer: (text: string) => number; + maxTokens: number; + private model: GenerativeModel; + + constructor( + maxTokens = 200_000, + tokenizer: (text: string) => number = (text) => text.length / 3, + model?: string, + ) { + this.maxTokens = maxTokens; + this.tokenizer = tokenizer; + + const gem = new GoogleGenerativeAI(Bun.env["GEMINI_API_KEY"]!); + this.model = gem.getGenerativeModel({ + // model: model || "gemini-2.0-flash-exp", + model: model || "gemini-2.5-pro-preview-05-06 ", + generationConfig: { maxOutputTokens: RESPONSE_LENGTH }, + }); + } + + public setModel(model: string) { + const gem = new GoogleGenerativeAI(Bun.env["GEMINI_API_KEY"]!); + this.model = gem.getGenerativeModel({ + model, + generationConfig: { maxOutputTokens: RESPONSE_LENGTH }, + }); + } + private mapMessages(input: ChatMessage[]): Content[] { + return input.map((m) => ({ + role: m.author === "gemini" ? "model" : "user", + parts: [{ text: m.text }], + })); + } + + private mapMessagesR1(input: ChatMessage[]): Content[] { + return input.reduce((acc: Content[], m, i) => { + const prev = acc[i - 1]; + const role = m.author === "gemini" ? "model" : "user"; + const msg = { role, parts: [{ text: m.text }] }; + if (prev?.role === role) acc[i - 1] = msg; + else acc = [...acc, msg]; + return acc; + }, []); + } + + private async apiCall( + messages: Content[], + isR1: boolean = false, + ): Promise<AsyncRes<string[]>> { + try { + const chat = this.model.startChat({ history: messages }); + const res = await chat.sendMessage(""); + return { ok: [res.response.text()] }; + } catch (e) { + console.log(e, "error in gemini api"); + return { error: `${e}` }; + } + } + + private async apiCallStream( + messages: Content[], + handle: (c: any) => void, + isR1: boolean = false, + ): Promise<void> { + try { + const chat = this.model.startChat({ history: messages }); + const res = await chat.sendMessage(""); + // for await (const chunk of res.stream()) { + // handle(chunk.text()); + // } + } catch (e) { + console.log(e, "error in gemini api"); + handle(`Error streaming Gemini, ${e}`); + } + } + + public async send(sys: string, input: ChatMessage[]) { + console.log({ sys, input }); + this.model.systemInstruction = { role: "system", parts: [{ text: sys }] }; + const messages = this.mapMessages(input); + const truncated = this.truncateHistory(messages); + const res = await this.apiCall(truncated); + return res; + } + + public async sendR1(input: ChatMessage[]) { + const messages = this.mapMessagesR1(input); + const truncated = this.truncateHistory(messages); + const res = await this.apiCall(truncated, true); + return res; + } + + public async stream( + sys: string, + input: ChatMessage[], + handle: (c: any) => void, + ) { + this.model.systemInstruction = { role: "system", parts: [{ text: sys }] }; + const messages = this.mapMessages(input); + const truncated = this.truncateHistory(messages); + await this.apiCallStream(truncated, handle); + } + + public async streamR1(input: ChatMessage[], handle: (c: any) => void) { + const messages = this.mapMessagesR1(input); + const truncated = this.truncateHistory(messages); + await this.apiCallStream(truncated, handle, true); + } + + public async sendDoc(data: ArrayBuffer, mimeType: string, prompt: string) { + const res = await this.model.generateContent([ + { + inlineData: { + data: Buffer.from(data).toString("base64"), + mimeType, + }, + }, + prompt, + ]); + return res; + } + + private truncateHistory(messages: Content[]): Content[] { + const totalTokens = messages.reduce((total, message) => { + return total + this.tokenizer(message.parts[0].text || ""); + }, 0); + while (totalTokens > this.maxTokens && messages.length > 1) { + messages.splice(0, 1); + } + return messages; + } +} diff --git a/src/model.ts b/src/generic.ts index 39b42dc..50c4435 100644 --- a/src/model.ts +++ b/src/generic.ts @@ -1,29 +1,30 @@ import OpenAI from "openai"; import { MAX_TOKENS, RESPONSE_LENGTH } from "./logic/constants"; -import type { AResult, ChatMessage, OChoice } from "./types"; +import type { AIModelAPI, ChatMessage, OChoice } from "./types"; +import type { AsyncRes } from "sortug"; type Message = OpenAI.Chat.Completions.ChatCompletionMessageParam; type Props = { baseURL: string; apiKey: string; - model: string; + model?: string; maxTokens?: number; tokenizer?: (text: string) => number; }; -export default class Conversation { +export default class OpenAIAPI implements AIModelAPI { private apiKey; private baseURL; - private maxTokens: number = MAX_TOKENS; - private tokenizer: (text: string) => number = (text) => text.length / 3; private api; - private model; + maxTokens: number = MAX_TOKENS; + tokenizer: (text: string) => number = (text) => text.length / 3; + model; constructor(props: Props) { this.apiKey = props.apiKey; this.baseURL = props.baseURL; this.api = new OpenAI({ baseURL: this.baseURL, apiKey: this.apiKey }); - this.model = props.model; + this.model = props.model || ""; if (props.maxTokens) this.maxTokens = props.maxTokens; if (props.tokenizer) this.tokenizer = props.tokenizer; } @@ -36,7 +37,7 @@ export default class Conversation { }); } - public async send(sys: string, input: ChatMessage[]): AResult<string[]> { + public async send(sys: string, input: ChatMessage[]): AsyncRes<string[]> { const messages = this.mapMessages(input); const sysMsg: Message = { role: "system", content: sys }; const allMessages = [sysMsg, ...messages]; @@ -44,12 +45,15 @@ export default class Conversation { const truncated = this.truncateHistory(allMessages); const res = await this.apiCall(truncated); if ("error" in res) return res; - else + else { try { - return { ok: res.ok.map((c) => c.message.content!) }; + // TODO type this properly + const choices: OChoice[] = res.ok; + return { ok: choices.map((c) => c.message.content!) }; } catch (e) { return { error: `${e}` }; } + } } public async stream( @@ -77,7 +81,7 @@ export default class Conversation { } // TODO custom temperature? - private async apiCall(messages: Message[]): AResult<OChoice[]> { + private async apiCall(messages: Message[]): AsyncRes<OChoice[]> { console.log({ messages }, "at the very end"); try { const completion = await this.api.chat.completions.create({ diff --git a/src/nlp/index.ts b/src/nlp/index.ts new file mode 100644 index 0000000..ebed586 --- /dev/null +++ b/src/nlp/index.ts @@ -0,0 +1,7 @@ +import * as Spacy from "./spacy"; +import * as Stanza from "./stanza"; +import * as ISO from "./iso"; +import { ocr } from "./ocr"; +import type * as Types from "./types"; +export * from "./nlp"; +export { ISO, ocr, Stanza, Spacy, type Types }; diff --git a/src/nlp/iso.ts b/src/nlp/iso.ts new file mode 100644 index 0000000..3e60850 --- /dev/null +++ b/src/nlp/iso.ts @@ -0,0 +1,10 @@ +import { franc, francAll } from "franc-all"; +import { iso6393To1 } from "iso-639-3"; +export { iso6393, iso6393To1, iso6393To2B, iso6393To2T } from "iso-639-3"; +export * as BCP47 from "bcp-47"; + +export function detectLang(text: string) { + const iso3 = franc(text); + const iso1 = iso6393To1[iso3]; + return iso1 ? iso1 : iso3; +} diff --git a/src/nlp/nlp.ts b/src/nlp/nlp.ts new file mode 100644 index 0000000..3b1e3a7 --- /dev/null +++ b/src/nlp/nlp.ts @@ -0,0 +1,208 @@ +export const isPunctuation = (text: string): boolean => { + // Common punctuation characters + const punctuationRegex = /^[.,;:!?()[\]{}'"«»""''…-]+$/; + return punctuationRegex.test(text); +}; + +// Get color for different syntactic categories +export function getColorForType(type: string): string { + const colors: Record<string, string> = { + // Phrasal categories + S: "#6495ED", // Sentence - cornflower blue + NP: "#FF7F50", // Noun Phrase - coral + VP: "#32CD32", // Verb Phrase - lime green + PP: "#9370DB", // Prepositional Phrase - medium purple + ADJP: "#FFD700", // Adjective Phrase - gold + ADVP: "#FF69B4", // Adverb Phrase - hot pink + + // Part-of-speech tags + NN: "#FFA07A", // Noun - light salmon + NNS: "#FFA07A", // Plural Noun - light salmon + NNP: "#FFA07A", // Proper Noun - light salmon + VB: "#90EE90", // Verb - light green + VBP: "#90EE90", // Present tense verb - light green + VBG: "#90EE90", // Gerund verb - light green + VBZ: "#90EE90", // 3rd person singular present verb - light green + VBD: "#90EE90", // Past tense verb - light green + VBN: "#90EE90", // Past participle verb - light green + JJ: "#F0E68C", // Adjective - khaki + RB: "#DDA0DD", // Adverb - plum + IN: "#87CEFA", // Preposition - light sky blue + DT: "#D3D3D3", // Determiner - light gray + PRP: "#D8BFD8", // Personal pronoun - thistle + CC: "#A9A9A9", // Coordinating conjunction - dark gray + + // Default + ROOT: "#000000", // Root - black + LEAF: "#666666", // Leaf nodes - dark gray + }; + + return colors[type] || "#666666"; +} + +// Get a description for node types +export function getDescription(type: string): string { + const descriptions: Record<string, string> = { + S: "Sentence", + SBAR: "Subordinating conjunction clause", + SBARQ: "Direct question", + SINV: "Declarative sentence with subject-aux inversion", + SQ: "Subconstituent of SBARQ excluding wh-word", + WHADVP: "wh-adverb phrase", + WHNP: "wh-nounphrase", + WHPP: "wh-prepositional phrase", + WDT: "wh-determiner", + WP: "wh-pronoun", + WRB: "wh-adverb", + WP$: "possesive wh-pronoun", + MD: "modal", + X: "Unknown", + NP: "Noun Phrase", + VP: "Verb Phrase", + PP: "Prepositional Phrase", + ADJP: "Adjective Phrase", + ADVP: "Adverb Phrase", + LS: "List item market", + SYM: "Symbol", + NN: "Noun", + NNS: "Plural Noun", + NNP: "Proper Noun", + NNPS: "Proper Noun, Plural", + VB: "Verb (base form)", + VBP: "Verb (present tense)", + VBG: "Verb (gerund/present participle)", + VBZ: "Verb (3rd person singular present)", + VBD: "Verb (past tense)", + VBN: "Verb (past participle)", + JJ: "Adjective", + JJR: "Adjective, comparative", + JJS: "Adjective, superlative", + EX: "Existential there", + RB: "Adverb", + RBR: "Adverb, comparative", + RBS: "Adverb, superlative", + RP: "Particle", + IN: "Preposition", + TO: "to", + DT: "Determiner", + PDT: "Predeterminer", + PRP: "Personal Pronoun", + PP$: "Possesive Pronoun", + PRP$: "Possesive Pronoun", + POS: "Possesive ending", + FW: "Foreign Word", + CC: "Coordinating Conjunction", + CD: "Cardinal number", + UH: "interjection", + ROOT: "Root Node", + CLR: "figurative motion", + FRAG: "fragment", + ":": "Colon/Semicolon", + ",": "Comma", + ".": "Period", + }; + + return descriptions[type] || type; +} + +// https://universaldependencies.org/u/dep/xcomp.htmlexport + +export function unpackDeprel(type: string): string { + const descriptions: Record<string, string> = { + nsubj: "nominal subject", + obj: "object", + iobj: "indirect object", + csubj: "clausal subject", + ccomp: "clausal complement", + xcomp: "open clausal complement", + obl: "oblique nominal", + vocative: "vocative", + expl: "expletive", + dislocated: "dislocated", + nmod: "nominal modifier", + appos: "appositional modifier", + nummod: "numeric modifier", + advcl: "adverbial clause modifier", + acl: "admonimal clause", + advmod: "adverbial modifier", + discourse: "dicourse element", + aux: "auxiliary", + cop: "copula", + mark: "marker", + amod: "adjectival modifier", + det: "determiner", + clf: "classifier", + case: "case marker", + conj: "conjunction", + cc: "coordinating conjunction", + fixed: "fixed multiword expression", + flat: "flat expression", + list: "list", + parataxis: "parataxis", + compound: "compound", + orphan: "orphan", + goeswith: "goes with", + reparandum: "overriden disfluency", + punct: "punctuation", + root: "root", + dep: "unspecified dependency", + }; + const res = descriptions[type]; + if (!res) console.log("tag not found!!", type); + + return res || type; +} + +export function deprelColors(type: string): string { + const colors: Record<string, string> = { + // Phrasal categories + s: "#6495ED", // Sentence - cornflower blue + nsubj: "#6495ED", // Sentence - cornflower blue + root: "#FFD700", // Adjective Phrase - gold + p: "#FFD700", // Adjective Phrase - gold + NP: "#FF7F50", // Noun Phrase - coral + VP: "#32CD32", // Verb Phrase - lime green + PP: "#9370DB", // Prepositional Phrase - medium purple + ADVP: "#FF69B4", // Adverb Phrase - hot pink + + // Part-of-speech tags + NN: "#FFA07A", // Noun - light salmon + NNS: "#FFA07A", // Plural Noun - light salmon + NNP: "#FFA07A", // Proper Noun - light salmon + VB: "#90EE90", // Verb - light green + VBP: "#90EE90", // Present tense verb - light green + VBG: "#90EE90", // Gerund verb - light green + VBZ: "#90EE90", // 3rd person singular present verb - light green + VBD: "#90EE90", // Past tense verb - light green + VBN: "#90EE90", // Past participle verb - light green + JJ: "#F0E68C", // Adjective - khaki + RB: "#DDA0DD", // Adverb - plum + IN: "#87CEFA", // Preposition - light sky blue + DT: "#D3D3D3", // Determiner - light gray + PRP: "#D8BFD8", // Personal pronoun - thistle + CC: "#A9A9A9", // Coordinating conjunction - dark gray + + // Default + ROOT: "#000000", // Root - black + LEAF: "#666666", // Leaf nodes - dark gray + }; + + return colors[type] || "#666666"; +} +export function unpackPos(pos: string): string { + const map: Record<string, string> = { + adj: "adjective", + adv: "adverb", + adv_phrase: "adverbial phrase", + combining_form: "combining form", + conj: "conjunction", + det: "determinant", + intj: "interjection", + num: "number", + prep: "preposition", + prep_phrase: "prepositional phrase", + pron: "pronoun", + punct: "punctuation", + }; + return map[pos] || pos; +} diff --git a/src/nlp/ocr.ts b/src/nlp/ocr.ts new file mode 100644 index 0000000..1c40355 --- /dev/null +++ b/src/nlp/ocr.ts @@ -0,0 +1,18 @@ +import type { AsyncRes } from "sortug"; + +export async function ocr(formData: FormData): AsyncRes<string[]> { + const endpoint = "http://localhost:8102/ocr"; + + const opts = { + method: "POST", + body: formData, + headers: { "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY! }, + }; + try { + const res = await fetch(endpoint, opts); + const j = await res.json(); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} diff --git a/src/nlp/spacy.ts b/src/nlp/spacy.ts new file mode 100644 index 0000000..d79de55 --- /dev/null +++ b/src/nlp/spacy.ts @@ -0,0 +1,79 @@ +import type { AsyncRes, Result } from "sortug"; +import { detectLang } from "./iso"; +const ENDPOINT = "http://localhost:8102"; + +export async function run(text: string, langg?: string): AsyncRes<SpacyRes> { + try { + const lang = langg ? langg : detectLang(text); + const body = JSON.stringify({ string: text, lang }); + const opts = { + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + method: "POST", + body, + }; + const res = await fetch(ENDPOINT + "/spacy", opts); + const j = await res.json(); + console.log("spacy", j); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} + +export type SpacyResBig = { + doc: { + text: string; + ents: any[]; + sents: Array<{ start: number; end: number }>; + tokens: Token[]; + }; + segs: Sentence[]; +}; +export type SpacyRes = { + input: string; + segments: Sentence[]; +}; +export type Sentence = { + text: string; + start: number; + end: number; + root: Token; + subj: Token; + arcs: Arc[]; + words: Word[]; +}; +export type Arc = { + start: number; + end: number; + label: string; // deprel label + dir: string; +}; +export type Token = { + id: number; + head: number; + start: number; + end: number; + dep: string; + lemma: string; + morph: string; + pos: string; + tag: string; + text: string; +}; + +export interface Word extends Token { + ancestors: number[]; + children: []; + n_lefts: number; + n_rights: number; + left_edge: number; + right_edge: number; + morph_map: Record<string, string>; +} + +export function isChild(w: Word, topId: number): boolean { + return w.id === topId || w.ancestors.includes(topId); +} diff --git a/src/nlp/stanza.ts b/src/nlp/stanza.ts new file mode 100644 index 0000000..5836b91 --- /dev/null +++ b/src/nlp/stanza.ts @@ -0,0 +1,210 @@ +import type { AsyncRes, Result } from "sortug"; +import { detectLang } from "./iso"; + +const ENDPOINT = "http://localhost:8102"; +export async function segmenter( + text: string, + langg?: string, +): AsyncRes<StanzaRes> { + try { + const lang = langg ? langg : detectLang(text); + const body = JSON.stringify({ lang, string: text }); + const opts = { + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + method: "POST", + body, + }; + const res = await fetch(ENDPOINT + "/stanza", opts); + const j = await res.json(); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} +export async function idLang(text: string) { + try { + const body = JSON.stringify({ string: text }); + const opts = { + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + method: "POST", + body, + }; + const res = await fetch(ENDPOINT + "/detect-lang", opts); + const j = await res.json(); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} +export type StanzaRes = { input: string; segments: Sentence[] }; +export type Sentence = { + text: string; + sentiment: number; + constituency: TreeNode; + constring: string; + dependencies: Dependency[]; + entities: Entity[]; + tokens: Token[]; + words: Word[]; +}; +export type TreeNode = { + label: string; + children: TreeNode[]; +}; +export type Dependency = Array<[Word, string, Word]>; +export type Word = { + id: number; + text: string; + lemma: string; + upos: string; + xpos: string; + feats: string; + head: number; + deprel: string; + start_char: number; + end_char: number; +}; +export type Token = { + id: [number, number]; + text: string; + misc: string; + words: Word[]; + start_char: number; + end_char: number; + ner: string; +}; +export type Entity = { + text: string; + misc: string; + start_char: number; + end_char: number; + type: string; +}; + +// mine +export type Clause = { + words: Word[]; + dependency: Dependency; + text: string; +}; +// "amod", +// { +// "id": 1, +// "text": "Stony", +// "lemma": "Stony", +// "upos": "ADJ", +// "xpos": "NNP", +// "feats": "Degree=Pos", +// "head": 3, +// "deprel": "amod", +// "start_char": 0, +// "end_char": 5 +// } +// +// + +export interface ParsedGrammar { + predicateCore: number; + subjectCore: number | null; + tree: Record<number, number[]>; + wordMap: WordMap; + words: BigWord[]; +} +export interface BigWord extends Word { + ancestry: number[]; + component: "s" | "p" | "u"; +} +export type ComputedDependency = { + word: BigWord; + children: ComputedDependency[]; +}; +export type WordMap = Record<number, Word>; + +export function buildTreeFromWords(words: Word[]): Result<ParsedGrammar> { + const roots = words.filter((w) => w.deprel === "root"); + if (roots.length > 1) { + console.log("roots", roots); + return { error: "too many roots" }; + } else if (roots.length === 0) { + return { error: "no roots" }; + } else { + const root = roots[0]; + const wordmap = words.reduce((acc: WordMap, item) => { + acc[item.id] = item; + return acc; + }, {}); + return { ok: parseFurther(words, wordmap, root) }; + } +} +function parseFurther( + words: Word[], + wordMap: WordMap, + root: Word, +): ParsedGrammar { + const predicateCore = root.id; + let subjectCore: number | null = null; + const tree: Record<number, number[]> = {}; + const bigwords: BigWord[] = []; + const getAncestry = (parent: Word): number[] => { + const kids = tree[parent.head] || []; + tree[parent.head] = [...kids, parent.id]; + if (parent.deprel === "nsubj") subjectCore = parent.id; + + console.log("getting ancestry " + parent.id, parent.text); + const grandpa = wordMap[parent.head]; + if (!grandpa) return [parent.id]; + else return [parent.id, ...getAncestry(grandpa)]; + }; + let idx = 0; + for (const w of words) { + if (w.deprel === "punct") { + const prev = words[idx - 1]; + if (!prev) continue; + prev.text += w.text; + continue; + } + const parent = wordMap[w.head]; + if (!parent) tree[w.id] = []; + const ancestry = !parent ? [] : getAncestry(parent); + const component = + subjectCore && (w.id === subjectCore || ancestry.includes(subjectCore)) + ? "s" + : w.id === predicateCore || ancestry.includes(root.id) + ? "p" + : "u"; + const bw: BigWord = { ...w, component, ancestry }; + wordMap[w.id] = bw; + bigwords.push(bw); + idx++; + } + const pg: ParsedGrammar = { + predicateCore, + subjectCore, + wordMap, + tree, + words: bigwords, + }; + return pg; +} + +export function oneDescendant(node: TreeNode): boolean { + if (node.children.length !== 1) return false; + else { + const child = node.children[0]; + return child.children.length === 0; + } +} + +// function findChildren(wordmap: WordMap, word: Word): ComputedDependency { +// const children = words.filter((w) => w.head === head.id); +// return { +// word: head, +// children: children.map((c) => findChildren(words, c)), +// }; +// } diff --git a/src/nlp/types.ts b/src/nlp/types.ts new file mode 100644 index 0000000..605a637 --- /dev/null +++ b/src/nlp/types.ts @@ -0,0 +1,50 @@ +export type ViewLevel = + | "text" + | "paragraph" + | "sentence" + | "clause" + | "word" + | "syllable" + | "phoneme"; +export interface ViewState { + level: ViewLevel; + pIndex: number | null; + sIndex: number | null; + cIndex: number | null; + wIndex: number | null; + yIndex: number | null; + fIndex: number | null; +} + +export interface ViewProps { + idx: number; + rawText: string; + context: Context; +} +export type Context = { + parentText: string; + segmented: string[]; + idx: number; +}; + +export type WordData = { + confidence: number; + frequency: number | null; + id: number; + ipa: Array<{ ipa: string; tags: string[] }>; + spelling: string; + type: ExpressionType; + syllables: number; + lang: string; + prosody: any; + senses: Sense[]; +}; +export type ExpressionType = "word" | "expression" | "syllable"; +export type Sense = { + etymology: string; + pos: string; + forms: Array<{ form: string; tags: string[] }>; + related: any; + senses: Array<{ glosses: string[]; links: Array<[string, string]> }>; +}; +export type LoadingStatus = "pending" | "loading" | "success" | "error"; diff --git a/src/openai.ts b/src/openai.ts index 2e15dcf..12939bc 100644 --- a/src/openai.ts +++ b/src/openai.ts @@ -1,14 +1,8 @@ import fs from "fs"; import OpenAI from "openai"; import { RESPONSE_LENGTH } from "./logic/constants"; -import type { - AResult, - ChatMessage, - OChoice, - OChunk, - OMessage, - Result, -} from "./types"; +import type { ChatMessage, OChoice, OChunk, OMessage } from "./types"; +import type { AsyncRes, Result } from "sortug"; import OpenAIToolUse from "./openai_tools"; import type { FileObject } from "openai/src/resources/files.js"; @@ -26,7 +20,7 @@ export default class Conversation { private baseURL: string = "https://api.openai.com/v1"; private tokenizer: (text: string) => number = (text) => text.length / 3; openai; - private model: string = "chatgpt-4o-latest"; + private model: string = "gpt-4.1"; constructor(props: Props) { if (props.apiKey) this.apiKey = props.apiKey; @@ -56,7 +50,7 @@ export default class Conversation { }, []); } - public async send(sys: string, input: ChatMessage[]): AResult<OChoice[]> { + public async send(sys: string, input: ChatMessage[]): AsyncRes<OChoice[]> { const messages = this.mapMessages(input); const sysMsg: Message = { role: "system", content: sys }; const allMessages = [sysMsg, ...messages]; @@ -65,7 +59,7 @@ export default class Conversation { return res; } - public async sendR1(input: ChatMessage[]): AResult<OChoice[]> { + public async sendR1(input: ChatMessage[]): AsyncRes<OChoice[]> { const messages = this.mapMessagesR1(input); const truncated = this.truncateHistory(messages); const res = await this.apiCall(truncated); @@ -102,7 +96,7 @@ export default class Conversation { return messages; } - private async apiCall(messages: Message[]): AResult<OChoice[]> { + private async apiCall(messages: Message[]): AsyncRes<OChoice[]> { try { const completion = await this.openai.chat.completions.create({ temperature: 1.3, diff --git a/src/types/index.ts b/src/types/index.ts index 97be443..b276457 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -1,15 +1,29 @@ import type OpenAI from "openai"; +import type { AsyncRes } from "sortug"; export type ChatMessage = { author: string; text: string; sent: number; reasoning?: string; }; -export type Result<T> = { ok: T } | { error: string }; -export type AResult<T> = Promise<{ ok: T } | { error: string }>; // openai export type OChoice = OpenAI.Chat.Completions.ChatCompletion.Choice; export type OChunk = OpenAI.Chat.Completions.ChatCompletionChunk.Choice; export type OMessage = OpenAI.Chat.Completions.ChatCompletionMessageParam; export type ContentType = { text: string } | { audio: Response }; +export type AIModelChoice = + | { name: "deepseek" | "chatgpt" | "claude" | "gemini" | "grok" } + | { other: { baseURL: string; apiKey: string } }; +export interface AIModelAPI { + setModel: (model: string) => void; + tokenizer: (text: string) => number; + maxTokens: number; + + send: (systemPrompt: string, input: ChatMessage[]) => AsyncRes<string[]>; + stream: ( + systemPrompt: string, + input: ChatMessage[], + handler: (data: any) => void, + ) => void; +} |
