diff options
Diffstat (limited to 'src/nlp')
| -rw-r--r-- | src/nlp/index.ts | 7 | ||||
| -rw-r--r-- | src/nlp/iso.ts | 10 | ||||
| -rw-r--r-- | src/nlp/nlp.ts | 208 | ||||
| -rw-r--r-- | src/nlp/ocr.ts | 18 | ||||
| -rw-r--r-- | src/nlp/spacy.ts | 79 | ||||
| -rw-r--r-- | src/nlp/stanza.ts | 210 | ||||
| -rw-r--r-- | src/nlp/types.ts | 50 |
7 files changed, 582 insertions, 0 deletions
diff --git a/src/nlp/index.ts b/src/nlp/index.ts new file mode 100644 index 0000000..ebed586 --- /dev/null +++ b/src/nlp/index.ts @@ -0,0 +1,7 @@ +import * as Spacy from "./spacy"; +import * as Stanza from "./stanza"; +import * as ISO from "./iso"; +import { ocr } from "./ocr"; +import type * as Types from "./types"; +export * from "./nlp"; +export { ISO, ocr, Stanza, Spacy, type Types }; diff --git a/src/nlp/iso.ts b/src/nlp/iso.ts new file mode 100644 index 0000000..3e60850 --- /dev/null +++ b/src/nlp/iso.ts @@ -0,0 +1,10 @@ +import { franc, francAll } from "franc-all"; +import { iso6393To1 } from "iso-639-3"; +export { iso6393, iso6393To1, iso6393To2B, iso6393To2T } from "iso-639-3"; +export * as BCP47 from "bcp-47"; + +export function detectLang(text: string) { + const iso3 = franc(text); + const iso1 = iso6393To1[iso3]; + return iso1 ? iso1 : iso3; +} diff --git a/src/nlp/nlp.ts b/src/nlp/nlp.ts new file mode 100644 index 0000000..3b1e3a7 --- /dev/null +++ b/src/nlp/nlp.ts @@ -0,0 +1,208 @@ +export const isPunctuation = (text: string): boolean => { + // Common punctuation characters + const punctuationRegex = /^[.,;:!?()[\]{}'"«»""''…-]+$/; + return punctuationRegex.test(text); +}; + +// Get color for different syntactic categories +export function getColorForType(type: string): string { + const colors: Record<string, string> = { + // Phrasal categories + S: "#6495ED", // Sentence - cornflower blue + NP: "#FF7F50", // Noun Phrase - coral + VP: "#32CD32", // Verb Phrase - lime green + PP: "#9370DB", // Prepositional Phrase - medium purple + ADJP: "#FFD700", // Adjective Phrase - gold + ADVP: "#FF69B4", // Adverb Phrase - hot pink + + // Part-of-speech tags + NN: "#FFA07A", // Noun - light salmon + NNS: "#FFA07A", // Plural Noun - light salmon + NNP: "#FFA07A", // Proper Noun - light salmon + VB: "#90EE90", // Verb - light green + VBP: "#90EE90", // Present tense verb - light green + VBG: "#90EE90", // Gerund verb - light green + VBZ: "#90EE90", // 3rd person singular present verb - light green + VBD: "#90EE90", // Past tense verb - light green + VBN: "#90EE90", // Past participle verb - light green + JJ: "#F0E68C", // Adjective - khaki + RB: "#DDA0DD", // Adverb - plum + IN: "#87CEFA", // Preposition - light sky blue + DT: "#D3D3D3", // Determiner - light gray + PRP: "#D8BFD8", // Personal pronoun - thistle + CC: "#A9A9A9", // Coordinating conjunction - dark gray + + // Default + ROOT: "#000000", // Root - black + LEAF: "#666666", // Leaf nodes - dark gray + }; + + return colors[type] || "#666666"; +} + +// Get a description for node types +export function getDescription(type: string): string { + const descriptions: Record<string, string> = { + S: "Sentence", + SBAR: "Subordinating conjunction clause", + SBARQ: "Direct question", + SINV: "Declarative sentence with subject-aux inversion", + SQ: "Subconstituent of SBARQ excluding wh-word", + WHADVP: "wh-adverb phrase", + WHNP: "wh-nounphrase", + WHPP: "wh-prepositional phrase", + WDT: "wh-determiner", + WP: "wh-pronoun", + WRB: "wh-adverb", + WP$: "possesive wh-pronoun", + MD: "modal", + X: "Unknown", + NP: "Noun Phrase", + VP: "Verb Phrase", + PP: "Prepositional Phrase", + ADJP: "Adjective Phrase", + ADVP: "Adverb Phrase", + LS: "List item market", + SYM: "Symbol", + NN: "Noun", + NNS: "Plural Noun", + NNP: "Proper Noun", + NNPS: "Proper Noun, Plural", + VB: "Verb (base form)", + VBP: "Verb (present tense)", + VBG: "Verb (gerund/present participle)", + VBZ: "Verb (3rd person singular present)", + VBD: "Verb (past tense)", + VBN: "Verb (past participle)", + JJ: "Adjective", + JJR: "Adjective, comparative", + JJS: "Adjective, superlative", + EX: "Existential there", + RB: "Adverb", + RBR: "Adverb, comparative", + RBS: "Adverb, superlative", + RP: "Particle", + IN: "Preposition", + TO: "to", + DT: "Determiner", + PDT: "Predeterminer", + PRP: "Personal Pronoun", + PP$: "Possesive Pronoun", + PRP$: "Possesive Pronoun", + POS: "Possesive ending", + FW: "Foreign Word", + CC: "Coordinating Conjunction", + CD: "Cardinal number", + UH: "interjection", + ROOT: "Root Node", + CLR: "figurative motion", + FRAG: "fragment", + ":": "Colon/Semicolon", + ",": "Comma", + ".": "Period", + }; + + return descriptions[type] || type; +} + +// https://universaldependencies.org/u/dep/xcomp.htmlexport + +export function unpackDeprel(type: string): string { + const descriptions: Record<string, string> = { + nsubj: "nominal subject", + obj: "object", + iobj: "indirect object", + csubj: "clausal subject", + ccomp: "clausal complement", + xcomp: "open clausal complement", + obl: "oblique nominal", + vocative: "vocative", + expl: "expletive", + dislocated: "dislocated", + nmod: "nominal modifier", + appos: "appositional modifier", + nummod: "numeric modifier", + advcl: "adverbial clause modifier", + acl: "admonimal clause", + advmod: "adverbial modifier", + discourse: "dicourse element", + aux: "auxiliary", + cop: "copula", + mark: "marker", + amod: "adjectival modifier", + det: "determiner", + clf: "classifier", + case: "case marker", + conj: "conjunction", + cc: "coordinating conjunction", + fixed: "fixed multiword expression", + flat: "flat expression", + list: "list", + parataxis: "parataxis", + compound: "compound", + orphan: "orphan", + goeswith: "goes with", + reparandum: "overriden disfluency", + punct: "punctuation", + root: "root", + dep: "unspecified dependency", + }; + const res = descriptions[type]; + if (!res) console.log("tag not found!!", type); + + return res || type; +} + +export function deprelColors(type: string): string { + const colors: Record<string, string> = { + // Phrasal categories + s: "#6495ED", // Sentence - cornflower blue + nsubj: "#6495ED", // Sentence - cornflower blue + root: "#FFD700", // Adjective Phrase - gold + p: "#FFD700", // Adjective Phrase - gold + NP: "#FF7F50", // Noun Phrase - coral + VP: "#32CD32", // Verb Phrase - lime green + PP: "#9370DB", // Prepositional Phrase - medium purple + ADVP: "#FF69B4", // Adverb Phrase - hot pink + + // Part-of-speech tags + NN: "#FFA07A", // Noun - light salmon + NNS: "#FFA07A", // Plural Noun - light salmon + NNP: "#FFA07A", // Proper Noun - light salmon + VB: "#90EE90", // Verb - light green + VBP: "#90EE90", // Present tense verb - light green + VBG: "#90EE90", // Gerund verb - light green + VBZ: "#90EE90", // 3rd person singular present verb - light green + VBD: "#90EE90", // Past tense verb - light green + VBN: "#90EE90", // Past participle verb - light green + JJ: "#F0E68C", // Adjective - khaki + RB: "#DDA0DD", // Adverb - plum + IN: "#87CEFA", // Preposition - light sky blue + DT: "#D3D3D3", // Determiner - light gray + PRP: "#D8BFD8", // Personal pronoun - thistle + CC: "#A9A9A9", // Coordinating conjunction - dark gray + + // Default + ROOT: "#000000", // Root - black + LEAF: "#666666", // Leaf nodes - dark gray + }; + + return colors[type] || "#666666"; +} +export function unpackPos(pos: string): string { + const map: Record<string, string> = { + adj: "adjective", + adv: "adverb", + adv_phrase: "adverbial phrase", + combining_form: "combining form", + conj: "conjunction", + det: "determinant", + intj: "interjection", + num: "number", + prep: "preposition", + prep_phrase: "prepositional phrase", + pron: "pronoun", + punct: "punctuation", + }; + return map[pos] || pos; +} diff --git a/src/nlp/ocr.ts b/src/nlp/ocr.ts new file mode 100644 index 0000000..1c40355 --- /dev/null +++ b/src/nlp/ocr.ts @@ -0,0 +1,18 @@ +import type { AsyncRes } from "sortug"; + +export async function ocr(formData: FormData): AsyncRes<string[]> { + const endpoint = "http://localhost:8102/ocr"; + + const opts = { + method: "POST", + body: formData, + headers: { "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY! }, + }; + try { + const res = await fetch(endpoint, opts); + const j = await res.json(); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} diff --git a/src/nlp/spacy.ts b/src/nlp/spacy.ts new file mode 100644 index 0000000..d79de55 --- /dev/null +++ b/src/nlp/spacy.ts @@ -0,0 +1,79 @@ +import type { AsyncRes, Result } from "sortug"; +import { detectLang } from "./iso"; +const ENDPOINT = "http://localhost:8102"; + +export async function run(text: string, langg?: string): AsyncRes<SpacyRes> { + try { + const lang = langg ? langg : detectLang(text); + const body = JSON.stringify({ string: text, lang }); + const opts = { + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + method: "POST", + body, + }; + const res = await fetch(ENDPOINT + "/spacy", opts); + const j = await res.json(); + console.log("spacy", j); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} + +export type SpacyResBig = { + doc: { + text: string; + ents: any[]; + sents: Array<{ start: number; end: number }>; + tokens: Token[]; + }; + segs: Sentence[]; +}; +export type SpacyRes = { + input: string; + segments: Sentence[]; +}; +export type Sentence = { + text: string; + start: number; + end: number; + root: Token; + subj: Token; + arcs: Arc[]; + words: Word[]; +}; +export type Arc = { + start: number; + end: number; + label: string; // deprel label + dir: string; +}; +export type Token = { + id: number; + head: number; + start: number; + end: number; + dep: string; + lemma: string; + morph: string; + pos: string; + tag: string; + text: string; +}; + +export interface Word extends Token { + ancestors: number[]; + children: []; + n_lefts: number; + n_rights: number; + left_edge: number; + right_edge: number; + morph_map: Record<string, string>; +} + +export function isChild(w: Word, topId: number): boolean { + return w.id === topId || w.ancestors.includes(topId); +} diff --git a/src/nlp/stanza.ts b/src/nlp/stanza.ts new file mode 100644 index 0000000..5836b91 --- /dev/null +++ b/src/nlp/stanza.ts @@ -0,0 +1,210 @@ +import type { AsyncRes, Result } from "sortug"; +import { detectLang } from "./iso"; + +const ENDPOINT = "http://localhost:8102"; +export async function segmenter( + text: string, + langg?: string, +): AsyncRes<StanzaRes> { + try { + const lang = langg ? langg : detectLang(text); + const body = JSON.stringify({ lang, string: text }); + const opts = { + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + method: "POST", + body, + }; + const res = await fetch(ENDPOINT + "/stanza", opts); + const j = await res.json(); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} +export async function idLang(text: string) { + try { + const body = JSON.stringify({ string: text }); + const opts = { + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + method: "POST", + body, + }; + const res = await fetch(ENDPOINT + "/detect-lang", opts); + const j = await res.json(); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} +export type StanzaRes = { input: string; segments: Sentence[] }; +export type Sentence = { + text: string; + sentiment: number; + constituency: TreeNode; + constring: string; + dependencies: Dependency[]; + entities: Entity[]; + tokens: Token[]; + words: Word[]; +}; +export type TreeNode = { + label: string; + children: TreeNode[]; +}; +export type Dependency = Array<[Word, string, Word]>; +export type Word = { + id: number; + text: string; + lemma: string; + upos: string; + xpos: string; + feats: string; + head: number; + deprel: string; + start_char: number; + end_char: number; +}; +export type Token = { + id: [number, number]; + text: string; + misc: string; + words: Word[]; + start_char: number; + end_char: number; + ner: string; +}; +export type Entity = { + text: string; + misc: string; + start_char: number; + end_char: number; + type: string; +}; + +// mine +export type Clause = { + words: Word[]; + dependency: Dependency; + text: string; +}; +// "amod", +// { +// "id": 1, +// "text": "Stony", +// "lemma": "Stony", +// "upos": "ADJ", +// "xpos": "NNP", +// "feats": "Degree=Pos", +// "head": 3, +// "deprel": "amod", +// "start_char": 0, +// "end_char": 5 +// } +// +// + +export interface ParsedGrammar { + predicateCore: number; + subjectCore: number | null; + tree: Record<number, number[]>; + wordMap: WordMap; + words: BigWord[]; +} +export interface BigWord extends Word { + ancestry: number[]; + component: "s" | "p" | "u"; +} +export type ComputedDependency = { + word: BigWord; + children: ComputedDependency[]; +}; +export type WordMap = Record<number, Word>; + +export function buildTreeFromWords(words: Word[]): Result<ParsedGrammar> { + const roots = words.filter((w) => w.deprel === "root"); + if (roots.length > 1) { + console.log("roots", roots); + return { error: "too many roots" }; + } else if (roots.length === 0) { + return { error: "no roots" }; + } else { + const root = roots[0]; + const wordmap = words.reduce((acc: WordMap, item) => { + acc[item.id] = item; + return acc; + }, {}); + return { ok: parseFurther(words, wordmap, root) }; + } +} +function parseFurther( + words: Word[], + wordMap: WordMap, + root: Word, +): ParsedGrammar { + const predicateCore = root.id; + let subjectCore: number | null = null; + const tree: Record<number, number[]> = {}; + const bigwords: BigWord[] = []; + const getAncestry = (parent: Word): number[] => { + const kids = tree[parent.head] || []; + tree[parent.head] = [...kids, parent.id]; + if (parent.deprel === "nsubj") subjectCore = parent.id; + + console.log("getting ancestry " + parent.id, parent.text); + const grandpa = wordMap[parent.head]; + if (!grandpa) return [parent.id]; + else return [parent.id, ...getAncestry(grandpa)]; + }; + let idx = 0; + for (const w of words) { + if (w.deprel === "punct") { + const prev = words[idx - 1]; + if (!prev) continue; + prev.text += w.text; + continue; + } + const parent = wordMap[w.head]; + if (!parent) tree[w.id] = []; + const ancestry = !parent ? [] : getAncestry(parent); + const component = + subjectCore && (w.id === subjectCore || ancestry.includes(subjectCore)) + ? "s" + : w.id === predicateCore || ancestry.includes(root.id) + ? "p" + : "u"; + const bw: BigWord = { ...w, component, ancestry }; + wordMap[w.id] = bw; + bigwords.push(bw); + idx++; + } + const pg: ParsedGrammar = { + predicateCore, + subjectCore, + wordMap, + tree, + words: bigwords, + }; + return pg; +} + +export function oneDescendant(node: TreeNode): boolean { + if (node.children.length !== 1) return false; + else { + const child = node.children[0]; + return child.children.length === 0; + } +} + +// function findChildren(wordmap: WordMap, word: Word): ComputedDependency { +// const children = words.filter((w) => w.head === head.id); +// return { +// word: head, +// children: children.map((c) => findChildren(words, c)), +// }; +// } diff --git a/src/nlp/types.ts b/src/nlp/types.ts new file mode 100644 index 0000000..605a637 --- /dev/null +++ b/src/nlp/types.ts @@ -0,0 +1,50 @@ +export type ViewLevel = + | "text" + | "paragraph" + | "sentence" + | "clause" + | "word" + | "syllable" + | "phoneme"; +export interface ViewState { + level: ViewLevel; + pIndex: number | null; + sIndex: number | null; + cIndex: number | null; + wIndex: number | null; + yIndex: number | null; + fIndex: number | null; +} + +export interface ViewProps { + idx: number; + rawText: string; + context: Context; +} +export type Context = { + parentText: string; + segmented: string[]; + idx: number; +}; + +export type WordData = { + confidence: number; + frequency: number | null; + id: number; + ipa: Array<{ ipa: string; tags: string[] }>; + spelling: string; + type: ExpressionType; + syllables: number; + lang: string; + prosody: any; + senses: Sense[]; +}; +export type ExpressionType = "word" | "expression" | "syllable"; +export type Sense = { + etymology: string; + pos: string; + forms: Array<{ form: string; tags: string[] }>; + related: any; + senses: Array<{ glosses: string[]; links: Array<[string, string]> }>; +}; +export type LoadingStatus = "pending" | "loading" | "success" | "error"; |
