summaryrefslogtreecommitdiff
path: root/src/nlp
diff options
context:
space:
mode:
Diffstat (limited to 'src/nlp')
-rw-r--r--src/nlp/index.ts7
-rw-r--r--src/nlp/iso.ts10
-rw-r--r--src/nlp/nlp.ts208
-rw-r--r--src/nlp/ocr.ts18
-rw-r--r--src/nlp/spacy.ts79
-rw-r--r--src/nlp/stanza.ts210
-rw-r--r--src/nlp/types.ts50
7 files changed, 582 insertions, 0 deletions
diff --git a/src/nlp/index.ts b/src/nlp/index.ts
new file mode 100644
index 0000000..ebed586
--- /dev/null
+++ b/src/nlp/index.ts
@@ -0,0 +1,7 @@
+import * as Spacy from "./spacy";
+import * as Stanza from "./stanza";
+import * as ISO from "./iso";
+import { ocr } from "./ocr";
+import type * as Types from "./types";
+export * from "./nlp";
+export { ISO, ocr, Stanza, Spacy, type Types };
diff --git a/src/nlp/iso.ts b/src/nlp/iso.ts
new file mode 100644
index 0000000..3e60850
--- /dev/null
+++ b/src/nlp/iso.ts
@@ -0,0 +1,10 @@
+import { franc, francAll } from "franc-all";
+import { iso6393To1 } from "iso-639-3";
+export { iso6393, iso6393To1, iso6393To2B, iso6393To2T } from "iso-639-3";
+export * as BCP47 from "bcp-47";
+
+export function detectLang(text: string) {
+ const iso3 = franc(text);
+ const iso1 = iso6393To1[iso3];
+ return iso1 ? iso1 : iso3;
+}
diff --git a/src/nlp/nlp.ts b/src/nlp/nlp.ts
new file mode 100644
index 0000000..3b1e3a7
--- /dev/null
+++ b/src/nlp/nlp.ts
@@ -0,0 +1,208 @@
+export const isPunctuation = (text: string): boolean => {
+ // Common punctuation characters
+ const punctuationRegex = /^[.,;:!?()[\]{}'"«»""''…-]+$/;
+ return punctuationRegex.test(text);
+};
+
+// Get color for different syntactic categories
+export function getColorForType(type: string): string {
+ const colors: Record<string, string> = {
+ // Phrasal categories
+ S: "#6495ED", // Sentence - cornflower blue
+ NP: "#FF7F50", // Noun Phrase - coral
+ VP: "#32CD32", // Verb Phrase - lime green
+ PP: "#9370DB", // Prepositional Phrase - medium purple
+ ADJP: "#FFD700", // Adjective Phrase - gold
+ ADVP: "#FF69B4", // Adverb Phrase - hot pink
+
+ // Part-of-speech tags
+ NN: "#FFA07A", // Noun - light salmon
+ NNS: "#FFA07A", // Plural Noun - light salmon
+ NNP: "#FFA07A", // Proper Noun - light salmon
+ VB: "#90EE90", // Verb - light green
+ VBP: "#90EE90", // Present tense verb - light green
+ VBG: "#90EE90", // Gerund verb - light green
+ VBZ: "#90EE90", // 3rd person singular present verb - light green
+ VBD: "#90EE90", // Past tense verb - light green
+ VBN: "#90EE90", // Past participle verb - light green
+ JJ: "#F0E68C", // Adjective - khaki
+ RB: "#DDA0DD", // Adverb - plum
+ IN: "#87CEFA", // Preposition - light sky blue
+ DT: "#D3D3D3", // Determiner - light gray
+ PRP: "#D8BFD8", // Personal pronoun - thistle
+ CC: "#A9A9A9", // Coordinating conjunction - dark gray
+
+ // Default
+ ROOT: "#000000", // Root - black
+ LEAF: "#666666", // Leaf nodes - dark gray
+ };
+
+ return colors[type] || "#666666";
+}
+
+// Get a description for node types
+export function getDescription(type: string): string {
+ const descriptions: Record<string, string> = {
+ S: "Sentence",
+ SBAR: "Subordinating conjunction clause",
+ SBARQ: "Direct question",
+ SINV: "Declarative sentence with subject-aux inversion",
+ SQ: "Subconstituent of SBARQ excluding wh-word",
+ WHADVP: "wh-adverb phrase",
+ WHNP: "wh-nounphrase",
+ WHPP: "wh-prepositional phrase",
+ WDT: "wh-determiner",
+ WP: "wh-pronoun",
+ WRB: "wh-adverb",
+ WP$: "possesive wh-pronoun",
+ MD: "modal",
+ X: "Unknown",
+ NP: "Noun Phrase",
+ VP: "Verb Phrase",
+ PP: "Prepositional Phrase",
+ ADJP: "Adjective Phrase",
+ ADVP: "Adverb Phrase",
+ LS: "List item market",
+ SYM: "Symbol",
+ NN: "Noun",
+ NNS: "Plural Noun",
+ NNP: "Proper Noun",
+ NNPS: "Proper Noun, Plural",
+ VB: "Verb (base form)",
+ VBP: "Verb (present tense)",
+ VBG: "Verb (gerund/present participle)",
+ VBZ: "Verb (3rd person singular present)",
+ VBD: "Verb (past tense)",
+ VBN: "Verb (past participle)",
+ JJ: "Adjective",
+ JJR: "Adjective, comparative",
+ JJS: "Adjective, superlative",
+ EX: "Existential there",
+ RB: "Adverb",
+ RBR: "Adverb, comparative",
+ RBS: "Adverb, superlative",
+ RP: "Particle",
+ IN: "Preposition",
+ TO: "to",
+ DT: "Determiner",
+ PDT: "Predeterminer",
+ PRP: "Personal Pronoun",
+ PP$: "Possesive Pronoun",
+ PRP$: "Possesive Pronoun",
+ POS: "Possesive ending",
+ FW: "Foreign Word",
+ CC: "Coordinating Conjunction",
+ CD: "Cardinal number",
+ UH: "interjection",
+ ROOT: "Root Node",
+ CLR: "figurative motion",
+ FRAG: "fragment",
+ ":": "Colon/Semicolon",
+ ",": "Comma",
+ ".": "Period",
+ };
+
+ return descriptions[type] || type;
+}
+
+// https://universaldependencies.org/u/dep/xcomp.htmlexport
+
+export function unpackDeprel(type: string): string {
+ const descriptions: Record<string, string> = {
+ nsubj: "nominal subject",
+ obj: "object",
+ iobj: "indirect object",
+ csubj: "clausal subject",
+ ccomp: "clausal complement",
+ xcomp: "open clausal complement",
+ obl: "oblique nominal",
+ vocative: "vocative",
+ expl: "expletive",
+ dislocated: "dislocated",
+ nmod: "nominal modifier",
+ appos: "appositional modifier",
+ nummod: "numeric modifier",
+ advcl: "adverbial clause modifier",
+ acl: "admonimal clause",
+ advmod: "adverbial modifier",
+ discourse: "dicourse element",
+ aux: "auxiliary",
+ cop: "copula",
+ mark: "marker",
+ amod: "adjectival modifier",
+ det: "determiner",
+ clf: "classifier",
+ case: "case marker",
+ conj: "conjunction",
+ cc: "coordinating conjunction",
+ fixed: "fixed multiword expression",
+ flat: "flat expression",
+ list: "list",
+ parataxis: "parataxis",
+ compound: "compound",
+ orphan: "orphan",
+ goeswith: "goes with",
+ reparandum: "overriden disfluency",
+ punct: "punctuation",
+ root: "root",
+ dep: "unspecified dependency",
+ };
+ const res = descriptions[type];
+ if (!res) console.log("tag not found!!", type);
+
+ return res || type;
+}
+
+export function deprelColors(type: string): string {
+ const colors: Record<string, string> = {
+ // Phrasal categories
+ s: "#6495ED", // Sentence - cornflower blue
+ nsubj: "#6495ED", // Sentence - cornflower blue
+ root: "#FFD700", // Adjective Phrase - gold
+ p: "#FFD700", // Adjective Phrase - gold
+ NP: "#FF7F50", // Noun Phrase - coral
+ VP: "#32CD32", // Verb Phrase - lime green
+ PP: "#9370DB", // Prepositional Phrase - medium purple
+ ADVP: "#FF69B4", // Adverb Phrase - hot pink
+
+ // Part-of-speech tags
+ NN: "#FFA07A", // Noun - light salmon
+ NNS: "#FFA07A", // Plural Noun - light salmon
+ NNP: "#FFA07A", // Proper Noun - light salmon
+ VB: "#90EE90", // Verb - light green
+ VBP: "#90EE90", // Present tense verb - light green
+ VBG: "#90EE90", // Gerund verb - light green
+ VBZ: "#90EE90", // 3rd person singular present verb - light green
+ VBD: "#90EE90", // Past tense verb - light green
+ VBN: "#90EE90", // Past participle verb - light green
+ JJ: "#F0E68C", // Adjective - khaki
+ RB: "#DDA0DD", // Adverb - plum
+ IN: "#87CEFA", // Preposition - light sky blue
+ DT: "#D3D3D3", // Determiner - light gray
+ PRP: "#D8BFD8", // Personal pronoun - thistle
+ CC: "#A9A9A9", // Coordinating conjunction - dark gray
+
+ // Default
+ ROOT: "#000000", // Root - black
+ LEAF: "#666666", // Leaf nodes - dark gray
+ };
+
+ return colors[type] || "#666666";
+}
+export function unpackPos(pos: string): string {
+ const map: Record<string, string> = {
+ adj: "adjective",
+ adv: "adverb",
+ adv_phrase: "adverbial phrase",
+ combining_form: "combining form",
+ conj: "conjunction",
+ det: "determinant",
+ intj: "interjection",
+ num: "number",
+ prep: "preposition",
+ prep_phrase: "prepositional phrase",
+ pron: "pronoun",
+ punct: "punctuation",
+ };
+ return map[pos] || pos;
+}
diff --git a/src/nlp/ocr.ts b/src/nlp/ocr.ts
new file mode 100644
index 0000000..1c40355
--- /dev/null
+++ b/src/nlp/ocr.ts
@@ -0,0 +1,18 @@
+import type { AsyncRes } from "sortug";
+
+export async function ocr(formData: FormData): AsyncRes<string[]> {
+ const endpoint = "http://localhost:8102/ocr";
+
+ const opts = {
+ method: "POST",
+ body: formData,
+ headers: { "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY! },
+ };
+ try {
+ const res = await fetch(endpoint, opts);
+ const j = await res.json();
+ return { ok: j };
+ } catch (e) {
+ return { error: `${e}` };
+ }
+}
diff --git a/src/nlp/spacy.ts b/src/nlp/spacy.ts
new file mode 100644
index 0000000..d79de55
--- /dev/null
+++ b/src/nlp/spacy.ts
@@ -0,0 +1,79 @@
+import type { AsyncRes, Result } from "sortug";
+import { detectLang } from "./iso";
+const ENDPOINT = "http://localhost:8102";
+
+export async function run(text: string, langg?: string): AsyncRes<SpacyRes> {
+ try {
+ const lang = langg ? langg : detectLang(text);
+ const body = JSON.stringify({ string: text, lang });
+ const opts = {
+ headers: {
+ "Content-type": "application/json",
+ "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+ },
+ method: "POST",
+ body,
+ };
+ const res = await fetch(ENDPOINT + "/spacy", opts);
+ const j = await res.json();
+ console.log("spacy", j);
+ return { ok: j };
+ } catch (e) {
+ return { error: `${e}` };
+ }
+}
+
+export type SpacyResBig = {
+ doc: {
+ text: string;
+ ents: any[];
+ sents: Array<{ start: number; end: number }>;
+ tokens: Token[];
+ };
+ segs: Sentence[];
+};
+export type SpacyRes = {
+ input: string;
+ segments: Sentence[];
+};
+export type Sentence = {
+ text: string;
+ start: number;
+ end: number;
+ root: Token;
+ subj: Token;
+ arcs: Arc[];
+ words: Word[];
+};
+export type Arc = {
+ start: number;
+ end: number;
+ label: string; // deprel label
+ dir: string;
+};
+export type Token = {
+ id: number;
+ head: number;
+ start: number;
+ end: number;
+ dep: string;
+ lemma: string;
+ morph: string;
+ pos: string;
+ tag: string;
+ text: string;
+};
+
+export interface Word extends Token {
+ ancestors: number[];
+ children: [];
+ n_lefts: number;
+ n_rights: number;
+ left_edge: number;
+ right_edge: number;
+ morph_map: Record<string, string>;
+}
+
+export function isChild(w: Word, topId: number): boolean {
+ return w.id === topId || w.ancestors.includes(topId);
+}
diff --git a/src/nlp/stanza.ts b/src/nlp/stanza.ts
new file mode 100644
index 0000000..5836b91
--- /dev/null
+++ b/src/nlp/stanza.ts
@@ -0,0 +1,210 @@
+import type { AsyncRes, Result } from "sortug";
+import { detectLang } from "./iso";
+
+const ENDPOINT = "http://localhost:8102";
+export async function segmenter(
+ text: string,
+ langg?: string,
+): AsyncRes<StanzaRes> {
+ try {
+ const lang = langg ? langg : detectLang(text);
+ const body = JSON.stringify({ lang, string: text });
+ const opts = {
+ headers: {
+ "Content-type": "application/json",
+ "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+ },
+ method: "POST",
+ body,
+ };
+ const res = await fetch(ENDPOINT + "/stanza", opts);
+ const j = await res.json();
+ return { ok: j };
+ } catch (e) {
+ return { error: `${e}` };
+ }
+}
+export async function idLang(text: string) {
+ try {
+ const body = JSON.stringify({ string: text });
+ const opts = {
+ headers: {
+ "Content-type": "application/json",
+ "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+ },
+ method: "POST",
+ body,
+ };
+ const res = await fetch(ENDPOINT + "/detect-lang", opts);
+ const j = await res.json();
+ return { ok: j };
+ } catch (e) {
+ return { error: `${e}` };
+ }
+}
+export type StanzaRes = { input: string; segments: Sentence[] };
+export type Sentence = {
+ text: string;
+ sentiment: number;
+ constituency: TreeNode;
+ constring: string;
+ dependencies: Dependency[];
+ entities: Entity[];
+ tokens: Token[];
+ words: Word[];
+};
+export type TreeNode = {
+ label: string;
+ children: TreeNode[];
+};
+export type Dependency = Array<[Word, string, Word]>;
+export type Word = {
+ id: number;
+ text: string;
+ lemma: string;
+ upos: string;
+ xpos: string;
+ feats: string;
+ head: number;
+ deprel: string;
+ start_char: number;
+ end_char: number;
+};
+export type Token = {
+ id: [number, number];
+ text: string;
+ misc: string;
+ words: Word[];
+ start_char: number;
+ end_char: number;
+ ner: string;
+};
+export type Entity = {
+ text: string;
+ misc: string;
+ start_char: number;
+ end_char: number;
+ type: string;
+};
+
+// mine
+export type Clause = {
+ words: Word[];
+ dependency: Dependency;
+ text: string;
+};
+// "amod",
+// {
+// "id": 1,
+// "text": "Stony",
+// "lemma": "Stony",
+// "upos": "ADJ",
+// "xpos": "NNP",
+// "feats": "Degree=Pos",
+// "head": 3,
+// "deprel": "amod",
+// "start_char": 0,
+// "end_char": 5
+// }
+//
+//
+
+export interface ParsedGrammar {
+ predicateCore: number;
+ subjectCore: number | null;
+ tree: Record<number, number[]>;
+ wordMap: WordMap;
+ words: BigWord[];
+}
+export interface BigWord extends Word {
+ ancestry: number[];
+ component: "s" | "p" | "u";
+}
+export type ComputedDependency = {
+ word: BigWord;
+ children: ComputedDependency[];
+};
+export type WordMap = Record<number, Word>;
+
+export function buildTreeFromWords(words: Word[]): Result<ParsedGrammar> {
+ const roots = words.filter((w) => w.deprel === "root");
+ if (roots.length > 1) {
+ console.log("roots", roots);
+ return { error: "too many roots" };
+ } else if (roots.length === 0) {
+ return { error: "no roots" };
+ } else {
+ const root = roots[0];
+ const wordmap = words.reduce((acc: WordMap, item) => {
+ acc[item.id] = item;
+ return acc;
+ }, {});
+ return { ok: parseFurther(words, wordmap, root) };
+ }
+}
+function parseFurther(
+ words: Word[],
+ wordMap: WordMap,
+ root: Word,
+): ParsedGrammar {
+ const predicateCore = root.id;
+ let subjectCore: number | null = null;
+ const tree: Record<number, number[]> = {};
+ const bigwords: BigWord[] = [];
+ const getAncestry = (parent: Word): number[] => {
+ const kids = tree[parent.head] || [];
+ tree[parent.head] = [...kids, parent.id];
+ if (parent.deprel === "nsubj") subjectCore = parent.id;
+
+ console.log("getting ancestry " + parent.id, parent.text);
+ const grandpa = wordMap[parent.head];
+ if (!grandpa) return [parent.id];
+ else return [parent.id, ...getAncestry(grandpa)];
+ };
+ let idx = 0;
+ for (const w of words) {
+ if (w.deprel === "punct") {
+ const prev = words[idx - 1];
+ if (!prev) continue;
+ prev.text += w.text;
+ continue;
+ }
+ const parent = wordMap[w.head];
+ if (!parent) tree[w.id] = [];
+ const ancestry = !parent ? [] : getAncestry(parent);
+ const component =
+ subjectCore && (w.id === subjectCore || ancestry.includes(subjectCore))
+ ? "s"
+ : w.id === predicateCore || ancestry.includes(root.id)
+ ? "p"
+ : "u";
+ const bw: BigWord = { ...w, component, ancestry };
+ wordMap[w.id] = bw;
+ bigwords.push(bw);
+ idx++;
+ }
+ const pg: ParsedGrammar = {
+ predicateCore,
+ subjectCore,
+ wordMap,
+ tree,
+ words: bigwords,
+ };
+ return pg;
+}
+
+export function oneDescendant(node: TreeNode): boolean {
+ if (node.children.length !== 1) return false;
+ else {
+ const child = node.children[0];
+ return child.children.length === 0;
+ }
+}
+
+// function findChildren(wordmap: WordMap, word: Word): ComputedDependency {
+// const children = words.filter((w) => w.head === head.id);
+// return {
+// word: head,
+// children: children.map((c) => findChildren(words, c)),
+// };
+// }
diff --git a/src/nlp/types.ts b/src/nlp/types.ts
new file mode 100644
index 0000000..605a637
--- /dev/null
+++ b/src/nlp/types.ts
@@ -0,0 +1,50 @@
+export type ViewLevel =
+ | "text"
+ | "paragraph"
+ | "sentence"
+ | "clause"
+ | "word"
+ | "syllable"
+ | "phoneme";
+export interface ViewState {
+ level: ViewLevel;
+ pIndex: number | null;
+ sIndex: number | null;
+ cIndex: number | null;
+ wIndex: number | null;
+ yIndex: number | null;
+ fIndex: number | null;
+}
+
+export interface ViewProps {
+ idx: number;
+ rawText: string;
+ context: Context;
+}
+export type Context = {
+ parentText: string;
+ segmented: string[];
+ idx: number;
+};
+
+export type WordData = {
+ confidence: number;
+ frequency: number | null;
+ id: number;
+ ipa: Array<{ ipa: string; tags: string[] }>;
+ spelling: string;
+ type: ExpressionType;
+ syllables: number;
+ lang: string;
+ prosody: any;
+ senses: Sense[];
+};
+export type ExpressionType = "word" | "expression" | "syllable";
+export type Sense = {
+ etymology: string;
+ pos: string;
+ forms: Array<{ form: string; tags: string[] }>;
+ related: any;
+ senses: Array<{ glosses: string[]; links: Array<[string, string]> }>;
+};
+export type LoadingStatus = "pending" | "loading" | "success" | "error";