7 files changed, 582 insertions, 0 deletions
diff --git a/src/nlp/index.ts b/src/nlp/index.ts
new file mode 100644
index 0000000..ebed586
--- /dev/null
+++ b/src/nlp/index.ts
@@ -0,0 +1,7 @@
+import * as Spacy from "./spacy";
+import * as Stanza from "./stanza";
+import * as ISO from "./iso";
+import { ocr } from "./ocr";
+import type * as Types from "./types";
+export * from "./nlp";
+export { ISO, ocr, Stanza, Spacy, type Types };
diff --git a/src/nlp/iso.ts b/src/nlp/iso.ts
new file mode 100644
index 0000000..3e60850
--- /dev/null
+++ b/src/nlp/iso.ts
@@ -0,0 +1,10 @@
+import { franc, francAll } from "franc-all";
+import { iso6393To1 } from "iso-639-3";
+export { iso6393, iso6393To1, iso6393To2B, iso6393To2T } from "iso-639-3";
+export * as BCP47 from "bcp-47";
+
+export function detectLang(text: string) {
+  const iso3 = franc(text);
+  const iso1 = iso6393To1[iso3];
+  return iso1 ? iso1 : iso3;
+}
diff --git a/src/nlp/nlp.ts b/src/nlp/nlp.ts
new file mode 100644
index 0000000..3b1e3a7
--- /dev/null
+++ b/src/nlp/nlp.ts
@@ -0,0 +1,208 @@
+export const isPunctuation = (text: string): boolean => {
+  // Common punctuation characters
+  const punctuationRegex = /^[.,;:!?()[\]{}'"«»""''…-]+$/;
+  return punctuationRegex.test(text);
+};
+
+// Get color for different syntactic categories
+export function getColorForType(type: string): string {
+  const colors: Record<string, string> = {
+    // Phrasal categories
+    S: "#6495ED", // Sentence - cornflower blue
+    NP: "#FF7F50", // Noun Phrase - coral
+    VP: "#32CD32", // Verb Phrase - lime green
+    PP: "#9370DB", // Prepositional Phrase - medium purple
+    ADJP: "#FFD700", // Adjective Phrase - gold
+    ADVP: "#FF69B4", // Adverb Phrase - hot pink
+
+    // Part-of-speech tags
+    NN: "#FFA07A", // Noun - light salmon
+    NNS: "#FFA07A", // Plural Noun - light salmon
+    NNP: "#FFA07A", // Proper Noun - light salmon
+    VB: "#90EE90", // Verb - light green
+    VBP: "#90EE90", // Present tense verb - light green
+    VBG: "#90EE90", // Gerund verb - light green
+    VBZ: "#90EE90", // 3rd person singular present verb - light green
+    VBD: "#90EE90", // Past tense verb - light green
+    VBN: "#90EE90", // Past participle verb - light green
+    JJ: "#F0E68C", // Adjective - khaki
+    RB: "#DDA0DD", // Adverb - plum
+    IN: "#87CEFA", // Preposition - light sky blue
+    DT: "#D3D3D3", // Determiner - light gray
+    PRP: "#D8BFD8", // Personal pronoun - thistle
+    CC: "#A9A9A9", // Coordinating conjunction - dark gray
+
+    // Default
+    ROOT: "#000000", // Root - black
+    LEAF: "#666666", // Leaf nodes - dark gray
+  };
+
+  return colors[type] || "#666666";
+}
+
+// Get a description for node types
+export function getDescription(type: string): string {
+  const descriptions: Record<string, string> = {
+    S: "Sentence",
+    SBAR: "Subordinating conjunction clause",
+    SBARQ: "Direct question",
+    SINV: "Declarative sentence with subject-aux inversion",
+    SQ: "Subconstituent of SBARQ excluding wh-word",
+    WHADVP: "wh-adverb phrase",
+    WHNP: "wh-nounphrase",
+    WHPP: "wh-prepositional phrase",
+    WDT: "wh-determiner",
+    WP: "wh-pronoun",
+    WRB: "wh-adverb",
+    WP$: "possesive wh-pronoun",
+    MD: "modal",
+    X: "Unknown",
+    NP: "Noun Phrase",
+    VP: "Verb Phrase",
+    PP: "Prepositional Phrase",
+    ADJP: "Adjective Phrase",
+    ADVP: "Adverb Phrase",
+    LS: "List item market",
+    SYM: "Symbol",
+    NN: "Noun",
+    NNS: "Plural Noun",
+    NNP: "Proper Noun",
+    NNPS: "Proper Noun, Plural",
+    VB: "Verb (base form)",
+    VBP: "Verb (present tense)",
+    VBG: "Verb (gerund/present participle)",
+    VBZ: "Verb (3rd person singular present)",
+    VBD: "Verb (past tense)",
+    VBN: "Verb (past participle)",
+    JJ: "Adjective",
+    JJR: "Adjective, comparative",
+    JJS: "Adjective, superlative",
+    EX: "Existential there",
+    RB: "Adverb",
+    RBR: "Adverb, comparative",
+    RBS: "Adverb, superlative",
+    RP: "Particle",
+    IN: "Preposition",
+    TO: "to",
+    DT: "Determiner",
+    PDT: "Predeterminer",
+    PRP: "Personal Pronoun",
+    PP$: "Possesive Pronoun",
+    PRP$: "Possesive Pronoun",
+    POS: "Possesive ending",
+    FW: "Foreign Word",
+    CC: "Coordinating Conjunction",
+    CD: "Cardinal number",
+    UH: "interjection",
+    ROOT: "Root Node",
+    CLR: "figurative motion",
+    FRAG: "fragment",
+    ":": "Colon/Semicolon",
+    ",": "Comma",
+    ".": "Period",
+  };
+
+  return descriptions[type] || type;
+}
+
+// https://universaldependencies.org/u/dep/xcomp.htmlexport
+
+export function unpackDeprel(type: string): string {
+  const descriptions: Record<string, string> = {
+    nsubj: "nominal subject",
+    obj: "object",
+    iobj: "indirect object",
+    csubj: "clausal subject",
+    ccomp: "clausal complement",
+    xcomp: "open clausal complement",
+    obl: "oblique nominal",
+    vocative: "vocative",
+    expl: "expletive",
+    dislocated: "dislocated",
+    nmod: "nominal modifier",
+    appos: "appositional modifier",
+    nummod: "numeric modifier",
+    advcl: "adverbial clause modifier",
+    acl: "admonimal clause",
+    advmod: "adverbial modifier",
+    discourse: "dicourse element",
+    aux: "auxiliary",
+    cop: "copula",
+    mark: "marker",
+    amod: "adjectival modifier",
+    det: "determiner",
+    clf: "classifier",
+    case: "case marker",
+    conj: "conjunction",
+    cc: "coordinating conjunction",
+    fixed: "fixed multiword expression",
+    flat: "flat expression",
+    list: "list",
+    parataxis: "parataxis",
+    compound: "compound",
+    orphan: "orphan",
+    goeswith: "goes with",
+    reparandum: "overriden disfluency",
+    punct: "punctuation",
+    root: "root",
+    dep: "unspecified dependency",
+  };
+  const res = descriptions[type];
+  if (!res) console.log("tag not found!!", type);
+
+  return res || type;
+}
+
+export function deprelColors(type: string): string {
+  const colors: Record<string, string> = {
+    // Phrasal categories
+    s: "#6495ED", // Sentence - cornflower blue
+    nsubj: "#6495ED", // Sentence - cornflower blue
+    root: "#FFD700", // Adjective Phrase - gold
+    p: "#FFD700", // Adjective Phrase - gold
+    NP: "#FF7F50", // Noun Phrase - coral
+    VP: "#32CD32", // Verb Phrase - lime green
+    PP: "#9370DB", // Prepositional Phrase - medium purple
+    ADVP: "#FF69B4", // Adverb Phrase - hot pink
+
+    // Part-of-speech tags
+    NN: "#FFA07A", // Noun - light salmon
+    NNS: "#FFA07A", // Plural Noun - light salmon
+    NNP: "#FFA07A", // Proper Noun - light salmon
+    VB: "#90EE90", // Verb - light green
+    VBP: "#90EE90", // Present tense verb - light green
+    VBG: "#90EE90", // Gerund verb - light green
+    VBZ: "#90EE90", // 3rd person singular present verb - light green
+    VBD: "#90EE90", // Past tense verb - light green
+    VBN: "#90EE90", // Past participle verb - light green
+    JJ: "#F0E68C", // Adjective - khaki
+    RB: "#DDA0DD", // Adverb - plum
+    IN: "#87CEFA", // Preposition - light sky blue
+    DT: "#D3D3D3", // Determiner - light gray
+    PRP: "#D8BFD8", // Personal pronoun - thistle
+    CC: "#A9A9A9", // Coordinating conjunction - dark gray
+
+    // Default
+    ROOT: "#000000", // Root - black
+    LEAF: "#666666", // Leaf nodes - dark gray
+  };
+
+  return colors[type] || "#666666";
+}
+export function unpackPos(pos: string): string {
+  const map: Record<string, string> = {
+    adj: "adjective",
+    adv: "adverb",
+    adv_phrase: "adverbial phrase",
+    combining_form: "combining form",
+    conj: "conjunction",
+    det: "determinant",
+    intj: "interjection",
+    num: "number",
+    prep: "preposition",
+    prep_phrase: "prepositional phrase",
+    pron: "pronoun",
+    punct: "punctuation",
+  };
+  return map[pos] || pos;
+}
diff --git a/src/nlp/ocr.ts b/src/nlp/ocr.ts
new file mode 100644
index 0000000..1c40355
--- /dev/null
+++ b/src/nlp/ocr.ts
@@ -0,0 +1,18 @@
+import type { AsyncRes } from "sortug";
+
+export async function ocr(formData: FormData): AsyncRes<string[]> {
+  const endpoint = "http://localhost:8102/ocr";
+
+  const opts = {
+    method: "POST",
+    body: formData,
+    headers: { "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY! },
+  };
+  try {
+    const res = await fetch(endpoint, opts);
+    const j = await res.json();
+    return { ok: j };
+  } catch (e) {
+    return { error: `${e}` };
+  }
+}
diff --git a/src/nlp/spacy.ts b/src/nlp/spacy.ts
new file mode 100644
index 0000000..d79de55
--- /dev/null
+++ b/src/nlp/spacy.ts
@@ -0,0 +1,79 @@
+import type { AsyncRes, Result } from "sortug";
+import { detectLang } from "./iso";
+const ENDPOINT = "http://localhost:8102";
+
+export async function run(text: string, langg?: string): AsyncRes<SpacyRes> {
+  try {
+    const lang = langg ? langg : detectLang(text);
+    const body = JSON.stringify({ string: text, lang });
+    const opts = {
+      headers: {
+        "Content-type": "application/json",
+        "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+      },
+      method: "POST",
+      body,
+    };
+    const res = await fetch(ENDPOINT + "/spacy", opts);
+    const j = await res.json();
+    console.log("spacy", j);
+    return { ok: j };
+  } catch (e) {
+    return { error: `${e}` };
+  }
+}
+
+export type SpacyResBig = {
+  doc: {
+    text: string;
+    ents: any[];
+    sents: Array<{ start: number; end: number }>;
+    tokens: Token[];
+  };
+  segs: Sentence[];
+};
+export type SpacyRes = {
+  input: string;
+  segments: Sentence[];
+};
+export type Sentence = {
+  text: string;
+  start: number;
+  end: number;
+  root: Token;
+  subj: Token;
+  arcs: Arc[];
+  words: Word[];
+};
+export type Arc = {
+  start: number;
+  end: number;
+  label: string; // deprel label
+  dir: string;
+};
+export type Token = {
+  id: number;
+  head: number;
+  start: number;
+  end: number;
+  dep: string;
+  lemma: string;
+  morph: string;
+  pos: string;
+  tag: string;
+  text: string;
+};
+
+export interface Word extends Token {
+  ancestors: number[];
+  children: [];
+  n_lefts: number;
+  n_rights: number;
+  left_edge: number;
+  right_edge: number;
+  morph_map: Record<string, string>;
+}
+
+export function isChild(w: Word, topId: number): boolean {
+  return w.id === topId || w.ancestors.includes(topId);
+}
diff --git a/src/nlp/stanza.ts b/src/nlp/stanza.ts
new file mode 100644
index 0000000..5836b91
--- /dev/null
+++ b/src/nlp/stanza.ts
@@ -0,0 +1,210 @@
+import type { AsyncRes, Result } from "sortug";
+import { detectLang } from "./iso";
+
+const ENDPOINT = "http://localhost:8102";
+export async function segmenter(
+  text: string,
+  langg?: string,
+): AsyncRes<StanzaRes> {
+  try {
+    const lang = langg ? langg : detectLang(text);
+    const body = JSON.stringify({ lang, string: text });
+    const opts = {
+      headers: {
+        "Content-type": "application/json",
+        "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+      },
+      method: "POST",
+      body,
+    };
+    const res = await fetch(ENDPOINT + "/stanza", opts);
+    const j = await res.json();
+    return { ok: j };
+  } catch (e) {
+    return { error: `${e}` };
+  }
+}
+export async function idLang(text: string) {
+  try {
+    const body = JSON.stringify({ string: text });
+    const opts = {
+      headers: {
+        "Content-type": "application/json",
+        "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+      },
+      method: "POST",
+      body,
+    };
+    const res = await fetch(ENDPOINT + "/detect-lang", opts);
+    const j = await res.json();
+    return { ok: j };
+  } catch (e) {
+    return { error: `${e}` };
+  }
+}
+export type StanzaRes = { input: string; segments: Sentence[] };
+export type Sentence = {
+  text: string;
+  sentiment: number;
+  constituency: TreeNode;
+  constring: string;
+  dependencies: Dependency[];
+  entities: Entity[];
+  tokens: Token[];
+  words: Word[];
+};
+export type TreeNode = {
+  label: string;
+  children: TreeNode[];
+};
+export type Dependency = Array<[Word, string, Word]>;
+export type Word = {
+  id: number;
+  text: string;
+  lemma: string;
+  upos: string;
+  xpos: string;
+  feats: string;
+  head: number;
+  deprel: string;
+  start_char: number;
+  end_char: number;
+};
+export type Token = {
+  id: [number, number];
+  text: string;
+  misc: string;
+  words: Word[];
+  start_char: number;
+  end_char: number;
+  ner: string;
+};
+export type Entity = {
+  text: string;
+  misc: string;
+  start_char: number;
+  end_char: number;
+  type: string;
+};
+
+// mine
+export type Clause = {
+  words: Word[];
+  dependency: Dependency;
+  text: string;
+};
+// "amod",
+// {
+//   "id": 1,
+//   "text": "Stony",
+//   "lemma": "Stony",
+//   "upos": "ADJ",
+//   "xpos": "NNP",
+//   "feats": "Degree=Pos",
+//   "head": 3,
+//   "deprel": "amod",
+//   "start_char": 0,
+//   "end_char": 5
+// }
+//
+//
+
+export interface ParsedGrammar {
+  predicateCore: number;
+  subjectCore: number | null;
+  tree: Record<number, number[]>;
+  wordMap: WordMap;
+  words: BigWord[];
+}
+export interface BigWord extends Word {
+  ancestry: number[];
+  component: "s" | "p" | "u";
+}
+export type ComputedDependency = {
+  word: BigWord;
+  children: ComputedDependency[];
+};
+export type WordMap = Record<number, Word>;
+
+export function buildTreeFromWords(words: Word[]): Result<ParsedGrammar> {
+  const roots = words.filter((w) => w.deprel === "root");
+  if (roots.length > 1) {
+    console.log("roots", roots);
+    return { error: "too many roots" };
+  } else if (roots.length === 0) {
+    return { error: "no roots" };
+  } else {
+    const root = roots[0];
+    const wordmap = words.reduce((acc: WordMap, item) => {
+      acc[item.id] = item;
+      return acc;
+    }, {});
+    return { ok: parseFurther(words, wordmap, root) };
+  }
+}
+function parseFurther(
+  words: Word[],
+  wordMap: WordMap,
+  root: Word,
+): ParsedGrammar {
+  const predicateCore = root.id;
+  let subjectCore: number | null = null;
+  const tree: Record<number, number[]> = {};
+  const bigwords: BigWord[] = [];
+  const getAncestry = (parent: Word): number[] => {
+    const kids = tree[parent.head] || [];
+    tree[parent.head] = [...kids, parent.id];
+    if (parent.deprel === "nsubj") subjectCore = parent.id;
+
+    console.log("getting ancestry " + parent.id, parent.text);
+    const grandpa = wordMap[parent.head];
+    if (!grandpa) return [parent.id];
+    else return [parent.id, ...getAncestry(grandpa)];
+  };
+  let idx = 0;
+  for (const w of words) {
+    if (w.deprel === "punct") {
+      const prev = words[idx - 1];
+      if (!prev) continue;
+      prev.text += w.text;
+      continue;
+    }
+    const parent = wordMap[w.head];
+    if (!parent) tree[w.id] = [];
+    const ancestry = !parent ? [] : getAncestry(parent);
+    const component =
+      subjectCore && (w.id === subjectCore || ancestry.includes(subjectCore))
+        ? "s"
+        : w.id === predicateCore || ancestry.includes(root.id)
+          ? "p"
+          : "u";
+    const bw: BigWord = { ...w, component, ancestry };
+    wordMap[w.id] = bw;
+    bigwords.push(bw);
+    idx++;
+  }
+  const pg: ParsedGrammar = {
+    predicateCore,
+    subjectCore,
+    wordMap,
+    tree,
+    words: bigwords,
+  };
+  return pg;
+}
+
+export function oneDescendant(node: TreeNode): boolean {
+  if (node.children.length !== 1) return false;
+  else {
+    const child = node.children[0];
+    return child.children.length === 0;
+  }
+}
+
+// function findChildren(wordmap: WordMap, word: Word): ComputedDependency {
+//   const children = words.filter((w) => w.head === head.id);
+//   return {
+//     word: head,
+//     children: children.map((c) => findChildren(words, c)),
+//   };
+// }
diff --git a/src/nlp/types.ts b/src/nlp/types.ts
new file mode 100644
index 0000000..605a637
--- /dev/null
+++ b/src/nlp/types.ts
@@ -0,0 +1,50 @@
+export type ViewLevel =
+  | "text"
+  | "paragraph"
+  | "sentence"
+  | "clause"
+  | "word"
+  | "syllable"
+  | "phoneme";
+export interface ViewState {
+  level: ViewLevel;
+  pIndex: number | null;
+  sIndex: number | null;
+  cIndex: number | null;
+  wIndex: number | null;
+  yIndex: number | null;
+  fIndex: number | null;
+}
+
+export interface ViewProps {
+  idx: number;
+  rawText: string;
+  context: Context;
+}
+export type Context = {
+  parentText: string;
+  segmented: string[];
+  idx: number;
+};
+
+export type WordData = {
+  confidence: number;
+  frequency: number | null;
+  id: number;
+  ipa: Array<{ ipa: string; tags: string[] }>;
+  spelling: string;
+  type: ExpressionType;
+  syllables: number;
+  lang: string;
+  prosody: any;
+  senses: Sense[];
+};
+export type ExpressionType = "word" | "expression" | "syllable";
+export type Sense = {
+  etymology: string;
+  pos: string;
+  forms: Array<{ form: string; tags: string[] }>;
+  related: any;
+  senses: Array<{ glosses: string[]; links: Array<[string, string]> }>;
+};
+export type LoadingStatus = "pending" | "loading" | "success" | "error";