1 files changed, 210 insertions, 0 deletions
diff --git a/src/nlp/stanza.ts b/src/nlp/stanza.ts
new file mode 100644
index 0000000..5836b91
--- /dev/null
+++ b/src/nlp/stanza.ts
@@ -0,0 +1,210 @@
+import type { AsyncRes, Result } from "sortug";
+import { detectLang } from "./iso";
+
+const ENDPOINT = "http://localhost:8102";
+export async function segmenter(
+  text: string,
+  langg?: string,
+): AsyncRes<StanzaRes> {
+  try {
+    const lang = langg ? langg : detectLang(text);
+    const body = JSON.stringify({ lang, string: text });
+    const opts = {
+      headers: {
+        "Content-type": "application/json",
+        "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+      },
+      method: "POST",
+      body,
+    };
+    const res = await fetch(ENDPOINT + "/stanza", opts);
+    const j = await res.json();
+    return { ok: j };
+  } catch (e) {
+    return { error: `${e}` };
+  }
+}
+export async function idLang(text: string) {
+  try {
+    const body = JSON.stringify({ string: text });
+    const opts = {
+      headers: {
+        "Content-type": "application/json",
+        "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+      },
+      method: "POST",
+      body,
+    };
+    const res = await fetch(ENDPOINT + "/detect-lang", opts);
+    const j = await res.json();
+    return { ok: j };
+  } catch (e) {
+    return { error: `${e}` };
+  }
+}
+export type StanzaRes = { input: string; segments: Sentence[] };
+export type Sentence = {
+  text: string;
+  sentiment: number;
+  constituency: TreeNode;
+  constring: string;
+  dependencies: Dependency[];
+  entities: Entity[];
+  tokens: Token[];
+  words: Word[];
+};
+export type TreeNode = {
+  label: string;
+  children: TreeNode[];
+};
+export type Dependency = Array<[Word, string, Word]>;
+export type Word = {
+  id: number;
+  text: string;
+  lemma: string;
+  upos: string;
+  xpos: string;
+  feats: string;
+  head: number;
+  deprel: string;
+  start_char: number;
+  end_char: number;
+};
+export type Token = {
+  id: [number, number];
+  text: string;
+  misc: string;
+  words: Word[];
+  start_char: number;
+  end_char: number;
+  ner: string;
+};
+export type Entity = {
+  text: string;
+  misc: string;
+  start_char: number;
+  end_char: number;
+  type: string;
+};
+
+// mine
+export type Clause = {
+  words: Word[];
+  dependency: Dependency;
+  text: string;
+};
+// "amod",
+// {
+//   "id": 1,
+//   "text": "Stony",
+//   "lemma": "Stony",
+//   "upos": "ADJ",
+//   "xpos": "NNP",
+//   "feats": "Degree=Pos",
+//   "head": 3,
+//   "deprel": "amod",
+//   "start_char": 0,
+//   "end_char": 5
+// }
+//
+//
+
+export interface ParsedGrammar {
+  predicateCore: number;
+  subjectCore: number | null;
+  tree: Record<number, number[]>;
+  wordMap: WordMap;
+  words: BigWord[];
+}
+export interface BigWord extends Word {
+  ancestry: number[];
+  component: "s" | "p" | "u";
+}
+export type ComputedDependency = {
+  word: BigWord;
+  children: ComputedDependency[];
+};
+export type WordMap = Record<number, Word>;
+
+export function buildTreeFromWords(words: Word[]): Result<ParsedGrammar> {
+  const roots = words.filter((w) => w.deprel === "root");
+  if (roots.length > 1) {
+    console.log("roots", roots);
+    return { error: "too many roots" };
+  } else if (roots.length === 0) {
+    return { error: "no roots" };
+  } else {
+    const root = roots[0];
+    const wordmap = words.reduce((acc: WordMap, item) => {
+      acc[item.id] = item;
+      return acc;
+    }, {});
+    return { ok: parseFurther(words, wordmap, root) };
+  }
+}
+function parseFurther(
+  words: Word[],
+  wordMap: WordMap,
+  root: Word,
+): ParsedGrammar {
+  const predicateCore = root.id;
+  let subjectCore: number | null = null;
+  const tree: Record<number, number[]> = {};
+  const bigwords: BigWord[] = [];
+  const getAncestry = (parent: Word): number[] => {
+    const kids = tree[parent.head] || [];
+    tree[parent.head] = [...kids, parent.id];
+    if (parent.deprel === "nsubj") subjectCore = parent.id;
+
+    console.log("getting ancestry " + parent.id, parent.text);
+    const grandpa = wordMap[parent.head];
+    if (!grandpa) return [parent.id];
+    else return [parent.id, ...getAncestry(grandpa)];
+  };
+  let idx = 0;
+  for (const w of words) {
+    if (w.deprel === "punct") {
+      const prev = words[idx - 1];
+      if (!prev) continue;
+      prev.text += w.text;
+      continue;
+    }
+    const parent = wordMap[w.head];
+    if (!parent) tree[w.id] = [];
+    const ancestry = !parent ? [] : getAncestry(parent);
+    const component =
+      subjectCore && (w.id === subjectCore || ancestry.includes(subjectCore))
+        ? "s"
+        : w.id === predicateCore || ancestry.includes(root.id)
+          ? "p"
+          : "u";
+    const bw: BigWord = { ...w, component, ancestry };
+    wordMap[w.id] = bw;
+    bigwords.push(bw);
+    idx++;
+  }
+  const pg: ParsedGrammar = {
+    predicateCore,
+    subjectCore,
+    wordMap,
+    tree,
+    words: bigwords,
+  };
+  return pg;
+}
+
+export function oneDescendant(node: TreeNode): boolean {
+  if (node.children.length !== 1) return false;
+  else {
+    const child = node.children[0];
+    return child.children.length === 0;
+  }
+}
+
+// function findChildren(wordmap: WordMap, word: Word): ComputedDependency {
+//   const children = words.filter((w) => w.head === head.id);
+//   return {
+//     word: head,
+//     children: children.map((c) => findChildren(words, c)),
+//   };
+// }