summaryrefslogtreecommitdiff
path: root/src/nlp/stanza.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/nlp/stanza.ts')
-rw-r--r--src/nlp/stanza.ts210
1 files changed, 210 insertions, 0 deletions
diff --git a/src/nlp/stanza.ts b/src/nlp/stanza.ts
new file mode 100644
index 0000000..5836b91
--- /dev/null
+++ b/src/nlp/stanza.ts
@@ -0,0 +1,210 @@
+import type { AsyncRes, Result } from "sortug";
+import { detectLang } from "./iso";
+
+const ENDPOINT = "http://localhost:8102";
+export async function segmenter(
+ text: string,
+ langg?: string,
+): AsyncRes<StanzaRes> {
+ try {
+ const lang = langg ? langg : detectLang(text);
+ const body = JSON.stringify({ lang, string: text });
+ const opts = {
+ headers: {
+ "Content-type": "application/json",
+ "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+ },
+ method: "POST",
+ body,
+ };
+ const res = await fetch(ENDPOINT + "/stanza", opts);
+ const j = await res.json();
+ return { ok: j };
+ } catch (e) {
+ return { error: `${e}` };
+ }
+}
+export async function idLang(text: string) {
+ try {
+ const body = JSON.stringify({ string: text });
+ const opts = {
+ headers: {
+ "Content-type": "application/json",
+ "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+ },
+ method: "POST",
+ body,
+ };
+ const res = await fetch(ENDPOINT + "/detect-lang", opts);
+ const j = await res.json();
+ return { ok: j };
+ } catch (e) {
+ return { error: `${e}` };
+ }
+}
+export type StanzaRes = { input: string; segments: Sentence[] };
+export type Sentence = {
+ text: string;
+ sentiment: number;
+ constituency: TreeNode;
+ constring: string;
+ dependencies: Dependency[];
+ entities: Entity[];
+ tokens: Token[];
+ words: Word[];
+};
+export type TreeNode = {
+ label: string;
+ children: TreeNode[];
+};
+export type Dependency = Array<[Word, string, Word]>;
+export type Word = {
+ id: number;
+ text: string;
+ lemma: string;
+ upos: string;
+ xpos: string;
+ feats: string;
+ head: number;
+ deprel: string;
+ start_char: number;
+ end_char: number;
+};
+export type Token = {
+ id: [number, number];
+ text: string;
+ misc: string;
+ words: Word[];
+ start_char: number;
+ end_char: number;
+ ner: string;
+};
+export type Entity = {
+ text: string;
+ misc: string;
+ start_char: number;
+ end_char: number;
+ type: string;
+};
+
+// mine
+export type Clause = {
+ words: Word[];
+ dependency: Dependency;
+ text: string;
+};
+// "amod",
+// {
+// "id": 1,
+// "text": "Stony",
+// "lemma": "Stony",
+// "upos": "ADJ",
+// "xpos": "NNP",
+// "feats": "Degree=Pos",
+// "head": 3,
+// "deprel": "amod",
+// "start_char": 0,
+// "end_char": 5
+// }
+//
+//
+
+export interface ParsedGrammar {
+ predicateCore: number;
+ subjectCore: number | null;
+ tree: Record<number, number[]>;
+ wordMap: WordMap;
+ words: BigWord[];
+}
+export interface BigWord extends Word {
+ ancestry: number[];
+ component: "s" | "p" | "u";
+}
+export type ComputedDependency = {
+ word: BigWord;
+ children: ComputedDependency[];
+};
+export type WordMap = Record<number, Word>;
+
+export function buildTreeFromWords(words: Word[]): Result<ParsedGrammar> {
+ const roots = words.filter((w) => w.deprel === "root");
+ if (roots.length > 1) {
+ console.log("roots", roots);
+ return { error: "too many roots" };
+ } else if (roots.length === 0) {
+ return { error: "no roots" };
+ } else {
+ const root = roots[0];
+ const wordmap = words.reduce((acc: WordMap, item) => {
+ acc[item.id] = item;
+ return acc;
+ }, {});
+ return { ok: parseFurther(words, wordmap, root) };
+ }
+}
+function parseFurther(
+ words: Word[],
+ wordMap: WordMap,
+ root: Word,
+): ParsedGrammar {
+ const predicateCore = root.id;
+ let subjectCore: number | null = null;
+ const tree: Record<number, number[]> = {};
+ const bigwords: BigWord[] = [];
+ const getAncestry = (parent: Word): number[] => {
+ const kids = tree[parent.head] || [];
+ tree[parent.head] = [...kids, parent.id];
+ if (parent.deprel === "nsubj") subjectCore = parent.id;
+
+ console.log("getting ancestry " + parent.id, parent.text);
+ const grandpa = wordMap[parent.head];
+ if (!grandpa) return [parent.id];
+ else return [parent.id, ...getAncestry(grandpa)];
+ };
+ let idx = 0;
+ for (const w of words) {
+ if (w.deprel === "punct") {
+ const prev = words[idx - 1];
+ if (!prev) continue;
+ prev.text += w.text;
+ continue;
+ }
+ const parent = wordMap[w.head];
+ if (!parent) tree[w.id] = [];
+ const ancestry = !parent ? [] : getAncestry(parent);
+ const component =
+ subjectCore && (w.id === subjectCore || ancestry.includes(subjectCore))
+ ? "s"
+ : w.id === predicateCore || ancestry.includes(root.id)
+ ? "p"
+ : "u";
+ const bw: BigWord = { ...w, component, ancestry };
+ wordMap[w.id] = bw;
+ bigwords.push(bw);
+ idx++;
+ }
+ const pg: ParsedGrammar = {
+ predicateCore,
+ subjectCore,
+ wordMap,
+ tree,
+ words: bigwords,
+ };
+ return pg;
+}
+
+export function oneDescendant(node: TreeNode): boolean {
+ if (node.children.length !== 1) return false;
+ else {
+ const child = node.children[0];
+ return child.children.length === 0;
+ }
+}
+
+// function findChildren(wordmap: WordMap, word: Word): ComputedDependency {
+// const children = words.filter((w) => w.head === head.id);
+// return {
+// word: head,
+// children: children.map((c) => findChildren(words, c)),
+// };
+// }