diff options
Diffstat (limited to 'src/nlp/stanza.ts')
| -rw-r--r-- | src/nlp/stanza.ts | 210 |
1 files changed, 210 insertions, 0 deletions
diff --git a/src/nlp/stanza.ts b/src/nlp/stanza.ts new file mode 100644 index 0000000..5836b91 --- /dev/null +++ b/src/nlp/stanza.ts @@ -0,0 +1,210 @@ +import type { AsyncRes, Result } from "sortug"; +import { detectLang } from "./iso"; + +const ENDPOINT = "http://localhost:8102"; +export async function segmenter( + text: string, + langg?: string, +): AsyncRes<StanzaRes> { + try { + const lang = langg ? langg : detectLang(text); + const body = JSON.stringify({ lang, string: text }); + const opts = { + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + method: "POST", + body, + }; + const res = await fetch(ENDPOINT + "/stanza", opts); + const j = await res.json(); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} +export async function idLang(text: string) { + try { + const body = JSON.stringify({ string: text }); + const opts = { + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + method: "POST", + body, + }; + const res = await fetch(ENDPOINT + "/detect-lang", opts); + const j = await res.json(); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} +export type StanzaRes = { input: string; segments: Sentence[] }; +export type Sentence = { + text: string; + sentiment: number; + constituency: TreeNode; + constring: string; + dependencies: Dependency[]; + entities: Entity[]; + tokens: Token[]; + words: Word[]; +}; +export type TreeNode = { + label: string; + children: TreeNode[]; +}; +export type Dependency = Array<[Word, string, Word]>; +export type Word = { + id: number; + text: string; + lemma: string; + upos: string; + xpos: string; + feats: string; + head: number; + deprel: string; + start_char: number; + end_char: number; +}; +export type Token = { + id: [number, number]; + text: string; + misc: string; + words: Word[]; + start_char: number; + end_char: number; + ner: string; +}; +export type Entity = { + text: string; + misc: string; + start_char: number; + end_char: number; + type: string; +}; + +// mine +export type Clause = { + words: Word[]; + dependency: Dependency; + text: string; +}; +// "amod", +// { +// "id": 1, +// "text": "Stony", +// "lemma": "Stony", +// "upos": "ADJ", +// "xpos": "NNP", +// "feats": "Degree=Pos", +// "head": 3, +// "deprel": "amod", +// "start_char": 0, +// "end_char": 5 +// } +// +// + +export interface ParsedGrammar { + predicateCore: number; + subjectCore: number | null; + tree: Record<number, number[]>; + wordMap: WordMap; + words: BigWord[]; +} +export interface BigWord extends Word { + ancestry: number[]; + component: "s" | "p" | "u"; +} +export type ComputedDependency = { + word: BigWord; + children: ComputedDependency[]; +}; +export type WordMap = Record<number, Word>; + +export function buildTreeFromWords(words: Word[]): Result<ParsedGrammar> { + const roots = words.filter((w) => w.deprel === "root"); + if (roots.length > 1) { + console.log("roots", roots); + return { error: "too many roots" }; + } else if (roots.length === 0) { + return { error: "no roots" }; + } else { + const root = roots[0]; + const wordmap = words.reduce((acc: WordMap, item) => { + acc[item.id] = item; + return acc; + }, {}); + return { ok: parseFurther(words, wordmap, root) }; + } +} +function parseFurther( + words: Word[], + wordMap: WordMap, + root: Word, +): ParsedGrammar { + const predicateCore = root.id; + let subjectCore: number | null = null; + const tree: Record<number, number[]> = {}; + const bigwords: BigWord[] = []; + const getAncestry = (parent: Word): number[] => { + const kids = tree[parent.head] || []; + tree[parent.head] = [...kids, parent.id]; + if (parent.deprel === "nsubj") subjectCore = parent.id; + + console.log("getting ancestry " + parent.id, parent.text); + const grandpa = wordMap[parent.head]; + if (!grandpa) return [parent.id]; + else return [parent.id, ...getAncestry(grandpa)]; + }; + let idx = 0; + for (const w of words) { + if (w.deprel === "punct") { + const prev = words[idx - 1]; + if (!prev) continue; + prev.text += w.text; + continue; + } + const parent = wordMap[w.head]; + if (!parent) tree[w.id] = []; + const ancestry = !parent ? [] : getAncestry(parent); + const component = + subjectCore && (w.id === subjectCore || ancestry.includes(subjectCore)) + ? "s" + : w.id === predicateCore || ancestry.includes(root.id) + ? "p" + : "u"; + const bw: BigWord = { ...w, component, ancestry }; + wordMap[w.id] = bw; + bigwords.push(bw); + idx++; + } + const pg: ParsedGrammar = { + predicateCore, + subjectCore, + wordMap, + tree, + words: bigwords, + }; + return pg; +} + +export function oneDescendant(node: TreeNode): boolean { + if (node.children.length !== 1) return false; + else { + const child = node.children[0]; + return child.children.length === 0; + } +} + +// function findChildren(wordmap: WordMap, word: Word): ComputedDependency { +// const children = words.filter((w) => w.head === head.id); +// return { +// word: head, +// children: children.map((c) => findChildren(words, c)), +// }; +// } |
