m

author: polwex <polwex@sortug.com> 2025-05-29 12:46:01 +0700
committer: polwex <polwex@sortug.com> 2025-05-29 12:46:01 +0700
commit: 06e8d0a0d636f539f20ece3d9d767190d0a71b3b (patch)
tree: 4564a30b20e6bafc89ecdf0e818b23e4734f3ec1
parent: a3f24ea79b14394b24c4b60a010651eb29eeb872 (diff)
2 files changed, 89 insertions, 54 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
index 28562d0..24e7cf3 100644
--- a/src/lib/calls/nlp.ts
+++ b/src/lib/calls/nlp.ts
@@ -52,3 +52,18 @@ export async function deconstructSyllable(ipa: string): Promise<SyllableRes> {
   const jj = await r2.json();
   return jj;
 }
+
+export async function findLemma(word: string, lang: string) {
+  const opts = {
+    method: "POST",
+    headers: {
+      "Content-type": "application/json",
+      "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+    },
+    body: JSON.stringify({ string: word, lang }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8102" + `/spacy`, opts);
+  const jj = await r2.json();
+  return jj;
+}
diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts
index 7f4352f..0e291c3 100644
--- a/src/lib/db/seed.ts
+++ b/src/lib/db/seed.ts
@@ -4,6 +4,7 @@ import useful from "@/lib/useful_thai.json";
 import db from ".";
 import pdb from "./prosodydb";
 import * as Sorsyl from "sorsyl";
+import { findLemma } from "../calls/nlp";
 
 const SYMBOL_REGEX = new RegExp(/[\W\d]/);
 
@@ -516,6 +517,7 @@ async function redump() {
   //   "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
   //   "text", "hangeul", "topics", "form", "audio-ipa"
   // ]
+  const langs = ["en", "th", "zh", "es", "ja", "vn"];
   for await (const line of readWiktionaryDump()) {
     try {
       count++;
@@ -524,6 +526,7 @@ async function redump() {
       console.log(Object.keys(j), j.word);
       // add language to db
       pdb.addLanguage(j.lang_code, j.lang);
+      if (!langs.includes(j.lang_code)) continue;
       // handleEtim(j);
       // handleDerived(j);
       // handleSenses(j.pos, j.senses);
@@ -551,64 +554,81 @@ type SorSyl = {
   tone: string;
 };
 async function handleWord(j: any) {
-  const wordId = pdb.addWord(j.word, j.lang_code);
   let ts = Date.now();
+  const analyzed = await findLemma(j.word, j.lang_code);
+  if (analyzed.segments.length !== 1)
+    return console.error("wtf bruh", analyzed);
+  const seg = analyzed.segments[0];
+  const isLemma = analyzed.input === seg.lemma;
+  if (!isLemma)
+    return console.error("not lemma", {
+      ...seg,
+      word: j.word,
+      input: analyzed.input,
+    });
+  const wordId = pdb.addWord(j.word, j.lang_code);
 
-  const hwikiRhyme = j.sounds.find((s) => "rhymes" in s);
+  const sounds = j.sounds || [];
+  const hwikiRhyme = sounds.find((s: any) => "rhymes" in s);
   const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
-  for (let snd of j.sounds || []) {
-    if ("ipa" in snd) {
-      const tags = JSON.stringify(snd.tags) || null;
-      const ipa = snd.ipa;
-      try {
-        const hres = await fetch("http://localhost:8104/syls", {
-          method: "POST",
-          headers: { "content-type": "application/json" },
-          body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }),
-        });
-        const hjon = await hres.json();
-        console.log(Date.now() - ts, "elapsed in http");
-        ts = Date.now();
-        pdb.addPronunciation(
-          "word",
-          wordId,
-          hjon.clean_ipa,
-          hjon.syls.length,
-          tags,
-          null,
-        );
-        const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => {
-          if (!item.stressed && !acc) return acc;
-          if (item.stressed && !acc) return `${acc}${item.rhyme}`;
-          else return `${acc}${item.ipa}`;
-        }, "");
-        if (wordRhyme)
-          pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
-        else console.log("no rhyme?", hjon);
-        for (const syl of hjon.syls) {
-          // TODO ideally syllables would have spelling not IPA... harsh tho
-          pdb.addSyllable(
-            wordId,
-            syl.ipa,
-            j.lang_code,
-            syl.long,
-            syl.onset || null,
-            syl.medial || null,
-            syl.nucleus,
-            syl.coda || null,
-            syl.rhyme,
-            syl.tone || null,
-            null,
-          );
-        }
-        console.log(Date.now() - ts, "elapsed in db");
-        ts = Date.now();
-      } catch (e) {
-        console.error(e);
-        console.error(j);
-        // break;
-      }
+  for (let snd of sounds) {
+    if ("ipa" in snd) handleIpa(wordId, j, snd, wikiRhyme);
+  }
+}
+async function handleIpa(
+  wordId: number | bigint,
+  j: any,
+  snd: any,
+  wikiRhyme: string | null,
+) {
+  const tags = JSON.stringify(snd.tags) || null;
+  const ipa = snd.ipa;
+  try {
+    const hres = await fetch("http://localhost:8104/syls", {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }),
+    });
+    const hjon = await hres.json();
+    // console.log(Date.now() - ts, "elapsed in http");
+    // ts = Date.now();
+    pdb.addPronunciation(
+      "word",
+      wordId,
+      hjon.clean_ipa,
+      hjon.syls.length,
+      tags,
+      null,
+    );
+    const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => {
+      if (!item.stressed && !acc) return acc;
+      if (item.stressed && !acc) return `${acc}${item.rhyme}`;
+      else return `${acc}${item.ipa}`;
+    }, "");
+    if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
+    else console.log("no rhyme?", hjon);
+    for (const syl of hjon.syls) {
+      // TODO ideally syllables would have spelling not IPA... harsh tho
+      pdb.addSyllable(
+        wordId,
+        syl.ipa,
+        j.lang_code,
+        syl.long,
+        syl.onset || null,
+        syl.medial || null,
+        syl.nucleus,
+        syl.coda || null,
+        syl.rhyme,
+        syl.tone || null,
+        null,
+      );
     }
+    // console.log(Date.now() - ts, "elapsed in db");
+    // ts = Date.now();
+  } catch (e) {
+    console.error(e);
+    console.error({ snd });
+    // break;
   }
 }
 async function handleIdiom(j: any) {
author	polwex <polwex@sortug.com>	2025-05-29 12:46:01 +0700
committer	polwex <polwex@sortug.com>	2025-05-29 12:46:01 +0700
commit	06e8d0a0d636f539f20ece3d9d767190d0a71b3b (patch)
tree	4564a30b20e6bafc89ecdf0e818b23e4734f3ec1
parent	a3f24ea79b14394b24c4b60a010651eb29eeb872 (diff)