m

author: polwex <polwex@sortug.com> 2025-06-03 19:40:34 +0700
committer: polwex <polwex@sortug.com> 2025-06-03 19:40:34 +0700
commit: b91b758041cbc7b8bf7e2a4aee8d6228a75d8105 (patch)
tree: 4fa343ed394034b16841ecfcb6411b1574d24b25
parent: 175ddca375cef765cec8ca5bbc527a205c40bf25 (diff)
3 files changed, 137 insertions, 82 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
index 1e84e93..2810744 100644
--- a/src/lib/calls/nlp.ts
+++ b/src/lib/calls/nlp.ts
@@ -176,3 +176,17 @@ export async function findLemma(word: string, lang: string) {
   const jj = await r2.json();
   return jj;
 }
+export async function charsiuG2P(word: string, lang: string) {
+  const opts = {
+    method: "POST",
+    headers: {
+      "Content-type": "application/json",
+      "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+    },
+    body: JSON.stringify({ string: word, lang }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8105" + `/ipa`, opts);
+  const jj = await r2.json();
+  return jj;
+}
diff --git a/src/lib/db/enseed.ts b/src/lib/db/enseed.ts
index 39dec44..9ef61ed 100644
--- a/src/lib/db/enseed.ts
+++ b/src/lib/db/enseed.ts
@@ -1,24 +1,15 @@
 import Database from "bun:sqlite";
-import {
-  analyzeTHWord,
-  deconstructSyllable,
-  segmentateThai,
-  type SorSyl,
-  type ThaiNLPRes,
-  sorSyl,
-  getThaiFreq,
-  SorBSyl,
-} from "../calls/nlp";
+import { sorSyl, SorBSyl, charsiuG2P, SorSylRes } from "../calls/nlp";
 import pdb from "./prosodydb";
 import { cleanIpa } from "../utils";
 import { handleFile } from "./utils";
-import { Tone } from "../types/phonetics";
+import { Phoneme, Tone } from "../types/phonetics";
 import { AsyncRes } from "../types";
 
 const errors: string[] = [];
 async function readDump(lang: string) {
   await pdb.init();
-  pdb.addLanguage("th", "thai");
+  pdb.addLanguage("en", "english");
   let count = 0;
   const langdb = new Database(
     `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
@@ -37,8 +28,8 @@ async function readDump(lang: string) {
     const split = word.split(" ");
     const res =
       split.length > 1
-        ? await handleIdiom(lang, word)
-        : await handleWord(lang, word, j, freqMap);
+        ? await handleIdiom(word, lang)
+        : await handleWord(word, lang, j, freqMap);
     if ("error" in res) {
       console.error(res.error);
       break;
@@ -48,50 +39,69 @@ async function readDump(lang: string) {
 }
 
 async function handleWord(
-  lang: string,
   word: string,
+  lang: string,
   j: any,
   freqMap: Map<string, number>,
 ): AsyncRes<string> {
-  // TODO add categories but add a tag to see what classifying scheme we're using
-  //
+  const frequency = freqMap.get(word) || null;
+  const promises = await getIpa(word, lang, j);
+  const phonetics = await Promise.all(promises);
+
+  // pdb.superAdd({ word, lang, frequency, wordNotes: null, phonetics });
+  return { ok: "" };
+}
+
+type IPAData = {
+  ipa: string;
+  syllable_count: number;
+  syllable_sequence: string;
+  tone_sequence: string;
+  ipa_sequence: string;
+  tags: string | null;
+  notes: string | null;
+  wordRhyme: string | null;
+  syllables: SylData[];
+};
+async function getIpa(
+  word: string,
+  lang: string,
+  j: any,
+): Promise<Promise<IPAData>[]> {
   const sounds = j.sounds || [];
   const hasIpa = sounds.find((s: any) => "ipa" in s);
-  const hwikiRhyme = sounds.find((s: any) => "rhymes" in s);
-  const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
   if (!hasIpa) {
-    // console.error("no ipa!!", word);
+    console.log("no ipa", word);
     // console.dir(j, { depth: null });
-    return { error: "meh no ipa" };
+    console.dir(sounds, { depth: null });
+    // TODO fetch from idk charsiu
+    // const ipa = await charsiuG2P(word, lang);
+    // console.log("charsiu", ipa);
   }
-  const freq = freqMap.get(word) || null;
-  // const wordId = pdb.addWord(word, lang, freq, null);
-  // WIPE
-  const wordId = 0;
-  // console.log(analyzed);
-  for (let snd of sounds)
-    if ("ipa" in snd) {
-      const res = await handleIpa(wordId, word, lang, j, snd, wikiRhyme);
-      if ("error" in res) return res;
-    }
-  return { ok: "" };
+  const hwikiRhyme = sounds.find((s: any) => "rhymes" in s);
+  const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
+  const ipaData: Promise<IPAData>[] = sounds.reduce(
+    (acc: Promise<IPAData>[], snd: any) => {
+      if ("ipa" in snd) {
+        const data = getIpaData(word, lang, snd, wikiRhyme);
+        return [...acc, data];
+      } else return acc;
+    },
+    [],
+  );
+  return ipaData;
 }
-async function handleIpa(
-  wordId: number | bigint,
+
+async function getIpaData(
   word: string,
   lang: string,
-  j: any,
   snd: any,
   wikiRhyme: string | null,
-) {
+): Promise<IPAData> {
+  console.log("geting ipa...");
   const tags = JSON.stringify(snd.tags) || null;
-  const ipa = snd.ipa;
+  const ipa = cleanIpa(snd.ipa);
   const syls = await sorSyl(word, lang, ipa);
-  // console.log(syls, "sorsyl");
-
-  console.log(word);
-  console.log(ipa);
-  pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null);
   // set word rhyme
   const wordRhyme = syls.syls.reduce((acc: string, itemm: SorBSyl) => {
     const item = itemm.ipa;
@@ -99,47 +109,76 @@ async function handleIpa(
     if (item.stressed && !acc) return `${acc}${item.rhyme}`;
     else return `${acc}${item.all}`;
   }, "");
-  if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
+  console.log({ word, wikiRhyme, wordRhyme });
 
-  for (let i = 0; i < syls.syls.length; i++) {
-    const syl = syls.syls[i]!;
-    const res = await handleSyllable(syl, wordId, i);
-    if ("error" in res) return res;
-  }
-  return { ok: "" };
+  const tone_sequence = "";
+  const seqs = syls.syls.reduce(
+    (acc, item, idx) => {
+      const startString = idx === 0 ? "" : ",";
+      const { ipa, spelling } = item;
+      acc.ipa += `${startString}${ipa.all}`;
+      acc.syls += `${startString}${spelling.all}`;
+      return acc;
+    },
+    { syls: "", ipa: "" },
+  );
+  const syllable_sequence = seqs.syls;
+  const ipa_sequence = seqs.ipa;
+  const syllables = getSyllables(syls);
+  return {
+    ipa,
+    syllable_count: syls.syls.length,
+    syllable_sequence,
+    tone_sequence,
+    ipa_sequence,
+    tags,
+    notes: null,
+    wordRhyme: null,
+    syllables,
+  };
 }
-async function handleSyllable(
-  syl: SorBSyl,
-  wordId: number | bigint,
-  idx: number,
-): AsyncRes<string> {
-  try {
-    pdb.addSyllable(
-      wordId,
-      idx + 1,
-      syl.ipa.stressed,
-      "th",
-      syl.ipa.all,
-      syl.ipa.long,
-      syl.spelling.all,
-      { spelling: syl.spelling.onset, ipa: syl.ipa.onset },
-      { spelling: syl.spelling.medial, ipa: syl.ipa.medial },
-      { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus },
-      { spelling: syl.spelling.coda, ipa: syl.ipa.coda },
-      { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme },
-      { letters: "", numbers: 0, name: "" },
-      null,
-    );
-    return { ok: "" };
-  } catch (e) {
-    // console.log("well fuck", syl);
-    // console.error(e);
-    return { error: `${e}` };
+type SylData = {
+  idx: number;
+  stressed: boolean | null;
+  spelling: string;
+  ipa: string;
+  long: boolean;
+  onset: Phoneme;
+  medial: Phoneme;
+  nucleus: Phoneme;
+  coda: Phoneme;
+  rhyme: Phoneme;
+  tone: Tone;
+  notes: string | null;
+};
+function getSyllables(syl: SorSylRes): SylData[] {
+  let syls: SylData[] = [];
+  for (let i = 0; i < syl.syls.length; i++) {
+    const syllable = syl.syls[i]!;
+    const res = getSyllable(syllable, i);
+    syls.push(res);
   }
+  return syls;
+}
+function getSyllable(syl: SorBSyl, idx: number): SylData {
+  return {
+    idx: idx + 1,
+    stressed: null,
+    spelling: syl.spelling.all,
+    ipa: syl.ipa.all,
+    long: syl.ipa.long,
+    onset: { spelling: syl.spelling.onset, ipa: syl.ipa.onset },
+    medial: { spelling: syl.spelling.medial, ipa: syl.ipa.medial },
+    nucleus: { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus },
+    coda: { spelling: syl.spelling.coda, ipa: syl.ipa.coda },
+    rhyme: { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme },
+    tone: { name: "", letters: "", numbers: 0 },
+    notes: null,
+  };
 }
-async function handleIdiom(lang: string, idiom: string): AsyncRes<string> {
+async function handleIdiom(idiom: string, lang: string): AsyncRes<string> {
   try {
-    pdb.addIdiom(idiom, lang);
+    // pdb.addIdiom(idiom, lang);
     // TODO later set idiom_words once all words are populated
     // console.log();
     return { ok: "" };
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
index 7c067d2..26687a2 100644
--- a/src/lib/db/prosodydb.ts
+++ b/src/lib/db/prosodydb.ts
@@ -8,10 +8,13 @@ class DatabaseHandler {
   db: Database;
   constructor() {
     // const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db";
-    const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/thaiphon.db";
+    const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/enphon.db";
     const db = new Database(dbPath, { create: true });
     db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance
     db.exec("PRAGMA foreign_keys = ON");
+    db.exec("PRAGMA cache_size = -8000"); // Increase cache size to 8MB
+    db.exec("PRAGMA temp_store = MEMORY"); // Store temp tables in memory
+    db.exec("PRAGMA synchronous = NORMAL"); // Slightly less safe but faster
     this.db = db;
   }
   async init() {
@@ -62,9 +65,8 @@ class DatabaseHandler {
        FROM words w
        JOIN word_phonetics wp ON wp.word_id = w.id
        JOIN syllables_words sw ON sw.word_id = w.id
-       WHERE w.frequency IS NOT NULL
-       AND w.lang = ?
-       ORDER BY w.frequency ASC
+       WHERE w.lang = ?
+       ORDER BY w.frequency ASC NULLS LAST
        LIMIT 300
        `,
     );
author	polwex <polwex@sortug.com>	2025-06-03 19:40:34 +0700
committer	polwex <polwex@sortug.com>	2025-06-03 19:40:34 +0700
commit	b91b758041cbc7b8bf7e2a4aee8d6228a75d8105 (patch)
tree	4fa343ed394034b16841ecfcb6411b1574d24b25
parent	175ddca375cef765cec8ca5bbc527a205c40bf25 (diff)