got thai working but this is a bit too specific i think

author: polwex <polwex@sortug.com> 2025-06-03 01:36:36 +0700
committer: polwex <polwex@sortug.com> 2025-06-03 01:36:36 +0700
commit: 2b80f7950df34f2a160135d7e20220a9b2ec3352 (patch)
tree: 0e2aec09b9aba887419e46c4d2fcaf861391eedc
parent: 249230c8e0e1bdb8ae4f433262997b84ee274904 (diff)
6 files changed, 330 insertions, 55 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
index 3cff415..f3364ac 100644
--- a/src/lib/calls/nlp.ts
+++ b/src/lib/calls/nlp.ts
@@ -3,6 +3,7 @@ import { SyllableRes } from "../types/cards";
 export type ThaiNLPRes = {
   word: string;
   normalized: string;
+  realSyls: string[];
   syllables: string[];
   syllablesIpa: string[];
   ipa: string;
diff --git a/src/lib/calls/thainlp.ts b/src/lib/calls/thainlp.ts
new file mode 100644
index 0000000..662e984
--- /dev/null
+++ b/src/lib/calls/thainlp.ts
@@ -0,0 +1,106 @@
+import { SyllableRes } from "../types/cards";
+
+export type ThaiNLPRes = {
+  word: string;
+  normalized: string;
+  realSyls: string[];
+  syllables: string[];
+  syllablesIpa: string[];
+  ipa: string;
+  pos: string;
+};
+
+export async function thaiData(word: string): Promise<ThaiNLPRes[]> {
+  const [head, tail] = await Promise.all([
+    analyzeTHWord(word),
+    segmentateThai(word),
+  ]);
+  return [head, ...tail];
+}
+
+export async function analyzeTHWord(word: string): Promise<ThaiNLPRes> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word }),
+  };
+  const r1 = await fetch("http://localhost:8001" + "/analyze", opts);
+  // const r2 = await fetch(`http://192.168.1.110:8000/analyze`, opts);
+  const jj = await r1.json();
+  return jj;
+}
+export async function segmentateThai(sentence: string): Promise<ThaiNLPRes[]> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word: sentence }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/segmentate`, opts);
+  const jj = await r2.json();
+  return jj;
+}
+export async function getThaiFreq(word: string): Promise<number> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/freq`, opts);
+  const jj = await r2.json();
+  return jj;
+}
+export async function getThaiNext(word: string): Promise<string[]> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/next`, opts);
+  const jj = await r2.json();
+  return jj;
+}
+
+export async function getThaiPrev(word: string): Promise<string[]> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/prev`, opts);
+  const jj = await r2.json();
+  return jj;
+}
+
+export async function getThaiNext_bi(
+  word1: string,
+  word2: string,
+): Promise<string[]> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word1, word2 }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/next_bi`, opts);
+  const jj = await r2.json();
+  return jj;
+}
+
+export async function getThaiPrev_bi(
+  word1: string,
+  word2: string,
+): Promise<string[]> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word1, word2 }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/prev_bi`, opts);
+  const jj = await r2.json();
+  return jj;
+}
diff --git a/src/lib/db/enseed.ts b/src/lib/db/enseed.ts
new file mode 100644
index 0000000..58f5876
--- /dev/null
+++ b/src/lib/db/enseed.ts
@@ -0,0 +1,151 @@
+import Database from "bun:sqlite";
+import {
+  analyzeTHWord,
+  deconstructSyllable,
+  segmentateThai,
+  type SorSyl,
+  type ThaiNLPRes,
+  sorSyl,
+  getThaiFreq,
+} from "../calls/nlp";
+import pdb from "./prosodydb";
+import { cleanIpa } from "../utils";
+import { handleFile } from "./utils";
+import { Tone } from "../types/phonetics";
+
+async function readDump(lang: string) {
+  await pdb.init();
+  pdb.addLanguage("th", "thai");
+  let count = 0;
+  const langdb = new Database(
+    `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+  );
+  let langrows: any = langdb.query("SELECT data FROM langs");
+  // langrows = langrows.slice(10);
+  const freqMap = await getFrequency();
+  for (const langrow of langrows) {
+    count++;
+    console.log(count);
+    // if (count <= 10000) continue;
+    if (count > 30) break;
+    const j = JSON.parse(langrow.data);
+    const word = j.word.trim();
+    if (!word) continue;
+    const split = word.split(" ");
+    if (split.length > 1) await handleIdiom(lang, word);
+    else await handleWord(lang, word, j, freqMap);
+  }
+}
+
+async function handleWord(
+  lang: string,
+  word: string,
+  j: any,
+  freqMap: Map<string, number>,
+) {
+  // TODO add categories but add a tag to see what classifying scheme we're using
+  //
+  const sounds = j.sounds || [];
+  const hasIpa = sounds.find((s: any) => "ipa" in s);
+  const hwikiRhyme = sounds.find((s: any) => "rhymes" in s);
+  const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
+  if (!hasIpa) {
+    console.error("no ipa!!", word);
+    console.dir(j, { depth: null });
+    return;
+  }
+  const freq = freqMap.get(word) || null;
+  // const wordId = pdb.addWord(word, lang, freq, null);
+  // WIPE
+  const wordId = 0;
+  // console.log(analyzed);
+  for (let snd of sounds)
+    if ("ipa" in snd) handleIpa(wordId, word, lang, j, snd, wikiRhyme);
+}
+async function handleIpa(
+  wordId: number | bigint,
+  word: string,
+  lang: string,
+  j: any,
+  snd: any,
+  wikiRhyme: string | null,
+) {
+  const tags = JSON.stringify(snd.tags) || null;
+  const ipa = snd.ipa;
+  const syls = await sorSyl(word, lang, ipa);
+
+  console.log(word);
+  console.log(ipa);
+  // pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null);
+  // set word rhyme
+  const wordRhyme = syls.syls.reduce((acc: string, item: SorSyl) => {
+    if (!item.stressed && !acc) return acc;
+    if (item.stressed && !acc) return `${acc}${item.rhyme}`;
+    else return `${acc}${item.ipa}`;
+  }, "");
+  if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
+  //
+  for (let i = 0; i < syls.syls.length; i++) {
+    const syl = syls.syls[i]!;
+    await handleSyllable(word, syl.ipa, wordId, i);
+  }
+}
+async function handleSyllable(
+  spelling: string,
+  ipa: string,
+  wordId: number | bigint,
+  idx: number,
+) {
+  const sorsyl = await sorSyl(spelling, "th", ipa);
+  if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
+  const syl = sorsyl.syls[0]!;
+  try {
+    pdb.addSyllable(
+      wordId,
+      idx + 1,
+      "th",
+      syl.ipa,
+      syl.long,
+      spelling,
+      { spelling: syl.onset, ipa: syl.onset },
+      { spelling: syl.medial, ipa: syl.medial },
+      { spelling: syl.nucleus, ipa: syl.nucleus },
+      { spelling: syl.coda, ipa: syl.coda },
+      { spelling: syl.rhyme, ipa: syl.rhyme },
+      { letters: "", numbers: 0, name: "" },
+      null,
+    );
+  } catch (e) {
+    // console.log("well fuck", syl);
+    // console.error(e);
+    console.log();
+  }
+}
+async function handleIdiom(lang: string, idiom: string) {
+  pdb.addIdiom(idiom, lang);
+  // TODO later set idiom_words once all words are populated
+  // console.log();
+}
+// ช้า ๆ
+// งก ๆ
+// หงก ๆ
+
+async function getFrequency() {
+  const freqMap = new Map<number, string>();
+  await handleFile(
+    "/home/y/code/prosody/hanchu/datasets/unigram_freq.csv",
+    (line, idx) => {
+      const [spelling, frequency] = line.split(",");
+      freqMap.set(Number(frequency!), spelling!);
+    },
+  );
+  const orderedMap = new Map<string, number>();
+  const keys = Array.from(freqMap.keys()).sort();
+  for (let i = 0; i < keys.length; i++) {
+    const val = freqMap.get(keys[i]!)!;
+    orderedMap.set(val, i + 1);
+  }
+  return orderedMap;
+}
+
+readDump("en");
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
index 1cfb8f0..9e76b8d 100644
--- a/src/lib/db/prosodydb.ts
+++ b/src/lib/db/prosodydb.ts
@@ -130,7 +130,7 @@ class DatabaseHandler {
            RETURNING rowid
           `,
         )
-        .get(onset.ipa, lang, onset.spelling) as number;
+        .get(onset.ipa, lang, onset.spelling) as { id: number };
       const medialId = this.db
         .query(
           `INSERT INTO medials(ipa, lang, text) VALUES(?, ?, ?)
@@ -139,7 +139,7 @@ class DatabaseHandler {
            RETURNING rowid
           `,
         )
-        .get(medial.ipa, lang, medial.spelling) as number;
+        .get(medial.ipa, lang, medial.spelling) as { id: number };
       const nucleusId = this.db
         .query(
           `INSERT INTO nucleus(ipa, lang, text) VALUES(?, ?, ?)
@@ -148,7 +148,7 @@ class DatabaseHandler {
          RETURNING rowid
         `,
         )
-        .get(nucleus.ipa, lang, nucleus.spelling) as number;
+        .get(nucleus.ipa, lang, nucleus.spelling) as { id: number };
       const codaId = this.db
         .query(
           `INSERT INTO codas(ipa, lang, text) VALUES(?, ?, ?)
@@ -157,7 +157,7 @@ class DatabaseHandler {
            RETURNING rowid
           `,
         )
-        .get(coda.ipa, lang, coda.spelling) as number;
+        .get(coda.ipa, lang, coda.spelling) as { id: number };
       const rhymeId = this.db
         .query(
           `INSERT INTO rhymes(ipa, lang, text) VALUES(?, ?, ?)
@@ -166,7 +166,7 @@ class DatabaseHandler {
          RETURNING rowid
         `,
         )
-        .get(rhyme.ipa, lang, rhyme.spelling) as number;
+        .get(rhyme.ipa, lang, rhyme.spelling) as { id: number };
       const toneId = this.db
         .query(
           `INSERT INTO tones(ipa, lang, name, nums) VALUES(?, ?, ?, ?)
@@ -175,39 +175,25 @@ class DatabaseHandler {
            RETURNING rowid
           `,
         )
-        .get(tone.letters, lang, tone.name, tone.numbers) as number;
+        .get(tone.letters, lang, tone.name, tone.numbers) as { id: number };
 
       const query = this.db.query(
-        `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+        `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
       );
-      // TODO need a dual structure here for IPA and orto
       const res = query.run(
         lang,
         ipa,
         long,
         text,
-        onsetId,
-        medialId,
-        nucleusId,
-        codaId,
-        rhymeId,
-        toneId,
+        onsetId.id,
+        medialId.id,
+        nucleusId.id,
+        codaId.id,
+        rhymeId.id,
+        toneId.id,
         notes,
       );
       const sylId = res.lastInsertRowid;
-      const ipaq = this.db.query(`
-        INSERT INTO syl_ipa(syl_id, ipa, onset, medial, nucleus, coda, rhyme, notes)
-        VALUES(?, ?, ?, ?, ?, ?, ?, ?)`);
-      ipaq.run(
-        sylId,
-        ipa,
-        onset.ipa,
-        medial.ipa,
-        nucleus.ipa,
-        coda.ipa,
-        rhyme.ipa,
-        null,
-      );
       //
       const res1 = this.db
         .query(
diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql
index 26818f3..c962d83 100644
--- a/src/lib/db/prosodyschema.sql
+++ b/src/lib/db/prosodyschema.sql
@@ -144,18 +144,6 @@ CREATE TABLE IF NOT EXISTS words_idioms(
 
 
 -- 
-CREATE TABLE IF NOT EXISTS syl_ipa(
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    syl_id INTEGER NOT NULL,
-    ipa TEXT NOT NULL,
-    onset TEXT NOT NULL,
-    medial TEXT NOT NULL,
-    nucleus TEXT NOT NULL,
-    rhyme TEXT NOT NULL,
-    coda TEXT NOT NULL,
-    notes TEXT,
-    CONSTRAINT syl_ipa_unique UNIQUE (ipa, syl_id)
-);
 
 CREATE TABLE IF NOT EXISTS word_phonetics(
     id INTEGER PRIMARY KEY AUTOINCREMENT,
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
index 687f0f3..5c75345 100644
--- a/src/lib/db/thaiseed.ts
+++ b/src/lib/db/thaiseed.ts
@@ -24,9 +24,9 @@ async function readDump(lang: string) {
   // langrows = langrows.slice(10);
   for (const langrow of langrows) {
     count++;
-    console.log(count);
+    // console.log(count);
     // if (count <= 10000) continue;
-    // if (count > 30) break;
+    // if (count > 100) break;
     const j = JSON.parse(langrow.data);
     const word = j.word.trim();
     if (!word) continue;
@@ -48,7 +48,6 @@ async function handleWord(word: string, j: any) {
   const freq = await getThaiFreq(word);
   const wordId = pdb.addWord(word, "th", freq, null);
   const analyzed = await analyzeTHWord(word);
-  // console.log(analyzed);
   for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed);
 }
 async function handleIpa(
@@ -66,27 +65,39 @@ async function handleIpa(
   const wikiIpaSplit = wikiIpa.split(".");
   const nlpIpaSplit = nlpIpa.split(".");
   if (wikiIpaSplit.length !== nlpIpaSplit.length) {
-    console.log("ipa mismatch");
-    console.log(wikiIpa);
-    console.log(nlpIpa);
-    // return;
+    // console.log("ipa mismatch");
+    // console.log(wikiIpa);
+    // console.log(nlpIpa);
   }
-  if (analyzed.syllables.length !== wikiIpaSplit.length) {
-    console.log("syllable analysis mismatch", j.word);
-    console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+  if (analyzed.realSyls.length !== wikiIpaSplit.length) {
+    // console.log("syllable analysis mismatch", j.word);
+    // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
     // console.dir(j, { depth: null });
     return;
   }
   pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null);
+  const writtenSyls = analyzed.syllables;
+  const pronouncedSyls = analyzed.realSyls;
+  let badSyls = false;
+  if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
 
-  for (let i = 0; i < analyzed.syllables.length; i++) {
-    const spelling = analyzed.syllables[i]!;
+  for (let i = 0; i < pronouncedSyls.length; i++) {
+    const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, "");
+    const written = writtenSyls[i] || "";
+    const syllable = badSyls ? pronounced : written;
     const ipa = wikiIpaSplit[i]!;
+    // TODO insert both??
+    const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
+    if (pronounced !== syllable) {
+      console.log("diff");
+      console.log(pronounced);
+      console.log(written);
+    }
     try {
-      await handleSyllable(spelling, ipa, wordId, i);
+      await handleSyllable(syllable, ipa, wordId, i, notes);
     } catch (e) {
       console.error("syl error", j.word, j.sounds);
-      console.error({ spelling, ipa, wikiIpaSplit });
+      console.error({ analyzed, ipa, wikiIpaSplit });
       console.error(e);
     }
   }
@@ -115,16 +126,48 @@ function parseTone(ipa: string, spelling: string): Tone {
     throw new Error("");
   }
 }
+
 async function handleSyllable(
   spelling: string,
   ipa: string,
   wordId: number | bigint,
   idx: number,
+  notes: string | null,
 ) {
   const sorsyl = await sorSyl(spelling, "th", ipa);
+  const weird = [
+    // "a̯n",
+    // "a̯",
+    // "a̯p",
+    // "a̯w",
+    // "a̯j",
+    // "a̯ŋ",
+    // "a̯k",
+    // "a̯t",
+    // "a̯m",
+    // "a̯ʔ",
+    // "ʔ",
+    "s",
+    "l",
+    "f",
+    "a̯s",
+    "js",
+    "t͡ɕʰ",
+    "ks",
+    "ns",
+    "a̯l",
+    "a̯f",
+    "mk",
+  ];
+  // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
+  // if (weirder) {
+  //   console.log("syllable", spelling);
+  //   // console.dir(sorsyl, { depth: null });
+  //   // console.dir(j, { depth: null });
+  // }
   if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
   const syl = sorsyl.syls[0]!;
-  const tone = syl.tone ? parseTone(syl.tone, spelling) : null;
+  const tone = parseTone(syl.tone, spelling);
   try {
     pdb.addSyllable(
       wordId,
@@ -139,7 +182,7 @@ async function handleSyllable(
       { spelling: syl.coda, ipa: syl.coda },
       { spelling: syl.rhyme, ipa: syl.rhyme },
       tone,
-      null,
+      notes,
     );
   } catch (e) {
     // console.log("well fuck", syl);
author	polwex <polwex@sortug.com>	2025-06-03 01:36:36 +0700
committer	polwex <polwex@sortug.com>	2025-06-03 01:36:36 +0700
commit	2b80f7950df34f2a160135d7e20220a9b2ec3352 (patch)
tree	0e2aec09b9aba887419e46c4d2fcaf861391eedc
parent	249230c8e0e1bdb8ae4f433262997b84ee274904 (diff)