4 files changed, 235 insertions, 58 deletions
diff --git a/src/lib/db/enseed.ts b/src/lib/db/enseed.ts
index 58f5876..39dec44 100644
--- a/src/lib/db/enseed.ts
+++ b/src/lib/db/enseed.ts
@@ -7,12 +7,15 @@ import {
   type ThaiNLPRes,
   sorSyl,
   getThaiFreq,
+  SorBSyl,
 } from "../calls/nlp";
 import pdb from "./prosodydb";
 import { cleanIpa } from "../utils";
 import { handleFile } from "./utils";
 import { Tone } from "../types/phonetics";
+import { AsyncRes } from "../types";
 
+const errors: string[] = [];
 async function readDump(lang: string) {
   await pdb.init();
   pdb.addLanguage("th", "thai");
@@ -27,14 +30,21 @@ async function readDump(lang: string) {
     count++;
     console.log(count);
     // if (count <= 10000) continue;
-    if (count > 30) break;
+    if (count > 300) break;
     const j = JSON.parse(langrow.data);
     const word = j.word.trim();
     if (!word) continue;
     const split = word.split(" ");
-    if (split.length > 1) await handleIdiom(lang, word);
-    else await handleWord(lang, word, j, freqMap);
+    const res =
+      split.length > 1
+        ? await handleIdiom(lang, word)
+        : await handleWord(lang, word, j, freqMap);
+    if ("error" in res) {
+      console.error(res.error);
+      break;
+    }
   }
+  console.dir(errors);
 }
 
 async function handleWord(
@@ -42,7 +52,7 @@ async function handleWord(
   word: string,
   j: any,
   freqMap: Map<string, number>,
-) {
+): AsyncRes<string> {
   // TODO add categories but add a tag to see what classifying scheme we're using
   //
   const sounds = j.sounds || [];
@@ -50,9 +60,9 @@ async function handleWord(
   const hwikiRhyme = sounds.find((s: any) => "rhymes" in s);
   const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
   if (!hasIpa) {
-    console.error("no ipa!!", word);
-    console.dir(j, { depth: null });
-    return;
+    // console.error("no ipa!!", word);
+    // console.dir(j, { depth: null });
+    return { error: "meh no ipa" };
   }
   const freq = freqMap.get(word) || null;
   // const wordId = pdb.addWord(word, lang, freq, null);
@@ -60,7 +70,11 @@ async function handleWord(
   const wordId = 0;
   // console.log(analyzed);
   for (let snd of sounds)
-    if ("ipa" in snd) handleIpa(wordId, word, lang, j, snd, wikiRhyme);
+    if ("ipa" in snd) {
+      const res = await handleIpa(wordId, word, lang, j, snd, wikiRhyme);
+      if ("error" in res) return res;
+    }
+  return { ok: "" };
 }
 async function handleIpa(
   wordId: number | bigint,
@@ -73,58 +87,65 @@ async function handleIpa(
   const tags = JSON.stringify(snd.tags) || null;
   const ipa = snd.ipa;
   const syls = await sorSyl(word, lang, ipa);
+  // console.log(syls, "sorsyl");
 
   console.log(word);
   console.log(ipa);
-  // pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null);
+  pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null);
   // set word rhyme
-  const wordRhyme = syls.syls.reduce((acc: string, item: SorSyl) => {
+  const wordRhyme = syls.syls.reduce((acc: string, itemm: SorBSyl) => {
+    const item = itemm.ipa;
     if (!item.stressed && !acc) return acc;
     if (item.stressed && !acc) return `${acc}${item.rhyme}`;
-    else return `${acc}${item.ipa}`;
+    else return `${acc}${item.all}`;
   }, "");
   if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
-  //
+
   for (let i = 0; i < syls.syls.length; i++) {
     const syl = syls.syls[i]!;
-    await handleSyllable(word, syl.ipa, wordId, i);
+    const res = await handleSyllable(syl, wordId, i);
+    if ("error" in res) return res;
   }
+  return { ok: "" };
 }
 async function handleSyllable(
-  spelling: string,
-  ipa: string,
+  syl: SorBSyl,
   wordId: number | bigint,
   idx: number,
-) {
-  const sorsyl = await sorSyl(spelling, "th", ipa);
-  if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
-  const syl = sorsyl.syls[0]!;
+): AsyncRes<string> {
   try {
     pdb.addSyllable(
       wordId,
       idx + 1,
+      syl.ipa.stressed,
       "th",
-      syl.ipa,
-      syl.long,
-      spelling,
-      { spelling: syl.onset, ipa: syl.onset },
-      { spelling: syl.medial, ipa: syl.medial },
-      { spelling: syl.nucleus, ipa: syl.nucleus },
-      { spelling: syl.coda, ipa: syl.coda },
-      { spelling: syl.rhyme, ipa: syl.rhyme },
+      syl.ipa.all,
+      syl.ipa.long,
+      syl.spelling.all,
+      { spelling: syl.spelling.onset, ipa: syl.ipa.onset },
+      { spelling: syl.spelling.medial, ipa: syl.ipa.medial },
+      { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus },
+      { spelling: syl.spelling.coda, ipa: syl.ipa.coda },
+      { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme },
       { letters: "", numbers: 0, name: "" },
       null,
     );
+    return { ok: "" };
   } catch (e) {
     // console.log("well fuck", syl);
     // console.error(e);
-    console.log();
+    return { error: `${e}` };
   }
 }
-async function handleIdiom(lang: string, idiom: string) {
-  pdb.addIdiom(idiom, lang);
-  // TODO later set idiom_words once all words are populated
-  // console.log();
+async function handleIdiom(lang: string, idiom: string): AsyncRes<string> {
+  try {
+    pdb.addIdiom(idiom, lang);
+    // TODO later set idiom_words once all words are populated
+    // console.log();
+    return { ok: "" };
+  } catch (e) {
+    return { error: `${e}` };
+  }
 }
 // ช้า ๆ
 // งก ๆ
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
index 9e76b8d..d6da389 100644
--- a/src/lib/db/prosodydb.ts
+++ b/src/lib/db/prosodydb.ts
@@ -1,12 +1,14 @@
 import Database from "bun:sqlite";
 import { Phoneme, Tone } from "../types/phonetics";
+import { ProsodyWord, ProsodyWordDB } from "../types/cards";
 type Str = string | null;
 type ItemType = "word" | "syllable" | "idiom";
 
 class DatabaseHandler {
   db: Database;
   constructor() {
-    const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db";
+    // const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db";
+    const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/thaiphon.db";
     const db = new Database(dbPath, { create: true });
     db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance
     db.exec("PRAGMA foreign_keys = ON");
@@ -18,12 +20,123 @@ class DatabaseHandler {
     this.db.exec(sql);
   }
   // selects
+  fetchFrequent(lang: string) {
+    const query = this.db.query(
+      `SELECT
+         w.id,
+         w.spelling,
+         w.lang,
+         w.frequency,
+         w.lang,
+         wp.ipa,
+         wp.syllables,
+         wp.tag,
+         w.notes,
+         (SELECT
+             json_group_array(json_object(
+                'ipa', s.ipa,
+                'spelling', s.text,
+                'long', s.long,
+                'notes', s.notes,
+                'onseto', os.text,
+                'onset', os.ipa,
+                'nucleuso', ns.text,
+                'nucleus', ns.ipa,
+                'codao', co.text,
+                'coda', co.ipa,
+                'rhymeo', rh.text,
+                'rhyme', rh.ipa,
+                'tonen', tns.name,
+                'tonenm', tns.nums,
+                'tone', tns.ipa
+                )
+             )
+             FROM syllables s
+             JOIN onsets os ON os.id = s.onset
+             JOIN nucleus ns ON ns.id = s.nucleus
+             JOIN codas co ON co.id = s.coda
+             JOIN rhymes rh ON rh.id = s.rhyme
+             JOIN tones tns ON tns.id = s.tone
+             WHERE s.id= sw.syl_id
+         ) as syllables
+       FROM words w
+       JOIN word_phonetics wp ON wp.word_id = w.id
+       JOIN syllables_words sw ON sw.word_id = w.id
+       WHERE w.frequency IS NOT NULL
+       AND w.lang = ?
+       ORDER BY w.frequency ASC
+       LIMIT 300
+       `,
+    );
+    return query.all(lang) as ProsodyWordDB[];
+  }
   fetchWords(words: string[]) {
     const query = this.db.query(
       `SELECT id FROM words where spelling IN (${words.map((w) => `'${w}'`).join(", ")})`,
     );
     return query.all() as Array<{ id: number }>;
   }
+  fetchSyllables(words: string[]) {
+    const query = this.db.query(
+      `SELECT id FROM words where spelling IN (${words.map((w) => `'${w}'`).join(", ")})`,
+    );
+    return query.all() as Array<{ id: number }>;
+  }
+  fetchOnsets(onset: string) {
+    const query = this.db.query(
+      `SELECT
+        w.id,
+        w.spelling,
+        w.frequency,
+        wp.ipa
+      FROM words w
+      JOIN word_phonetics wp ON wp.word_id = w.id
+      JOIN syllables_words sw ON sw.word_id = w.id 
+      JOIN syllables s ON s.id = sw.syl_id
+      JOIN onsets os ON os.id = syl.onset
+        `,
+    );
+    return query.all(onset) as any[];
+  }
+  // tones
+  fetchWordsByToneAndSyls(tones: Array<string | null>) {
+    const toneString = tones.reduce((acc: string, item) => {
+      if (!item) return `${acc},%`;
+      else return `${acc},${item}`;
+    }, "");
+    console.log({ toneString });
+    const query = this.db.query(
+      `
+      WITH word_tone_sequences AS (
+          SELECT 
+              w.id as word_id,
+              w.spelling,
+              wp.ipa,
+              w.frequency,
+              GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+              COUNT(sw.syl_id) as syllable_count
+          FROM words w
+          JOIN word_phonetics wp ON w.id = wp.word_id
+          JOIN syllables_words sw ON w.id = sw.word_id
+          JOIN syllables s ON sw.syl_id = s.id
+          JOIN tones t ON s.tone = t.id
+          GROUP BY w.id, w.spelling, w.lang, w.frequency
+      )        
+        SELECT 
+            word_id,
+            spelling,
+            ipa,
+            frequency,
+            tone_sequence,
+            syllable_count
+        FROM word_tone_sequences
+        WHERE tone_sequence LIKE ?
+        AND syllable_count = ?
+        ORDER BY frequency DESC NULLS LAST;
+      `,
+    );
+    return query.all(toneString.slice(1), tones.length) as any[];
+  }
   // inserts
 
   addLanguage(code: string, name: string) {
@@ -109,6 +222,7 @@ class DatabaseHandler {
   addSyllable(
     wordId: number | bigint,
     sylIdx: number,
+    stressed: boolean | null,
     lang: string,
     ipa: string,
     long: boolean,
@@ -197,9 +311,9 @@ class DatabaseHandler {
       //
       const res1 = this.db
         .query(
-          `INSERT INTO syllables_words(syl_id, word_id, idx) VALUES(?, ?, ?)`,
+          `INSERT INTO syllables_words(syl_id, word_id, idx, stressed) VALUES(?, ?, ?, ?)`,
         )
-        .run(sylId, wordId, sylIdx);
+        .run(sylId, wordId, sylIdx, stressed);
       //
       return sylId;
     });
diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql
index c962d83..c6a04fa 100644
--- a/src/lib/db/prosodyschema.sql
+++ b/src/lib/db/prosodyschema.sql
@@ -130,6 +130,7 @@ CREATE TABLE IF NOT EXISTS syllables_words(
     syl_id INTEGER NOT NULL,
     word_id INTEGER NOT NULL,
     idx INTEGER NOT NULL,
+    stressed INTEGER,
     FOREIGN KEY (syl_id) REFERENCES syllables(id),
     FOREIGN KEY (word_id) REFERENCES words(id)
 );
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
index 5c75345..6c69d9c 100644
--- a/src/lib/db/thaiseed.ts
+++ b/src/lib/db/thaiseed.ts
@@ -12,6 +12,7 @@ import pdb from "./prosodydb";
 import { cleanIpa } from "../utils";
 import { handleFile } from "./utils";
 import { Tone } from "../types/phonetics";
+import { AsyncRes } from "../types";
 
 async function readDump(lang: string) {
   await pdb.init();
@@ -30,38 +31,77 @@ async function readDump(lang: string) {
     const j = JSON.parse(langrow.data);
     const word = j.word.trim();
     if (!word) continue;
-    if (word.includes("ๆ")) await handleWord(word, j);
-    else {
+
+    if (word.includes("ๆ")) {
+      const res = await handleWord(word, j);
+      if ("error" in res) {
+        if (res.error.includes("meh")) continue;
+        if (res.error.includes("wtf")) {
+          console.error(res.error);
+          console.error(j.sounds);
+        }
+        break;
+      }
+    } else {
       const split = word.split(" ");
-      if (split.length > 1) await handleIdiom(word);
-      else await handleWord(word, j);
+      if (split.length > 1) {
+        const res = await handleIdiom(word);
+        if ("error" in res) {
+          console.error(res.error);
+          break;
+        }
+      } else {
+        const res = await handleWord(word, j);
+        if ("error" in res) {
+          if (res.error.includes("meh")) continue;
+          if (res.error.includes("wtf")) {
+            console.error(res.error);
+            console.error(j.sounds);
+          }
+          // break;
+        }
+      }
     }
   }
 }
 
-async function handleWord(word: string, j: any) {
+async function handleWord(word: string, j: any): AsyncRes<string> {
   // TODO add categories but add a tag to see what classifying scheme we're using
   //
   const sounds = j.sounds || [];
   const hasIpa = sounds.find((s: any) => "ipa" in s);
-  if (!hasIpa) return;
+  if (!hasIpa) return { error: "meh no ipa" };
   const freq = await getThaiFreq(word);
   const wordId = pdb.addWord(word, "th", freq, null);
+  if (wordId == 478 || word === "และ") {
+    console.log("wtf man");
+    console.dir(j, { depth: null });
+    return { error: "i said wtf" };
+  }
   const analyzed = await analyzeTHWord(word);
-  for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed);
+  for (let snd of sounds)
+    if ("ipa" in snd) {
+      const res = await handleIpa(wordId, j, snd, analyzed);
+      if ("error" in res) return res;
+    }
+  return { ok: "" };
 }
 async function handleIpa(
   wordId: number | bigint,
   j: any,
   snd: any,
   analyzed: ThaiNLPRes,
-) {
+): AsyncRes<string> {
   const tags = JSON.stringify(snd.tags) || null;
   // console.log("handleipa", analyzed.syllables.length);
   // console.log(analyzed);
   const wikiIpa = cleanIpa(snd.ipa);
   const nlpIpa = cleanIpa(analyzed.ipa);
   const ipa = wikiIpa || nlpIpa;
+  if (j.word === "และ") {
+    console.log("wtf!!");
+    return { error: "wtf is this" };
+  }
   const wikiIpaSplit = wikiIpa.split(".");
   const nlpIpaSplit = nlpIpa.split(".");
   if (wikiIpaSplit.length !== nlpIpaSplit.length) {
@@ -73,14 +113,15 @@ async function handleIpa(
     // console.log("syllable analysis mismatch", j.word);
     // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
     // console.dir(j, { depth: null });
-    return;
+    return { error: "meh syllable analysis mismatch" };
   }
-  pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null);
   const writtenSyls = analyzed.syllables;
   const pronouncedSyls = analyzed.realSyls;
   let badSyls = false;
   if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
 
+  pdb.addPronunciation(wordId, ipa, pronouncedSyls.length, tags, null);
+
   for (let i = 0; i < pronouncedSyls.length; i++) {
     const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, "");
     const written = writtenSyls[i] || "";
@@ -93,14 +134,10 @@ async function handleIpa(
       console.log(pronounced);
       console.log(written);
     }
-    try {
-      await handleSyllable(syllable, ipa, wordId, i, notes);
-    } catch (e) {
-      console.error("syl error", j.word, j.sounds);
-      console.error({ analyzed, ipa, wikiIpaSplit });
-      console.error(e);
-    }
+    const res = await handleSyllable(syllable, ipa, wordId, i, notes);
+    if ("error" in res) return res;
   }
+  return { ok: "" };
 }
 const thaiTones: Record<string, string> = {
   "˧": "mid",
@@ -122,7 +159,7 @@ function parseTone(ipa: string, spelling: string): Tone {
     const numbers = thaiToneNums[ipa]!;
     return { letters: ipa, name, numbers };
   } catch (e) {
-    console.error("wrong tones!!", { s: spelling, ipa });
+    console.error("meh wrong tones!!", { s: spelling, ipa });
     throw new Error("");
   }
 }
@@ -133,7 +170,7 @@ async function handleSyllable(
   wordId: number | bigint,
   idx: number,
   notes: string | null,
-) {
+): AsyncRes<string> {
   const sorsyl = await sorSyl(spelling, "th", ipa);
   const weird = [
     // "a̯n",
@@ -166,14 +203,16 @@ async function handleSyllable(
   //   // console.dir(j, { depth: null });
   // }
   if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
-  const syl = sorsyl.syls[0]!;
+  const syl = sorsyl.syls[0]!.ipa;
   const tone = parseTone(syl.tone, spelling);
+  // TODO add actual ortographic data here not just ipa
   try {
     pdb.addSyllable(
       wordId,
       idx + 1,
+      null,
       "th",
-      syl.ipa,
+      syl.all,
       syl.long,
       spelling,
       { spelling: syl.onset, ipa: syl.onset },
@@ -184,16 +223,18 @@ async function handleSyllable(
       tone,
       notes,
     );
+    return { ok: "" };
   } catch (e) {
     // console.log("well fuck", syl);
     // console.error(e);
-    console.log();
+    return { error: `meh ${e}` };
   }
 }
-async function handleIdiom(idiom: string) {
+async function handleIdiom(idiom: string): AsyncRes<string> {
   pdb.addIdiom(idiom, "th");
   // TODO later set idiom_words once all words are populated
   // console.log();
+  return { ok: "" };
 }
 // ช้า ๆ
 // งก ๆ