preeeeettty much done FUCK YES

author: polwex <polwex@sortug.com> 2025-06-03 15:41:31 +0700
committer: polwex <polwex@sortug.com> 2025-06-03 15:41:31 +0700
commit: 175ddca375cef765cec8ca5bbc527a205c40bf25 (patch)
tree: f2e47a5d85e4d5e0297613e5a17cebce7d09b09b /src/lib/db
parent: 2401217a4019938d1c1cc61b6e33ccb233eb6e74 (diff)
4 files changed, 722 insertions, 145 deletions
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
index d6da389..7c067d2 100644
--- a/src/lib/db/prosodydb.ts
+++ b/src/lib/db/prosodydb.ts
@@ -1,5 +1,5 @@
 import Database from "bun:sqlite";
-import { Phoneme, Tone } from "../types/phonetics";
+import { MutationOrder, Phoneme, Tone } from "../types/phonetics";
 import { ProsodyWord, ProsodyWordDB } from "../types/cards";
 type Str = string | null;
 type ItemType = "word" | "syllable" | "idiom";
@@ -113,6 +113,7 @@ class DatabaseHandler {
               w.spelling,
               wp.ipa,
               w.frequency,
+              GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq,
               GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
               COUNT(sw.syl_id) as syllable_count
           FROM words w
@@ -127,17 +128,166 @@ class DatabaseHandler {
             spelling,
             ipa,
             frequency,
+            syl_seq,
             tone_sequence,
             syllable_count
         FROM word_tone_sequences
         WHERE tone_sequence LIKE ?
         AND syllable_count = ?
-        ORDER BY frequency DESC NULLS LAST;
+        ORDER BY frequency ASC NULLS LAST;
       `,
     );
     return query.all(toneString.slice(1), tones.length) as any[];
   }
+  // fetchWordsByToneAndSyls(tones: Array<string | null>) {
+  //   const toneString = tones.reduce((acc: string, item) => {
+  //     if (!item) return `${acc},%`;
+  //     else return `${acc},${item}`;
+  //   }, "");
+  //   console.log({ toneString });
+  //   const query = this.db.query(
+  //     `
+  //     WITH word_tone_sequences AS (
+  //         SELECT
+  //             w.id as word_id,
+  //             w.spelling,
+  //             wp.ipa,
+  //             w.frequency,
+  //             GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq,
+  //             GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+  //             COUNT(sw.syl_id) as syllable_count
+  //         FROM words w
+  //         JOIN word_phonetics wp ON w.id = wp.word_id
+  //         JOIN syllables_words sw ON w.id = sw.word_id
+  //         JOIN syllables s ON sw.syl_id = s.id
+  //         JOIN tones t ON s.tone = t.id
+  //         GROUP BY w.id, w.spelling, w.lang, w.frequency
+  //     )
+  //       SELECT
+  //           word_id,
+  //           spelling,
+  //           ipa,
+  //           frequency,
+  //           syl_seq,
+  //           tone_sequence,
+  //           syllable_count
+  //       FROM word_tone_sequences
+  //       WHERE tone_sequence LIKE ?
+  //       AND syllable_count = ?
+  //       ORDER BY frequency DESC NULLS LAST;
+  //     `,
+  //   );
+  //   return query.all(toneString.slice(1), tones.length) as any[];
+  // }
+  fetchWordsByToneSylsWords(order: MutationOrder) {
+    console.log({ order });
+    type Acc = { tones: string; syls: string };
+    const strings = order.reduce(
+      (acc: Acc, item, idx) => {
+        const startString = idx === 0 ? "" : ",";
+        if ("change" in item)
+          return {
+            tones: `${acc.tones}${startString}${item.change}`,
+            syls: `${acc.syls}${startString}%`,
+          };
+        else
+          return {
+            tones: `${acc.tones}${startString}%`,
+            syls: `${acc.syls}${startString}${item.keep}`,
+          };
+      },
+      { tones: "", syls: "" },
+    );
+    const query = this.db.query(`
+    SELECT 
+        w.id as word_id,
+        w.spelling,
+        w.lang,
+        w.frequency,
+        wp.ipa,
+        wp.syllable_sequence,
+        wp.tone_sequence,
+        wp.ipa_sequence,
+        GROUP_CONCAT(s.text ORDER BY sw.idx) as syllable_pattern,
+        GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_pattern
+    FROM words w
+    JOIN syllables_words sw ON w.id = sw.word_id
+    JOIN syllables s ON sw.syl_id = s.id
+    JOIN tones t ON s.tone = t.id
+    JOIN word_phonetics wp ON wp.word_id= w.id
+    WHERE wp.syllable_sequence LIKE ?1
+    AND tone_sequence LIKE ?2
+    AND syllable_count = ?3
+    GROUP BY w.id, w.spelling, w.lang, w.frequency
+    ORDER BY w.frequency ASC NULLS LAST;      `);
+    return query.all(strings.syls, strings.tones, order.length) as any[];
+  }
   // inserts
+  superAdd(p: {
+    word: string;
+    lang: string;
+    frequency: number | null;
+    wordNotes: Str;
+    phonetics: Array<{
+      ipa: string;
+      syllable_count: number;
+      syllable_sequence: string;
+      tone_sequence: string;
+      ipa_sequence: string;
+      tags: Str;
+      notes: Str;
+      wordRhyme: Str;
+      syllables: Array<{
+        idx: number;
+        stressed: boolean | null;
+        spelling: string;
+        ipa: string;
+        long: boolean;
+        onset: Phoneme;
+        medial: Phoneme;
+        nucleus: Phoneme;
+        coda: Phoneme;
+        rhyme: Phoneme;
+        tone: Tone;
+        notes: Str;
+      }>;
+    }>;
+  }) {
+    const tx = this.db.transaction(() => {
+      const wordId = this.addWord(p.word, p.lang, p.frequency, p.wordNotes);
+      for (const ph of p.phonetics) {
+        this.addPronunciation(
+          wordId,
+          ph.ipa,
+          ph.syllable_count,
+          ph.syllable_sequence,
+          ph.tone_sequence,
+          ph.ipa_sequence,
+          ph.tags,
+          ph.notes,
+        );
+        for (const syl of ph.syllables) {
+          this.addSyllable(
+            wordId,
+            syl.idx,
+            syl.stressed,
+            p.lang,
+            syl.ipa,
+            syl.long,
+            syl.spelling,
+            syl.onset,
+            syl.medial,
+            syl.nucleus,
+            syl.coda,
+            syl.rhyme,
+            syl.tone,
+            syl.notes,
+          );
+        }
+      }
+    });
+    tx();
+  }
 
   addLanguage(code: string, name: string) {
     const query = this.db
@@ -147,15 +297,44 @@ class DatabaseHandler {
   addPronunciation(
     wordId: number | bigint,
     ipa: string,
-    syllables: number,
+    syllable_count: number,
+    syllable_sequence: string,
+    tone_sequence: string,
+    ipa_sequence: string,
     tags: Str,
     notes: Str,
   ) {
+    console.log({
+      wordId,
+      ipa,
+      syllable_count,
+      syllable_sequence,
+      tone_sequence,
+      ipa_sequence,
+    });
     const query = this.db
       .query(
-        `INSERT OR IGNORE INTO word_phonetics(word_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?)`,
+        `INSERT OR IGNORE INTO word_phonetics(
+            word_id,
+            ipa,
+            syllable_count,
+            syllable_sequence,
+            tone_sequence,
+            ipa_sequence,
+            tag,
+            notes)
+        VALUES(?, ?, ?, ?, ?, ?, ?, ?)`,
       )
-      .run(wordId, ipa, syllables, tags, notes);
+      .run(
+        wordId,
+        ipa,
+        syllable_count,
+        syllable_sequence,
+        tone_sequence,
+        ipa_sequence,
+        tags,
+        notes,
+      );
   }
   addWordRhyme(wordId: number | bigint, ipa: string, lang: string, notes: Str) {
     const query = this.db
@@ -212,12 +391,14 @@ class DatabaseHandler {
     notes: Str,
   ) {
     const query = this.db.query(
-      `INSERT OR IGNORE INTO words(spelling, lang, frequency, notes) VALUES(?, ?, ?, ?)`,
-      // `INSERT INTO words(spelling, lang) VALUES(?, ?)`,
+      `INSERT INTO words(spelling, lang, frequency, notes) VALUES(?, ?, ?, ?)
+       ON CONFLICT(spelling, lang) DO UPDATE SET
+       lang = excluded.lang
+       RETURNING rowid
+     `,
     );
-    const res = query.run(spelling, lang, frequency, notes);
-    const wordId = res.lastInsertRowid;
-    return wordId;
+    const res = query.get(spelling, lang, frequency, notes) as { id: number };
+    return res.id;
   }
   addSyllable(
     wordId: number | bigint,
@@ -292,9 +473,15 @@ class DatabaseHandler {
         .get(tone.letters, lang, tone.name, tone.numbers) as { id: number };
 
       const query = this.db.query(
-        `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+        `INSERT INTO syllables(
+        lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes)
+        VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        ON CONFLICT(text, ipa, lang) DO UPDATE SET
+        lang = excluded.lang
+        RETURNING rowid
+        `,
       );
-      const res = query.run(
+      const res = query.get(
         lang,
         ipa,
         long,
@@ -306,8 +493,8 @@ class DatabaseHandler {
         rhymeId.id,
         toneId.id,
         notes,
-      );
-      const sylId = res.lastInsertRowid;
+      ) as { id: number };
+      const sylId = res.id;
       //
       const res1 = this.db
         .query(
diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql
index c6a04fa..5554a02 100644
--- a/src/lib/db/prosodyschema.sql
+++ b/src/lib/db/prosodyschema.sql
@@ -150,9 +150,103 @@ CREATE TABLE IF NOT EXISTS word_phonetics(
     id INTEGER PRIMARY KEY AUTOINCREMENT,
     word_id INTEGER NOT NULL,
     ipa TEXT NOT NULL,
-    syllables INTEGER NOT NULL,
+    syllable_count INTEGER NOT NULL,
+    syllable_sequence TEXT NOT NULL,  -- "家,鄉"
+    tone_sequence TEXT NOT NULL,      -- "rising,rising" 
+    ipa_sequence TEXT NOT NULL,       -- IPA representation
     tag TEXT,
     notes TEXT,
-    CONSTRAINT ipa_unique UNIQUE (ipa, word_id)
+    FOREIGN KEY (word_id) REFERENCES words(id)
 );
 CREATE INDEX IF NOT EXISTS idx_words_ipa ON word_phonetics(ipa, word_id);
+
+-- -- Query 2: Even simpler with pattern table
+-- -- Pattern [{ change: "rising" }, { change: "falling" }] - any 2-syllable word with rising,falling tones
+-- SELECT 
+--     w.spelling,
+--     w.frequency,
+--     wp.syllable_sequence,
+--     wp.tone_sequence
+-- FROM words w
+-- JOIN word_patterns wp ON w.id = wp.word_id
+-- WHERE wp.syllable_count = 2
+--   AND wp.tone_sequence = 'rising,falling'
+-- ORDER BY w.frequency DESC NULLS LAST;
+
+-- -- Query 3: Mixed pattern [{ keep: "家" }, { change: "falling" }, { keep: "人" }]
+-- SELECT DISTINCT
+--     w.spelling,
+--     w.frequency,
+--     wp.syllable_sequence,
+--     wp.tone_sequence
+-- FROM words w
+-- JOIN word_patterns wp ON w.id = wp.word_id
+-- WHERE wp.syllable_count = 3
+--   AND wp.syllable_sequence LIKE '家,%,人'  -- Simple pattern matching
+--   AND EXISTS (
+--       SELECT 1 FROM word_syllable_positions wsp
+--       WHERE wsp.word_id = w.id 
+--         AND wsp.position = 1 
+--         AND wsp.tone_name = 'falling'
+--   )
+-- ORDER BY w.frequency DESC NULLS LAST;
+
+-- -- Query 4: Super fast rhyme finding
+-- -- Find all words that end with same syllable as "家鄉" (end with "鄉")
+-- SELECT 
+--     w.spelling,
+--     w.frequency,
+--     wp.syllable_sequence
+-- FROM words w
+-- JOIN word_patterns wp ON w.id = wp.word_id
+-- WHERE wp.syllable_sequence LIKE '%,鄉'  -- Ends with 鄉
+--   AND wp.syllable_count >= 2
+-- ORDER BY w.frequency DESC NULLS LAST;
+
+
+
+
+-- SELECT
+--     w.id as word_id,
+--     w.spelling,
+--     w.lang,
+--     w.frequency
+-- FROM words w
+-- JOIN word_phonetics wp ON wp.word_id= w.id
+-- WHERE wp.syllable_sequence LIKE '%,ใจ'
+-- AND wp.tone_sequence LIKE 'rising,%'
+-- AND wp.syllable_count = 2
+-- GROUP BY w.id, w.spelling, w.lang, w.frequency
+-- ORDER BY w.frequency DESC NULLS LAST;
+-- 
+-- Indexes for fast pattern matching
+CREATE INDEX IF NOT EXISTS idx_word_patterns_syllables ON word_phonetics(syllable_sequence);
+CREATE INDEX IF NOT EXISTS idx_word_patterns_tones ON word_phonetics(tone_sequence);
+CREATE INDEX IF NOT EXISTS idx_word_patterns_count ON word_phonetics(syllable_count);
+CREATE INDEX IF NOT EXISTS idx_word_patterns_mixed ON word_phonetics(syllable_count, syllable_sequence, tone_sequence);
+
+
+CREATE INDEX IF NOT EXISTS idx_syllables_words_word_idx ON syllables_words(word_id, idx);
+CREATE INDEX IF NOT EXISTS idx_syllables_words_idx_word ON syllables_words(idx, word_id);
+CREATE INDEX IF NOT EXISTS idx_syllables_words_syl ON syllables_words(syl_id);
+
+-- 2. Syllables table indexes for text and language lookups
+CREATE INDEX IF NOT EXISTS idx_syllables_text_lang ON syllables(text, lang);
+CREATE INDEX IF NOT EXISTS idx_syllables_lang_text ON syllables(lang, text);
+CREATE INDEX IF NOT EXISTS idx_syllables_tone ON syllables(tone);
+CREATE INDEX IF NOT EXISTS idx_syllables_text_tone ON syllables(text, tone);
+
+-- 3. Tones table indexes
+CREATE INDEX IF NOT EXISTS idx_tones_name_lang ON tones(name, lang);
+CREATE INDEX IF NOT EXISTS idx_tones_nums_lang ON tones(nums, lang);
+CREATE INDEX IF NOT EXISTS idx_tones_lang_name ON tones(lang, name);
+
+-- 4. Words table indexes
+CREATE INDEX IF NOT EXISTS idx_words_lang_freq ON words(lang, frequency DESC);
+CREATE INDEX IF NOT EXISTS idx_words_id_lang ON words(id, lang);
+
+-- 5. Composite indexes for common query patterns
+CREATE INDEX IF NOT EXISTS idx_syllables_compound ON syllables(lang, text, tone);
+CREATE INDEX IF NOT EXISTS idx_syllables_words_compound ON syllables_words(word_id, idx, syl_id);
+
+
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
index 6c69d9c..32434da 100644
--- a/src/lib/db/thaiseed.ts
+++ b/src/lib/db/thaiseed.ts
@@ -11,7 +11,7 @@ import {
 import pdb from "./prosodydb";
 import { cleanIpa } from "../utils";
 import { handleFile } from "./utils";
-import { Tone } from "../types/phonetics";
+import { Phoneme, Tone } from "../types/phonetics";
 import { AsyncRes } from "../types";
 
 async function readDump(lang: string) {
@@ -25,7 +25,7 @@ async function readDump(lang: string) {
   // langrows = langrows.slice(10);
   for (const langrow of langrows) {
     count++;
-    // console.log(count);
+    console.log(count);
     // if (count <= 10000) continue;
     // if (count > 100) break;
     const j = JSON.parse(langrow.data);
@@ -68,65 +68,101 @@ async function readDump(lang: string) {
 async function handleWord(word: string, j: any): AsyncRes<string> {
   // TODO add categories but add a tag to see what classifying scheme we're using
   //
-  const sounds = j.sounds || [];
-  const hasIpa = sounds.find((s: any) => "ipa" in s);
-  if (!hasIpa) return { error: "meh no ipa" };
-  const freq = await getThaiFreq(word);
-  const wordId = pdb.addWord(word, "th", freq, null);
-  if (wordId == 478 || word === "และ") {
-    console.log("wtf man");
-    console.dir(j, { depth: null });
-    return { error: "i said wtf" };
-  }
+  const frequency = await getThaiFreq(word);
   const analyzed = await analyzeTHWord(word);
-  for (let snd of sounds)
-    if ("ipa" in snd) {
-      const res = await handleIpa(wordId, j, snd, analyzed);
-      if ("error" in res) return res;
-    }
+  const phonetics = await Promise.all(getIpa(j, analyzed));
+
+  pdb.superAdd({ word, lang: "th", frequency, wordNotes: null, phonetics });
   return { ok: "" };
 }
-async function handleIpa(
-  wordId: number | bigint,
-  j: any,
-  snd: any,
-  analyzed: ThaiNLPRes,
-): AsyncRes<string> {
+function getIpa(j: any, analyzed: ThaiNLPRes) {
+  const sounds = j.sounds || [];
+  const hasIpa = sounds.find((s: any) => "ipa" in s);
+  if (!hasIpa) return [];
+  const ipaData: Promise<IPAData>[] = sounds.reduce(
+    async (acc: Promise<IPAData>[], snd: any) => {
+      if ("ipa" in snd) {
+        const data = getIpaData(snd, analyzed);
+        return [...acc, data];
+      } else return acc;
+    },
+    [],
+  );
+  return ipaData;
+}
+type IPAData = {
+  ipa: string;
+  syllable_count: number;
+  syllable_sequence: string;
+  tone_sequence: string;
+  ipa_sequence: string;
+  tags: string | null;
+  notes: string | null;
+  wordRhyme: string | null;
+  syllables: SylData[];
+};
+async function getIpaData(snd: any, analyzed: ThaiNLPRes): Promise<IPAData> {
   const tags = JSON.stringify(snd.tags) || null;
   // console.log("handleipa", analyzed.syllables.length);
   // console.log(analyzed);
   const wikiIpa = cleanIpa(snd.ipa);
   const nlpIpa = cleanIpa(analyzed.ipa);
   const ipa = wikiIpa || nlpIpa;
-  if (j.word === "และ") {
-    console.log("wtf!!");
-    return { error: "wtf is this" };
-  }
+  // if (j.word === "และ") {
+  //   console.log("wtf!!");
+  // return { error: "wtf is this" };
+  // }
   const wikiIpaSplit = wikiIpa.split(".");
   const nlpIpaSplit = nlpIpa.split(".");
   if (wikiIpaSplit.length !== nlpIpaSplit.length) {
-    // console.log("ipa mismatch");
-    // console.log(wikiIpa);
-    // console.log(nlpIpa);
+    console.log("ipa mismatch");
+    console.log(wikiIpa);
+    console.log(nlpIpa);
   }
   if (analyzed.realSyls.length !== wikiIpaSplit.length) {
-    // console.log("syllable analysis mismatch", j.word);
-    // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
-    // console.dir(j, { depth: null });
-    return { error: "meh syllable analysis mismatch" };
+    console.log("syllable analysis mismatch", analyzed.word);
+    console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+    throw new Error("syllable mismatch");
   }
   const writtenSyls = analyzed.syllables;
-  const pronouncedSyls = analyzed.realSyls;
+  const pronouncedSyls = analyzed.realSyls.map((s) =>
+    s.replace(/\u{E3A}/u, ""),
+  );
+
+  const tone_sequence = wikiIpaSplit
+    .map((s) => parseTone(s, analyzed.word))
+    .map((t) => t.name)
+    .join(",");
+  const syllable_sequence = pronouncedSyls.join(",");
+  const ipa_sequence = wikiIpaSplit.join(",");
+  const syllables = await Promise.all(
+    getSyllables(writtenSyls, pronouncedSyls, wikiIpaSplit),
+  );
+  return {
+    ipa,
+    syllable_count: pronouncedSyls.length,
+    syllable_sequence,
+    tone_sequence,
+    ipa_sequence,
+    tags,
+    notes: null,
+    wordRhyme: null,
+    syllables,
+  };
+}
+function getSyllables(
+  writtenSyls: string[],
+  pronouncedSyls: string[],
+  ipaSyls: string[],
+) {
   let badSyls = false;
   if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
-
-  pdb.addPronunciation(wordId, ipa, pronouncedSyls.length, tags, null);
-
+  let syls: Promise<SylData>[] = [];
   for (let i = 0; i < pronouncedSyls.length; i++) {
-    const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, "");
+    const pronounced = pronouncedSyls[i]!;
     const written = writtenSyls[i] || "";
     const syllable = badSyls ? pronounced : written;
-    const ipa = wikiIpaSplit[i]!;
+    const ipa = ipaSyls[i]!;
     // TODO insert both??
     const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
     if (pronounced !== syllable) {
@@ -134,10 +170,10 @@ async function handleIpa(
       console.log(pronounced);
       console.log(written);
     }
-    const res = await handleSyllable(syllable, ipa, wordId, i, notes);
-    if ("error" in res) return res;
+    const res = getSyllable(syllable, ipa, i, notes);
+    syls.push(res);
   }
-  return { ok: "" };
+  return syls;
 }
 const thaiTones: Record<string, string> = {
   "˧": "mid",
@@ -153,8 +189,22 @@ const thaiToneNums: Record<string, number> = {
   "˦˥": 45,
   "˩˩˦": 214,
 };
+const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|"));
+
 function parseTone(ipa: string, spelling: string): Tone {
   try {
+    const match = ipa.match(toneRegex)!;
+    const m = match[0]!;
+    const name = thaiTones[m]!;
+    const numbers = thaiToneNums[m]!;
+    return { letters: ipa, name, numbers };
+  } catch (e) {
+    console.error("meh wrong tones!!", { s: spelling, ipa });
+    throw new Error("");
+  }
+}
+function parseToneS(ipa: string, spelling: string): Tone {
+  try {
     const name = thaiTones[ipa]!;
     const numbers = thaiToneNums[ipa]!;
     return { letters: ipa, name, numbers };
@@ -164,71 +214,44 @@ function parseTone(ipa: string, spelling: string): Tone {
   }
 }
 
-async function handleSyllable(
+type SylData = {
+  idx: number;
+  stressed: boolean | null;
+  spelling: string;
+  ipa: string;
+  long: boolean;
+  onset: Phoneme;
+  medial: Phoneme;
+  nucleus: Phoneme;
+  coda: Phoneme;
+  rhyme: Phoneme;
+  tone: Tone;
+  notes: string | null;
+};
+async function getSyllable(
   spelling: string,
   ipa: string,
-  wordId: number | bigint,
   idx: number,
   notes: string | null,
-): AsyncRes<string> {
+): Promise<SylData> {
   const sorsyl = await sorSyl(spelling, "th", ipa);
-  const weird = [
-    // "a̯n",
-    // "a̯",
-    // "a̯p",
-    // "a̯w",
-    // "a̯j",
-    // "a̯ŋ",
-    // "a̯k",
-    // "a̯t",
-    // "a̯m",
-    // "a̯ʔ",
-    // "ʔ",
-    "s",
-    "l",
-    "f",
-    "a̯s",
-    "js",
-    "t͡ɕʰ",
-    "ks",
-    "ns",
-    "a̯l",
-    "a̯f",
-    "mk",
-  ];
-  // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
-  // if (weirder) {
-  //   console.log("syllable", spelling);
-  //   // console.dir(sorsyl, { depth: null });
-  //   // console.dir(j, { depth: null });
-  // }
   if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
   const syl = sorsyl.syls[0]!.ipa;
-  const tone = parseTone(syl.tone, spelling);
-  // TODO add actual ortographic data here not just ipa
-  try {
-    pdb.addSyllable(
-      wordId,
-      idx + 1,
-      null,
-      "th",
-      syl.all,
-      syl.long,
-      spelling,
-      { spelling: syl.onset, ipa: syl.onset },
-      { spelling: syl.medial, ipa: syl.medial },
-      { spelling: syl.nucleus, ipa: syl.nucleus },
-      { spelling: syl.coda, ipa: syl.coda },
-      { spelling: syl.rhyme, ipa: syl.rhyme },
-      tone,
-      notes,
-    );
-    return { ok: "" };
-  } catch (e) {
-    // console.log("well fuck", syl);
-    // console.error(e);
-    return { error: `meh ${e}` };
-  }
+  const tone = parseToneS(syl.tone, spelling);
+  return {
+    idx: idx + 1,
+    stressed: null,
+    spelling,
+    ipa: syl.all,
+    long: syl.long,
+    onset: { spelling: syl.onset, ipa: syl.onset },
+    medial: { spelling: syl.medial, ipa: syl.medial },
+    nucleus: { spelling: syl.nucleus, ipa: syl.nucleus },
+    coda: { spelling: syl.coda, ipa: syl.coda },
+    rhyme: { spelling: syl.rhyme, ipa: syl.rhyme },
+    tone,
+    notes,
+  };
 }
 async function handleIdiom(idiom: string): AsyncRes<string> {
   pdb.addIdiom(idiom, "th");
@@ -236,33 +259,5 @@ async function handleIdiom(idiom: string): AsyncRes<string> {
   // console.log();
   return { ok: "" };
 }
-// ช้า ๆ
-// งก ๆ
-// หงก ๆ
-
-async function getFrequency() {
-  const files = [
-    "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
-    "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
-    "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
-    "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
-    "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
-    "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
-  ];
-  const freqMap = new Map<number, string>();
-  for (const file of files) {
-    await handleFile(file, (line, idx) => {
-      const [spelling, IPA, tone, length, frequency, ...rest] = line.split(",");
-      freqMap.set(Number(frequency!), spelling!);
-    });
-  }
-  const orderedMap = new Map<string, number>();
-  const keys = Array.from(freqMap.keys()).sort();
-  for (let i = 0; i < keys.length; i++) {
-    const val = freqMap.get(keys[i]!)!;
-    orderedMap.set(val, i + 1);
-  }
-  return orderedMap;
-}
 
 readDump("th");
diff --git a/src/lib/db/thaiseedold.ts b/src/lib/db/thaiseedold.ts
new file mode 100644
index 0000000..b9522dd
--- /dev/null
+++ b/src/lib/db/thaiseedold.ts
@@ -0,0 +1,301 @@
+import Database from "bun:sqlite";
+import {
+  analyzeTHWord,
+  deconstructSyllable,
+  segmentateThai,
+  type SorSyl,
+  type ThaiNLPRes,
+  sorSyl,
+  getThaiFreq,
+} from "../calls/nlp";
+import pdb from "./prosodydb";
+import { cleanIpa } from "../utils";
+import { handleFile } from "./utils";
+import { Tone } from "../types/phonetics";
+import { AsyncRes } from "../types";
+
+async function readDump(lang: string) {
+  await pdb.init();
+  pdb.addLanguage("th", "thai");
+  let count = 0;
+  const langdb = new Database(
+    `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+  );
+  let langrows: any = langdb.query("SELECT data FROM langs");
+  // langrows = langrows.slice(10);
+  for (const langrow of langrows) {
+    count++;
+    console.log(count);
+    // if (count <= 10000) continue;
+    // if (count > 100) break;
+    const j = JSON.parse(langrow.data);
+    const word = j.word.trim();
+    if (!word) continue;
+
+    if (word.includes("ๆ")) {
+      const res = await handleWord(word, j);
+      if ("error" in res) {
+        if (res.error.includes("meh")) continue;
+        if (res.error.includes("wtf")) {
+          console.error(res.error);
+          console.error(j.sounds);
+        }
+        break;
+      }
+    } else {
+      const split = word.split(" ");
+      if (split.length > 1) {
+        const res = await handleIdiom(word);
+        if ("error" in res) {
+          console.error(res.error);
+          break;
+        }
+      } else {
+        const res = await handleWord(word, j);
+        if ("error" in res) {
+          if (res.error.includes("meh")) continue;
+          if (res.error.includes("wtf")) {
+            console.error(res.error);
+            console.error(j.sounds);
+          }
+          // break;
+        }
+      }
+    }
+  }
+}
+
+// if (wordId == 478 || word === "และ") {
+//   // console.log("wtf man");
+//   // console.dir(j, { depth: null });
+//   // return { error: "i said wtf" };
+// }
+async function handleWord(word: string, j: any): AsyncRes<string> {
+  // TODO add categories but add a tag to see what classifying scheme we're using
+  //
+  const sounds = j.sounds || [];
+  const hasIpa = sounds.find((s: any) => "ipa" in s);
+  if (!hasIpa) return { error: "meh no ipa" };
+  const freq = await getThaiFreq(word);
+  const wordId = pdb.addWord(word, "th", freq, null);
+  const analyzed = await analyzeTHWord(word);
+  for (let snd of sounds)
+    if ("ipa" in snd) {
+      const res = await handleIpa(wordId, j, snd, analyzed);
+      if ("error" in res) return res;
+    }
+  return { ok: "" };
+}
+async function handleIpa(
+  wordId: number | bigint,
+  j: any,
+  snd: any,
+  analyzed: ThaiNLPRes,
+): AsyncRes<string> {
+  console.log();
+  const tags = JSON.stringify(snd.tags) || null;
+  // console.log("handleipa", analyzed.syllables.length);
+  // console.log(analyzed);
+  const wikiIpa = cleanIpa(snd.ipa);
+  const nlpIpa = cleanIpa(analyzed.ipa);
+  const ipa = wikiIpa || nlpIpa;
+  // if (j.word === "และ") {
+  //   console.log("wtf!!");
+  //   return { error: "wtf is this" };
+  // }
+  const wikiIpaSplit = wikiIpa.split(".");
+  const nlpIpaSplit = nlpIpa.split(".");
+  if (wikiIpaSplit.length !== nlpIpaSplit.length) {
+    // console.log("ipa mismatch");
+    // console.log(wikiIpa);
+    // console.log(nlpIpa);
+  }
+  if (analyzed.realSyls.length !== wikiIpaSplit.length) {
+    // console.log("syllable analysis mismatch", j.word);
+    // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+    // console.dir(j, { depth: null });
+    return { error: "meh syllable analysis mismatch" };
+  }
+  const writtenSyls = analyzed.syllables;
+  const pronouncedSyls = analyzed.realSyls.map((s) =>
+    s.replace(/\u{E3A}/u, ""),
+  );
+  let badSyls = false;
+  if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
+
+  const tone_sequence = wikiIpaSplit
+    .map((s) => parseTone(s, j.word))
+    .map((t) => t.name)
+    .join(",");
+  const syl_sequence = pronouncedSyls.join(",");
+  const ipa_sequence = wikiIpaSplit.join(",");
+  pdb.addPronunciation(
+    wordId,
+    ipa,
+    pronouncedSyls.length,
+    syl_sequence,
+    tone_sequence,
+    ipa_sequence,
+    tags,
+    null,
+  );
+
+  for (let i = 0; i < pronouncedSyls.length; i++) {
+    const pronounced = pronouncedSyls[i]!;
+    const written = writtenSyls[i] || "";
+    const syllable = badSyls ? pronounced : written;
+    const ipa = wikiIpaSplit[i]!;
+    // TODO insert both??
+    const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
+    if (pronounced !== syllable) {
+      console.log("diff");
+      console.log(pronounced);
+      console.log(written);
+    }
+    const res = await handleSyllable(syllable, ipa, wordId, i, notes);
+    if ("error" in res) return res;
+  }
+  return { ok: "" };
+}
+const thaiTones: Record<string, string> = {
+  "˧": "mid",
+  "˨˩": "low",
+  "˥˩": "falling",
+  "˦˥": "high",
+  "˩˩˦": "rising",
+};
+const thaiToneNums: Record<string, number> = {
+  "˧": 33,
+  "˨˩": 21,
+  "˥˩": 41,
+  "˦˥": 45,
+  "˩˩˦": 214,
+};
+const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|"));
+
+function parseTone(ipa: string, spelling: string): Tone {
+  try {
+    const match = ipa.match(toneRegex)!;
+    const m = match[0]!;
+    const name = thaiTones[m]!;
+    const numbers = thaiToneNums[m]!;
+    return { letters: ipa, name, numbers };
+  } catch (e) {
+    console.error("meh wrong tones!!", { s: spelling, ipa });
+    throw new Error("");
+  }
+}
+function parseToneS(ipa: string, spelling: string): Tone {
+  try {
+    const name = thaiTones[ipa]!;
+    const numbers = thaiToneNums[ipa]!;
+    return { letters: ipa, name, numbers };
+  } catch (e) {
+    console.error("meh wrong tones!!", { s: spelling, ipa });
+    throw new Error("");
+  }
+}
+
+async function handleSyllable(
+  spelling: string,
+  ipa: string,
+  wordId: number | bigint,
+  idx: number,
+  notes: string | null,
+): AsyncRes<string> {
+  const sorsyl = await sorSyl(spelling, "th", ipa);
+  // console.log("ssyl", sorsyl.syls);
+  const weird = [
+    // "a̯n",
+    // "a̯",
+    // "a̯p",
+    // "a̯w",
+    // "a̯j",
+    // "a̯ŋ",
+    // "a̯k",
+    // "a̯t",
+    // "a̯m",
+    // "a̯ʔ",
+    // "ʔ",
+    "s",
+    "l",
+    "f",
+    "a̯s",
+    "js",
+    "t͡ɕʰ",
+    "ks",
+    "ns",
+    "a̯l",
+    "a̯f",
+    "mk",
+  ];
+  // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
+  // if (weirder) {
+  //   console.log("syllable", spelling);
+  //   // console.dir(sorsyl, { depth: null });
+  //   // console.dir(j, { depth: null });
+  // }
+  if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
+  const syl = sorsyl.syls[0]!.ipa;
+  const tone = parseToneS(syl.tone, spelling);
+  // TODO add actual ortographic data here not just ipa
+  try {
+    pdb.addSyllable(
+      wordId,
+      idx + 1,
+      null,
+      "th",
+      syl.all,
+      syl.long,
+      spelling,
+      { spelling: syl.onset, ipa: syl.onset },
+      { spelling: syl.medial, ipa: syl.medial },
+      { spelling: syl.nucleus, ipa: syl.nucleus },
+      { spelling: syl.coda, ipa: syl.coda },
+      { spelling: syl.rhyme, ipa: syl.rhyme },
+      tone,
+      notes,
+    );
+    return { ok: "" };
+  } catch (e) {
+    // console.log("well fuck", syl);
+    // console.error(e);
+    return { error: `meh ${e}` };
+  }
+}
+async function handleIdiom(idiom: string): AsyncRes<string> {
+  pdb.addIdiom(idiom, "th");
+  // TODO later set idiom_words once all words are populated
+  // console.log();
+  return { ok: "" };
+}
+// ช้า ๆ
+// งก ๆ
+// หงก ๆ
+
+async function getFrequency() {
+  const files = [
+    "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
+  ];
+  const freqMap = new Map<number, string>();
+  for (const file of files) {
+    await handleFile(file, (line, idx) => {
+      const [spelling, IPA, tone, length, frequency, ...rest] = line.split(",");
+      freqMap.set(Number(frequency!), spelling!);
+    });
+  }
+  const orderedMap = new Map<string, number>();
+  const keys = Array.from(freqMap.keys()).sort();
+  for (let i = 0; i < keys.length; i++) {
+    const val = freqMap.get(keys[i]!)!;
+    orderedMap.set(val, i + 1);
+  }
+  return orderedMap;
+}
+
+readDump("th");
author	polwex <polwex@sortug.com>	2025-06-03 15:41:31 +0700
committer	polwex <polwex@sortug.com>	2025-06-03 15:41:31 +0700
commit	175ddca375cef765cec8ca5bbc527a205c40bf25 (patch)
tree	f2e47a5d85e4d5e0297613e5a17cebce7d09b09b /src/lib/db
parent	2401217a4019938d1c1cc61b6e33ccb233eb6e74 (diff)