1 files changed, 301 insertions, 0 deletions
diff --git a/src/lib/db/thaiseedold.ts b/src/lib/db/thaiseedold.ts
new file mode 100644
index 0000000..b9522dd
--- /dev/null
+++ b/src/lib/db/thaiseedold.ts
@@ -0,0 +1,301 @@
+import Database from "bun:sqlite";
+import {
+  analyzeTHWord,
+  deconstructSyllable,
+  segmentateThai,
+  type SorSyl,
+  type ThaiNLPRes,
+  sorSyl,
+  getThaiFreq,
+} from "../calls/nlp";
+import pdb from "./prosodydb";
+import { cleanIpa } from "../utils";
+import { handleFile } from "./utils";
+import { Tone } from "../types/phonetics";
+import { AsyncRes } from "../types";
+
+async function readDump(lang: string) {
+  await pdb.init();
+  pdb.addLanguage("th", "thai");
+  let count = 0;
+  const langdb = new Database(
+    `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+  );
+  let langrows: any = langdb.query("SELECT data FROM langs");
+  // langrows = langrows.slice(10);
+  for (const langrow of langrows) {
+    count++;
+    console.log(count);
+    // if (count <= 10000) continue;
+    // if (count > 100) break;
+    const j = JSON.parse(langrow.data);
+    const word = j.word.trim();
+    if (!word) continue;
+
+    if (word.includes("ๆ")) {
+      const res = await handleWord(word, j);
+      if ("error" in res) {
+        if (res.error.includes("meh")) continue;
+        if (res.error.includes("wtf")) {
+          console.error(res.error);
+          console.error(j.sounds);
+        }
+        break;
+      }
+    } else {
+      const split = word.split(" ");
+      if (split.length > 1) {
+        const res = await handleIdiom(word);
+        if ("error" in res) {
+          console.error(res.error);
+          break;
+        }
+      } else {
+        const res = await handleWord(word, j);
+        if ("error" in res) {
+          if (res.error.includes("meh")) continue;
+          if (res.error.includes("wtf")) {
+            console.error(res.error);
+            console.error(j.sounds);
+          }
+          // break;
+        }
+      }
+    }
+  }
+}
+
+// if (wordId == 478 || word === "และ") {
+//   // console.log("wtf man");
+//   // console.dir(j, { depth: null });
+//   // return { error: "i said wtf" };
+// }
+async function handleWord(word: string, j: any): AsyncRes<string> {
+  // TODO add categories but add a tag to see what classifying scheme we're using
+  //
+  const sounds = j.sounds || [];
+  const hasIpa = sounds.find((s: any) => "ipa" in s);
+  if (!hasIpa) return { error: "meh no ipa" };
+  const freq = await getThaiFreq(word);
+  const wordId = pdb.addWord(word, "th", freq, null);
+  const analyzed = await analyzeTHWord(word);
+  for (let snd of sounds)
+    if ("ipa" in snd) {
+      const res = await handleIpa(wordId, j, snd, analyzed);
+      if ("error" in res) return res;
+    }
+  return { ok: "" };
+}
+async function handleIpa(
+  wordId: number | bigint,
+  j: any,
+  snd: any,
+  analyzed: ThaiNLPRes,
+): AsyncRes<string> {
+  console.log();
+  const tags = JSON.stringify(snd.tags) || null;
+  // console.log("handleipa", analyzed.syllables.length);
+  // console.log(analyzed);
+  const wikiIpa = cleanIpa(snd.ipa);
+  const nlpIpa = cleanIpa(analyzed.ipa);
+  const ipa = wikiIpa || nlpIpa;
+  // if (j.word === "และ") {
+  //   console.log("wtf!!");
+  //   return { error: "wtf is this" };
+  // }
+  const wikiIpaSplit = wikiIpa.split(".");
+  const nlpIpaSplit = nlpIpa.split(".");
+  if (wikiIpaSplit.length !== nlpIpaSplit.length) {
+    // console.log("ipa mismatch");
+    // console.log(wikiIpa);
+    // console.log(nlpIpa);
+  }
+  if (analyzed.realSyls.length !== wikiIpaSplit.length) {
+    // console.log("syllable analysis mismatch", j.word);
+    // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+    // console.dir(j, { depth: null });
+    return { error: "meh syllable analysis mismatch" };
+  }
+  const writtenSyls = analyzed.syllables;
+  const pronouncedSyls = analyzed.realSyls.map((s) =>
+    s.replace(/\u{E3A}/u, ""),
+  );
+  let badSyls = false;
+  if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
+
+  const tone_sequence = wikiIpaSplit
+    .map((s) => parseTone(s, j.word))
+    .map((t) => t.name)
+    .join(",");
+  const syl_sequence = pronouncedSyls.join(",");
+  const ipa_sequence = wikiIpaSplit.join(",");
+  pdb.addPronunciation(
+    wordId,
+    ipa,
+    pronouncedSyls.length,
+    syl_sequence,
+    tone_sequence,
+    ipa_sequence,
+    tags,
+    null,
+  );
+
+  for (let i = 0; i < pronouncedSyls.length; i++) {
+    const pronounced = pronouncedSyls[i]!;
+    const written = writtenSyls[i] || "";
+    const syllable = badSyls ? pronounced : written;
+    const ipa = wikiIpaSplit[i]!;
+    // TODO insert both??
+    const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
+    if (pronounced !== syllable) {
+      console.log("diff");
+      console.log(pronounced);
+      console.log(written);
+    }
+    const res = await handleSyllable(syllable, ipa, wordId, i, notes);
+    if ("error" in res) return res;
+  }
+  return { ok: "" };
+}
+const thaiTones: Record<string, string> = {
+  "˧": "mid",
+  "˨˩": "low",
+  "˥˩": "falling",
+  "˦˥": "high",
+  "˩˩˦": "rising",
+};
+const thaiToneNums: Record<string, number> = {
+  "˧": 33,
+  "˨˩": 21,
+  "˥˩": 41,
+  "˦˥": 45,
+  "˩˩˦": 214,
+};
+const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|"));
+
+function parseTone(ipa: string, spelling: string): Tone {
+  try {
+    const match = ipa.match(toneRegex)!;
+    const m = match[0]!;
+    const name = thaiTones[m]!;
+    const numbers = thaiToneNums[m]!;
+    return { letters: ipa, name, numbers };
+  } catch (e) {
+    console.error("meh wrong tones!!", { s: spelling, ipa });
+    throw new Error("");
+  }
+}
+function parseToneS(ipa: string, spelling: string): Tone {
+  try {
+    const name = thaiTones[ipa]!;
+    const numbers = thaiToneNums[ipa]!;
+    return { letters: ipa, name, numbers };
+  } catch (e) {
+    console.error("meh wrong tones!!", { s: spelling, ipa });
+    throw new Error("");
+  }
+}
+
+async function handleSyllable(
+  spelling: string,
+  ipa: string,
+  wordId: number | bigint,
+  idx: number,
+  notes: string | null,
+): AsyncRes<string> {
+  const sorsyl = await sorSyl(spelling, "th", ipa);
+  // console.log("ssyl", sorsyl.syls);
+  const weird = [
+    // "a̯n",
+    // "a̯",
+    // "a̯p",
+    // "a̯w",
+    // "a̯j",
+    // "a̯ŋ",
+    // "a̯k",
+    // "a̯t",
+    // "a̯m",
+    // "a̯ʔ",
+    // "ʔ",
+    "s",
+    "l",
+    "f",
+    "a̯s",
+    "js",
+    "t͡ɕʰ",
+    "ks",
+    "ns",
+    "a̯l",
+    "a̯f",
+    "mk",
+  ];
+  // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
+  // if (weirder) {
+  //   console.log("syllable", spelling);
+  //   // console.dir(sorsyl, { depth: null });
+  //   // console.dir(j, { depth: null });
+  // }
+  if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
+  const syl = sorsyl.syls[0]!.ipa;
+  const tone = parseToneS(syl.tone, spelling);
+  // TODO add actual ortographic data here not just ipa
+  try {
+    pdb.addSyllable(
+      wordId,
+      idx + 1,
+      null,
+      "th",
+      syl.all,
+      syl.long,
+      spelling,
+      { spelling: syl.onset, ipa: syl.onset },
+      { spelling: syl.medial, ipa: syl.medial },
+      { spelling: syl.nucleus, ipa: syl.nucleus },
+      { spelling: syl.coda, ipa: syl.coda },
+      { spelling: syl.rhyme, ipa: syl.rhyme },
+      tone,
+      notes,
+    );
+    return { ok: "" };
+  } catch (e) {
+    // console.log("well fuck", syl);
+    // console.error(e);
+    return { error: `meh ${e}` };
+  }
+}
+async function handleIdiom(idiom: string): AsyncRes<string> {
+  pdb.addIdiom(idiom, "th");
+  // TODO later set idiom_words once all words are populated
+  // console.log();
+  return { ok: "" };
+}
+// ช้า ๆ
+// งก ๆ
+// หงก ๆ
+
+async function getFrequency() {
+  const files = [
+    "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
+  ];
+  const freqMap = new Map<number, string>();
+  for (const file of files) {
+    await handleFile(file, (line, idx) => {
+      const [spelling, IPA, tone, length, frequency, ...rest] = line.split(",");
+      freqMap.set(Number(frequency!), spelling!);
+    });
+  }
+  const orderedMap = new Map<string, number>();
+  const keys = Array.from(freqMap.keys()).sort();
+  for (let i = 0; i < keys.length; i++) {
+    const val = freqMap.get(keys[i]!)!;
+    orderedMap.set(val, i + 1);
+  }
+  return orderedMap;
+}
+
+readDump("th");