1 files changed, 124 insertions, 129 deletions
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
index 6c69d9c..32434da 100644
--- a/src/lib/db/thaiseed.ts
+++ b/src/lib/db/thaiseed.ts
@@ -11,7 +11,7 @@ import {
 import pdb from "./prosodydb";
 import { cleanIpa } from "../utils";
 import { handleFile } from "./utils";
-import { Tone } from "../types/phonetics";
+import { Phoneme, Tone } from "../types/phonetics";
 import { AsyncRes } from "../types";
 
 async function readDump(lang: string) {
@@ -25,7 +25,7 @@ async function readDump(lang: string) {
   // langrows = langrows.slice(10);
   for (const langrow of langrows) {
     count++;
-    // console.log(count);
+    console.log(count);
     // if (count <= 10000) continue;
     // if (count > 100) break;
     const j = JSON.parse(langrow.data);
@@ -68,65 +68,101 @@ async function readDump(lang: string) {
 async function handleWord(word: string, j: any): AsyncRes<string> {
   // TODO add categories but add a tag to see what classifying scheme we're using
   //
-  const sounds = j.sounds || [];
-  const hasIpa = sounds.find((s: any) => "ipa" in s);
-  if (!hasIpa) return { error: "meh no ipa" };
-  const freq = await getThaiFreq(word);
-  const wordId = pdb.addWord(word, "th", freq, null);
-  if (wordId == 478 || word === "และ") {
-    console.log("wtf man");
-    console.dir(j, { depth: null });
-    return { error: "i said wtf" };
-  }
+  const frequency = await getThaiFreq(word);
   const analyzed = await analyzeTHWord(word);
-  for (let snd of sounds)
-    if ("ipa" in snd) {
-      const res = await handleIpa(wordId, j, snd, analyzed);
-      if ("error" in res) return res;
-    }
+  const phonetics = await Promise.all(getIpa(j, analyzed));
+
+  pdb.superAdd({ word, lang: "th", frequency, wordNotes: null, phonetics });
   return { ok: "" };
 }
-async function handleIpa(
-  wordId: number | bigint,
-  j: any,
-  snd: any,
-  analyzed: ThaiNLPRes,
-): AsyncRes<string> {
+function getIpa(j: any, analyzed: ThaiNLPRes) {
+  const sounds = j.sounds || [];
+  const hasIpa = sounds.find((s: any) => "ipa" in s);
+  if (!hasIpa) return [];
+  const ipaData: Promise<IPAData>[] = sounds.reduce(
+    async (acc: Promise<IPAData>[], snd: any) => {
+      if ("ipa" in snd) {
+        const data = getIpaData(snd, analyzed);
+        return [...acc, data];
+      } else return acc;
+    },
+    [],
+  );
+  return ipaData;
+}
+type IPAData = {
+  ipa: string;
+  syllable_count: number;
+  syllable_sequence: string;
+  tone_sequence: string;
+  ipa_sequence: string;
+  tags: string | null;
+  notes: string | null;
+  wordRhyme: string | null;
+  syllables: SylData[];
+};
+async function getIpaData(snd: any, analyzed: ThaiNLPRes): Promise<IPAData> {
   const tags = JSON.stringify(snd.tags) || null;
   // console.log("handleipa", analyzed.syllables.length);
   // console.log(analyzed);
   const wikiIpa = cleanIpa(snd.ipa);
   const nlpIpa = cleanIpa(analyzed.ipa);
   const ipa = wikiIpa || nlpIpa;
-  if (j.word === "และ") {
-    console.log("wtf!!");
-    return { error: "wtf is this" };
-  }
+  // if (j.word === "และ") {
+  //   console.log("wtf!!");
+  // return { error: "wtf is this" };
+  // }
   const wikiIpaSplit = wikiIpa.split(".");
   const nlpIpaSplit = nlpIpa.split(".");
   if (wikiIpaSplit.length !== nlpIpaSplit.length) {
-    // console.log("ipa mismatch");
-    // console.log(wikiIpa);
-    // console.log(nlpIpa);
+    console.log("ipa mismatch");
+    console.log(wikiIpa);
+    console.log(nlpIpa);
   }
   if (analyzed.realSyls.length !== wikiIpaSplit.length) {
-    // console.log("syllable analysis mismatch", j.word);
-    // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
-    // console.dir(j, { depth: null });
-    return { error: "meh syllable analysis mismatch" };
+    console.log("syllable analysis mismatch", analyzed.word);
+    console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+    throw new Error("syllable mismatch");
   }
   const writtenSyls = analyzed.syllables;
-  const pronouncedSyls = analyzed.realSyls;
+  const pronouncedSyls = analyzed.realSyls.map((s) =>
+    s.replace(/\u{E3A}/u, ""),
+  );
+
+  const tone_sequence = wikiIpaSplit
+    .map((s) => parseTone(s, analyzed.word))
+    .map((t) => t.name)
+    .join(",");
+  const syllable_sequence = pronouncedSyls.join(",");
+  const ipa_sequence = wikiIpaSplit.join(",");
+  const syllables = await Promise.all(
+    getSyllables(writtenSyls, pronouncedSyls, wikiIpaSplit),
+  );
+  return {
+    ipa,
+    syllable_count: pronouncedSyls.length,
+    syllable_sequence,
+    tone_sequence,
+    ipa_sequence,
+    tags,
+    notes: null,
+    wordRhyme: null,
+    syllables,
+  };
+}
+function getSyllables(
+  writtenSyls: string[],
+  pronouncedSyls: string[],
+  ipaSyls: string[],
+) {
   let badSyls = false;
   if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
-
-  pdb.addPronunciation(wordId, ipa, pronouncedSyls.length, tags, null);
-
+  let syls: Promise<SylData>[] = [];
   for (let i = 0; i < pronouncedSyls.length; i++) {
-    const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, "");
+    const pronounced = pronouncedSyls[i]!;
     const written = writtenSyls[i] || "";
     const syllable = badSyls ? pronounced : written;
-    const ipa = wikiIpaSplit[i]!;
+    const ipa = ipaSyls[i]!;
     // TODO insert both??
     const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
     if (pronounced !== syllable) {
@@ -134,10 +170,10 @@ async function handleIpa(
       console.log(pronounced);
       console.log(written);
     }
-    const res = await handleSyllable(syllable, ipa, wordId, i, notes);
-    if ("error" in res) return res;
+    const res = getSyllable(syllable, ipa, i, notes);
+    syls.push(res);
   }
-  return { ok: "" };
+  return syls;
 }
 const thaiTones: Record<string, string> = {
   "˧": "mid",
@@ -153,8 +189,22 @@ const thaiToneNums: Record<string, number> = {
   "˦˥": 45,
   "˩˩˦": 214,
 };
+const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|"));
+
 function parseTone(ipa: string, spelling: string): Tone {
   try {
+    const match = ipa.match(toneRegex)!;
+    const m = match[0]!;
+    const name = thaiTones[m]!;
+    const numbers = thaiToneNums[m]!;
+    return { letters: ipa, name, numbers };
+  } catch (e) {
+    console.error("meh wrong tones!!", { s: spelling, ipa });
+    throw new Error("");
+  }
+}
+function parseToneS(ipa: string, spelling: string): Tone {
+  try {
     const name = thaiTones[ipa]!;
     const numbers = thaiToneNums[ipa]!;
     return { letters: ipa, name, numbers };
@@ -164,71 +214,44 @@ function parseTone(ipa: string, spelling: string): Tone {
   }
 }
 
-async function handleSyllable(
+type SylData = {
+  idx: number;
+  stressed: boolean | null;
+  spelling: string;
+  ipa: string;
+  long: boolean;
+  onset: Phoneme;
+  medial: Phoneme;
+  nucleus: Phoneme;
+  coda: Phoneme;
+  rhyme: Phoneme;
+  tone: Tone;
+  notes: string | null;
+};
+async function getSyllable(
   spelling: string,
   ipa: string,
-  wordId: number | bigint,
   idx: number,
   notes: string | null,
-): AsyncRes<string> {
+): Promise<SylData> {
   const sorsyl = await sorSyl(spelling, "th", ipa);
-  const weird = [
-    // "a̯n",
-    // "a̯",
-    // "a̯p",
-    // "a̯w",
-    // "a̯j",
-    // "a̯ŋ",
-    // "a̯k",
-    // "a̯t",
-    // "a̯m",
-    // "a̯ʔ",
-    // "ʔ",
-    "s",
-    "l",
-    "f",
-    "a̯s",
-    "js",
-    "t͡ɕʰ",
-    "ks",
-    "ns",
-    "a̯l",
-    "a̯f",
-    "mk",
-  ];
-  // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
-  // if (weirder) {
-  //   console.log("syllable", spelling);
-  //   // console.dir(sorsyl, { depth: null });
-  //   // console.dir(j, { depth: null });
-  // }
   if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
   const syl = sorsyl.syls[0]!.ipa;
-  const tone = parseTone(syl.tone, spelling);
-  // TODO add actual ortographic data here not just ipa
-  try {
-    pdb.addSyllable(
-      wordId,
-      idx + 1,
-      null,
-      "th",
-      syl.all,
-      syl.long,
-      spelling,
-      { spelling: syl.onset, ipa: syl.onset },
-      { spelling: syl.medial, ipa: syl.medial },
-      { spelling: syl.nucleus, ipa: syl.nucleus },
-      { spelling: syl.coda, ipa: syl.coda },
-      { spelling: syl.rhyme, ipa: syl.rhyme },
-      tone,
-      notes,
-    );
-    return { ok: "" };
-  } catch (e) {
-    // console.log("well fuck", syl);
-    // console.error(e);
-    return { error: `meh ${e}` };
-  }
+  const tone = parseToneS(syl.tone, spelling);
+  return {
+    idx: idx + 1,
+    stressed: null,
+    spelling,
+    ipa: syl.all,
+    long: syl.long,
+    onset: { spelling: syl.onset, ipa: syl.onset },
+    medial: { spelling: syl.medial, ipa: syl.medial },
+    nucleus: { spelling: syl.nucleus, ipa: syl.nucleus },
+    coda: { spelling: syl.coda, ipa: syl.coda },
+    rhyme: { spelling: syl.rhyme, ipa: syl.rhyme },
+    tone,
+    notes,
+  };
 }
 async function handleIdiom(idiom: string): AsyncRes<string> {
   pdb.addIdiom(idiom, "th");
@@ -236,33 +259,5 @@ async function handleIdiom(idiom: string): AsyncRes<string> {
   // console.log();
   return { ok: "" };
 }
-// ช้า ๆ
-// งก ๆ
-// หงก ๆ
-
-async function getFrequency() {
-  const files = [
-    "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
-    "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
-    "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
-    "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
-    "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
-    "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
-  ];
-  const freqMap = new Map<number, string>();
-  for (const file of files) {
-    await handleFile(file, (line, idx) => {
-      const [spelling, IPA, tone, length, frequency, ...rest] = line.split(",");
-      freqMap.set(Number(frequency!), spelling!);
-    });
-  }
-  const orderedMap = new Map<string, number>();
-  const keys = Array.from(freqMap.keys()).sort();
-  for (let i = 0; i < keys.length; i++) {
-    const val = freqMap.get(keys[i]!)!;
-    orderedMap.set(val, i + 1);
-  }
-  return orderedMap;
-}
 
 readDump("th");