summaryrefslogtreecommitdiff
path: root/src/lib/db/thaiseedold.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/db/thaiseedold.ts')
-rw-r--r--src/lib/db/thaiseedold.ts301
1 files changed, 301 insertions, 0 deletions
diff --git a/src/lib/db/thaiseedold.ts b/src/lib/db/thaiseedold.ts
new file mode 100644
index 0000000..b9522dd
--- /dev/null
+++ b/src/lib/db/thaiseedold.ts
@@ -0,0 +1,301 @@
+import Database from "bun:sqlite";
+import {
+ analyzeTHWord,
+ deconstructSyllable,
+ segmentateThai,
+ type SorSyl,
+ type ThaiNLPRes,
+ sorSyl,
+ getThaiFreq,
+} from "../calls/nlp";
+import pdb from "./prosodydb";
+import { cleanIpa } from "../utils";
+import { handleFile } from "./utils";
+import { Tone } from "../types/phonetics";
+import { AsyncRes } from "../types";
+
+async function readDump(lang: string) {
+ await pdb.init();
+ pdb.addLanguage("th", "thai");
+ let count = 0;
+ const langdb = new Database(
+ `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+ );
+ let langrows: any = langdb.query("SELECT data FROM langs");
+ // langrows = langrows.slice(10);
+ for (const langrow of langrows) {
+ count++;
+ console.log(count);
+ // if (count <= 10000) continue;
+ // if (count > 100) break;
+ const j = JSON.parse(langrow.data);
+ const word = j.word.trim();
+ if (!word) continue;
+
+ if (word.includes("ๆ")) {
+ const res = await handleWord(word, j);
+ if ("error" in res) {
+ if (res.error.includes("meh")) continue;
+ if (res.error.includes("wtf")) {
+ console.error(res.error);
+ console.error(j.sounds);
+ }
+ break;
+ }
+ } else {
+ const split = word.split(" ");
+ if (split.length > 1) {
+ const res = await handleIdiom(word);
+ if ("error" in res) {
+ console.error(res.error);
+ break;
+ }
+ } else {
+ const res = await handleWord(word, j);
+ if ("error" in res) {
+ if (res.error.includes("meh")) continue;
+ if (res.error.includes("wtf")) {
+ console.error(res.error);
+ console.error(j.sounds);
+ }
+ // break;
+ }
+ }
+ }
+ }
+}
+
+// if (wordId == 478 || word === "และ") {
+// // console.log("wtf man");
+// // console.dir(j, { depth: null });
+// // return { error: "i said wtf" };
+// }
+async function handleWord(word: string, j: any): AsyncRes<string> {
+ // TODO add categories but add a tag to see what classifying scheme we're using
+ //
+ const sounds = j.sounds || [];
+ const hasIpa = sounds.find((s: any) => "ipa" in s);
+ if (!hasIpa) return { error: "meh no ipa" };
+ const freq = await getThaiFreq(word);
+ const wordId = pdb.addWord(word, "th", freq, null);
+ const analyzed = await analyzeTHWord(word);
+ for (let snd of sounds)
+ if ("ipa" in snd) {
+ const res = await handleIpa(wordId, j, snd, analyzed);
+ if ("error" in res) return res;
+ }
+ return { ok: "" };
+}
+async function handleIpa(
+ wordId: number | bigint,
+ j: any,
+ snd: any,
+ analyzed: ThaiNLPRes,
+): AsyncRes<string> {
+ console.log();
+ const tags = JSON.stringify(snd.tags) || null;
+ // console.log("handleipa", analyzed.syllables.length);
+ // console.log(analyzed);
+ const wikiIpa = cleanIpa(snd.ipa);
+ const nlpIpa = cleanIpa(analyzed.ipa);
+ const ipa = wikiIpa || nlpIpa;
+ // if (j.word === "และ") {
+ // console.log("wtf!!");
+ // return { error: "wtf is this" };
+ // }
+ const wikiIpaSplit = wikiIpa.split(".");
+ const nlpIpaSplit = nlpIpa.split(".");
+ if (wikiIpaSplit.length !== nlpIpaSplit.length) {
+ // console.log("ipa mismatch");
+ // console.log(wikiIpa);
+ // console.log(nlpIpa);
+ }
+ if (analyzed.realSyls.length !== wikiIpaSplit.length) {
+ // console.log("syllable analysis mismatch", j.word);
+ // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+ // console.dir(j, { depth: null });
+ return { error: "meh syllable analysis mismatch" };
+ }
+ const writtenSyls = analyzed.syllables;
+ const pronouncedSyls = analyzed.realSyls.map((s) =>
+ s.replace(/\u{E3A}/u, ""),
+ );
+ let badSyls = false;
+ if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
+
+ const tone_sequence = wikiIpaSplit
+ .map((s) => parseTone(s, j.word))
+ .map((t) => t.name)
+ .join(",");
+ const syl_sequence = pronouncedSyls.join(",");
+ const ipa_sequence = wikiIpaSplit.join(",");
+ pdb.addPronunciation(
+ wordId,
+ ipa,
+ pronouncedSyls.length,
+ syl_sequence,
+ tone_sequence,
+ ipa_sequence,
+ tags,
+ null,
+ );
+
+ for (let i = 0; i < pronouncedSyls.length; i++) {
+ const pronounced = pronouncedSyls[i]!;
+ const written = writtenSyls[i] || "";
+ const syllable = badSyls ? pronounced : written;
+ const ipa = wikiIpaSplit[i]!;
+ // TODO insert both??
+ const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
+ if (pronounced !== syllable) {
+ console.log("diff");
+ console.log(pronounced);
+ console.log(written);
+ }
+ const res = await handleSyllable(syllable, ipa, wordId, i, notes);
+ if ("error" in res) return res;
+ }
+ return { ok: "" };
+}
+const thaiTones: Record<string, string> = {
+ "˧": "mid",
+ "˨˩": "low",
+ "˥˩": "falling",
+ "˦˥": "high",
+ "˩˩˦": "rising",
+};
+const thaiToneNums: Record<string, number> = {
+ "˧": 33,
+ "˨˩": 21,
+ "˥˩": 41,
+ "˦˥": 45,
+ "˩˩˦": 214,
+};
+const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|"));
+
+function parseTone(ipa: string, spelling: string): Tone {
+ try {
+ const match = ipa.match(toneRegex)!;
+ const m = match[0]!;
+ const name = thaiTones[m]!;
+ const numbers = thaiToneNums[m]!;
+ return { letters: ipa, name, numbers };
+ } catch (e) {
+ console.error("meh wrong tones!!", { s: spelling, ipa });
+ throw new Error("");
+ }
+}
+function parseToneS(ipa: string, spelling: string): Tone {
+ try {
+ const name = thaiTones[ipa]!;
+ const numbers = thaiToneNums[ipa]!;
+ return { letters: ipa, name, numbers };
+ } catch (e) {
+ console.error("meh wrong tones!!", { s: spelling, ipa });
+ throw new Error("");
+ }
+}
+
+async function handleSyllable(
+ spelling: string,
+ ipa: string,
+ wordId: number | bigint,
+ idx: number,
+ notes: string | null,
+): AsyncRes<string> {
+ const sorsyl = await sorSyl(spelling, "th", ipa);
+ // console.log("ssyl", sorsyl.syls);
+ const weird = [
+ // "a̯n",
+ // "a̯",
+ // "a̯p",
+ // "a̯w",
+ // "a̯j",
+ // "a̯ŋ",
+ // "a̯k",
+ // "a̯t",
+ // "a̯m",
+ // "a̯ʔ",
+ // "ʔ",
+ "s",
+ "l",
+ "f",
+ "a̯s",
+ "js",
+ "t͡ɕʰ",
+ "ks",
+ "ns",
+ "a̯l",
+ "a̯f",
+ "mk",
+ ];
+ // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
+ // if (weirder) {
+ // console.log("syllable", spelling);
+ // // console.dir(sorsyl, { depth: null });
+ // // console.dir(j, { depth: null });
+ // }
+ if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
+ const syl = sorsyl.syls[0]!.ipa;
+ const tone = parseToneS(syl.tone, spelling);
+ // TODO add actual ortographic data here not just ipa
+ try {
+ pdb.addSyllable(
+ wordId,
+ idx + 1,
+ null,
+ "th",
+ syl.all,
+ syl.long,
+ spelling,
+ { spelling: syl.onset, ipa: syl.onset },
+ { spelling: syl.medial, ipa: syl.medial },
+ { spelling: syl.nucleus, ipa: syl.nucleus },
+ { spelling: syl.coda, ipa: syl.coda },
+ { spelling: syl.rhyme, ipa: syl.rhyme },
+ tone,
+ notes,
+ );
+ return { ok: "" };
+ } catch (e) {
+ // console.log("well fuck", syl);
+ // console.error(e);
+ return { error: `meh ${e}` };
+ }
+}
+async function handleIdiom(idiom: string): AsyncRes<string> {
+ pdb.addIdiom(idiom, "th");
+ // TODO later set idiom_words once all words are populated
+ // console.log();
+ return { ok: "" };
+}
+// ช้า ๆ
+// งก ๆ
+// หงก ๆ
+
+async function getFrequency() {
+ const files = [
+ "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
+ ];
+ const freqMap = new Map<number, string>();
+ for (const file of files) {
+ await handleFile(file, (line, idx) => {
+ const [spelling, IPA, tone, length, frequency, ...rest] = line.split(",");
+ freqMap.set(Number(frequency!), spelling!);
+ });
+ }
+ const orderedMap = new Map<string, number>();
+ const keys = Array.from(freqMap.keys()).sort();
+ for (let i = 0; i < keys.length; i++) {
+ const val = freqMap.get(keys[i]!)!;
+ orderedMap.set(val, i + 1);
+ }
+ return orderedMap;
+}
+
+readDump("th");