import Database from "bun:sqlite"; import { analyzeTHWord, deconstructSyllable, segmentateThai, type SorSyl, type ThaiNLPRes, sorSyl, getThaiFreq, } from "../calls/nlp"; import pdb from "./prosodydb"; import { cleanIpa } from "../utils"; import { handleFile } from "./utils"; import { Tone } from "../types/phonetics"; import { AsyncRes } from "../types"; async function readDump(lang: string) { await pdb.init(); pdb.addLanguage("th", "thai"); let count = 0; const langdb = new Database( `/home/y/code/prosody/resources/wiktionary/${lang}.db`, ); let langrows: any = langdb.query("SELECT data FROM langs"); // langrows = langrows.slice(10); for (const langrow of langrows) { count++; console.log(count); // if (count <= 10000) continue; // if (count > 100) break; const j = JSON.parse(langrow.data); const word = j.word.trim(); if (!word) continue; if (word.includes("ๆ")) { const res = await handleWord(word, j); if ("error" in res) { if (res.error.includes("meh")) continue; if (res.error.includes("wtf")) { console.error(res.error); console.error(j.sounds); } break; } } else { const split = word.split(" "); if (split.length > 1) { const res = await handleIdiom(word); if ("error" in res) { console.error(res.error); break; } } else { const res = await handleWord(word, j); if ("error" in res) { if (res.error.includes("meh")) continue; if (res.error.includes("wtf")) { console.error(res.error); console.error(j.sounds); } // break; } } } } } // if (wordId == 478 || word === "และ") { // // console.log("wtf man"); // // console.dir(j, { depth: null }); // // return { error: "i said wtf" }; // } async function handleWord(word: string, j: any): AsyncRes { // TODO add categories but add a tag to see what classifying scheme we're using // const sounds = j.sounds || []; const hasIpa = sounds.find((s: any) => "ipa" in s); if (!hasIpa) return { error: "meh no ipa" }; const freq = await getThaiFreq(word); const wordId = pdb.addWord(word, "th", freq, null); const analyzed = await analyzeTHWord(word); for (let snd of sounds) if ("ipa" in snd) { const res = await handleIpa(wordId, j, snd, analyzed); if ("error" in res) return res; } return { ok: "" }; } async function handleIpa( wordId: number | bigint, j: any, snd: any, analyzed: ThaiNLPRes, ): AsyncRes { console.log(); const tags = JSON.stringify(snd.tags) || null; // console.log("handleipa", analyzed.syllables.length); // console.log(analyzed); const wikiIpa = cleanIpa(snd.ipa); const nlpIpa = cleanIpa(analyzed.ipa); const ipa = wikiIpa || nlpIpa; // if (j.word === "และ") { // console.log("wtf!!"); // return { error: "wtf is this" }; // } const wikiIpaSplit = wikiIpa.split("."); const nlpIpaSplit = nlpIpa.split("."); if (wikiIpaSplit.length !== nlpIpaSplit.length) { // console.log("ipa mismatch"); // console.log(wikiIpa); // console.log(nlpIpa); } if (analyzed.realSyls.length !== wikiIpaSplit.length) { // console.log("syllable analysis mismatch", j.word); // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); // console.dir(j, { depth: null }); return { error: "meh syllable analysis mismatch" }; } const writtenSyls = analyzed.syllables; const pronouncedSyls = analyzed.realSyls.map((s) => s.replace(/\u{E3A}/u, ""), ); let badSyls = false; if (writtenSyls.length !== pronouncedSyls.length) badSyls = true; const tone_sequence = wikiIpaSplit .map((s) => parseTone(s, j.word)) .map((t) => t.name) .join(","); const syl_sequence = pronouncedSyls.join(","); const ipa_sequence = wikiIpaSplit.join(","); pdb.addPronunciation( wordId, ipa, pronouncedSyls.length, syl_sequence, tone_sequence, ipa_sequence, tags, null, ); for (let i = 0; i < pronouncedSyls.length; i++) { const pronounced = pronouncedSyls[i]!; const written = writtenSyls[i] || ""; const syllable = badSyls ? pronounced : written; const ipa = wikiIpaSplit[i]!; // TODO insert both?? const notes = pronounced === written ? null : `Pronounced ${pronounced}`; if (pronounced !== syllable) { console.log("diff"); console.log(pronounced); console.log(written); } const res = await handleSyllable(syllable, ipa, wordId, i, notes); if ("error" in res) return res; } return { ok: "" }; } const thaiTones: Record = { "˧": "mid", "˨˩": "low", "˥˩": "falling", "˦˥": "high", "˩˩˦": "rising", }; const thaiToneNums: Record = { "˧": 33, "˨˩": 21, "˥˩": 41, "˦˥": 45, "˩˩˦": 214, }; const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|")); function parseTone(ipa: string, spelling: string): Tone { try { const match = ipa.match(toneRegex)!; const m = match[0]!; const name = thaiTones[m]!; const numbers = thaiToneNums[m]!; return { letters: ipa, name, numbers }; } catch (e) { console.error("meh wrong tones!!", { s: spelling, ipa }); throw new Error(""); } } function parseToneS(ipa: string, spelling: string): Tone { try { const name = thaiTones[ipa]!; const numbers = thaiToneNums[ipa]!; return { letters: ipa, name, numbers }; } catch (e) { console.error("meh wrong tones!!", { s: spelling, ipa }); throw new Error(""); } } async function handleSyllable( spelling: string, ipa: string, wordId: number | bigint, idx: number, notes: string | null, ): AsyncRes { const sorsyl = await sorSyl(spelling, "th", ipa); // console.log("ssyl", sorsyl.syls); const weird = [ // "a̯n", // "a̯", // "a̯p", // "a̯w", // "a̯j", // "a̯ŋ", // "a̯k", // "a̯t", // "a̯m", // "a̯ʔ", // "ʔ", "s", "l", "f", "a̯s", "js", "t͡ɕʰ", "ks", "ns", "a̯l", "a̯f", "mk", ]; // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda)); // if (weirder) { // console.log("syllable", spelling); // // console.dir(sorsyl, { depth: null }); // // console.dir(j, { depth: null }); // } if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); const syl = sorsyl.syls[0]!.ipa; const tone = parseToneS(syl.tone, spelling); // TODO add actual ortographic data here not just ipa try { pdb.addSyllable( wordId, idx + 1, null, "th", syl.all, syl.long, spelling, { spelling: syl.onset, ipa: syl.onset }, { spelling: syl.medial, ipa: syl.medial }, { spelling: syl.nucleus, ipa: syl.nucleus }, { spelling: syl.coda, ipa: syl.coda }, { spelling: syl.rhyme, ipa: syl.rhyme }, tone, notes, ); return { ok: "" }; } catch (e) { // console.log("well fuck", syl); // console.error(e); return { error: `meh ${e}` }; } } async function handleIdiom(idiom: string): AsyncRes { pdb.addIdiom(idiom, "th"); // TODO later set idiom_words once all words are populated // console.log(); return { ok: "" }; } // ช้า ๆ // งก ๆ // หงก ๆ async function getFrequency() { const files = [ "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv", "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv", "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv", "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv", "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv", "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv", ]; const freqMap = new Map(); for (const file of files) { await handleFile(file, (line, idx) => { const [spelling, IPA, tone, length, frequency, ...rest] = line.split(","); freqMap.set(Number(frequency!), spelling!); }); } const orderedMap = new Map(); const keys = Array.from(freqMap.keys()).sort(); for (let i = 0; i < keys.length; i++) { const val = freqMap.get(keys[i]!)!; orderedMap.set(val, i + 1); } return orderedMap; } readDump("th");