import Database from "bun:sqlite"; import { analyzeTHWord, deconstructSyllable, segmentateThai, type SorSyl, type ThaiNLPRes, sorSyl, getThaiFreq, } from "../calls/nlp"; import pdb from "./prosodydb"; import { cleanIpa } from "../utils"; import { handleFile } from "./utils"; import { Phoneme, Tone } from "../types/phonetics"; import { AsyncRes } from "../types"; async function readDump(lang: string) { await pdb.init(); pdb.addLanguage("th", "thai"); let count = 0; const langdb = new Database( `/home/y/code/prosody/resources/wiktionary/${lang}.db`, ); let langrows: any = langdb.query("SELECT data FROM langs"); // langrows = langrows.slice(10); for (const langrow of langrows) { count++; console.log(count); // if (count <= 10000) continue; // if (count > 100) break; const j = JSON.parse(langrow.data); const word = j.word.trim(); if (!word) continue; if (word.includes("ๆ")) { const res = await handleWord(word, j); if ("error" in res) { if (res.error.includes("meh")) continue; if (res.error.includes("wtf")) { console.error(res.error); console.error(j.sounds); } break; } } else { const split = word.split(" "); if (split.length > 1) { const res = await handleIdiom(word); if ("error" in res) { console.error(res.error); break; } } else { const res = await handleWord(word, j); if ("error" in res) { if (res.error.includes("meh")) continue; if (res.error.includes("wtf")) { console.error(res.error); console.error(j.sounds); } // break; } } } } } async function handleWord(word: string, j: any): AsyncRes { // TODO add categories but add a tag to see what classifying scheme we're using // const frequency = await getThaiFreq(word); const analyzed = await analyzeTHWord(word); const phonetics = await Promise.all(getIpa(j, analyzed)); pdb.superAdd({ word, lang: "th", frequency, wordNotes: null, phonetics }); return { ok: "" }; } function getIpa(j: any, analyzed: ThaiNLPRes) { const sounds = j.sounds || []; const hasIpa = sounds.find((s: any) => "ipa" in s); if (!hasIpa) return []; const ipaData: Promise[] = sounds.reduce( async (acc: Promise[], snd: any) => { if ("ipa" in snd) { const data = getIpaData(snd, analyzed); return [...acc, data]; } else return acc; }, [], ); return ipaData; } type IPAData = { ipa: string; syllable_count: number; syllable_sequence: string; tone_sequence: string; ipa_sequence: string; tags: string | null; notes: string | null; wordRhyme: string | null; syllables: SylData[]; }; async function getIpaData(snd: any, analyzed: ThaiNLPRes): Promise { const tags = JSON.stringify(snd.tags) || null; // console.log("handleipa", analyzed.syllables.length); // console.log(analyzed); const wikiIpa = cleanIpa(snd.ipa); const nlpIpa = cleanIpa(analyzed.ipa); const ipa = wikiIpa || nlpIpa; // if (j.word === "และ") { // console.log("wtf!!"); // return { error: "wtf is this" }; // } const wikiIpaSplit = wikiIpa.split("."); const nlpIpaSplit = nlpIpa.split("."); if (wikiIpaSplit.length !== nlpIpaSplit.length) { console.log("ipa mismatch"); console.log(wikiIpa); console.log(nlpIpa); } if (analyzed.realSyls.length !== wikiIpaSplit.length) { console.log("syllable analysis mismatch", analyzed.word); console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); throw new Error("syllable mismatch"); } const writtenSyls = analyzed.syllables; const pronouncedSyls = analyzed.realSyls.map((s) => s.replace(/\u{E3A}/u, ""), ); const tone_sequence = wikiIpaSplit .map((s) => parseTone(s, analyzed.word)) .map((t) => t.name) .join(","); const syllable_sequence = pronouncedSyls.join(","); const ipa_sequence = wikiIpaSplit.join(","); const syllables = await Promise.all( getSyllables(writtenSyls, pronouncedSyls, wikiIpaSplit), ); return { ipa, syllable_count: pronouncedSyls.length, syllable_sequence, tone_sequence, ipa_sequence, tags, notes: null, wordRhyme: null, syllables, }; } function getSyllables( writtenSyls: string[], pronouncedSyls: string[], ipaSyls: string[], ) { let badSyls = false; if (writtenSyls.length !== pronouncedSyls.length) badSyls = true; let syls: Promise[] = []; for (let i = 0; i < pronouncedSyls.length; i++) { const pronounced = pronouncedSyls[i]!; const written = writtenSyls[i] || ""; const syllable = badSyls ? pronounced : written; const ipa = ipaSyls[i]!; // TODO insert both?? const notes = pronounced === written ? null : `Pronounced ${pronounced}`; if (pronounced !== syllable) { console.log("diff"); console.log(pronounced); console.log(written); } const res = getSyllable(syllable, ipa, i, notes); syls.push(res); } return syls; } const thaiTones: Record = { "˧": "mid", "˨˩": "low", "˥˩": "falling", "˦˥": "high", "˩˩˦": "rising", }; const thaiToneNums: Record = { "˧": 33, "˨˩": 21, "˥˩": 41, "˦˥": 45, "˩˩˦": 214, }; const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|")); function parseTone(ipa: string, spelling: string): Tone { try { const match = ipa.match(toneRegex)!; const m = match[0]!; const name = thaiTones[m]!; const numbers = thaiToneNums[m]!; return { letters: ipa, name, numbers }; } catch (e) { console.error("meh wrong tones!!", { s: spelling, ipa }); throw new Error(""); } } function parseToneS(ipa: string, spelling: string): Tone { try { const name = thaiTones[ipa]!; const numbers = thaiToneNums[ipa]!; return { letters: ipa, name, numbers }; } catch (e) { console.error("meh wrong tones!!", { s: spelling, ipa }); throw new Error(""); } } type SylData = { idx: number; stressed: boolean | null; spelling: string; ipa: string; long: boolean; onset: Phoneme; medial: Phoneme; nucleus: Phoneme; coda: Phoneme; rhyme: Phoneme; tone: Tone; notes: string | null; }; async function getSyllable( spelling: string, ipa: string, idx: number, notes: string | null, ): Promise { const sorsyl = await sorSyl(spelling, "th", ipa); if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); const syl = sorsyl.syls[0]!.ipa; const tone = parseToneS(syl.tone, spelling); return { idx: idx + 1, stressed: null, spelling, ipa: syl.all, long: syl.long, onset: { spelling: syl.onset, ipa: syl.onset }, medial: { spelling: syl.medial, ipa: syl.medial }, nucleus: { spelling: syl.nucleus, ipa: syl.nucleus }, coda: { spelling: syl.coda, ipa: syl.coda }, rhyme: { spelling: syl.rhyme, ipa: syl.rhyme }, tone, notes, }; } async function handleIdiom(idiom: string): AsyncRes { pdb.addIdiom(idiom, "th"); // TODO later set idiom_words once all words are populated // console.log(); return { ok: "" }; } readDump("th");