import Database from "bun:sqlite"; import { analyzeTHWord, deconstructSyllable, segmentateThai, type SorSyl, type ThaiNLPRes, sorSyl, getThaiFreq, SorBSyl, } from "../calls/nlp"; import pdb from "./prosodydb"; import { cleanIpa } from "../utils"; import { handleFile } from "./utils"; import { Tone } from "../types/phonetics"; import { AsyncRes } from "../types"; const errors: string[] = []; async function readDump(lang: string) { await pdb.init(); pdb.addLanguage("th", "thai"); let count = 0; const langdb = new Database( `/home/y/code/prosody/resources/wiktionary/${lang}.db`, ); let langrows: any = langdb.query("SELECT data FROM langs"); // langrows = langrows.slice(10); const freqMap = await getFrequency(); for (const langrow of langrows) { count++; console.log(count); // if (count <= 10000) continue; if (count > 300) break; const j = JSON.parse(langrow.data); const word = j.word.trim(); if (!word) continue; const split = word.split(" "); const res = split.length > 1 ? await handleIdiom(lang, word) : await handleWord(lang, word, j, freqMap); if ("error" in res) { console.error(res.error); break; } } console.dir(errors); } async function handleWord( lang: string, word: string, j: any, freqMap: Map, ): AsyncRes { // TODO add categories but add a tag to see what classifying scheme we're using // const sounds = j.sounds || []; const hasIpa = sounds.find((s: any) => "ipa" in s); const hwikiRhyme = sounds.find((s: any) => "rhymes" in s); const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; if (!hasIpa) { // console.error("no ipa!!", word); // console.dir(j, { depth: null }); return { error: "meh no ipa" }; } const freq = freqMap.get(word) || null; // const wordId = pdb.addWord(word, lang, freq, null); // WIPE const wordId = 0; // console.log(analyzed); for (let snd of sounds) if ("ipa" in snd) { const res = await handleIpa(wordId, word, lang, j, snd, wikiRhyme); if ("error" in res) return res; } return { ok: "" }; } async function handleIpa( wordId: number | bigint, word: string, lang: string, j: any, snd: any, wikiRhyme: string | null, ) { const tags = JSON.stringify(snd.tags) || null; const ipa = snd.ipa; const syls = await sorSyl(word, lang, ipa); // console.log(syls, "sorsyl"); console.log(word); console.log(ipa); pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null); // set word rhyme const wordRhyme = syls.syls.reduce((acc: string, itemm: SorBSyl) => { const item = itemm.ipa; if (!item.stressed && !acc) return acc; if (item.stressed && !acc) return `${acc}${item.rhyme}`; else return `${acc}${item.all}`; }, ""); if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme); for (let i = 0; i < syls.syls.length; i++) { const syl = syls.syls[i]!; const res = await handleSyllable(syl, wordId, i); if ("error" in res) return res; } return { ok: "" }; } async function handleSyllable( syl: SorBSyl, wordId: number | bigint, idx: number, ): AsyncRes { try { pdb.addSyllable( wordId, idx + 1, syl.ipa.stressed, "th", syl.ipa.all, syl.ipa.long, syl.spelling.all, { spelling: syl.spelling.onset, ipa: syl.ipa.onset }, { spelling: syl.spelling.medial, ipa: syl.ipa.medial }, { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus }, { spelling: syl.spelling.coda, ipa: syl.ipa.coda }, { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme }, { letters: "", numbers: 0, name: "" }, null, ); return { ok: "" }; } catch (e) { // console.log("well fuck", syl); // console.error(e); return { error: `${e}` }; } } async function handleIdiom(lang: string, idiom: string): AsyncRes { try { pdb.addIdiom(idiom, lang); // TODO later set idiom_words once all words are populated // console.log(); return { ok: "" }; } catch (e) { return { error: `${e}` }; } } // ช้า ๆ // งก ๆ // หงก ๆ async function getFrequency() { const freqMap = new Map(); await handleFile( "/home/y/code/prosody/hanchu/datasets/unigram_freq.csv", (line, idx) => { const [spelling, frequency] = line.split(","); freqMap.set(Number(frequency!), spelling!); }, ); const orderedMap = new Map(); const keys = Array.from(freqMap.keys()).sort(); for (let i = 0; i < keys.length; i++) { const val = freqMap.get(keys[i]!)!; orderedMap.set(val, i + 1); } return orderedMap; } readDump("en");