import Database from "bun:sqlite"; import { sorSyl, SorBSyl, charsiuG2P, SorSylRes } from "../calls/nlp"; import pdb from "./prosodydb"; import { cleanIpa } from "../utils"; import { handleFile } from "./utils"; import { Phoneme, Tone } from "../types/phonetics"; import { AsyncRes } from "../types"; const errors: string[] = []; async function readDump(lang: string) { await pdb.init(); pdb.addLanguage("en", "english"); let count = 0; const langdb = new Database( `/home/y/code/prosody/resources/wiktionary/${lang}.db`, ); let langrows: any = langdb.query("SELECT data FROM langs"); // langrows = langrows.slice(10); const freqMap = await getFrequency(); for (const langrow of langrows) { count++; console.log(count); // if (count <= 10000) continue; if (count > 300) break; const j = JSON.parse(langrow.data); const word = j.word.trim(); if (!word) continue; const split = word.split(" "); const res = split.length > 1 ? await handleIdiom(word, lang) : await handleWord(word, lang, j, freqMap); if ("error" in res) { console.error(res.error); break; } } console.dir(errors); } async function handleWord( word: string, lang: string, j: any, freqMap: Map, ): AsyncRes { const frequency = freqMap.get(word) || null; const promises = await getIpa(word, lang, j); const phonetics = await Promise.all(promises); // pdb.superAdd({ word, lang, frequency, wordNotes: null, phonetics }); return { ok: "" }; } type IPAData = { ipa: string; syllable_count: number; syllable_sequence: string; tone_sequence: string; ipa_sequence: string; tags: string | null; notes: string | null; wordRhyme: string | null; syllables: SylData[]; }; async function getIpa( word: string, lang: string, j: any, ): Promise[]> { const sounds = j.sounds || []; const hasIpa = sounds.find((s: any) => "ipa" in s); if (!hasIpa) { console.log("no ipa", word); // console.dir(j, { depth: null }); console.dir(sounds, { depth: null }); // TODO fetch from idk charsiu // const ipa = await charsiuG2P(word, lang); // console.log("charsiu", ipa); } const hwikiRhyme = sounds.find((s: any) => "rhymes" in s); const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; const ipaData: Promise[] = sounds.reduce( (acc: Promise[], snd: any) => { if ("ipa" in snd) { const data = getIpaData(word, lang, snd, wikiRhyme); return [...acc, data]; } else return acc; }, [], ); return ipaData; } async function getIpaData( word: string, lang: string, snd: any, wikiRhyme: string | null, ): Promise { console.log("geting ipa..."); const tags = JSON.stringify(snd.tags) || null; const ipa = cleanIpa(snd.ipa); const syls = await sorSyl(word, lang, ipa); // set word rhyme const wordRhyme = syls.syls.reduce((acc: string, itemm: SorBSyl) => { const item = itemm.ipa; if (!item.stressed && !acc) return acc; if (item.stressed && !acc) return `${acc}${item.rhyme}`; else return `${acc}${item.all}`; }, ""); console.log({ word, wikiRhyme, wordRhyme }); const tone_sequence = ""; const seqs = syls.syls.reduce( (acc, item, idx) => { const startString = idx === 0 ? "" : ","; const { ipa, spelling } = item; acc.ipa += `${startString}${ipa.all}`; acc.syls += `${startString}${spelling.all}`; return acc; }, { syls: "", ipa: "" }, ); const syllable_sequence = seqs.syls; const ipa_sequence = seqs.ipa; const syllables = getSyllables(syls); return { ipa, syllable_count: syls.syls.length, syllable_sequence, tone_sequence, ipa_sequence, tags, notes: null, wordRhyme: null, syllables, }; } type SylData = { idx: number; stressed: boolean | null; spelling: string; ipa: string; long: boolean; onset: Phoneme; medial: Phoneme; nucleus: Phoneme; coda: Phoneme; rhyme: Phoneme; tone: Tone; notes: string | null; }; function getSyllables(syl: SorSylRes): SylData[] { let syls: SylData[] = []; for (let i = 0; i < syl.syls.length; i++) { const syllable = syl.syls[i]!; const res = getSyllable(syllable, i); syls.push(res); } return syls; } function getSyllable(syl: SorBSyl, idx: number): SylData { return { idx: idx + 1, stressed: null, spelling: syl.spelling.all, ipa: syl.ipa.all, long: syl.ipa.long, onset: { spelling: syl.spelling.onset, ipa: syl.ipa.onset }, medial: { spelling: syl.spelling.medial, ipa: syl.ipa.medial }, nucleus: { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus }, coda: { spelling: syl.spelling.coda, ipa: syl.ipa.coda }, rhyme: { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme }, tone: { name: "", letters: "", numbers: 0 }, notes: null, }; } async function handleIdiom(idiom: string, lang: string): AsyncRes { try { // pdb.addIdiom(idiom, lang); // TODO later set idiom_words once all words are populated // console.log(); return { ok: "" }; } catch (e) { return { error: `${e}` }; } } // ช้า ๆ // งก ๆ // หงก ๆ async function getFrequency() { const freqMap = new Map(); await handleFile( "/home/y/code/prosody/hanchu/datasets/unigram_freq.csv", (line, idx) => { const [spelling, frequency] = line.split(","); freqMap.set(Number(frequency!), spelling!); }, ); const orderedMap = new Map(); const keys = Array.from(freqMap.keys()).sort(); for (let i = 0; i < keys.length; i++) { const val = freqMap.get(keys[i]!)!; orderedMap.set(val, i + 1); } return orderedMap; } readDump("en");