diff options
Diffstat (limited to 'src/lib/db/enseed.ts')
-rw-r--r-- | src/lib/db/enseed.ts | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/src/lib/db/enseed.ts b/src/lib/db/enseed.ts new file mode 100644 index 0000000..58f5876 --- /dev/null +++ b/src/lib/db/enseed.ts @@ -0,0 +1,151 @@ +import Database from "bun:sqlite"; +import { + analyzeTHWord, + deconstructSyllable, + segmentateThai, + type SorSyl, + type ThaiNLPRes, + sorSyl, + getThaiFreq, +} from "../calls/nlp"; +import pdb from "./prosodydb"; +import { cleanIpa } from "../utils"; +import { handleFile } from "./utils"; +import { Tone } from "../types/phonetics"; + +async function readDump(lang: string) { + await pdb.init(); + pdb.addLanguage("th", "thai"); + let count = 0; + const langdb = new Database( + `/home/y/code/prosody/resources/wiktionary/${lang}.db`, + ); + let langrows: any = langdb.query("SELECT data FROM langs"); + // langrows = langrows.slice(10); + const freqMap = await getFrequency(); + for (const langrow of langrows) { + count++; + console.log(count); + // if (count <= 10000) continue; + if (count > 30) break; + const j = JSON.parse(langrow.data); + const word = j.word.trim(); + if (!word) continue; + const split = word.split(" "); + if (split.length > 1) await handleIdiom(lang, word); + else await handleWord(lang, word, j, freqMap); + } +} + +async function handleWord( + lang: string, + word: string, + j: any, + freqMap: Map<string, number>, +) { + // TODO add categories but add a tag to see what classifying scheme we're using + // + const sounds = j.sounds || []; + const hasIpa = sounds.find((s: any) => "ipa" in s); + const hwikiRhyme = sounds.find((s: any) => "rhymes" in s); + const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; + if (!hasIpa) { + console.error("no ipa!!", word); + console.dir(j, { depth: null }); + return; + } + const freq = freqMap.get(word) || null; + // const wordId = pdb.addWord(word, lang, freq, null); + // WIPE + const wordId = 0; + // console.log(analyzed); + for (let snd of sounds) + if ("ipa" in snd) handleIpa(wordId, word, lang, j, snd, wikiRhyme); +} +async function handleIpa( + wordId: number | bigint, + word: string, + lang: string, + j: any, + snd: any, + wikiRhyme: string | null, +) { + const tags = JSON.stringify(snd.tags) || null; + const ipa = snd.ipa; + const syls = await sorSyl(word, lang, ipa); + + console.log(word); + console.log(ipa); + // pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null); + // set word rhyme + const wordRhyme = syls.syls.reduce((acc: string, item: SorSyl) => { + if (!item.stressed && !acc) return acc; + if (item.stressed && !acc) return `${acc}${item.rhyme}`; + else return `${acc}${item.ipa}`; + }, ""); + if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme); + // + for (let i = 0; i < syls.syls.length; i++) { + const syl = syls.syls[i]!; + await handleSyllable(word, syl.ipa, wordId, i); + } +} +async function handleSyllable( + spelling: string, + ipa: string, + wordId: number | bigint, + idx: number, +) { + const sorsyl = await sorSyl(spelling, "th", ipa); + if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); + const syl = sorsyl.syls[0]!; + try { + pdb.addSyllable( + wordId, + idx + 1, + "th", + syl.ipa, + syl.long, + spelling, + { spelling: syl.onset, ipa: syl.onset }, + { spelling: syl.medial, ipa: syl.medial }, + { spelling: syl.nucleus, ipa: syl.nucleus }, + { spelling: syl.coda, ipa: syl.coda }, + { spelling: syl.rhyme, ipa: syl.rhyme }, + { letters: "", numbers: 0, name: "" }, + null, + ); + } catch (e) { + // console.log("well fuck", syl); + // console.error(e); + console.log(); + } +} +async function handleIdiom(lang: string, idiom: string) { + pdb.addIdiom(idiom, lang); + // TODO later set idiom_words once all words are populated + // console.log(); +} +// ช้า ๆ +// งก ๆ +// หงก ๆ + +async function getFrequency() { + const freqMap = new Map<number, string>(); + await handleFile( + "/home/y/code/prosody/hanchu/datasets/unigram_freq.csv", + (line, idx) => { + const [spelling, frequency] = line.split(","); + freqMap.set(Number(frequency!), spelling!); + }, + ); + const orderedMap = new Map<string, number>(); + const keys = Array.from(freqMap.keys()).sort(); + for (let i = 0; i < keys.length; i++) { + const val = freqMap.get(keys[i]!)!; + orderedMap.set(val, i + 1); + } + return orderedMap; +} + +readDump("en"); |