diff options
Diffstat (limited to 'src/lib/db/thaiseedold.ts')
-rw-r--r-- | src/lib/db/thaiseedold.ts | 301 |
1 files changed, 301 insertions, 0 deletions
diff --git a/src/lib/db/thaiseedold.ts b/src/lib/db/thaiseedold.ts new file mode 100644 index 0000000..b9522dd --- /dev/null +++ b/src/lib/db/thaiseedold.ts @@ -0,0 +1,301 @@ +import Database from "bun:sqlite"; +import { + analyzeTHWord, + deconstructSyllable, + segmentateThai, + type SorSyl, + type ThaiNLPRes, + sorSyl, + getThaiFreq, +} from "../calls/nlp"; +import pdb from "./prosodydb"; +import { cleanIpa } from "../utils"; +import { handleFile } from "./utils"; +import { Tone } from "../types/phonetics"; +import { AsyncRes } from "../types"; + +async function readDump(lang: string) { + await pdb.init(); + pdb.addLanguage("th", "thai"); + let count = 0; + const langdb = new Database( + `/home/y/code/prosody/resources/wiktionary/${lang}.db`, + ); + let langrows: any = langdb.query("SELECT data FROM langs"); + // langrows = langrows.slice(10); + for (const langrow of langrows) { + count++; + console.log(count); + // if (count <= 10000) continue; + // if (count > 100) break; + const j = JSON.parse(langrow.data); + const word = j.word.trim(); + if (!word) continue; + + if (word.includes("ๆ")) { + const res = await handleWord(word, j); + if ("error" in res) { + if (res.error.includes("meh")) continue; + if (res.error.includes("wtf")) { + console.error(res.error); + console.error(j.sounds); + } + break; + } + } else { + const split = word.split(" "); + if (split.length > 1) { + const res = await handleIdiom(word); + if ("error" in res) { + console.error(res.error); + break; + } + } else { + const res = await handleWord(word, j); + if ("error" in res) { + if (res.error.includes("meh")) continue; + if (res.error.includes("wtf")) { + console.error(res.error); + console.error(j.sounds); + } + // break; + } + } + } + } +} + +// if (wordId == 478 || word === "และ") { +// // console.log("wtf man"); +// // console.dir(j, { depth: null }); +// // return { error: "i said wtf" }; +// } +async function handleWord(word: string, j: any): AsyncRes<string> { + // TODO add categories but add a tag to see what classifying scheme we're using + // + const sounds = j.sounds || []; + const hasIpa = sounds.find((s: any) => "ipa" in s); + if (!hasIpa) return { error: "meh no ipa" }; + const freq = await getThaiFreq(word); + const wordId = pdb.addWord(word, "th", freq, null); + const analyzed = await analyzeTHWord(word); + for (let snd of sounds) + if ("ipa" in snd) { + const res = await handleIpa(wordId, j, snd, analyzed); + if ("error" in res) return res; + } + return { ok: "" }; +} +async function handleIpa( + wordId: number | bigint, + j: any, + snd: any, + analyzed: ThaiNLPRes, +): AsyncRes<string> { + console.log(); + const tags = JSON.stringify(snd.tags) || null; + // console.log("handleipa", analyzed.syllables.length); + // console.log(analyzed); + const wikiIpa = cleanIpa(snd.ipa); + const nlpIpa = cleanIpa(analyzed.ipa); + const ipa = wikiIpa || nlpIpa; + // if (j.word === "และ") { + // console.log("wtf!!"); + // return { error: "wtf is this" }; + // } + const wikiIpaSplit = wikiIpa.split("."); + const nlpIpaSplit = nlpIpa.split("."); + if (wikiIpaSplit.length !== nlpIpaSplit.length) { + // console.log("ipa mismatch"); + // console.log(wikiIpa); + // console.log(nlpIpa); + } + if (analyzed.realSyls.length !== wikiIpaSplit.length) { + // console.log("syllable analysis mismatch", j.word); + // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); + // console.dir(j, { depth: null }); + return { error: "meh syllable analysis mismatch" }; + } + const writtenSyls = analyzed.syllables; + const pronouncedSyls = analyzed.realSyls.map((s) => + s.replace(/\u{E3A}/u, ""), + ); + let badSyls = false; + if (writtenSyls.length !== pronouncedSyls.length) badSyls = true; + + const tone_sequence = wikiIpaSplit + .map((s) => parseTone(s, j.word)) + .map((t) => t.name) + .join(","); + const syl_sequence = pronouncedSyls.join(","); + const ipa_sequence = wikiIpaSplit.join(","); + pdb.addPronunciation( + wordId, + ipa, + pronouncedSyls.length, + syl_sequence, + tone_sequence, + ipa_sequence, + tags, + null, + ); + + for (let i = 0; i < pronouncedSyls.length; i++) { + const pronounced = pronouncedSyls[i]!; + const written = writtenSyls[i] || ""; + const syllable = badSyls ? pronounced : written; + const ipa = wikiIpaSplit[i]!; + // TODO insert both?? + const notes = pronounced === written ? null : `Pronounced ${pronounced}`; + if (pronounced !== syllable) { + console.log("diff"); + console.log(pronounced); + console.log(written); + } + const res = await handleSyllable(syllable, ipa, wordId, i, notes); + if ("error" in res) return res; + } + return { ok: "" }; +} +const thaiTones: Record<string, string> = { + "˧": "mid", + "˨˩": "low", + "˥˩": "falling", + "˦˥": "high", + "˩˩˦": "rising", +}; +const thaiToneNums: Record<string, number> = { + "˧": 33, + "˨˩": 21, + "˥˩": 41, + "˦˥": 45, + "˩˩˦": 214, +}; +const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|")); + +function parseTone(ipa: string, spelling: string): Tone { + try { + const match = ipa.match(toneRegex)!; + const m = match[0]!; + const name = thaiTones[m]!; + const numbers = thaiToneNums[m]!; + return { letters: ipa, name, numbers }; + } catch (e) { + console.error("meh wrong tones!!", { s: spelling, ipa }); + throw new Error(""); + } +} +function parseToneS(ipa: string, spelling: string): Tone { + try { + const name = thaiTones[ipa]!; + const numbers = thaiToneNums[ipa]!; + return { letters: ipa, name, numbers }; + } catch (e) { + console.error("meh wrong tones!!", { s: spelling, ipa }); + throw new Error(""); + } +} + +async function handleSyllable( + spelling: string, + ipa: string, + wordId: number | bigint, + idx: number, + notes: string | null, +): AsyncRes<string> { + const sorsyl = await sorSyl(spelling, "th", ipa); + // console.log("ssyl", sorsyl.syls); + const weird = [ + // "a̯n", + // "a̯", + // "a̯p", + // "a̯w", + // "a̯j", + // "a̯ŋ", + // "a̯k", + // "a̯t", + // "a̯m", + // "a̯ʔ", + // "ʔ", + "s", + "l", + "f", + "a̯s", + "js", + "t͡ɕʰ", + "ks", + "ns", + "a̯l", + "a̯f", + "mk", + ]; + // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda)); + // if (weirder) { + // console.log("syllable", spelling); + // // console.dir(sorsyl, { depth: null }); + // // console.dir(j, { depth: null }); + // } + if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); + const syl = sorsyl.syls[0]!.ipa; + const tone = parseToneS(syl.tone, spelling); + // TODO add actual ortographic data here not just ipa + try { + pdb.addSyllable( + wordId, + idx + 1, + null, + "th", + syl.all, + syl.long, + spelling, + { spelling: syl.onset, ipa: syl.onset }, + { spelling: syl.medial, ipa: syl.medial }, + { spelling: syl.nucleus, ipa: syl.nucleus }, + { spelling: syl.coda, ipa: syl.coda }, + { spelling: syl.rhyme, ipa: syl.rhyme }, + tone, + notes, + ); + return { ok: "" }; + } catch (e) { + // console.log("well fuck", syl); + // console.error(e); + return { error: `meh ${e}` }; + } +} +async function handleIdiom(idiom: string): AsyncRes<string> { + pdb.addIdiom(idiom, "th"); + // TODO later set idiom_words once all words are populated + // console.log(); + return { ok: "" }; +} +// ช้า ๆ +// งก ๆ +// หงก ๆ + +async function getFrequency() { + const files = [ + "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv", + ]; + const freqMap = new Map<number, string>(); + for (const file of files) { + await handleFile(file, (line, idx) => { + const [spelling, IPA, tone, length, frequency, ...rest] = line.split(","); + freqMap.set(Number(frequency!), spelling!); + }); + } + const orderedMap = new Map<string, number>(); + const keys = Array.from(freqMap.keys()).sort(); + for (let i = 0; i < keys.length; i++) { + const val = freqMap.get(keys[i]!)!; + orderedMap.set(val, i + 1); + } + return orderedMap; +} + +readDump("th"); |