diff options
Diffstat (limited to 'src/lib/db/thaiseed.ts')
-rw-r--r-- | src/lib/db/thaiseed.ts | 253 |
1 files changed, 124 insertions, 129 deletions
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts index 6c69d9c..32434da 100644 --- a/src/lib/db/thaiseed.ts +++ b/src/lib/db/thaiseed.ts @@ -11,7 +11,7 @@ import { import pdb from "./prosodydb"; import { cleanIpa } from "../utils"; import { handleFile } from "./utils"; -import { Tone } from "../types/phonetics"; +import { Phoneme, Tone } from "../types/phonetics"; import { AsyncRes } from "../types"; async function readDump(lang: string) { @@ -25,7 +25,7 @@ async function readDump(lang: string) { // langrows = langrows.slice(10); for (const langrow of langrows) { count++; - // console.log(count); + console.log(count); // if (count <= 10000) continue; // if (count > 100) break; const j = JSON.parse(langrow.data); @@ -68,65 +68,101 @@ async function readDump(lang: string) { async function handleWord(word: string, j: any): AsyncRes<string> { // TODO add categories but add a tag to see what classifying scheme we're using // - const sounds = j.sounds || []; - const hasIpa = sounds.find((s: any) => "ipa" in s); - if (!hasIpa) return { error: "meh no ipa" }; - const freq = await getThaiFreq(word); - const wordId = pdb.addWord(word, "th", freq, null); - if (wordId == 478 || word === "และ") { - console.log("wtf man"); - console.dir(j, { depth: null }); - return { error: "i said wtf" }; - } + const frequency = await getThaiFreq(word); const analyzed = await analyzeTHWord(word); - for (let snd of sounds) - if ("ipa" in snd) { - const res = await handleIpa(wordId, j, snd, analyzed); - if ("error" in res) return res; - } + const phonetics = await Promise.all(getIpa(j, analyzed)); + + pdb.superAdd({ word, lang: "th", frequency, wordNotes: null, phonetics }); return { ok: "" }; } -async function handleIpa( - wordId: number | bigint, - j: any, - snd: any, - analyzed: ThaiNLPRes, -): AsyncRes<string> { +function getIpa(j: any, analyzed: ThaiNLPRes) { + const sounds = j.sounds || []; + const hasIpa = sounds.find((s: any) => "ipa" in s); + if (!hasIpa) return []; + const ipaData: Promise<IPAData>[] = sounds.reduce( + async (acc: Promise<IPAData>[], snd: any) => { + if ("ipa" in snd) { + const data = getIpaData(snd, analyzed); + return [...acc, data]; + } else return acc; + }, + [], + ); + return ipaData; +} +type IPAData = { + ipa: string; + syllable_count: number; + syllable_sequence: string; + tone_sequence: string; + ipa_sequence: string; + tags: string | null; + notes: string | null; + wordRhyme: string | null; + syllables: SylData[]; +}; +async function getIpaData(snd: any, analyzed: ThaiNLPRes): Promise<IPAData> { const tags = JSON.stringify(snd.tags) || null; // console.log("handleipa", analyzed.syllables.length); // console.log(analyzed); const wikiIpa = cleanIpa(snd.ipa); const nlpIpa = cleanIpa(analyzed.ipa); const ipa = wikiIpa || nlpIpa; - if (j.word === "และ") { - console.log("wtf!!"); - return { error: "wtf is this" }; - } + // if (j.word === "และ") { + // console.log("wtf!!"); + // return { error: "wtf is this" }; + // } const wikiIpaSplit = wikiIpa.split("."); const nlpIpaSplit = nlpIpa.split("."); if (wikiIpaSplit.length !== nlpIpaSplit.length) { - // console.log("ipa mismatch"); - // console.log(wikiIpa); - // console.log(nlpIpa); + console.log("ipa mismatch"); + console.log(wikiIpa); + console.log(nlpIpa); } if (analyzed.realSyls.length !== wikiIpaSplit.length) { - // console.log("syllable analysis mismatch", j.word); - // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); - // console.dir(j, { depth: null }); - return { error: "meh syllable analysis mismatch" }; + console.log("syllable analysis mismatch", analyzed.word); + console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); + throw new Error("syllable mismatch"); } const writtenSyls = analyzed.syllables; - const pronouncedSyls = analyzed.realSyls; + const pronouncedSyls = analyzed.realSyls.map((s) => + s.replace(/\u{E3A}/u, ""), + ); + + const tone_sequence = wikiIpaSplit + .map((s) => parseTone(s, analyzed.word)) + .map((t) => t.name) + .join(","); + const syllable_sequence = pronouncedSyls.join(","); + const ipa_sequence = wikiIpaSplit.join(","); + const syllables = await Promise.all( + getSyllables(writtenSyls, pronouncedSyls, wikiIpaSplit), + ); + return { + ipa, + syllable_count: pronouncedSyls.length, + syllable_sequence, + tone_sequence, + ipa_sequence, + tags, + notes: null, + wordRhyme: null, + syllables, + }; +} +function getSyllables( + writtenSyls: string[], + pronouncedSyls: string[], + ipaSyls: string[], +) { let badSyls = false; if (writtenSyls.length !== pronouncedSyls.length) badSyls = true; - - pdb.addPronunciation(wordId, ipa, pronouncedSyls.length, tags, null); - + let syls: Promise<SylData>[] = []; for (let i = 0; i < pronouncedSyls.length; i++) { - const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, ""); + const pronounced = pronouncedSyls[i]!; const written = writtenSyls[i] || ""; const syllable = badSyls ? pronounced : written; - const ipa = wikiIpaSplit[i]!; + const ipa = ipaSyls[i]!; // TODO insert both?? const notes = pronounced === written ? null : `Pronounced ${pronounced}`; if (pronounced !== syllable) { @@ -134,10 +170,10 @@ async function handleIpa( console.log(pronounced); console.log(written); } - const res = await handleSyllable(syllable, ipa, wordId, i, notes); - if ("error" in res) return res; + const res = getSyllable(syllable, ipa, i, notes); + syls.push(res); } - return { ok: "" }; + return syls; } const thaiTones: Record<string, string> = { "˧": "mid", @@ -153,8 +189,22 @@ const thaiToneNums: Record<string, number> = { "˦˥": 45, "˩˩˦": 214, }; +const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|")); + function parseTone(ipa: string, spelling: string): Tone { try { + const match = ipa.match(toneRegex)!; + const m = match[0]!; + const name = thaiTones[m]!; + const numbers = thaiToneNums[m]!; + return { letters: ipa, name, numbers }; + } catch (e) { + console.error("meh wrong tones!!", { s: spelling, ipa }); + throw new Error(""); + } +} +function parseToneS(ipa: string, spelling: string): Tone { + try { const name = thaiTones[ipa]!; const numbers = thaiToneNums[ipa]!; return { letters: ipa, name, numbers }; @@ -164,71 +214,44 @@ function parseTone(ipa: string, spelling: string): Tone { } } -async function handleSyllable( +type SylData = { + idx: number; + stressed: boolean | null; + spelling: string; + ipa: string; + long: boolean; + onset: Phoneme; + medial: Phoneme; + nucleus: Phoneme; + coda: Phoneme; + rhyme: Phoneme; + tone: Tone; + notes: string | null; +}; +async function getSyllable( spelling: string, ipa: string, - wordId: number | bigint, idx: number, notes: string | null, -): AsyncRes<string> { +): Promise<SylData> { const sorsyl = await sorSyl(spelling, "th", ipa); - const weird = [ - // "a̯n", - // "a̯", - // "a̯p", - // "a̯w", - // "a̯j", - // "a̯ŋ", - // "a̯k", - // "a̯t", - // "a̯m", - // "a̯ʔ", - // "ʔ", - "s", - "l", - "f", - "a̯s", - "js", - "t͡ɕʰ", - "ks", - "ns", - "a̯l", - "a̯f", - "mk", - ]; - // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda)); - // if (weirder) { - // console.log("syllable", spelling); - // // console.dir(sorsyl, { depth: null }); - // // console.dir(j, { depth: null }); - // } if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); const syl = sorsyl.syls[0]!.ipa; - const tone = parseTone(syl.tone, spelling); - // TODO add actual ortographic data here not just ipa - try { - pdb.addSyllable( - wordId, - idx + 1, - null, - "th", - syl.all, - syl.long, - spelling, - { spelling: syl.onset, ipa: syl.onset }, - { spelling: syl.medial, ipa: syl.medial }, - { spelling: syl.nucleus, ipa: syl.nucleus }, - { spelling: syl.coda, ipa: syl.coda }, - { spelling: syl.rhyme, ipa: syl.rhyme }, - tone, - notes, - ); - return { ok: "" }; - } catch (e) { - // console.log("well fuck", syl); - // console.error(e); - return { error: `meh ${e}` }; - } + const tone = parseToneS(syl.tone, spelling); + return { + idx: idx + 1, + stressed: null, + spelling, + ipa: syl.all, + long: syl.long, + onset: { spelling: syl.onset, ipa: syl.onset }, + medial: { spelling: syl.medial, ipa: syl.medial }, + nucleus: { spelling: syl.nucleus, ipa: syl.nucleus }, + coda: { spelling: syl.coda, ipa: syl.coda }, + rhyme: { spelling: syl.rhyme, ipa: syl.rhyme }, + tone, + notes, + }; } async function handleIdiom(idiom: string): AsyncRes<string> { pdb.addIdiom(idiom, "th"); @@ -236,33 +259,5 @@ async function handleIdiom(idiom: string): AsyncRes<string> { // console.log(); return { ok: "" }; } -// ช้า ๆ -// งก ๆ -// หงก ๆ - -async function getFrequency() { - const files = [ - "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv", - "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv", - "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv", - "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv", - "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv", - "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv", - ]; - const freqMap = new Map<number, string>(); - for (const file of files) { - await handleFile(file, (line, idx) => { - const [spelling, IPA, tone, length, frequency, ...rest] = line.split(","); - freqMap.set(Number(frequency!), spelling!); - }); - } - const orderedMap = new Map<string, number>(); - const keys = Array.from(freqMap.keys()).sort(); - for (let i = 0; i < keys.length; i++) { - const val = freqMap.get(keys[i]!)!; - orderedMap.set(val, i + 1); - } - return orderedMap; -} readDump("th"); |