diff options
Diffstat (limited to 'src/lib/db/thaiseed.ts')
-rw-r--r-- | src/lib/db/thaiseed.ts | 75 |
1 files changed, 59 insertions, 16 deletions
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts index 687f0f3..5c75345 100644 --- a/src/lib/db/thaiseed.ts +++ b/src/lib/db/thaiseed.ts @@ -24,9 +24,9 @@ async function readDump(lang: string) { // langrows = langrows.slice(10); for (const langrow of langrows) { count++; - console.log(count); + // console.log(count); // if (count <= 10000) continue; - // if (count > 30) break; + // if (count > 100) break; const j = JSON.parse(langrow.data); const word = j.word.trim(); if (!word) continue; @@ -48,7 +48,6 @@ async function handleWord(word: string, j: any) { const freq = await getThaiFreq(word); const wordId = pdb.addWord(word, "th", freq, null); const analyzed = await analyzeTHWord(word); - // console.log(analyzed); for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed); } async function handleIpa( @@ -66,27 +65,39 @@ async function handleIpa( const wikiIpaSplit = wikiIpa.split("."); const nlpIpaSplit = nlpIpa.split("."); if (wikiIpaSplit.length !== nlpIpaSplit.length) { - console.log("ipa mismatch"); - console.log(wikiIpa); - console.log(nlpIpa); - // return; + // console.log("ipa mismatch"); + // console.log(wikiIpa); + // console.log(nlpIpa); } - if (analyzed.syllables.length !== wikiIpaSplit.length) { - console.log("syllable analysis mismatch", j.word); - console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); + if (analyzed.realSyls.length !== wikiIpaSplit.length) { + // console.log("syllable analysis mismatch", j.word); + // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); // console.dir(j, { depth: null }); return; } pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null); + const writtenSyls = analyzed.syllables; + const pronouncedSyls = analyzed.realSyls; + let badSyls = false; + if (writtenSyls.length !== pronouncedSyls.length) badSyls = true; - for (let i = 0; i < analyzed.syllables.length; i++) { - const spelling = analyzed.syllables[i]!; + for (let i = 0; i < pronouncedSyls.length; i++) { + const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, ""); + const written = writtenSyls[i] || ""; + const syllable = badSyls ? pronounced : written; const ipa = wikiIpaSplit[i]!; + // TODO insert both?? + const notes = pronounced === written ? null : `Pronounced ${pronounced}`; + if (pronounced !== syllable) { + console.log("diff"); + console.log(pronounced); + console.log(written); + } try { - await handleSyllable(spelling, ipa, wordId, i); + await handleSyllable(syllable, ipa, wordId, i, notes); } catch (e) { console.error("syl error", j.word, j.sounds); - console.error({ spelling, ipa, wikiIpaSplit }); + console.error({ analyzed, ipa, wikiIpaSplit }); console.error(e); } } @@ -115,16 +126,48 @@ function parseTone(ipa: string, spelling: string): Tone { throw new Error(""); } } + async function handleSyllable( spelling: string, ipa: string, wordId: number | bigint, idx: number, + notes: string | null, ) { const sorsyl = await sorSyl(spelling, "th", ipa); + const weird = [ + // "a̯n", + // "a̯", + // "a̯p", + // "a̯w", + // "a̯j", + // "a̯ŋ", + // "a̯k", + // "a̯t", + // "a̯m", + // "a̯ʔ", + // "ʔ", + "s", + "l", + "f", + "a̯s", + "js", + "t͡ɕʰ", + "ks", + "ns", + "a̯l", + "a̯f", + "mk", + ]; + // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda)); + // if (weirder) { + // console.log("syllable", spelling); + // // console.dir(sorsyl, { depth: null }); + // // console.dir(j, { depth: null }); + // } if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); const syl = sorsyl.syls[0]!; - const tone = syl.tone ? parseTone(syl.tone, spelling) : null; + const tone = parseTone(syl.tone, spelling); try { pdb.addSyllable( wordId, @@ -139,7 +182,7 @@ async function handleSyllable( { spelling: syl.coda, ipa: syl.coda }, { spelling: syl.rhyme, ipa: syl.rhyme }, tone, - null, + notes, ); } catch (e) { // console.log("well fuck", syl); |