summaryrefslogtreecommitdiff
path: root/src/lib/db/thaiseed.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/db/thaiseed.ts')
-rw-r--r--src/lib/db/thaiseed.ts75
1 files changed, 59 insertions, 16 deletions
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
index 687f0f3..5c75345 100644
--- a/src/lib/db/thaiseed.ts
+++ b/src/lib/db/thaiseed.ts
@@ -24,9 +24,9 @@ async function readDump(lang: string) {
// langrows = langrows.slice(10);
for (const langrow of langrows) {
count++;
- console.log(count);
+ // console.log(count);
// if (count <= 10000) continue;
- // if (count > 30) break;
+ // if (count > 100) break;
const j = JSON.parse(langrow.data);
const word = j.word.trim();
if (!word) continue;
@@ -48,7 +48,6 @@ async function handleWord(word: string, j: any) {
const freq = await getThaiFreq(word);
const wordId = pdb.addWord(word, "th", freq, null);
const analyzed = await analyzeTHWord(word);
- // console.log(analyzed);
for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed);
}
async function handleIpa(
@@ -66,27 +65,39 @@ async function handleIpa(
const wikiIpaSplit = wikiIpa.split(".");
const nlpIpaSplit = nlpIpa.split(".");
if (wikiIpaSplit.length !== nlpIpaSplit.length) {
- console.log("ipa mismatch");
- console.log(wikiIpa);
- console.log(nlpIpa);
- // return;
+ // console.log("ipa mismatch");
+ // console.log(wikiIpa);
+ // console.log(nlpIpa);
}
- if (analyzed.syllables.length !== wikiIpaSplit.length) {
- console.log("syllable analysis mismatch", j.word);
- console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+ if (analyzed.realSyls.length !== wikiIpaSplit.length) {
+ // console.log("syllable analysis mismatch", j.word);
+ // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
// console.dir(j, { depth: null });
return;
}
pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null);
+ const writtenSyls = analyzed.syllables;
+ const pronouncedSyls = analyzed.realSyls;
+ let badSyls = false;
+ if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
- for (let i = 0; i < analyzed.syllables.length; i++) {
- const spelling = analyzed.syllables[i]!;
+ for (let i = 0; i < pronouncedSyls.length; i++) {
+ const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, "");
+ const written = writtenSyls[i] || "";
+ const syllable = badSyls ? pronounced : written;
const ipa = wikiIpaSplit[i]!;
+ // TODO insert both??
+ const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
+ if (pronounced !== syllable) {
+ console.log("diff");
+ console.log(pronounced);
+ console.log(written);
+ }
try {
- await handleSyllable(spelling, ipa, wordId, i);
+ await handleSyllable(syllable, ipa, wordId, i, notes);
} catch (e) {
console.error("syl error", j.word, j.sounds);
- console.error({ spelling, ipa, wikiIpaSplit });
+ console.error({ analyzed, ipa, wikiIpaSplit });
console.error(e);
}
}
@@ -115,16 +126,48 @@ function parseTone(ipa: string, spelling: string): Tone {
throw new Error("");
}
}
+
async function handleSyllable(
spelling: string,
ipa: string,
wordId: number | bigint,
idx: number,
+ notes: string | null,
) {
const sorsyl = await sorSyl(spelling, "th", ipa);
+ const weird = [
+ // "a̯n",
+ // "a̯",
+ // "a̯p",
+ // "a̯w",
+ // "a̯j",
+ // "a̯ŋ",
+ // "a̯k",
+ // "a̯t",
+ // "a̯m",
+ // "a̯ʔ",
+ // "ʔ",
+ "s",
+ "l",
+ "f",
+ "a̯s",
+ "js",
+ "t͡ɕʰ",
+ "ks",
+ "ns",
+ "a̯l",
+ "a̯f",
+ "mk",
+ ];
+ // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
+ // if (weirder) {
+ // console.log("syllable", spelling);
+ // // console.dir(sorsyl, { depth: null });
+ // // console.dir(j, { depth: null });
+ // }
if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
const syl = sorsyl.syls[0]!;
- const tone = syl.tone ? parseTone(syl.tone, spelling) : null;
+ const tone = parseTone(syl.tone, spelling);
try {
pdb.addSyllable(
wordId,
@@ -139,7 +182,7 @@ async function handleSyllable(
{ spelling: syl.coda, ipa: syl.coda },
{ spelling: syl.rhyme, ipa: syl.rhyme },
tone,
- null,
+ notes,
);
} catch (e) {
// console.log("well fuck", syl);