summaryrefslogtreecommitdiff
path: root/src/lib/db/thaiseed.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/db/thaiseed.ts')
-rw-r--r--src/lib/db/thaiseed.ts253
1 files changed, 124 insertions, 129 deletions
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
index 6c69d9c..32434da 100644
--- a/src/lib/db/thaiseed.ts
+++ b/src/lib/db/thaiseed.ts
@@ -11,7 +11,7 @@ import {
import pdb from "./prosodydb";
import { cleanIpa } from "../utils";
import { handleFile } from "./utils";
-import { Tone } from "../types/phonetics";
+import { Phoneme, Tone } from "../types/phonetics";
import { AsyncRes } from "../types";
async function readDump(lang: string) {
@@ -25,7 +25,7 @@ async function readDump(lang: string) {
// langrows = langrows.slice(10);
for (const langrow of langrows) {
count++;
- // console.log(count);
+ console.log(count);
// if (count <= 10000) continue;
// if (count > 100) break;
const j = JSON.parse(langrow.data);
@@ -68,65 +68,101 @@ async function readDump(lang: string) {
async function handleWord(word: string, j: any): AsyncRes<string> {
// TODO add categories but add a tag to see what classifying scheme we're using
//
- const sounds = j.sounds || [];
- const hasIpa = sounds.find((s: any) => "ipa" in s);
- if (!hasIpa) return { error: "meh no ipa" };
- const freq = await getThaiFreq(word);
- const wordId = pdb.addWord(word, "th", freq, null);
- if (wordId == 478 || word === "และ") {
- console.log("wtf man");
- console.dir(j, { depth: null });
- return { error: "i said wtf" };
- }
+ const frequency = await getThaiFreq(word);
const analyzed = await analyzeTHWord(word);
- for (let snd of sounds)
- if ("ipa" in snd) {
- const res = await handleIpa(wordId, j, snd, analyzed);
- if ("error" in res) return res;
- }
+ const phonetics = await Promise.all(getIpa(j, analyzed));
+
+ pdb.superAdd({ word, lang: "th", frequency, wordNotes: null, phonetics });
return { ok: "" };
}
-async function handleIpa(
- wordId: number | bigint,
- j: any,
- snd: any,
- analyzed: ThaiNLPRes,
-): AsyncRes<string> {
+function getIpa(j: any, analyzed: ThaiNLPRes) {
+ const sounds = j.sounds || [];
+ const hasIpa = sounds.find((s: any) => "ipa" in s);
+ if (!hasIpa) return [];
+ const ipaData: Promise<IPAData>[] = sounds.reduce(
+ async (acc: Promise<IPAData>[], snd: any) => {
+ if ("ipa" in snd) {
+ const data = getIpaData(snd, analyzed);
+ return [...acc, data];
+ } else return acc;
+ },
+ [],
+ );
+ return ipaData;
+}
+type IPAData = {
+ ipa: string;
+ syllable_count: number;
+ syllable_sequence: string;
+ tone_sequence: string;
+ ipa_sequence: string;
+ tags: string | null;
+ notes: string | null;
+ wordRhyme: string | null;
+ syllables: SylData[];
+};
+async function getIpaData(snd: any, analyzed: ThaiNLPRes): Promise<IPAData> {
const tags = JSON.stringify(snd.tags) || null;
// console.log("handleipa", analyzed.syllables.length);
// console.log(analyzed);
const wikiIpa = cleanIpa(snd.ipa);
const nlpIpa = cleanIpa(analyzed.ipa);
const ipa = wikiIpa || nlpIpa;
- if (j.word === "และ") {
- console.log("wtf!!");
- return { error: "wtf is this" };
- }
+ // if (j.word === "และ") {
+ // console.log("wtf!!");
+ // return { error: "wtf is this" };
+ // }
const wikiIpaSplit = wikiIpa.split(".");
const nlpIpaSplit = nlpIpa.split(".");
if (wikiIpaSplit.length !== nlpIpaSplit.length) {
- // console.log("ipa mismatch");
- // console.log(wikiIpa);
- // console.log(nlpIpa);
+ console.log("ipa mismatch");
+ console.log(wikiIpa);
+ console.log(nlpIpa);
}
if (analyzed.realSyls.length !== wikiIpaSplit.length) {
- // console.log("syllable analysis mismatch", j.word);
- // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
- // console.dir(j, { depth: null });
- return { error: "meh syllable analysis mismatch" };
+ console.log("syllable analysis mismatch", analyzed.word);
+ console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+ throw new Error("syllable mismatch");
}
const writtenSyls = analyzed.syllables;
- const pronouncedSyls = analyzed.realSyls;
+ const pronouncedSyls = analyzed.realSyls.map((s) =>
+ s.replace(/\u{E3A}/u, ""),
+ );
+
+ const tone_sequence = wikiIpaSplit
+ .map((s) => parseTone(s, analyzed.word))
+ .map((t) => t.name)
+ .join(",");
+ const syllable_sequence = pronouncedSyls.join(",");
+ const ipa_sequence = wikiIpaSplit.join(",");
+ const syllables = await Promise.all(
+ getSyllables(writtenSyls, pronouncedSyls, wikiIpaSplit),
+ );
+ return {
+ ipa,
+ syllable_count: pronouncedSyls.length,
+ syllable_sequence,
+ tone_sequence,
+ ipa_sequence,
+ tags,
+ notes: null,
+ wordRhyme: null,
+ syllables,
+ };
+}
+function getSyllables(
+ writtenSyls: string[],
+ pronouncedSyls: string[],
+ ipaSyls: string[],
+) {
let badSyls = false;
if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
-
- pdb.addPronunciation(wordId, ipa, pronouncedSyls.length, tags, null);
-
+ let syls: Promise<SylData>[] = [];
for (let i = 0; i < pronouncedSyls.length; i++) {
- const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, "");
+ const pronounced = pronouncedSyls[i]!;
const written = writtenSyls[i] || "";
const syllable = badSyls ? pronounced : written;
- const ipa = wikiIpaSplit[i]!;
+ const ipa = ipaSyls[i]!;
// TODO insert both??
const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
if (pronounced !== syllable) {
@@ -134,10 +170,10 @@ async function handleIpa(
console.log(pronounced);
console.log(written);
}
- const res = await handleSyllable(syllable, ipa, wordId, i, notes);
- if ("error" in res) return res;
+ const res = getSyllable(syllable, ipa, i, notes);
+ syls.push(res);
}
- return { ok: "" };
+ return syls;
}
const thaiTones: Record<string, string> = {
"˧": "mid",
@@ -153,8 +189,22 @@ const thaiToneNums: Record<string, number> = {
"˦˥": 45,
"˩˩˦": 214,
};
+const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|"));
+
function parseTone(ipa: string, spelling: string): Tone {
try {
+ const match = ipa.match(toneRegex)!;
+ const m = match[0]!;
+ const name = thaiTones[m]!;
+ const numbers = thaiToneNums[m]!;
+ return { letters: ipa, name, numbers };
+ } catch (e) {
+ console.error("meh wrong tones!!", { s: spelling, ipa });
+ throw new Error("");
+ }
+}
+function parseToneS(ipa: string, spelling: string): Tone {
+ try {
const name = thaiTones[ipa]!;
const numbers = thaiToneNums[ipa]!;
return { letters: ipa, name, numbers };
@@ -164,71 +214,44 @@ function parseTone(ipa: string, spelling: string): Tone {
}
}
-async function handleSyllable(
+type SylData = {
+ idx: number;
+ stressed: boolean | null;
+ spelling: string;
+ ipa: string;
+ long: boolean;
+ onset: Phoneme;
+ medial: Phoneme;
+ nucleus: Phoneme;
+ coda: Phoneme;
+ rhyme: Phoneme;
+ tone: Tone;
+ notes: string | null;
+};
+async function getSyllable(
spelling: string,
ipa: string,
- wordId: number | bigint,
idx: number,
notes: string | null,
-): AsyncRes<string> {
+): Promise<SylData> {
const sorsyl = await sorSyl(spelling, "th", ipa);
- const weird = [
- // "a̯n",
- // "a̯",
- // "a̯p",
- // "a̯w",
- // "a̯j",
- // "a̯ŋ",
- // "a̯k",
- // "a̯t",
- // "a̯m",
- // "a̯ʔ",
- // "ʔ",
- "s",
- "l",
- "f",
- "a̯s",
- "js",
- "t͡ɕʰ",
- "ks",
- "ns",
- "a̯l",
- "a̯f",
- "mk",
- ];
- // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
- // if (weirder) {
- // console.log("syllable", spelling);
- // // console.dir(sorsyl, { depth: null });
- // // console.dir(j, { depth: null });
- // }
if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
const syl = sorsyl.syls[0]!.ipa;
- const tone = parseTone(syl.tone, spelling);
- // TODO add actual ortographic data here not just ipa
- try {
- pdb.addSyllable(
- wordId,
- idx + 1,
- null,
- "th",
- syl.all,
- syl.long,
- spelling,
- { spelling: syl.onset, ipa: syl.onset },
- { spelling: syl.medial, ipa: syl.medial },
- { spelling: syl.nucleus, ipa: syl.nucleus },
- { spelling: syl.coda, ipa: syl.coda },
- { spelling: syl.rhyme, ipa: syl.rhyme },
- tone,
- notes,
- );
- return { ok: "" };
- } catch (e) {
- // console.log("well fuck", syl);
- // console.error(e);
- return { error: `meh ${e}` };
- }
+ const tone = parseToneS(syl.tone, spelling);
+ return {
+ idx: idx + 1,
+ stressed: null,
+ spelling,
+ ipa: syl.all,
+ long: syl.long,
+ onset: { spelling: syl.onset, ipa: syl.onset },
+ medial: { spelling: syl.medial, ipa: syl.medial },
+ nucleus: { spelling: syl.nucleus, ipa: syl.nucleus },
+ coda: { spelling: syl.coda, ipa: syl.coda },
+ rhyme: { spelling: syl.rhyme, ipa: syl.rhyme },
+ tone,
+ notes,
+ };
}
async function handleIdiom(idiom: string): AsyncRes<string> {
pdb.addIdiom(idiom, "th");
@@ -236,33 +259,5 @@ async function handleIdiom(idiom: string): AsyncRes<string> {
// console.log();
return { ok: "" };
}
-// ช้า ๆ
-// งก ๆ
-// หงก ๆ
-
-async function getFrequency() {
- const files = [
- "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
- "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
- "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
- "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
- "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
- "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
- ];
- const freqMap = new Map<number, string>();
- for (const file of files) {
- await handleFile(file, (line, idx) => {
- const [spelling, IPA, tone, length, frequency, ...rest] = line.split(",");
- freqMap.set(Number(frequency!), spelling!);
- });
- }
- const orderedMap = new Map<string, number>();
- const keys = Array.from(freqMap.keys()).sort();
- for (let i = 0; i < keys.length; i++) {
- const val = freqMap.get(keys[i]!)!;
- orderedMap.set(val, i + 1);
- }
- return orderedMap;
-}
readDump("th");