summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-06-03 01:36:36 +0700
committerpolwex <polwex@sortug.com>2025-06-03 01:36:36 +0700
commit2b80f7950df34f2a160135d7e20220a9b2ec3352 (patch)
tree0e2aec09b9aba887419e46c4d2fcaf861391eedc
parent249230c8e0e1bdb8ae4f433262997b84ee274904 (diff)
got thai working but this is a bit too specific i think
-rw-r--r--src/lib/calls/nlp.ts1
-rw-r--r--src/lib/calls/thainlp.ts106
-rw-r--r--src/lib/db/enseed.ts151
-rw-r--r--src/lib/db/prosodydb.ts40
-rw-r--r--src/lib/db/prosodyschema.sql12
-rw-r--r--src/lib/db/thaiseed.ts75
6 files changed, 330 insertions, 55 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
index 3cff415..f3364ac 100644
--- a/src/lib/calls/nlp.ts
+++ b/src/lib/calls/nlp.ts
@@ -3,6 +3,7 @@ import { SyllableRes } from "../types/cards";
export type ThaiNLPRes = {
word: string;
normalized: string;
+ realSyls: string[];
syllables: string[];
syllablesIpa: string[];
ipa: string;
diff --git a/src/lib/calls/thainlp.ts b/src/lib/calls/thainlp.ts
new file mode 100644
index 0000000..662e984
--- /dev/null
+++ b/src/lib/calls/thainlp.ts
@@ -0,0 +1,106 @@
+import { SyllableRes } from "../types/cards";
+
+export type ThaiNLPRes = {
+ word: string;
+ normalized: string;
+ realSyls: string[];
+ syllables: string[];
+ syllablesIpa: string[];
+ ipa: string;
+ pos: string;
+};
+
+export async function thaiData(word: string): Promise<ThaiNLPRes[]> {
+ const [head, tail] = await Promise.all([
+ analyzeTHWord(word),
+ segmentateThai(word),
+ ]);
+ return [head, ...tail];
+}
+
+export async function analyzeTHWord(word: string): Promise<ThaiNLPRes> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word }),
+ };
+ const r1 = await fetch("http://localhost:8001" + "/analyze", opts);
+ // const r2 = await fetch(`http://192.168.1.110:8000/analyze`, opts);
+ const jj = await r1.json();
+ return jj;
+}
+export async function segmentateThai(sentence: string): Promise<ThaiNLPRes[]> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word: sentence }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/segmentate`, opts);
+ const jj = await r2.json();
+ return jj;
+}
+export async function getThaiFreq(word: string): Promise<number> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/freq`, opts);
+ const jj = await r2.json();
+ return jj;
+}
+export async function getThaiNext(word: string): Promise<string[]> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/next`, opts);
+ const jj = await r2.json();
+ return jj;
+}
+
+export async function getThaiPrev(word: string): Promise<string[]> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/prev`, opts);
+ const jj = await r2.json();
+ return jj;
+}
+
+export async function getThaiNext_bi(
+ word1: string,
+ word2: string,
+): Promise<string[]> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word1, word2 }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/next_bi`, opts);
+ const jj = await r2.json();
+ return jj;
+}
+
+export async function getThaiPrev_bi(
+ word1: string,
+ word2: string,
+): Promise<string[]> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word1, word2 }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/prev_bi`, opts);
+ const jj = await r2.json();
+ return jj;
+}
diff --git a/src/lib/db/enseed.ts b/src/lib/db/enseed.ts
new file mode 100644
index 0000000..58f5876
--- /dev/null
+++ b/src/lib/db/enseed.ts
@@ -0,0 +1,151 @@
+import Database from "bun:sqlite";
+import {
+ analyzeTHWord,
+ deconstructSyllable,
+ segmentateThai,
+ type SorSyl,
+ type ThaiNLPRes,
+ sorSyl,
+ getThaiFreq,
+} from "../calls/nlp";
+import pdb from "./prosodydb";
+import { cleanIpa } from "../utils";
+import { handleFile } from "./utils";
+import { Tone } from "../types/phonetics";
+
+async function readDump(lang: string) {
+ await pdb.init();
+ pdb.addLanguage("th", "thai");
+ let count = 0;
+ const langdb = new Database(
+ `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+ );
+ let langrows: any = langdb.query("SELECT data FROM langs");
+ // langrows = langrows.slice(10);
+ const freqMap = await getFrequency();
+ for (const langrow of langrows) {
+ count++;
+ console.log(count);
+ // if (count <= 10000) continue;
+ if (count > 30) break;
+ const j = JSON.parse(langrow.data);
+ const word = j.word.trim();
+ if (!word) continue;
+ const split = word.split(" ");
+ if (split.length > 1) await handleIdiom(lang, word);
+ else await handleWord(lang, word, j, freqMap);
+ }
+}
+
+async function handleWord(
+ lang: string,
+ word: string,
+ j: any,
+ freqMap: Map<string, number>,
+) {
+ // TODO add categories but add a tag to see what classifying scheme we're using
+ //
+ const sounds = j.sounds || [];
+ const hasIpa = sounds.find((s: any) => "ipa" in s);
+ const hwikiRhyme = sounds.find((s: any) => "rhymes" in s);
+ const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
+ if (!hasIpa) {
+ console.error("no ipa!!", word);
+ console.dir(j, { depth: null });
+ return;
+ }
+ const freq = freqMap.get(word) || null;
+ // const wordId = pdb.addWord(word, lang, freq, null);
+ // WIPE
+ const wordId = 0;
+ // console.log(analyzed);
+ for (let snd of sounds)
+ if ("ipa" in snd) handleIpa(wordId, word, lang, j, snd, wikiRhyme);
+}
+async function handleIpa(
+ wordId: number | bigint,
+ word: string,
+ lang: string,
+ j: any,
+ snd: any,
+ wikiRhyme: string | null,
+) {
+ const tags = JSON.stringify(snd.tags) || null;
+ const ipa = snd.ipa;
+ const syls = await sorSyl(word, lang, ipa);
+
+ console.log(word);
+ console.log(ipa);
+ // pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null);
+ // set word rhyme
+ const wordRhyme = syls.syls.reduce((acc: string, item: SorSyl) => {
+ if (!item.stressed && !acc) return acc;
+ if (item.stressed && !acc) return `${acc}${item.rhyme}`;
+ else return `${acc}${item.ipa}`;
+ }, "");
+ if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
+ //
+ for (let i = 0; i < syls.syls.length; i++) {
+ const syl = syls.syls[i]!;
+ await handleSyllable(word, syl.ipa, wordId, i);
+ }
+}
+async function handleSyllable(
+ spelling: string,
+ ipa: string,
+ wordId: number | bigint,
+ idx: number,
+) {
+ const sorsyl = await sorSyl(spelling, "th", ipa);
+ if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
+ const syl = sorsyl.syls[0]!;
+ try {
+ pdb.addSyllable(
+ wordId,
+ idx + 1,
+ "th",
+ syl.ipa,
+ syl.long,
+ spelling,
+ { spelling: syl.onset, ipa: syl.onset },
+ { spelling: syl.medial, ipa: syl.medial },
+ { spelling: syl.nucleus, ipa: syl.nucleus },
+ { spelling: syl.coda, ipa: syl.coda },
+ { spelling: syl.rhyme, ipa: syl.rhyme },
+ { letters: "", numbers: 0, name: "" },
+ null,
+ );
+ } catch (e) {
+ // console.log("well fuck", syl);
+ // console.error(e);
+ console.log();
+ }
+}
+async function handleIdiom(lang: string, idiom: string) {
+ pdb.addIdiom(idiom, lang);
+ // TODO later set idiom_words once all words are populated
+ // console.log();
+}
+// ช้า ๆ
+// งก ๆ
+// หงก ๆ
+
+async function getFrequency() {
+ const freqMap = new Map<number, string>();
+ await handleFile(
+ "/home/y/code/prosody/hanchu/datasets/unigram_freq.csv",
+ (line, idx) => {
+ const [spelling, frequency] = line.split(",");
+ freqMap.set(Number(frequency!), spelling!);
+ },
+ );
+ const orderedMap = new Map<string, number>();
+ const keys = Array.from(freqMap.keys()).sort();
+ for (let i = 0; i < keys.length; i++) {
+ const val = freqMap.get(keys[i]!)!;
+ orderedMap.set(val, i + 1);
+ }
+ return orderedMap;
+}
+
+readDump("en");
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
index 1cfb8f0..9e76b8d 100644
--- a/src/lib/db/prosodydb.ts
+++ b/src/lib/db/prosodydb.ts
@@ -130,7 +130,7 @@ class DatabaseHandler {
RETURNING rowid
`,
)
- .get(onset.ipa, lang, onset.spelling) as number;
+ .get(onset.ipa, lang, onset.spelling) as { id: number };
const medialId = this.db
.query(
`INSERT INTO medials(ipa, lang, text) VALUES(?, ?, ?)
@@ -139,7 +139,7 @@ class DatabaseHandler {
RETURNING rowid
`,
)
- .get(medial.ipa, lang, medial.spelling) as number;
+ .get(medial.ipa, lang, medial.spelling) as { id: number };
const nucleusId = this.db
.query(
`INSERT INTO nucleus(ipa, lang, text) VALUES(?, ?, ?)
@@ -148,7 +148,7 @@ class DatabaseHandler {
RETURNING rowid
`,
)
- .get(nucleus.ipa, lang, nucleus.spelling) as number;
+ .get(nucleus.ipa, lang, nucleus.spelling) as { id: number };
const codaId = this.db
.query(
`INSERT INTO codas(ipa, lang, text) VALUES(?, ?, ?)
@@ -157,7 +157,7 @@ class DatabaseHandler {
RETURNING rowid
`,
)
- .get(coda.ipa, lang, coda.spelling) as number;
+ .get(coda.ipa, lang, coda.spelling) as { id: number };
const rhymeId = this.db
.query(
`INSERT INTO rhymes(ipa, lang, text) VALUES(?, ?, ?)
@@ -166,7 +166,7 @@ class DatabaseHandler {
RETURNING rowid
`,
)
- .get(rhyme.ipa, lang, rhyme.spelling) as number;
+ .get(rhyme.ipa, lang, rhyme.spelling) as { id: number };
const toneId = this.db
.query(
`INSERT INTO tones(ipa, lang, name, nums) VALUES(?, ?, ?, ?)
@@ -175,39 +175,25 @@ class DatabaseHandler {
RETURNING rowid
`,
)
- .get(tone.letters, lang, tone.name, tone.numbers) as number;
+ .get(tone.letters, lang, tone.name, tone.numbers) as { id: number };
const query = this.db.query(
- `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+ `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
);
- // TODO need a dual structure here for IPA and orto
const res = query.run(
lang,
ipa,
long,
text,
- onsetId,
- medialId,
- nucleusId,
- codaId,
- rhymeId,
- toneId,
+ onsetId.id,
+ medialId.id,
+ nucleusId.id,
+ codaId.id,
+ rhymeId.id,
+ toneId.id,
notes,
);
const sylId = res.lastInsertRowid;
- const ipaq = this.db.query(`
- INSERT INTO syl_ipa(syl_id, ipa, onset, medial, nucleus, coda, rhyme, notes)
- VALUES(?, ?, ?, ?, ?, ?, ?, ?)`);
- ipaq.run(
- sylId,
- ipa,
- onset.ipa,
- medial.ipa,
- nucleus.ipa,
- coda.ipa,
- rhyme.ipa,
- null,
- );
//
const res1 = this.db
.query(
diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql
index 26818f3..c962d83 100644
--- a/src/lib/db/prosodyschema.sql
+++ b/src/lib/db/prosodyschema.sql
@@ -144,18 +144,6 @@ CREATE TABLE IF NOT EXISTS words_idioms(
--
-CREATE TABLE IF NOT EXISTS syl_ipa(
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- syl_id INTEGER NOT NULL,
- ipa TEXT NOT NULL,
- onset TEXT NOT NULL,
- medial TEXT NOT NULL,
- nucleus TEXT NOT NULL,
- rhyme TEXT NOT NULL,
- coda TEXT NOT NULL,
- notes TEXT,
- CONSTRAINT syl_ipa_unique UNIQUE (ipa, syl_id)
-);
CREATE TABLE IF NOT EXISTS word_phonetics(
id INTEGER PRIMARY KEY AUTOINCREMENT,
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
index 687f0f3..5c75345 100644
--- a/src/lib/db/thaiseed.ts
+++ b/src/lib/db/thaiseed.ts
@@ -24,9 +24,9 @@ async function readDump(lang: string) {
// langrows = langrows.slice(10);
for (const langrow of langrows) {
count++;
- console.log(count);
+ // console.log(count);
// if (count <= 10000) continue;
- // if (count > 30) break;
+ // if (count > 100) break;
const j = JSON.parse(langrow.data);
const word = j.word.trim();
if (!word) continue;
@@ -48,7 +48,6 @@ async function handleWord(word: string, j: any) {
const freq = await getThaiFreq(word);
const wordId = pdb.addWord(word, "th", freq, null);
const analyzed = await analyzeTHWord(word);
- // console.log(analyzed);
for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed);
}
async function handleIpa(
@@ -66,27 +65,39 @@ async function handleIpa(
const wikiIpaSplit = wikiIpa.split(".");
const nlpIpaSplit = nlpIpa.split(".");
if (wikiIpaSplit.length !== nlpIpaSplit.length) {
- console.log("ipa mismatch");
- console.log(wikiIpa);
- console.log(nlpIpa);
- // return;
+ // console.log("ipa mismatch");
+ // console.log(wikiIpa);
+ // console.log(nlpIpa);
}
- if (analyzed.syllables.length !== wikiIpaSplit.length) {
- console.log("syllable analysis mismatch", j.word);
- console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+ if (analyzed.realSyls.length !== wikiIpaSplit.length) {
+ // console.log("syllable analysis mismatch", j.word);
+ // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
// console.dir(j, { depth: null });
return;
}
pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null);
+ const writtenSyls = analyzed.syllables;
+ const pronouncedSyls = analyzed.realSyls;
+ let badSyls = false;
+ if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
- for (let i = 0; i < analyzed.syllables.length; i++) {
- const spelling = analyzed.syllables[i]!;
+ for (let i = 0; i < pronouncedSyls.length; i++) {
+ const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, "");
+ const written = writtenSyls[i] || "";
+ const syllable = badSyls ? pronounced : written;
const ipa = wikiIpaSplit[i]!;
+ // TODO insert both??
+ const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
+ if (pronounced !== syllable) {
+ console.log("diff");
+ console.log(pronounced);
+ console.log(written);
+ }
try {
- await handleSyllable(spelling, ipa, wordId, i);
+ await handleSyllable(syllable, ipa, wordId, i, notes);
} catch (e) {
console.error("syl error", j.word, j.sounds);
- console.error({ spelling, ipa, wikiIpaSplit });
+ console.error({ analyzed, ipa, wikiIpaSplit });
console.error(e);
}
}
@@ -115,16 +126,48 @@ function parseTone(ipa: string, spelling: string): Tone {
throw new Error("");
}
}
+
async function handleSyllable(
spelling: string,
ipa: string,
wordId: number | bigint,
idx: number,
+ notes: string | null,
) {
const sorsyl = await sorSyl(spelling, "th", ipa);
+ const weird = [
+ // "a̯n",
+ // "a̯",
+ // "a̯p",
+ // "a̯w",
+ // "a̯j",
+ // "a̯ŋ",
+ // "a̯k",
+ // "a̯t",
+ // "a̯m",
+ // "a̯ʔ",
+ // "ʔ",
+ "s",
+ "l",
+ "f",
+ "a̯s",
+ "js",
+ "t͡ɕʰ",
+ "ks",
+ "ns",
+ "a̯l",
+ "a̯f",
+ "mk",
+ ];
+ // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
+ // if (weirder) {
+ // console.log("syllable", spelling);
+ // // console.dir(sorsyl, { depth: null });
+ // // console.dir(j, { depth: null });
+ // }
if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
const syl = sorsyl.syls[0]!;
- const tone = syl.tone ? parseTone(syl.tone, spelling) : null;
+ const tone = parseTone(syl.tone, spelling);
try {
pdb.addSyllable(
wordId,
@@ -139,7 +182,7 @@ async function handleSyllable(
{ spelling: syl.coda, ipa: syl.coda },
{ spelling: syl.rhyme, ipa: syl.rhyme },
tone,
- null,
+ notes,
);
} catch (e) {
// console.log("well fuck", syl);