diff options
author | polwex <polwex@sortug.com> | 2025-06-03 01:36:36 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-06-03 01:36:36 +0700 |
commit | 2b80f7950df34f2a160135d7e20220a9b2ec3352 (patch) | |
tree | 0e2aec09b9aba887419e46c4d2fcaf861391eedc | |
parent | 249230c8e0e1bdb8ae4f433262997b84ee274904 (diff) |
got thai working but this is a bit too specific i think
-rw-r--r-- | src/lib/calls/nlp.ts | 1 | ||||
-rw-r--r-- | src/lib/calls/thainlp.ts | 106 | ||||
-rw-r--r-- | src/lib/db/enseed.ts | 151 | ||||
-rw-r--r-- | src/lib/db/prosodydb.ts | 40 | ||||
-rw-r--r-- | src/lib/db/prosodyschema.sql | 12 | ||||
-rw-r--r-- | src/lib/db/thaiseed.ts | 75 |
6 files changed, 330 insertions, 55 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts index 3cff415..f3364ac 100644 --- a/src/lib/calls/nlp.ts +++ b/src/lib/calls/nlp.ts @@ -3,6 +3,7 @@ import { SyllableRes } from "../types/cards"; export type ThaiNLPRes = { word: string; normalized: string; + realSyls: string[]; syllables: string[]; syllablesIpa: string[]; ipa: string; diff --git a/src/lib/calls/thainlp.ts b/src/lib/calls/thainlp.ts new file mode 100644 index 0000000..662e984 --- /dev/null +++ b/src/lib/calls/thainlp.ts @@ -0,0 +1,106 @@ +import { SyllableRes } from "../types/cards"; + +export type ThaiNLPRes = { + word: string; + normalized: string; + realSyls: string[]; + syllables: string[]; + syllablesIpa: string[]; + ipa: string; + pos: string; +}; + +export async function thaiData(word: string): Promise<ThaiNLPRes[]> { + const [head, tail] = await Promise.all([ + analyzeTHWord(word), + segmentateThai(word), + ]); + return [head, ...tail]; +} + +export async function analyzeTHWord(word: string): Promise<ThaiNLPRes> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word }), + }; + const r1 = await fetch("http://localhost:8001" + "/analyze", opts); + // const r2 = await fetch(`http://192.168.1.110:8000/analyze`, opts); + const jj = await r1.json(); + return jj; +} +export async function segmentateThai(sentence: string): Promise<ThaiNLPRes[]> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word: sentence }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/segmentate`, opts); + const jj = await r2.json(); + return jj; +} +export async function getThaiFreq(word: string): Promise<number> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/freq`, opts); + const jj = await r2.json(); + return jj; +} +export async function getThaiNext(word: string): Promise<string[]> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/next`, opts); + const jj = await r2.json(); + return jj; +} + +export async function getThaiPrev(word: string): Promise<string[]> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/prev`, opts); + const jj = await r2.json(); + return jj; +} + +export async function getThaiNext_bi( + word1: string, + word2: string, +): Promise<string[]> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word1, word2 }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/next_bi`, opts); + const jj = await r2.json(); + return jj; +} + +export async function getThaiPrev_bi( + word1: string, + word2: string, +): Promise<string[]> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word1, word2 }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/prev_bi`, opts); + const jj = await r2.json(); + return jj; +} diff --git a/src/lib/db/enseed.ts b/src/lib/db/enseed.ts new file mode 100644 index 0000000..58f5876 --- /dev/null +++ b/src/lib/db/enseed.ts @@ -0,0 +1,151 @@ +import Database from "bun:sqlite"; +import { + analyzeTHWord, + deconstructSyllable, + segmentateThai, + type SorSyl, + type ThaiNLPRes, + sorSyl, + getThaiFreq, +} from "../calls/nlp"; +import pdb from "./prosodydb"; +import { cleanIpa } from "../utils"; +import { handleFile } from "./utils"; +import { Tone } from "../types/phonetics"; + +async function readDump(lang: string) { + await pdb.init(); + pdb.addLanguage("th", "thai"); + let count = 0; + const langdb = new Database( + `/home/y/code/prosody/resources/wiktionary/${lang}.db`, + ); + let langrows: any = langdb.query("SELECT data FROM langs"); + // langrows = langrows.slice(10); + const freqMap = await getFrequency(); + for (const langrow of langrows) { + count++; + console.log(count); + // if (count <= 10000) continue; + if (count > 30) break; + const j = JSON.parse(langrow.data); + const word = j.word.trim(); + if (!word) continue; + const split = word.split(" "); + if (split.length > 1) await handleIdiom(lang, word); + else await handleWord(lang, word, j, freqMap); + } +} + +async function handleWord( + lang: string, + word: string, + j: any, + freqMap: Map<string, number>, +) { + // TODO add categories but add a tag to see what classifying scheme we're using + // + const sounds = j.sounds || []; + const hasIpa = sounds.find((s: any) => "ipa" in s); + const hwikiRhyme = sounds.find((s: any) => "rhymes" in s); + const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; + if (!hasIpa) { + console.error("no ipa!!", word); + console.dir(j, { depth: null }); + return; + } + const freq = freqMap.get(word) || null; + // const wordId = pdb.addWord(word, lang, freq, null); + // WIPE + const wordId = 0; + // console.log(analyzed); + for (let snd of sounds) + if ("ipa" in snd) handleIpa(wordId, word, lang, j, snd, wikiRhyme); +} +async function handleIpa( + wordId: number | bigint, + word: string, + lang: string, + j: any, + snd: any, + wikiRhyme: string | null, +) { + const tags = JSON.stringify(snd.tags) || null; + const ipa = snd.ipa; + const syls = await sorSyl(word, lang, ipa); + + console.log(word); + console.log(ipa); + // pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null); + // set word rhyme + const wordRhyme = syls.syls.reduce((acc: string, item: SorSyl) => { + if (!item.stressed && !acc) return acc; + if (item.stressed && !acc) return `${acc}${item.rhyme}`; + else return `${acc}${item.ipa}`; + }, ""); + if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme); + // + for (let i = 0; i < syls.syls.length; i++) { + const syl = syls.syls[i]!; + await handleSyllable(word, syl.ipa, wordId, i); + } +} +async function handleSyllable( + spelling: string, + ipa: string, + wordId: number | bigint, + idx: number, +) { + const sorsyl = await sorSyl(spelling, "th", ipa); + if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); + const syl = sorsyl.syls[0]!; + try { + pdb.addSyllable( + wordId, + idx + 1, + "th", + syl.ipa, + syl.long, + spelling, + { spelling: syl.onset, ipa: syl.onset }, + { spelling: syl.medial, ipa: syl.medial }, + { spelling: syl.nucleus, ipa: syl.nucleus }, + { spelling: syl.coda, ipa: syl.coda }, + { spelling: syl.rhyme, ipa: syl.rhyme }, + { letters: "", numbers: 0, name: "" }, + null, + ); + } catch (e) { + // console.log("well fuck", syl); + // console.error(e); + console.log(); + } +} +async function handleIdiom(lang: string, idiom: string) { + pdb.addIdiom(idiom, lang); + // TODO later set idiom_words once all words are populated + // console.log(); +} +// ช้า ๆ +// งก ๆ +// หงก ๆ + +async function getFrequency() { + const freqMap = new Map<number, string>(); + await handleFile( + "/home/y/code/prosody/hanchu/datasets/unigram_freq.csv", + (line, idx) => { + const [spelling, frequency] = line.split(","); + freqMap.set(Number(frequency!), spelling!); + }, + ); + const orderedMap = new Map<string, number>(); + const keys = Array.from(freqMap.keys()).sort(); + for (let i = 0; i < keys.length; i++) { + const val = freqMap.get(keys[i]!)!; + orderedMap.set(val, i + 1); + } + return orderedMap; +} + +readDump("en"); diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts index 1cfb8f0..9e76b8d 100644 --- a/src/lib/db/prosodydb.ts +++ b/src/lib/db/prosodydb.ts @@ -130,7 +130,7 @@ class DatabaseHandler { RETURNING rowid `, ) - .get(onset.ipa, lang, onset.spelling) as number; + .get(onset.ipa, lang, onset.spelling) as { id: number }; const medialId = this.db .query( `INSERT INTO medials(ipa, lang, text) VALUES(?, ?, ?) @@ -139,7 +139,7 @@ class DatabaseHandler { RETURNING rowid `, ) - .get(medial.ipa, lang, medial.spelling) as number; + .get(medial.ipa, lang, medial.spelling) as { id: number }; const nucleusId = this.db .query( `INSERT INTO nucleus(ipa, lang, text) VALUES(?, ?, ?) @@ -148,7 +148,7 @@ class DatabaseHandler { RETURNING rowid `, ) - .get(nucleus.ipa, lang, nucleus.spelling) as number; + .get(nucleus.ipa, lang, nucleus.spelling) as { id: number }; const codaId = this.db .query( `INSERT INTO codas(ipa, lang, text) VALUES(?, ?, ?) @@ -157,7 +157,7 @@ class DatabaseHandler { RETURNING rowid `, ) - .get(coda.ipa, lang, coda.spelling) as number; + .get(coda.ipa, lang, coda.spelling) as { id: number }; const rhymeId = this.db .query( `INSERT INTO rhymes(ipa, lang, text) VALUES(?, ?, ?) @@ -166,7 +166,7 @@ class DatabaseHandler { RETURNING rowid `, ) - .get(rhyme.ipa, lang, rhyme.spelling) as number; + .get(rhyme.ipa, lang, rhyme.spelling) as { id: number }; const toneId = this.db .query( `INSERT INTO tones(ipa, lang, name, nums) VALUES(?, ?, ?, ?) @@ -175,39 +175,25 @@ class DatabaseHandler { RETURNING rowid `, ) - .get(tone.letters, lang, tone.name, tone.numbers) as number; + .get(tone.letters, lang, tone.name, tone.numbers) as { id: number }; const query = this.db.query( - `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ); - // TODO need a dual structure here for IPA and orto const res = query.run( lang, ipa, long, text, - onsetId, - medialId, - nucleusId, - codaId, - rhymeId, - toneId, + onsetId.id, + medialId.id, + nucleusId.id, + codaId.id, + rhymeId.id, + toneId.id, notes, ); const sylId = res.lastInsertRowid; - const ipaq = this.db.query(` - INSERT INTO syl_ipa(syl_id, ipa, onset, medial, nucleus, coda, rhyme, notes) - VALUES(?, ?, ?, ?, ?, ?, ?, ?)`); - ipaq.run( - sylId, - ipa, - onset.ipa, - medial.ipa, - nucleus.ipa, - coda.ipa, - rhyme.ipa, - null, - ); // const res1 = this.db .query( diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql index 26818f3..c962d83 100644 --- a/src/lib/db/prosodyschema.sql +++ b/src/lib/db/prosodyschema.sql @@ -144,18 +144,6 @@ CREATE TABLE IF NOT EXISTS words_idioms( -- -CREATE TABLE IF NOT EXISTS syl_ipa( - id INTEGER PRIMARY KEY AUTOINCREMENT, - syl_id INTEGER NOT NULL, - ipa TEXT NOT NULL, - onset TEXT NOT NULL, - medial TEXT NOT NULL, - nucleus TEXT NOT NULL, - rhyme TEXT NOT NULL, - coda TEXT NOT NULL, - notes TEXT, - CONSTRAINT syl_ipa_unique UNIQUE (ipa, syl_id) -); CREATE TABLE IF NOT EXISTS word_phonetics( id INTEGER PRIMARY KEY AUTOINCREMENT, diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts index 687f0f3..5c75345 100644 --- a/src/lib/db/thaiseed.ts +++ b/src/lib/db/thaiseed.ts @@ -24,9 +24,9 @@ async function readDump(lang: string) { // langrows = langrows.slice(10); for (const langrow of langrows) { count++; - console.log(count); + // console.log(count); // if (count <= 10000) continue; - // if (count > 30) break; + // if (count > 100) break; const j = JSON.parse(langrow.data); const word = j.word.trim(); if (!word) continue; @@ -48,7 +48,6 @@ async function handleWord(word: string, j: any) { const freq = await getThaiFreq(word); const wordId = pdb.addWord(word, "th", freq, null); const analyzed = await analyzeTHWord(word); - // console.log(analyzed); for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed); } async function handleIpa( @@ -66,27 +65,39 @@ async function handleIpa( const wikiIpaSplit = wikiIpa.split("."); const nlpIpaSplit = nlpIpa.split("."); if (wikiIpaSplit.length !== nlpIpaSplit.length) { - console.log("ipa mismatch"); - console.log(wikiIpa); - console.log(nlpIpa); - // return; + // console.log("ipa mismatch"); + // console.log(wikiIpa); + // console.log(nlpIpa); } - if (analyzed.syllables.length !== wikiIpaSplit.length) { - console.log("syllable analysis mismatch", j.word); - console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); + if (analyzed.realSyls.length !== wikiIpaSplit.length) { + // console.log("syllable analysis mismatch", j.word); + // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); // console.dir(j, { depth: null }); return; } pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null); + const writtenSyls = analyzed.syllables; + const pronouncedSyls = analyzed.realSyls; + let badSyls = false; + if (writtenSyls.length !== pronouncedSyls.length) badSyls = true; - for (let i = 0; i < analyzed.syllables.length; i++) { - const spelling = analyzed.syllables[i]!; + for (let i = 0; i < pronouncedSyls.length; i++) { + const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, ""); + const written = writtenSyls[i] || ""; + const syllable = badSyls ? pronounced : written; const ipa = wikiIpaSplit[i]!; + // TODO insert both?? + const notes = pronounced === written ? null : `Pronounced ${pronounced}`; + if (pronounced !== syllable) { + console.log("diff"); + console.log(pronounced); + console.log(written); + } try { - await handleSyllable(spelling, ipa, wordId, i); + await handleSyllable(syllable, ipa, wordId, i, notes); } catch (e) { console.error("syl error", j.word, j.sounds); - console.error({ spelling, ipa, wikiIpaSplit }); + console.error({ analyzed, ipa, wikiIpaSplit }); console.error(e); } } @@ -115,16 +126,48 @@ function parseTone(ipa: string, spelling: string): Tone { throw new Error(""); } } + async function handleSyllable( spelling: string, ipa: string, wordId: number | bigint, idx: number, + notes: string | null, ) { const sorsyl = await sorSyl(spelling, "th", ipa); + const weird = [ + // "a̯n", + // "a̯", + // "a̯p", + // "a̯w", + // "a̯j", + // "a̯ŋ", + // "a̯k", + // "a̯t", + // "a̯m", + // "a̯ʔ", + // "ʔ", + "s", + "l", + "f", + "a̯s", + "js", + "t͡ɕʰ", + "ks", + "ns", + "a̯l", + "a̯f", + "mk", + ]; + // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda)); + // if (weirder) { + // console.log("syllable", spelling); + // // console.dir(sorsyl, { depth: null }); + // // console.dir(j, { depth: null }); + // } if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); const syl = sorsyl.syls[0]!; - const tone = syl.tone ? parseTone(syl.tone, spelling) : null; + const tone = parseTone(syl.tone, spelling); try { pdb.addSyllable( wordId, @@ -139,7 +182,7 @@ async function handleSyllable( { spelling: syl.coda, ipa: syl.coda }, { spelling: syl.rhyme, ipa: syl.rhyme }, tone, - null, + notes, ); } catch (e) { // console.log("well fuck", syl); |