diff options
author | polwex <polwex@sortug.com> | 2025-06-03 19:40:34 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-06-03 19:40:34 +0700 |
commit | b91b758041cbc7b8bf7e2a4aee8d6228a75d8105 (patch) | |
tree | 4fa343ed394034b16841ecfcb6411b1574d24b25 | |
parent | 175ddca375cef765cec8ca5bbc527a205c40bf25 (diff) |
m
-rw-r--r-- | src/lib/calls/nlp.ts | 14 | ||||
-rw-r--r-- | src/lib/db/enseed.ts | 195 | ||||
-rw-r--r-- | src/lib/db/prosodydb.ts | 10 |
3 files changed, 137 insertions, 82 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts index 1e84e93..2810744 100644 --- a/src/lib/calls/nlp.ts +++ b/src/lib/calls/nlp.ts @@ -176,3 +176,17 @@ export async function findLemma(word: string, lang: string) { const jj = await r2.json(); return jj; } +export async function charsiuG2P(word: string, lang: string) { + const opts = { + method: "POST", + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + body: JSON.stringify({ string: word, lang }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8105" + `/ipa`, opts); + const jj = await r2.json(); + return jj; +} diff --git a/src/lib/db/enseed.ts b/src/lib/db/enseed.ts index 39dec44..9ef61ed 100644 --- a/src/lib/db/enseed.ts +++ b/src/lib/db/enseed.ts @@ -1,24 +1,15 @@ import Database from "bun:sqlite"; -import { - analyzeTHWord, - deconstructSyllable, - segmentateThai, - type SorSyl, - type ThaiNLPRes, - sorSyl, - getThaiFreq, - SorBSyl, -} from "../calls/nlp"; +import { sorSyl, SorBSyl, charsiuG2P, SorSylRes } from "../calls/nlp"; import pdb from "./prosodydb"; import { cleanIpa } from "../utils"; import { handleFile } from "./utils"; -import { Tone } from "../types/phonetics"; +import { Phoneme, Tone } from "../types/phonetics"; import { AsyncRes } from "../types"; const errors: string[] = []; async function readDump(lang: string) { await pdb.init(); - pdb.addLanguage("th", "thai"); + pdb.addLanguage("en", "english"); let count = 0; const langdb = new Database( `/home/y/code/prosody/resources/wiktionary/${lang}.db`, @@ -37,8 +28,8 @@ async function readDump(lang: string) { const split = word.split(" "); const res = split.length > 1 - ? await handleIdiom(lang, word) - : await handleWord(lang, word, j, freqMap); + ? await handleIdiom(word, lang) + : await handleWord(word, lang, j, freqMap); if ("error" in res) { console.error(res.error); break; @@ -48,50 +39,69 @@ async function readDump(lang: string) { } async function handleWord( - lang: string, word: string, + lang: string, j: any, freqMap: Map<string, number>, ): AsyncRes<string> { - // TODO add categories but add a tag to see what classifying scheme we're using - // + const frequency = freqMap.get(word) || null; + const promises = await getIpa(word, lang, j); + const phonetics = await Promise.all(promises); + + // pdb.superAdd({ word, lang, frequency, wordNotes: null, phonetics }); + return { ok: "" }; +} + +type IPAData = { + ipa: string; + syllable_count: number; + syllable_sequence: string; + tone_sequence: string; + ipa_sequence: string; + tags: string | null; + notes: string | null; + wordRhyme: string | null; + syllables: SylData[]; +}; +async function getIpa( + word: string, + lang: string, + j: any, +): Promise<Promise<IPAData>[]> { const sounds = j.sounds || []; const hasIpa = sounds.find((s: any) => "ipa" in s); - const hwikiRhyme = sounds.find((s: any) => "rhymes" in s); - const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; if (!hasIpa) { - // console.error("no ipa!!", word); + console.log("no ipa", word); // console.dir(j, { depth: null }); - return { error: "meh no ipa" }; + console.dir(sounds, { depth: null }); + // TODO fetch from idk charsiu + // const ipa = await charsiuG2P(word, lang); + // console.log("charsiu", ipa); } - const freq = freqMap.get(word) || null; - // const wordId = pdb.addWord(word, lang, freq, null); - // WIPE - const wordId = 0; - // console.log(analyzed); - for (let snd of sounds) - if ("ipa" in snd) { - const res = await handleIpa(wordId, word, lang, j, snd, wikiRhyme); - if ("error" in res) return res; - } - return { ok: "" }; + const hwikiRhyme = sounds.find((s: any) => "rhymes" in s); + const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; + const ipaData: Promise<IPAData>[] = sounds.reduce( + (acc: Promise<IPAData>[], snd: any) => { + if ("ipa" in snd) { + const data = getIpaData(word, lang, snd, wikiRhyme); + return [...acc, data]; + } else return acc; + }, + [], + ); + return ipaData; } -async function handleIpa( - wordId: number | bigint, + +async function getIpaData( word: string, lang: string, - j: any, snd: any, wikiRhyme: string | null, -) { +): Promise<IPAData> { + console.log("geting ipa..."); const tags = JSON.stringify(snd.tags) || null; - const ipa = snd.ipa; + const ipa = cleanIpa(snd.ipa); const syls = await sorSyl(word, lang, ipa); - // console.log(syls, "sorsyl"); - - console.log(word); - console.log(ipa); - pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null); // set word rhyme const wordRhyme = syls.syls.reduce((acc: string, itemm: SorBSyl) => { const item = itemm.ipa; @@ -99,47 +109,76 @@ async function handleIpa( if (item.stressed && !acc) return `${acc}${item.rhyme}`; else return `${acc}${item.all}`; }, ""); - if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme); + console.log({ word, wikiRhyme, wordRhyme }); - for (let i = 0; i < syls.syls.length; i++) { - const syl = syls.syls[i]!; - const res = await handleSyllable(syl, wordId, i); - if ("error" in res) return res; - } - return { ok: "" }; + const tone_sequence = ""; + const seqs = syls.syls.reduce( + (acc, item, idx) => { + const startString = idx === 0 ? "" : ","; + const { ipa, spelling } = item; + acc.ipa += `${startString}${ipa.all}`; + acc.syls += `${startString}${spelling.all}`; + return acc; + }, + { syls: "", ipa: "" }, + ); + const syllable_sequence = seqs.syls; + const ipa_sequence = seqs.ipa; + const syllables = getSyllables(syls); + return { + ipa, + syllable_count: syls.syls.length, + syllable_sequence, + tone_sequence, + ipa_sequence, + tags, + notes: null, + wordRhyme: null, + syllables, + }; } -async function handleSyllable( - syl: SorBSyl, - wordId: number | bigint, - idx: number, -): AsyncRes<string> { - try { - pdb.addSyllable( - wordId, - idx + 1, - syl.ipa.stressed, - "th", - syl.ipa.all, - syl.ipa.long, - syl.spelling.all, - { spelling: syl.spelling.onset, ipa: syl.ipa.onset }, - { spelling: syl.spelling.medial, ipa: syl.ipa.medial }, - { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus }, - { spelling: syl.spelling.coda, ipa: syl.ipa.coda }, - { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme }, - { letters: "", numbers: 0, name: "" }, - null, - ); - return { ok: "" }; - } catch (e) { - // console.log("well fuck", syl); - // console.error(e); - return { error: `${e}` }; +type SylData = { + idx: number; + stressed: boolean | null; + spelling: string; + ipa: string; + long: boolean; + onset: Phoneme; + medial: Phoneme; + nucleus: Phoneme; + coda: Phoneme; + rhyme: Phoneme; + tone: Tone; + notes: string | null; +}; +function getSyllables(syl: SorSylRes): SylData[] { + let syls: SylData[] = []; + for (let i = 0; i < syl.syls.length; i++) { + const syllable = syl.syls[i]!; + const res = getSyllable(syllable, i); + syls.push(res); } + return syls; +} +function getSyllable(syl: SorBSyl, idx: number): SylData { + return { + idx: idx + 1, + stressed: null, + spelling: syl.spelling.all, + ipa: syl.ipa.all, + long: syl.ipa.long, + onset: { spelling: syl.spelling.onset, ipa: syl.ipa.onset }, + medial: { spelling: syl.spelling.medial, ipa: syl.ipa.medial }, + nucleus: { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus }, + coda: { spelling: syl.spelling.coda, ipa: syl.ipa.coda }, + rhyme: { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme }, + tone: { name: "", letters: "", numbers: 0 }, + notes: null, + }; } -async function handleIdiom(lang: string, idiom: string): AsyncRes<string> { +async function handleIdiom(idiom: string, lang: string): AsyncRes<string> { try { - pdb.addIdiom(idiom, lang); + // pdb.addIdiom(idiom, lang); // TODO later set idiom_words once all words are populated // console.log(); return { ok: "" }; diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts index 7c067d2..26687a2 100644 --- a/src/lib/db/prosodydb.ts +++ b/src/lib/db/prosodydb.ts @@ -8,10 +8,13 @@ class DatabaseHandler { db: Database; constructor() { // const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db"; - const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/thaiphon.db"; + const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/enphon.db"; const db = new Database(dbPath, { create: true }); db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance db.exec("PRAGMA foreign_keys = ON"); + db.exec("PRAGMA cache_size = -8000"); // Increase cache size to 8MB + db.exec("PRAGMA temp_store = MEMORY"); // Store temp tables in memory + db.exec("PRAGMA synchronous = NORMAL"); // Slightly less safe but faster this.db = db; } async init() { @@ -62,9 +65,8 @@ class DatabaseHandler { FROM words w JOIN word_phonetics wp ON wp.word_id = w.id JOIN syllables_words sw ON sw.word_id = w.id - WHERE w.frequency IS NOT NULL - AND w.lang = ? - ORDER BY w.frequency ASC + WHERE w.lang = ? + ORDER BY w.frequency ASC NULLS LAST LIMIT 300 `, ); |