diff options
Diffstat (limited to 'src/lib/db')
-rw-r--r-- | src/lib/db/enseed.ts | 85 | ||||
-rw-r--r-- | src/lib/db/prosodydb.ts | 120 | ||||
-rw-r--r-- | src/lib/db/prosodyschema.sql | 1 | ||||
-rw-r--r-- | src/lib/db/thaiseed.ts | 87 |
4 files changed, 235 insertions, 58 deletions
diff --git a/src/lib/db/enseed.ts b/src/lib/db/enseed.ts index 58f5876..39dec44 100644 --- a/src/lib/db/enseed.ts +++ b/src/lib/db/enseed.ts @@ -7,12 +7,15 @@ import { type ThaiNLPRes, sorSyl, getThaiFreq, + SorBSyl, } from "../calls/nlp"; import pdb from "./prosodydb"; import { cleanIpa } from "../utils"; import { handleFile } from "./utils"; import { Tone } from "../types/phonetics"; +import { AsyncRes } from "../types"; +const errors: string[] = []; async function readDump(lang: string) { await pdb.init(); pdb.addLanguage("th", "thai"); @@ -27,14 +30,21 @@ async function readDump(lang: string) { count++; console.log(count); // if (count <= 10000) continue; - if (count > 30) break; + if (count > 300) break; const j = JSON.parse(langrow.data); const word = j.word.trim(); if (!word) continue; const split = word.split(" "); - if (split.length > 1) await handleIdiom(lang, word); - else await handleWord(lang, word, j, freqMap); + const res = + split.length > 1 + ? await handleIdiom(lang, word) + : await handleWord(lang, word, j, freqMap); + if ("error" in res) { + console.error(res.error); + break; + } } + console.dir(errors); } async function handleWord( @@ -42,7 +52,7 @@ async function handleWord( word: string, j: any, freqMap: Map<string, number>, -) { +): AsyncRes<string> { // TODO add categories but add a tag to see what classifying scheme we're using // const sounds = j.sounds || []; @@ -50,9 +60,9 @@ async function handleWord( const hwikiRhyme = sounds.find((s: any) => "rhymes" in s); const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; if (!hasIpa) { - console.error("no ipa!!", word); - console.dir(j, { depth: null }); - return; + // console.error("no ipa!!", word); + // console.dir(j, { depth: null }); + return { error: "meh no ipa" }; } const freq = freqMap.get(word) || null; // const wordId = pdb.addWord(word, lang, freq, null); @@ -60,7 +70,11 @@ async function handleWord( const wordId = 0; // console.log(analyzed); for (let snd of sounds) - if ("ipa" in snd) handleIpa(wordId, word, lang, j, snd, wikiRhyme); + if ("ipa" in snd) { + const res = await handleIpa(wordId, word, lang, j, snd, wikiRhyme); + if ("error" in res) return res; + } + return { ok: "" }; } async function handleIpa( wordId: number | bigint, @@ -73,58 +87,65 @@ async function handleIpa( const tags = JSON.stringify(snd.tags) || null; const ipa = snd.ipa; const syls = await sorSyl(word, lang, ipa); + // console.log(syls, "sorsyl"); console.log(word); console.log(ipa); - // pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null); + pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null); // set word rhyme - const wordRhyme = syls.syls.reduce((acc: string, item: SorSyl) => { + const wordRhyme = syls.syls.reduce((acc: string, itemm: SorBSyl) => { + const item = itemm.ipa; if (!item.stressed && !acc) return acc; if (item.stressed && !acc) return `${acc}${item.rhyme}`; - else return `${acc}${item.ipa}`; + else return `${acc}${item.all}`; }, ""); if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme); - // + for (let i = 0; i < syls.syls.length; i++) { const syl = syls.syls[i]!; - await handleSyllable(word, syl.ipa, wordId, i); + const res = await handleSyllable(syl, wordId, i); + if ("error" in res) return res; } + return { ok: "" }; } async function handleSyllable( - spelling: string, - ipa: string, + syl: SorBSyl, wordId: number | bigint, idx: number, -) { - const sorsyl = await sorSyl(spelling, "th", ipa); - if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); - const syl = sorsyl.syls[0]!; +): AsyncRes<string> { try { pdb.addSyllable( wordId, idx + 1, + syl.ipa.stressed, "th", - syl.ipa, - syl.long, - spelling, - { spelling: syl.onset, ipa: syl.onset }, - { spelling: syl.medial, ipa: syl.medial }, - { spelling: syl.nucleus, ipa: syl.nucleus }, - { spelling: syl.coda, ipa: syl.coda }, - { spelling: syl.rhyme, ipa: syl.rhyme }, + syl.ipa.all, + syl.ipa.long, + syl.spelling.all, + { spelling: syl.spelling.onset, ipa: syl.ipa.onset }, + { spelling: syl.spelling.medial, ipa: syl.ipa.medial }, + { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus }, + { spelling: syl.spelling.coda, ipa: syl.ipa.coda }, + { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme }, { letters: "", numbers: 0, name: "" }, null, ); + return { ok: "" }; } catch (e) { // console.log("well fuck", syl); // console.error(e); - console.log(); + return { error: `${e}` }; } } -async function handleIdiom(lang: string, idiom: string) { - pdb.addIdiom(idiom, lang); - // TODO later set idiom_words once all words are populated - // console.log(); +async function handleIdiom(lang: string, idiom: string): AsyncRes<string> { + try { + pdb.addIdiom(idiom, lang); + // TODO later set idiom_words once all words are populated + // console.log(); + return { ok: "" }; + } catch (e) { + return { error: `${e}` }; + } } // ช้า ๆ // งก ๆ diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts index 9e76b8d..d6da389 100644 --- a/src/lib/db/prosodydb.ts +++ b/src/lib/db/prosodydb.ts @@ -1,12 +1,14 @@ import Database from "bun:sqlite"; import { Phoneme, Tone } from "../types/phonetics"; +import { ProsodyWord, ProsodyWordDB } from "../types/cards"; type Str = string | null; type ItemType = "word" | "syllable" | "idiom"; class DatabaseHandler { db: Database; constructor() { - const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db"; + // const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db"; + const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/thaiphon.db"; const db = new Database(dbPath, { create: true }); db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance db.exec("PRAGMA foreign_keys = ON"); @@ -18,12 +20,123 @@ class DatabaseHandler { this.db.exec(sql); } // selects + fetchFrequent(lang: string) { + const query = this.db.query( + `SELECT + w.id, + w.spelling, + w.lang, + w.frequency, + w.lang, + wp.ipa, + wp.syllables, + wp.tag, + w.notes, + (SELECT + json_group_array(json_object( + 'ipa', s.ipa, + 'spelling', s.text, + 'long', s.long, + 'notes', s.notes, + 'onseto', os.text, + 'onset', os.ipa, + 'nucleuso', ns.text, + 'nucleus', ns.ipa, + 'codao', co.text, + 'coda', co.ipa, + 'rhymeo', rh.text, + 'rhyme', rh.ipa, + 'tonen', tns.name, + 'tonenm', tns.nums, + 'tone', tns.ipa + ) + ) + FROM syllables s + JOIN onsets os ON os.id = s.onset + JOIN nucleus ns ON ns.id = s.nucleus + JOIN codas co ON co.id = s.coda + JOIN rhymes rh ON rh.id = s.rhyme + JOIN tones tns ON tns.id = s.tone + WHERE s.id= sw.syl_id + ) as syllables + FROM words w + JOIN word_phonetics wp ON wp.word_id = w.id + JOIN syllables_words sw ON sw.word_id = w.id + WHERE w.frequency IS NOT NULL + AND w.lang = ? + ORDER BY w.frequency ASC + LIMIT 300 + `, + ); + return query.all(lang) as ProsodyWordDB[]; + } fetchWords(words: string[]) { const query = this.db.query( `SELECT id FROM words where spelling IN (${words.map((w) => `'${w}'`).join(", ")})`, ); return query.all() as Array<{ id: number }>; } + fetchSyllables(words: string[]) { + const query = this.db.query( + `SELECT id FROM words where spelling IN (${words.map((w) => `'${w}'`).join(", ")})`, + ); + return query.all() as Array<{ id: number }>; + } + fetchOnsets(onset: string) { + const query = this.db.query( + `SELECT + w.id, + w.spelling, + w.frequency, + wp.ipa + FROM words w + JOIN word_phonetics wp ON wp.word_id = w.id + JOIN syllables_words sw ON sw.word_id = w.id + JOIN syllables s ON s.id = sw.syl_id + JOIN onsets os ON os.id = syl.onset + `, + ); + return query.all(onset) as any[]; + } + // tones + fetchWordsByToneAndSyls(tones: Array<string | null>) { + const toneString = tones.reduce((acc: string, item) => { + if (!item) return `${acc},%`; + else return `${acc},${item}`; + }, ""); + console.log({ toneString }); + const query = this.db.query( + ` + WITH word_tone_sequences AS ( + SELECT + w.id as word_id, + w.spelling, + wp.ipa, + w.frequency, + GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, + COUNT(sw.syl_id) as syllable_count + FROM words w + JOIN word_phonetics wp ON w.id = wp.word_id + JOIN syllables_words sw ON w.id = sw.word_id + JOIN syllables s ON sw.syl_id = s.id + JOIN tones t ON s.tone = t.id + GROUP BY w.id, w.spelling, w.lang, w.frequency + ) + SELECT + word_id, + spelling, + ipa, + frequency, + tone_sequence, + syllable_count + FROM word_tone_sequences + WHERE tone_sequence LIKE ? + AND syllable_count = ? + ORDER BY frequency DESC NULLS LAST; + `, + ); + return query.all(toneString.slice(1), tones.length) as any[]; + } // inserts addLanguage(code: string, name: string) { @@ -109,6 +222,7 @@ class DatabaseHandler { addSyllable( wordId: number | bigint, sylIdx: number, + stressed: boolean | null, lang: string, ipa: string, long: boolean, @@ -197,9 +311,9 @@ class DatabaseHandler { // const res1 = this.db .query( - `INSERT INTO syllables_words(syl_id, word_id, idx) VALUES(?, ?, ?)`, + `INSERT INTO syllables_words(syl_id, word_id, idx, stressed) VALUES(?, ?, ?, ?)`, ) - .run(sylId, wordId, sylIdx); + .run(sylId, wordId, sylIdx, stressed); // return sylId; }); diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql index c962d83..c6a04fa 100644 --- a/src/lib/db/prosodyschema.sql +++ b/src/lib/db/prosodyschema.sql @@ -130,6 +130,7 @@ CREATE TABLE IF NOT EXISTS syllables_words( syl_id INTEGER NOT NULL, word_id INTEGER NOT NULL, idx INTEGER NOT NULL, + stressed INTEGER, FOREIGN KEY (syl_id) REFERENCES syllables(id), FOREIGN KEY (word_id) REFERENCES words(id) ); diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts index 5c75345..6c69d9c 100644 --- a/src/lib/db/thaiseed.ts +++ b/src/lib/db/thaiseed.ts @@ -12,6 +12,7 @@ import pdb from "./prosodydb"; import { cleanIpa } from "../utils"; import { handleFile } from "./utils"; import { Tone } from "../types/phonetics"; +import { AsyncRes } from "../types"; async function readDump(lang: string) { await pdb.init(); @@ -30,38 +31,77 @@ async function readDump(lang: string) { const j = JSON.parse(langrow.data); const word = j.word.trim(); if (!word) continue; - if (word.includes("ๆ")) await handleWord(word, j); - else { + + if (word.includes("ๆ")) { + const res = await handleWord(word, j); + if ("error" in res) { + if (res.error.includes("meh")) continue; + if (res.error.includes("wtf")) { + console.error(res.error); + console.error(j.sounds); + } + break; + } + } else { const split = word.split(" "); - if (split.length > 1) await handleIdiom(word); - else await handleWord(word, j); + if (split.length > 1) { + const res = await handleIdiom(word); + if ("error" in res) { + console.error(res.error); + break; + } + } else { + const res = await handleWord(word, j); + if ("error" in res) { + if (res.error.includes("meh")) continue; + if (res.error.includes("wtf")) { + console.error(res.error); + console.error(j.sounds); + } + // break; + } + } } } } -async function handleWord(word: string, j: any) { +async function handleWord(word: string, j: any): AsyncRes<string> { // TODO add categories but add a tag to see what classifying scheme we're using // const sounds = j.sounds || []; const hasIpa = sounds.find((s: any) => "ipa" in s); - if (!hasIpa) return; + if (!hasIpa) return { error: "meh no ipa" }; const freq = await getThaiFreq(word); const wordId = pdb.addWord(word, "th", freq, null); + if (wordId == 478 || word === "และ") { + console.log("wtf man"); + console.dir(j, { depth: null }); + return { error: "i said wtf" }; + } const analyzed = await analyzeTHWord(word); - for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed); + for (let snd of sounds) + if ("ipa" in snd) { + const res = await handleIpa(wordId, j, snd, analyzed); + if ("error" in res) return res; + } + return { ok: "" }; } async function handleIpa( wordId: number | bigint, j: any, snd: any, analyzed: ThaiNLPRes, -) { +): AsyncRes<string> { const tags = JSON.stringify(snd.tags) || null; // console.log("handleipa", analyzed.syllables.length); // console.log(analyzed); const wikiIpa = cleanIpa(snd.ipa); const nlpIpa = cleanIpa(analyzed.ipa); const ipa = wikiIpa || nlpIpa; + if (j.word === "และ") { + console.log("wtf!!"); + return { error: "wtf is this" }; + } const wikiIpaSplit = wikiIpa.split("."); const nlpIpaSplit = nlpIpa.split("."); if (wikiIpaSplit.length !== nlpIpaSplit.length) { @@ -73,14 +113,15 @@ async function handleIpa( // console.log("syllable analysis mismatch", j.word); // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); // console.dir(j, { depth: null }); - return; + return { error: "meh syllable analysis mismatch" }; } - pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null); const writtenSyls = analyzed.syllables; const pronouncedSyls = analyzed.realSyls; let badSyls = false; if (writtenSyls.length !== pronouncedSyls.length) badSyls = true; + pdb.addPronunciation(wordId, ipa, pronouncedSyls.length, tags, null); + for (let i = 0; i < pronouncedSyls.length; i++) { const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, ""); const written = writtenSyls[i] || ""; @@ -93,14 +134,10 @@ async function handleIpa( console.log(pronounced); console.log(written); } - try { - await handleSyllable(syllable, ipa, wordId, i, notes); - } catch (e) { - console.error("syl error", j.word, j.sounds); - console.error({ analyzed, ipa, wikiIpaSplit }); - console.error(e); - } + const res = await handleSyllable(syllable, ipa, wordId, i, notes); + if ("error" in res) return res; } + return { ok: "" }; } const thaiTones: Record<string, string> = { "˧": "mid", @@ -122,7 +159,7 @@ function parseTone(ipa: string, spelling: string): Tone { const numbers = thaiToneNums[ipa]!; return { letters: ipa, name, numbers }; } catch (e) { - console.error("wrong tones!!", { s: spelling, ipa }); + console.error("meh wrong tones!!", { s: spelling, ipa }); throw new Error(""); } } @@ -133,7 +170,7 @@ async function handleSyllable( wordId: number | bigint, idx: number, notes: string | null, -) { +): AsyncRes<string> { const sorsyl = await sorSyl(spelling, "th", ipa); const weird = [ // "a̯n", @@ -166,14 +203,16 @@ async function handleSyllable( // // console.dir(j, { depth: null }); // } if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); - const syl = sorsyl.syls[0]!; + const syl = sorsyl.syls[0]!.ipa; const tone = parseTone(syl.tone, spelling); + // TODO add actual ortographic data here not just ipa try { pdb.addSyllable( wordId, idx + 1, + null, "th", - syl.ipa, + syl.all, syl.long, spelling, { spelling: syl.onset, ipa: syl.onset }, @@ -184,16 +223,18 @@ async function handleSyllable( tone, notes, ); + return { ok: "" }; } catch (e) { // console.log("well fuck", syl); // console.error(e); - console.log(); + return { error: `meh ${e}` }; } } -async function handleIdiom(idiom: string) { +async function handleIdiom(idiom: string): AsyncRes<string> { pdb.addIdiom(idiom, "th"); // TODO later set idiom_words once all words are populated // console.log(); + return { ok: "" }; } // ช้า ๆ // งก ๆ |