summaryrefslogtreecommitdiff
path: root/src/lib/db
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-06-03 09:34:29 +0700
committerpolwex <polwex@sortug.com>2025-06-03 09:34:29 +0700
commit2401217a4019938d1c1cc61b6e33ccb233eb6e74 (patch)
tree06118284965be5cfd6b417dca86d46db5758217b /src/lib/db
parent2b80f7950df34f2a160135d7e20220a9b2ec3352 (diff)
this is golden thanks claude
Diffstat (limited to 'src/lib/db')
-rw-r--r--src/lib/db/enseed.ts85
-rw-r--r--src/lib/db/prosodydb.ts120
-rw-r--r--src/lib/db/prosodyschema.sql1
-rw-r--r--src/lib/db/thaiseed.ts87
4 files changed, 235 insertions, 58 deletions
diff --git a/src/lib/db/enseed.ts b/src/lib/db/enseed.ts
index 58f5876..39dec44 100644
--- a/src/lib/db/enseed.ts
+++ b/src/lib/db/enseed.ts
@@ -7,12 +7,15 @@ import {
type ThaiNLPRes,
sorSyl,
getThaiFreq,
+ SorBSyl,
} from "../calls/nlp";
import pdb from "./prosodydb";
import { cleanIpa } from "../utils";
import { handleFile } from "./utils";
import { Tone } from "../types/phonetics";
+import { AsyncRes } from "../types";
+const errors: string[] = [];
async function readDump(lang: string) {
await pdb.init();
pdb.addLanguage("th", "thai");
@@ -27,14 +30,21 @@ async function readDump(lang: string) {
count++;
console.log(count);
// if (count <= 10000) continue;
- if (count > 30) break;
+ if (count > 300) break;
const j = JSON.parse(langrow.data);
const word = j.word.trim();
if (!word) continue;
const split = word.split(" ");
- if (split.length > 1) await handleIdiom(lang, word);
- else await handleWord(lang, word, j, freqMap);
+ const res =
+ split.length > 1
+ ? await handleIdiom(lang, word)
+ : await handleWord(lang, word, j, freqMap);
+ if ("error" in res) {
+ console.error(res.error);
+ break;
+ }
}
+ console.dir(errors);
}
async function handleWord(
@@ -42,7 +52,7 @@ async function handleWord(
word: string,
j: any,
freqMap: Map<string, number>,
-) {
+): AsyncRes<string> {
// TODO add categories but add a tag to see what classifying scheme we're using
//
const sounds = j.sounds || [];
@@ -50,9 +60,9 @@ async function handleWord(
const hwikiRhyme = sounds.find((s: any) => "rhymes" in s);
const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
if (!hasIpa) {
- console.error("no ipa!!", word);
- console.dir(j, { depth: null });
- return;
+ // console.error("no ipa!!", word);
+ // console.dir(j, { depth: null });
+ return { error: "meh no ipa" };
}
const freq = freqMap.get(word) || null;
// const wordId = pdb.addWord(word, lang, freq, null);
@@ -60,7 +70,11 @@ async function handleWord(
const wordId = 0;
// console.log(analyzed);
for (let snd of sounds)
- if ("ipa" in snd) handleIpa(wordId, word, lang, j, snd, wikiRhyme);
+ if ("ipa" in snd) {
+ const res = await handleIpa(wordId, word, lang, j, snd, wikiRhyme);
+ if ("error" in res) return res;
+ }
+ return { ok: "" };
}
async function handleIpa(
wordId: number | bigint,
@@ -73,58 +87,65 @@ async function handleIpa(
const tags = JSON.stringify(snd.tags) || null;
const ipa = snd.ipa;
const syls = await sorSyl(word, lang, ipa);
+ // console.log(syls, "sorsyl");
console.log(word);
console.log(ipa);
- // pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null);
+ pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null);
// set word rhyme
- const wordRhyme = syls.syls.reduce((acc: string, item: SorSyl) => {
+ const wordRhyme = syls.syls.reduce((acc: string, itemm: SorBSyl) => {
+ const item = itemm.ipa;
if (!item.stressed && !acc) return acc;
if (item.stressed && !acc) return `${acc}${item.rhyme}`;
- else return `${acc}${item.ipa}`;
+ else return `${acc}${item.all}`;
}, "");
if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
- //
+
for (let i = 0; i < syls.syls.length; i++) {
const syl = syls.syls[i]!;
- await handleSyllable(word, syl.ipa, wordId, i);
+ const res = await handleSyllable(syl, wordId, i);
+ if ("error" in res) return res;
}
+ return { ok: "" };
}
async function handleSyllable(
- spelling: string,
- ipa: string,
+ syl: SorBSyl,
wordId: number | bigint,
idx: number,
-) {
- const sorsyl = await sorSyl(spelling, "th", ipa);
- if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
- const syl = sorsyl.syls[0]!;
+): AsyncRes<string> {
try {
pdb.addSyllable(
wordId,
idx + 1,
+ syl.ipa.stressed,
"th",
- syl.ipa,
- syl.long,
- spelling,
- { spelling: syl.onset, ipa: syl.onset },
- { spelling: syl.medial, ipa: syl.medial },
- { spelling: syl.nucleus, ipa: syl.nucleus },
- { spelling: syl.coda, ipa: syl.coda },
- { spelling: syl.rhyme, ipa: syl.rhyme },
+ syl.ipa.all,
+ syl.ipa.long,
+ syl.spelling.all,
+ { spelling: syl.spelling.onset, ipa: syl.ipa.onset },
+ { spelling: syl.spelling.medial, ipa: syl.ipa.medial },
+ { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus },
+ { spelling: syl.spelling.coda, ipa: syl.ipa.coda },
+ { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme },
{ letters: "", numbers: 0, name: "" },
null,
);
+ return { ok: "" };
} catch (e) {
// console.log("well fuck", syl);
// console.error(e);
- console.log();
+ return { error: `${e}` };
}
}
-async function handleIdiom(lang: string, idiom: string) {
- pdb.addIdiom(idiom, lang);
- // TODO later set idiom_words once all words are populated
- // console.log();
+async function handleIdiom(lang: string, idiom: string): AsyncRes<string> {
+ try {
+ pdb.addIdiom(idiom, lang);
+ // TODO later set idiom_words once all words are populated
+ // console.log();
+ return { ok: "" };
+ } catch (e) {
+ return { error: `${e}` };
+ }
}
// ช้า ๆ
// งก ๆ
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
index 9e76b8d..d6da389 100644
--- a/src/lib/db/prosodydb.ts
+++ b/src/lib/db/prosodydb.ts
@@ -1,12 +1,14 @@
import Database from "bun:sqlite";
import { Phoneme, Tone } from "../types/phonetics";
+import { ProsodyWord, ProsodyWordDB } from "../types/cards";
type Str = string | null;
type ItemType = "word" | "syllable" | "idiom";
class DatabaseHandler {
db: Database;
constructor() {
- const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db";
+ // const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db";
+ const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/thaiphon.db";
const db = new Database(dbPath, { create: true });
db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance
db.exec("PRAGMA foreign_keys = ON");
@@ -18,12 +20,123 @@ class DatabaseHandler {
this.db.exec(sql);
}
// selects
+ fetchFrequent(lang: string) {
+ const query = this.db.query(
+ `SELECT
+ w.id,
+ w.spelling,
+ w.lang,
+ w.frequency,
+ w.lang,
+ wp.ipa,
+ wp.syllables,
+ wp.tag,
+ w.notes,
+ (SELECT
+ json_group_array(json_object(
+ 'ipa', s.ipa,
+ 'spelling', s.text,
+ 'long', s.long,
+ 'notes', s.notes,
+ 'onseto', os.text,
+ 'onset', os.ipa,
+ 'nucleuso', ns.text,
+ 'nucleus', ns.ipa,
+ 'codao', co.text,
+ 'coda', co.ipa,
+ 'rhymeo', rh.text,
+ 'rhyme', rh.ipa,
+ 'tonen', tns.name,
+ 'tonenm', tns.nums,
+ 'tone', tns.ipa
+ )
+ )
+ FROM syllables s
+ JOIN onsets os ON os.id = s.onset
+ JOIN nucleus ns ON ns.id = s.nucleus
+ JOIN codas co ON co.id = s.coda
+ JOIN rhymes rh ON rh.id = s.rhyme
+ JOIN tones tns ON tns.id = s.tone
+ WHERE s.id= sw.syl_id
+ ) as syllables
+ FROM words w
+ JOIN word_phonetics wp ON wp.word_id = w.id
+ JOIN syllables_words sw ON sw.word_id = w.id
+ WHERE w.frequency IS NOT NULL
+ AND w.lang = ?
+ ORDER BY w.frequency ASC
+ LIMIT 300
+ `,
+ );
+ return query.all(lang) as ProsodyWordDB[];
+ }
fetchWords(words: string[]) {
const query = this.db.query(
`SELECT id FROM words where spelling IN (${words.map((w) => `'${w}'`).join(", ")})`,
);
return query.all() as Array<{ id: number }>;
}
+ fetchSyllables(words: string[]) {
+ const query = this.db.query(
+ `SELECT id FROM words where spelling IN (${words.map((w) => `'${w}'`).join(", ")})`,
+ );
+ return query.all() as Array<{ id: number }>;
+ }
+ fetchOnsets(onset: string) {
+ const query = this.db.query(
+ `SELECT
+ w.id,
+ w.spelling,
+ w.frequency,
+ wp.ipa
+ FROM words w
+ JOIN word_phonetics wp ON wp.word_id = w.id
+ JOIN syllables_words sw ON sw.word_id = w.id
+ JOIN syllables s ON s.id = sw.syl_id
+ JOIN onsets os ON os.id = syl.onset
+ `,
+ );
+ return query.all(onset) as any[];
+ }
+ // tones
+ fetchWordsByToneAndSyls(tones: Array<string | null>) {
+ const toneString = tones.reduce((acc: string, item) => {
+ if (!item) return `${acc},%`;
+ else return `${acc},${item}`;
+ }, "");
+ console.log({ toneString });
+ const query = this.db.query(
+ `
+ WITH word_tone_sequences AS (
+ SELECT
+ w.id as word_id,
+ w.spelling,
+ wp.ipa,
+ w.frequency,
+ GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+ COUNT(sw.syl_id) as syllable_count
+ FROM words w
+ JOIN word_phonetics wp ON w.id = wp.word_id
+ JOIN syllables_words sw ON w.id = sw.word_id
+ JOIN syllables s ON sw.syl_id = s.id
+ JOIN tones t ON s.tone = t.id
+ GROUP BY w.id, w.spelling, w.lang, w.frequency
+ )
+ SELECT
+ word_id,
+ spelling,
+ ipa,
+ frequency,
+ tone_sequence,
+ syllable_count
+ FROM word_tone_sequences
+ WHERE tone_sequence LIKE ?
+ AND syllable_count = ?
+ ORDER BY frequency DESC NULLS LAST;
+ `,
+ );
+ return query.all(toneString.slice(1), tones.length) as any[];
+ }
// inserts
addLanguage(code: string, name: string) {
@@ -109,6 +222,7 @@ class DatabaseHandler {
addSyllable(
wordId: number | bigint,
sylIdx: number,
+ stressed: boolean | null,
lang: string,
ipa: string,
long: boolean,
@@ -197,9 +311,9 @@ class DatabaseHandler {
//
const res1 = this.db
.query(
- `INSERT INTO syllables_words(syl_id, word_id, idx) VALUES(?, ?, ?)`,
+ `INSERT INTO syllables_words(syl_id, word_id, idx, stressed) VALUES(?, ?, ?, ?)`,
)
- .run(sylId, wordId, sylIdx);
+ .run(sylId, wordId, sylIdx, stressed);
//
return sylId;
});
diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql
index c962d83..c6a04fa 100644
--- a/src/lib/db/prosodyschema.sql
+++ b/src/lib/db/prosodyschema.sql
@@ -130,6 +130,7 @@ CREATE TABLE IF NOT EXISTS syllables_words(
syl_id INTEGER NOT NULL,
word_id INTEGER NOT NULL,
idx INTEGER NOT NULL,
+ stressed INTEGER,
FOREIGN KEY (syl_id) REFERENCES syllables(id),
FOREIGN KEY (word_id) REFERENCES words(id)
);
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
index 5c75345..6c69d9c 100644
--- a/src/lib/db/thaiseed.ts
+++ b/src/lib/db/thaiseed.ts
@@ -12,6 +12,7 @@ import pdb from "./prosodydb";
import { cleanIpa } from "../utils";
import { handleFile } from "./utils";
import { Tone } from "../types/phonetics";
+import { AsyncRes } from "../types";
async function readDump(lang: string) {
await pdb.init();
@@ -30,38 +31,77 @@ async function readDump(lang: string) {
const j = JSON.parse(langrow.data);
const word = j.word.trim();
if (!word) continue;
- if (word.includes("ๆ")) await handleWord(word, j);
- else {
+
+ if (word.includes("ๆ")) {
+ const res = await handleWord(word, j);
+ if ("error" in res) {
+ if (res.error.includes("meh")) continue;
+ if (res.error.includes("wtf")) {
+ console.error(res.error);
+ console.error(j.sounds);
+ }
+ break;
+ }
+ } else {
const split = word.split(" ");
- if (split.length > 1) await handleIdiom(word);
- else await handleWord(word, j);
+ if (split.length > 1) {
+ const res = await handleIdiom(word);
+ if ("error" in res) {
+ console.error(res.error);
+ break;
+ }
+ } else {
+ const res = await handleWord(word, j);
+ if ("error" in res) {
+ if (res.error.includes("meh")) continue;
+ if (res.error.includes("wtf")) {
+ console.error(res.error);
+ console.error(j.sounds);
+ }
+ // break;
+ }
+ }
}
}
}
-async function handleWord(word: string, j: any) {
+async function handleWord(word: string, j: any): AsyncRes<string> {
// TODO add categories but add a tag to see what classifying scheme we're using
//
const sounds = j.sounds || [];
const hasIpa = sounds.find((s: any) => "ipa" in s);
- if (!hasIpa) return;
+ if (!hasIpa) return { error: "meh no ipa" };
const freq = await getThaiFreq(word);
const wordId = pdb.addWord(word, "th", freq, null);
+ if (wordId == 478 || word === "และ") {
+ console.log("wtf man");
+ console.dir(j, { depth: null });
+ return { error: "i said wtf" };
+ }
const analyzed = await analyzeTHWord(word);
- for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed);
+ for (let snd of sounds)
+ if ("ipa" in snd) {
+ const res = await handleIpa(wordId, j, snd, analyzed);
+ if ("error" in res) return res;
+ }
+ return { ok: "" };
}
async function handleIpa(
wordId: number | bigint,
j: any,
snd: any,
analyzed: ThaiNLPRes,
-) {
+): AsyncRes<string> {
const tags = JSON.stringify(snd.tags) || null;
// console.log("handleipa", analyzed.syllables.length);
// console.log(analyzed);
const wikiIpa = cleanIpa(snd.ipa);
const nlpIpa = cleanIpa(analyzed.ipa);
const ipa = wikiIpa || nlpIpa;
+ if (j.word === "และ") {
+ console.log("wtf!!");
+ return { error: "wtf is this" };
+ }
const wikiIpaSplit = wikiIpa.split(".");
const nlpIpaSplit = nlpIpa.split(".");
if (wikiIpaSplit.length !== nlpIpaSplit.length) {
@@ -73,14 +113,15 @@ async function handleIpa(
// console.log("syllable analysis mismatch", j.word);
// console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
// console.dir(j, { depth: null });
- return;
+ return { error: "meh syllable analysis mismatch" };
}
- pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null);
const writtenSyls = analyzed.syllables;
const pronouncedSyls = analyzed.realSyls;
let badSyls = false;
if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
+ pdb.addPronunciation(wordId, ipa, pronouncedSyls.length, tags, null);
+
for (let i = 0; i < pronouncedSyls.length; i++) {
const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, "");
const written = writtenSyls[i] || "";
@@ -93,14 +134,10 @@ async function handleIpa(
console.log(pronounced);
console.log(written);
}
- try {
- await handleSyllable(syllable, ipa, wordId, i, notes);
- } catch (e) {
- console.error("syl error", j.word, j.sounds);
- console.error({ analyzed, ipa, wikiIpaSplit });
- console.error(e);
- }
+ const res = await handleSyllable(syllable, ipa, wordId, i, notes);
+ if ("error" in res) return res;
}
+ return { ok: "" };
}
const thaiTones: Record<string, string> = {
"˧": "mid",
@@ -122,7 +159,7 @@ function parseTone(ipa: string, spelling: string): Tone {
const numbers = thaiToneNums[ipa]!;
return { letters: ipa, name, numbers };
} catch (e) {
- console.error("wrong tones!!", { s: spelling, ipa });
+ console.error("meh wrong tones!!", { s: spelling, ipa });
throw new Error("");
}
}
@@ -133,7 +170,7 @@ async function handleSyllable(
wordId: number | bigint,
idx: number,
notes: string | null,
-) {
+): AsyncRes<string> {
const sorsyl = await sorSyl(spelling, "th", ipa);
const weird = [
// "a̯n",
@@ -166,14 +203,16 @@ async function handleSyllable(
// // console.dir(j, { depth: null });
// }
if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
- const syl = sorsyl.syls[0]!;
+ const syl = sorsyl.syls[0]!.ipa;
const tone = parseTone(syl.tone, spelling);
+ // TODO add actual ortographic data here not just ipa
try {
pdb.addSyllable(
wordId,
idx + 1,
+ null,
"th",
- syl.ipa,
+ syl.all,
syl.long,
spelling,
{ spelling: syl.onset, ipa: syl.onset },
@@ -184,16 +223,18 @@ async function handleSyllable(
tone,
notes,
);
+ return { ok: "" };
} catch (e) {
// console.log("well fuck", syl);
// console.error(e);
- console.log();
+ return { error: `meh ${e}` };
}
}
-async function handleIdiom(idiom: string) {
+async function handleIdiom(idiom: string): AsyncRes<string> {
pdb.addIdiom(idiom, "th");
// TODO later set idiom_words once all words are populated
// console.log();
+ return { ok: "" };
}
// ช้า ๆ
// งก ๆ