diff options
author | polwex <polwex@sortug.com> | 2025-06-02 23:05:36 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-06-02 23:05:36 +0700 |
commit | 904b34de8f7748b7954d88784369b9cae6fa92fb (patch) | |
tree | 53bb5cb3377ae40d8bfa44087a0c712edd6c9d02 | |
parent | a03c92dc82ad527d7da6bbaa3c43000e2e5f0e69 (diff) |
all me here should merge
-rw-r--r-- | NOTES.md | 4 | ||||
-rw-r--r-- | src/lib/calls/nlp.ts | 112 | ||||
-rw-r--r-- | src/lib/db/perf.ts | 43 | ||||
-rw-r--r-- | src/lib/db/prosodydb.ts | 153 | ||||
-rw-r--r-- | src/lib/db/prosodyschema.sql | 67 | ||||
-rw-r--r-- | src/lib/db/seed.ts | 132 | ||||
-rw-r--r-- | src/lib/db/thaiseed.ts | 184 | ||||
-rw-r--r-- | src/lib/db/utils.ts | 29 | ||||
-rw-r--r-- | src/lib/types/phonetics.ts | 22 | ||||
-rw-r--r-- | src/lib/utils.ts | 6 |
10 files changed, 576 insertions, 176 deletions
diff --git a/NOTES.md b/NOTES.md new file mode 100644 index 0000000..c853835 --- /dev/null +++ b/NOTES.md @@ -0,0 +1,4 @@ +some weirdness: + +วันพฤหัสบดี +วันพฤหัส diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts index 24e7cf3..3cff415 100644 --- a/src/lib/calls/nlp.ts +++ b/src/lib/calls/nlp.ts @@ -1,13 +1,35 @@ import { SyllableRes } from "../types/cards"; -type AnalyzeRes = { +export type ThaiNLPRes = { word: string; + normalized: string; syllables: string[]; + syllablesIpa: string[]; ipa: string; pos: string; }; +export type SorSylRes = { + word: string; + ipa: string; + clean_ipa: string; + syls: SorSyl[]; +}; +export type SorSyl = { + stressed: boolean; + long: boolean; + spelling: string; + ipa: string; + nucleus: string; + onset: string; + medial: string; + coda: string; + rhyme: string; + tone: string; + start_idx: number; + end_idx: number; +}; -export async function thaiData(word: string): Promise<AnalyzeRes[]> { +export async function thaiData(word: string): Promise<ThaiNLPRes[]> { const [head, tail] = await Promise.all([ analyzeTHWord(word), segmentateThai(word), @@ -15,7 +37,7 @@ export async function thaiData(word: string): Promise<AnalyzeRes[]> { return [head, ...tail]; } -export async function analyzeTHWord(word: string): Promise<AnalyzeRes> { +export async function analyzeTHWord(word: string): Promise<ThaiNLPRes> { const opts = { method: "POST", headers: { "Content-type": "application/json" }, @@ -26,7 +48,7 @@ export async function analyzeTHWord(word: string): Promise<AnalyzeRes> { const jj = await r1.json(); return jj; } -export async function segmentateThai(sentence: string): Promise<AnalyzeRes[]> { +export async function segmentateThai(sentence: string): Promise<ThaiNLPRes[]> { const opts = { method: "POST", headers: { "Content-type": "application/json" }, @@ -37,6 +59,70 @@ export async function segmentateThai(sentence: string): Promise<AnalyzeRes[]> { const jj = await r2.json(); return jj; } +export async function getThaiFreq(word: string): Promise<number> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/freq`, opts); + const jj = await r2.json(); + return jj; +} +export async function getThaiNext(word: string): Promise<string[]> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/next`, opts); + const jj = await r2.json(); + return jj; +} + +export async function getThaiPrev(word: string): Promise<string[]> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/prev`, opts); + const jj = await r2.json(); + return jj; +} + +export async function getThaiNext_bi( + word1: string, + word2: string, +): Promise<string[]> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word1, word2 }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/next_bi`, opts); + const jj = await r2.json(); + return jj; +} + +export async function getThaiPrev_bi( + word1: string, + word2: string, +): Promise<string[]> { + const opts = { + method: "POST", + headers: { "Content-type": "application/json" }, + body: JSON.stringify({ word1, word2 }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8001" + `/prev_bi`, opts); + const jj = await r2.json(); + return jj; +} export async function deconstructSyllable(ipa: string): Promise<SyllableRes> { const opts = { @@ -52,6 +138,24 @@ export async function deconstructSyllable(ipa: string): Promise<SyllableRes> { const jj = await r2.json(); return jj; } +export async function sorSyl( + word: string, + lang_code: string, + ipa: string, +): Promise<SorSylRes> { + const opts = { + method: "POST", + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + body: JSON.stringify({ string: word, lang: lang_code, ipa }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8104" + `/syls`, opts); + const jj = await r2.json(); + return jj; +} export async function findLemma(word: string, lang: string) { const opts = { diff --git a/src/lib/db/perf.ts b/src/lib/db/perf.ts index a5b57c3..d805314 100644 --- a/src/lib/db/perf.ts +++ b/src/lib/db/perf.ts @@ -1,4 +1,47 @@ /** + * Database Performance Optimizations Documentation + * =============================================== + * + * 1. SRS Card Fetching Optimization + * --------------------------------- + * Problem: When processing card reviews in the SRS system, the application was fetching an entire + * lesson's worth of cards just to retrieve a single updated card. This was inefficient, especially + * for lessons with many cards. + * + * Solution: Implemented a dedicated `fetchCardById` method in DatabaseHandler that retrieves only + * the specific card needed with all its associated data (expression, progress, etc.). This method + * is used in SRSStudyService.processReview to efficiently fetch just the updated card after a review. + * + * Impact: + * - Reduced database query load by eliminating unnecessary card fetches + * - Fixed the "Failed to fetch updated card data" error that occurred when processing reviews + * - Made card reviews more reliable and efficient + * + * Implementation details: + * 1. Added fetchCardById method to DatabaseHandler class + * 2. Updated SRSStudyService.processReview to use fetchCardById instead of fetchLesson + * 3. Maintained consistent timing measurements for performance monitoring + * + * 2. SQLite Optimization Techniques + * -------------------------------- + * - WAL (Write-Ahead Logging) mode enabled for better concurrency + * - Increased cache size to 8MB for improved read performance + * - Temp tables stored in memory rather than disk + * - Reduced synchronous mode to NORMAL for better write performance + * - Added strategic indexes on frequently queried columns + * + * 3. JSON Processing Optimization + * ------------------------------ + * - Measured and isolated JSON processing time from query execution time + * - Confirmed that database queries (~329ms) were the primary bottleneck rather than + * JSON processing (~0.8ms) + * + * 4. Query-Level Optimizations + * --------------------------- + * - Used proper indexing for user_progress, expressions, and cards_lessons tables + * - Optimized JOIN conditions to ensure efficient execution plans + * - Used parameterized queries to take advantage of SQLite's query cache + * * Database performance optimization suggestions */ diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts index 52312bd..ec95359 100644 --- a/src/lib/db/prosodydb.ts +++ b/src/lib/db/prosodydb.ts @@ -1,11 +1,12 @@ import Database from "bun:sqlite"; +import { Phoneme, Tone } from "../types/phonetics"; type Str = string | null; type ItemType = "word" | "syllable" | "idiom"; class DatabaseHandler { db: Database; constructor() { - const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/prosodynew.db"; + const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db"; const db = new Database(dbPath, { create: true }); db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance db.exec("PRAGMA foreign_keys = ON"); @@ -31,48 +32,39 @@ class DatabaseHandler { .run(code, name); } addPronunciation( - type: ItemType, - parentId: number | bigint, + wordId: number | bigint, ipa: string, syllables: number, tags: Str, notes: Str, ) { - try { - const query = this.db - .query( - `INSERT INTO pronunciation(type, parent_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?, ?)`, - ) - .run(type, parentId, ipa, syllables, tags, notes); - } catch (e) { - // console.error(e); - } + const query = this.db + .query( + `INSERT OR IGNORE INTO word_phonetics(word_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?)`, + ) + .run(wordId, ipa, syllables, tags, notes); } addWordRhyme(wordId: number | bigint, ipa: string, lang: string, notes: Str) { - try { - const query = this.db - .query( - `INSERT INTO word_rhymes(text, lang, notes) VALUES(?, ?, ?) + const query = this.db + .query( + `INSERT INTO word_rhymes(text, lang, notes) VALUES(?, ?, ?) ON CONFLICT(text,lang) DO UPDATE SET text = excluded.text RETURNING rowid `, - ) - .get(ipa, lang, notes) as { id: number }; - const query2 = this.db - .query( - ` - INSERT INTO words_idioms(word_id, idiom_id) VALUES(?, ?) + ) + .get(ipa, lang, notes) as { id: number }; + const query2 = this.db + .query( + ` + INSERT INTO words_wrhymes(word_id, wrhyme_id) VALUES(?, ?) `, - ) - .run(wordId, query.id); - } catch (e) { - // console.error(e); - } + ) + .run(wordId, query.id); } addIdiom(spelling: string, lang: string) { const query = this.db.query( - `INSERT INTO idioms(spelling, lang) VALUES(?, ?)`, + `INSERT OR IGNORE INTO idioms(spelling, lang) VALUES(?, ?)`, ); const res = query.run(spelling, lang); return res; @@ -100,49 +92,72 @@ class DatabaseHandler { this.findIdiomWords(row.spelling, row.id); } } - addWord(spelling: string, lang: string) { + addWord( + spelling: string, + lang: string, + frequency: number | null, + notes: Str, + ) { const query = this.db.query( - // `INSERT OR IGNORE INTO words(spelling, lang) VALUES(?, ?)`, - `INSERT INTO words(spelling, lang) VALUES(?, ?)`, + `INSERT OR IGNORE INTO words(spelling, lang, frequency, notes) VALUES(?, ?, ?, ?)`, + // `INSERT INTO words(spelling, lang) VALUES(?, ?)`, ); - const res = query.run(spelling, lang); + const res = query.run(spelling, lang, frequency, notes); const wordId = res.lastInsertRowid; return wordId; } addSyllable( wordId: number | bigint, - text: string, + sylIdx: number, lang: string, + ipa: string, long: boolean, - onset: Str, - medial: Str, - nucleus: string, - coda: Str, - rhyme: string, - tone: Str, + text: string, + onset: Phoneme, + medial: Phoneme, + nucleus: Phoneme, + coda: Phoneme, + rhyme: Phoneme, + tone: Tone | null, notes: Str, ) { const tx = this.db.transaction(() => { const query = this.db.query( - `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, ); + // TODO need a dual structure here for IPA and orto const res = query.run( - text, lang, + ipa, long, - onset, - medial, - nucleus, - coda, - rhyme, - tone, + text, + onset.spelling, + medial.spelling, + nucleus.spelling, + coda.spelling, + rhyme.spelling, notes, ); const sylId = res.lastInsertRowid; - + const ipaq = this.db.query(` + INSERT INTO syl_ipa(syl_id, ipa, onset, medial, nucleus, coda, rhyme, notes) + VALUES(?, ?, ?, ?, ?, ?, ?, ?)`); + ipaq.run( + sylId, + ipa, + onset.ipa, + medial.ipa, + nucleus.ipa, + coda.ipa, + rhyme.ipa, + null, + ); + // const res1 = this.db - .query(`INSERT INTO syllables_words(syl_id, word_id) VALUES(?, ?)`) - .run(sylId, wordId); + .query( + `INSERT INTO syllables_words(syl_id, word_id, idx) VALUES(?, ?, ?)`, + ) + .run(sylId, wordId, sylIdx); // return sylId; }); @@ -151,13 +166,13 @@ class DatabaseHandler { if (onset) { res1 = this.db .query( - `INSERT INTO onsets(text, lang) VALUES(?, ?) - ON CONFLICT(text, lang) DO UPDATE SET + `INSERT INTO onsets(ipa, lang, text) VALUES(?, ?, ?) + ON CONFLICT(ipa, lang, text) DO UPDATE SET text = excluded.text RETURNING rowid `, ) - .get(onset, lang); + .get(onset.ipa, lang, onset.spelling); this.db .query(`INSERT INTO onsets_syllables(syl_id, onset_id) VALUES(?, ?)`) .run(sylId, res1.id); @@ -165,65 +180,65 @@ class DatabaseHandler { if (medial) { res1 = this.db .query( - `INSERT INTO medials(text, lang) VALUES(?, ?) - ON CONFLICT(text, lang) DO UPDATE SET + `INSERT INTO medials(ipa, lang, text) VALUES(?, ?, ?) + ON CONFLICT(ipa, lang, text) DO UPDATE SET text = excluded.text RETURNING rowid `, ) - .get(medial, lang); + .get(medial.ipa, lang, medial.spelling); this.db .query(`INSERT INTO medials_syllables(syl_id, medial_id) VALUES(?, ?)`) .run(sylId, res1.id); } res1 = this.db .query( - `INSERT INTO nucleus(text, lang) VALUES(?, ?) - ON CONFLICT(text, lang) DO UPDATE SET + `INSERT INTO nucleus(ipa, lang, text) VALUES(?, ?, ?) + ON CONFLICT(ipa, lang, text) DO UPDATE SET text = excluded.text RETURNING rowid `, ) - .get(nucleus, lang); + .get(nucleus.ipa, lang, nucleus.spelling); this.db .query(`INSERT INTO nucleus_syllables(syl_id, nucleus_id) VALUES(?, ?)`) .run(sylId, res1.id); if (coda) { res1 = this.db .query( - `INSERT INTO codas(text, lang) VALUES(?, ?) - ON CONFLICT(text, lang) DO UPDATE SET + `INSERT INTO codas(ipa, lang, text) VALUES(?, ?, ?) + ON CONFLICT(ipa, lang, text) DO UPDATE SET text = excluded.text RETURNING rowid `, ) - .get(coda, lang); + .get(coda.ipa, lang, coda.spelling); this.db .query(`INSERT INTO codas_syllables(syl_id, coda_id) VALUES(?, ?)`) .run(sylId, res1.id); } res1 = this.db .query( - `INSERT INTO rhymes(text, lang) VALUES(?, ?) - ON CONFLICT(text, lang) DO UPDATE SET + `INSERT INTO rhymes(ipa, lang, text) VALUES(?, ?, ?) + ON CONFLICT(ipa, lang, text) DO UPDATE SET text = excluded.text RETURNING rowid `, ) - .get(rhyme, lang); + .get(rhyme.ipa, lang, rhyme.spelling); this.db .query(`INSERT INTO rhymes_syllables(syl_id, rhyme_id) VALUES(?, ?)`) .run(sylId, res1.id); if (tone) { res1 = this.db .query( - `INSERT INTO tones(text, lang) VALUES(?, ?) - ON CONFLICT(text, lang) DO UPDATE SET - text = excluded.text + `INSERT INTO tones(ipa, lang, name, nums) VALUES(?, ?, ?, ?) + ON CONFLICT(ipa, lang) DO UPDATE SET + ipa = excluded.ipa RETURNING rowid `, ) - .get(tone, lang); + .get(tone.letters, lang, tone.name, tone.numbers); this.db .query(`INSERT INTO tones_syllables(syl_id, tone_id) VALUES(?, ?)`) .run(sylId, res1.id); diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql index e70b005..09dabc2 100644 --- a/src/lib/db/prosodyschema.sql +++ b/src/lib/db/prosodyschema.sql @@ -35,6 +35,7 @@ CREATE TABLE IF NOT EXISTS words( spelling TEXT NOT NULL, lang TEXT NOT NULL, frequency INTEGER, + notes TEXT, FOREIGN KEY (lang) REFERENCES languages(iso6392), CONSTRAINT spell_unique UNIQUE (spelling, lang) ); @@ -48,7 +49,7 @@ CREATE TABLE IF NOT EXISTS word_rhymes( notes TEXT, CONSTRAINT wrhyme_unique UNIQUE (text, lang) ); -CREATE TABLE IF NOT EXISTS words_rhymes( +CREATE TABLE IF NOT EXISTS words_wrhymes( word_id INTEGER NOT NULL, wrhyme_id INTEGER NOT NULL, FOREIGN KEY (word_id) REFERENCES words(id), @@ -58,57 +59,62 @@ CREATE TABLE IF NOT EXISTS words_rhymes( -- break up syllables CREATE TABLE IF NOT EXISTS syllables( id INTEGER PRIMARY KEY AUTOINCREMENT, - text TEXT NOT NULL, lang TEXT NOT NULL, + ipa TEXT NOT NULL, long INTEGER NOT NULL, - tone TEXT, - onset TEXT, - medial TEXT, - nucleus TEXT, - coda TEXT, - rhyme TEXT, + text TEXT NOT NULL, + onset TEXT NOT NULL, + medial TEXT NOT NULL, + nucleus TEXT NOT NULL, + coda TEXT NOT NULL, + rhyme TEXT NOT NULL, notes TEXT, FOREIGN KEY (lang) REFERENCES languages(iso6392), - CONSTRAINT spell_unique UNIQUE (text, lang) + CONSTRAINT syllable_unique UNIQUE (text, ipa, lang) ); CREATE TABLE IF NOT EXISTS tones( id INTEGER PRIMARY KEY AUTOINCREMENT, - text TEXT NOT NULL, + ipa TEXT NOT NULL, lang TEXT NOT NULL, - name TEXT, - num INTEGER, - CONSTRAINT tone_unique UNIQUE (text, lang) + name TEXT NOT NULL, + nums INTEGER NOT NULL, + CONSTRAINT tone_unique UNIQUE (ipa, lang) ); CREATE TABLE IF NOT EXISTS onsets( id INTEGER PRIMARY KEY AUTOINCREMENT, + ipa TEXT NOT NULL, text TEXT NOT NULL, lang TEXT NOT NULL, - CONSTRAINT onsets_unique UNIQUE (text, lang) + CONSTRAINT onsets_unique UNIQUE (ipa, text, lang) ); CREATE TABLE IF NOT EXISTS medials( id INTEGER PRIMARY KEY AUTOINCREMENT, + ipa TEXT NOT NULL, text TEXT NOT NULL, lang TEXT NOT NULL, - CONSTRAINT medials_unique UNIQUE (text, lang) + CONSTRAINT onsets_unique UNIQUE (ipa, text, lang) ); CREATE TABLE IF NOT EXISTS nucleus( id INTEGER PRIMARY KEY AUTOINCREMENT, + ipa TEXT NOT NULL, text TEXT NOT NULL, lang TEXT NOT NULL, - CONSTRAINT nucleus_unique UNIQUE (text, lang) + CONSTRAINT onsets_unique UNIQUE (ipa, text, lang) ); CREATE TABLE IF NOT EXISTS codas( id INTEGER PRIMARY KEY AUTOINCREMENT, + ipa TEXT NOT NULL, text TEXT NOT NULL, lang TEXT NOT NULL, - CONSTRAINT coda_unique UNIQUE (text, lang) + CONSTRAINT onsets_unique UNIQUE (ipa, text, lang) ); CREATE TABLE IF NOT EXISTS rhymes( id INTEGER PRIMARY KEY AUTOINCREMENT, + ipa TEXT NOT NULL, text TEXT NOT NULL, lang TEXT NOT NULL, - CONSTRAINT rhyme_unique UNIQUE (text, lang) + CONSTRAINT onsets_unique UNIQUE (ipa, text, lang) ); -- join tables @@ -153,9 +159,12 @@ CREATE TABLE IF NOT EXISTS rhymes_syllables( CREATE TABLE IF NOT EXISTS syllables_words( syl_id INTEGER NOT NULL, word_id INTEGER NOT NULL, + idx INTEGER NOT NULL, FOREIGN KEY (syl_id) REFERENCES syllables(id), FOREIGN KEY (word_id) REFERENCES words(id) ); + + CREATE TABLE IF NOT EXISTS words_idioms( word_id INTEGER NOT NULL, idiom_id INTEGER NOT NULL, @@ -165,14 +174,26 @@ CREATE TABLE IF NOT EXISTS words_idioms( -- -CREATE TABLE IF NOT EXISTS pronunciation( +CREATE TABLE IF NOT EXISTS syl_ipa( id INTEGER PRIMARY KEY AUTOINCREMENT, - type TEXT CHECK(type IN ('word', 'syllable', 'idiom')) NOT NULL, - parent_id INTEGER NOT NULL, + syl_id INTEGER NOT NULL, + ipa TEXT NOT NULL, + onset TEXT NOT NULL, + medial TEXT NOT NULL, + nucleus TEXT NOT NULL, + rhyme TEXT NOT NULL, + coda TEXT NOT NULL, + notes TEXT, + CONSTRAINT syl_ipa_unique UNIQUE (ipa, syl_id) +); + +CREATE TABLE IF NOT EXISTS word_phonetics( + id INTEGER PRIMARY KEY AUTOINCREMENT, + word_id INTEGER NOT NULL, ipa TEXT NOT NULL, syllables INTEGER NOT NULL, tag TEXT, notes TEXT, - CONSTRAINT ipa_unique UNIQUE (ipa, parent_id) + CONSTRAINT ipa_unique UNIQUE (ipa, word_id) ); -CREATE INDEX IF NOT EXISTS idx_words_ipa ON pronunciation(ipa, parent_id); +CREATE INDEX IF NOT EXISTS idx_words_ipa ON word_phonetics(ipa, word_id); diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts index 4780dc3..c03da60 100644 --- a/src/lib/db/seed.ts +++ b/src/lib/db/seed.ts @@ -1,3 +1,4 @@ +import Database from "bun:sqlite"; import { readWiktionaryDump } from "../services/wiki"; import { getStressedSyllable, getSyllableCount } from "../utils"; import useful from "@/lib/useful_thai.json"; @@ -7,36 +8,6 @@ import { findLemma } from "../calls/nlp"; const SYMBOL_REGEX = new RegExp(/[\W\d]/); -async function handleFile( - filename: string, - func: (line: string, idx: number) => void, -) { - const file = Bun.file(filename); - const s = file.stream(); - const reader = s.getReader(); - const decoder = new TextDecoder(); - let leftover = ""; - let lineCount = 0; - while (true) { - const { value, done } = await reader.read(); - if (done) break; - const chunk = decoder.decode(value, { stream: true }); - const lines = (leftover + chunk).split("\n"); - - // Process each line except the last (which might be incomplete) - for (const line of lines.slice(0, -1)) { - lineCount++; - func(line, lineCount); - } - - // Save the last incomplete line to process in the next iteration - leftover = lines[lines.length - 1]; - } - - // Handle any remaining content after reading all chunks - if (leftover) func(leftover, lineCount + 1); -} - function goodPos(pos: string): boolean { const list = [ "CC", @@ -90,12 +61,12 @@ async function englishFreq() { } async function thaiFreq() { const files = [ - "/home/y/code/prosody/prosody/langdata/thai/data/1yin_freq.csv", - "/home/y/code/prosody/prosody/langdata/thai/data/2yin_freq.csv", - "/home/y/code/prosody/prosody/langdata/thai/data/3yin_freq.csv", - "/home/y/code/prosody/prosody/langdata/thai/data/4yin_freq.csv", - "/home/y/code/prosody/prosody/langdata/thai/data/5yin_freq.csv", - "/home/y/code/prosody/prosody/langdata/thai/data/6yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv", ]; for (let f of files) { handleFile(f, (line, idx) => { @@ -508,52 +479,51 @@ function fixSyllables() { // const SORSYL_PATH = "/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl"; -async function redump() { - await pdb.init(); - let count = 0; - // const soundTypes = new Set<string>(); - // [ - // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other", - // "text", "hangeul", "topics", "form", "audio-ipa" - // ] - const langs = ["en", "th", "zh", "es", "ja", "vn"]; - for await (const line of readWiktionaryDump()) { - try { - count++; - console.log({ count }); - // if (count > 50) break; - const j = JSON.parse(line); - // console.log(Object.keys(j), j.word); - // add language to db - pdb.addLanguage(j.lang_code, j.lang); - if (!langs.includes(j.lang_code)) continue; - // handleEtim(j); - // handleDerived(j); - // handleSenses(j.pos, j.senses); - // // - const isWord = j.word.trim().split(" ").length === 1; - if (isWord) await handleWord(j); - else await handleIdiom(j); - } catch (e) { - // console.log("error parsing", e); - // break; - } +async function redump(lang: string) { + let count = 0; + const langdb = new Database( + `/home/y/code/prosody/resources/wiktionary/${lang}.db`, + ); + const langrows: any = langdb.query("SELECT data FROM langs"); + for (const langrow of langrows) { + const j = JSON.parse(langrow.data); + console.log({ j }); + if (count > 10) break; } + // await pdb.init(); + + // // const soundTypes = new Set<string>(); + // // [ + // // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other", + // // "text", "hangeul", "topics", "form", "audio-ipa" + // // ] + // const langs = ["en", "th", "zh", "es", "ja", "vn"]; + + // for await (const line of readWiktionaryDump()) { + // try { + // count++; + // console.log({ count }); + // // if (count > 50) break; + // const j = JSON.parse(line); + // // console.log(Object.keys(j), j.word); + // // add language to db + // pdb.addLanguage(j.lang_code, j.lang); + // if (!langs.includes(j.lang_code)) continue; + // // handleEtim(j); + // // handleDerived(j); + // // handleSenses(j.pos, j.senses); + // // // + // const isWord = j.word.trim().split(" ").length === 1; + // if (isWord) await handleWord(j); + // else await handleIdiom(j); + // } catch (e) { + // // console.log("error parsing", e); + // // break; + // } + // } } -type SorSyl = { - stressed: boolean; - long: boolean; - spelling: string; - ipa: string; - nucleus: string; - onset: string; - medial: string; - coda: string; - rhyme: string; - tone: string; -}; async function handleWord(j: any) { let ts = Date.now(); const analyzed = await findLemma(j.word, j.lang_code); @@ -615,9 +585,11 @@ async function handleIpa( // TODO ideally syllables would have spelling not IPA... harsh tho pdb.addSyllable( wordId, - syl.ipa, + idx, j.lang_code, + syl.ipa, syl.long, + "", syl.onset || null, syl.medial || null, syl.nucleus, @@ -689,7 +661,7 @@ async function handleSenses(pos: string, senses: any[]) { } } -redump(); +redump("th"); async function newtest() { // const query = pdb.db.query( diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts new file mode 100644 index 0000000..687f0f3 --- /dev/null +++ b/src/lib/db/thaiseed.ts @@ -0,0 +1,184 @@ +import Database from "bun:sqlite"; +import { + analyzeTHWord, + deconstructSyllable, + segmentateThai, + type SorSyl, + type ThaiNLPRes, + sorSyl, + getThaiFreq, +} from "../calls/nlp"; +import pdb from "./prosodydb"; +import { cleanIpa } from "../utils"; +import { handleFile } from "./utils"; +import { Tone } from "../types/phonetics"; + +async function readDump(lang: string) { + await pdb.init(); + pdb.addLanguage("th", "thai"); + let count = 0; + const langdb = new Database( + `/home/y/code/prosody/resources/wiktionary/${lang}.db`, + ); + let langrows: any = langdb.query("SELECT data FROM langs"); + // langrows = langrows.slice(10); + for (const langrow of langrows) { + count++; + console.log(count); + // if (count <= 10000) continue; + // if (count > 30) break; + const j = JSON.parse(langrow.data); + const word = j.word.trim(); + if (!word) continue; + if (word.includes("ๆ")) await handleWord(word, j); + else { + const split = word.split(" "); + if (split.length > 1) await handleIdiom(word); + else await handleWord(word, j); + } + } +} + +async function handleWord(word: string, j: any) { + // TODO add categories but add a tag to see what classifying scheme we're using + // + const sounds = j.sounds || []; + const hasIpa = sounds.find((s: any) => "ipa" in s); + if (!hasIpa) return; + const freq = await getThaiFreq(word); + const wordId = pdb.addWord(word, "th", freq, null); + const analyzed = await analyzeTHWord(word); + // console.log(analyzed); + for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed); +} +async function handleIpa( + wordId: number | bigint, + j: any, + snd: any, + analyzed: ThaiNLPRes, +) { + const tags = JSON.stringify(snd.tags) || null; + // console.log("handleipa", analyzed.syllables.length); + // console.log(analyzed); + const wikiIpa = cleanIpa(snd.ipa); + const nlpIpa = cleanIpa(analyzed.ipa); + const ipa = wikiIpa || nlpIpa; + const wikiIpaSplit = wikiIpa.split("."); + const nlpIpaSplit = nlpIpa.split("."); + if (wikiIpaSplit.length !== nlpIpaSplit.length) { + console.log("ipa mismatch"); + console.log(wikiIpa); + console.log(nlpIpa); + // return; + } + if (analyzed.syllables.length !== wikiIpaSplit.length) { + console.log("syllable analysis mismatch", j.word); + console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit }); + // console.dir(j, { depth: null }); + return; + } + pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null); + + for (let i = 0; i < analyzed.syllables.length; i++) { + const spelling = analyzed.syllables[i]!; + const ipa = wikiIpaSplit[i]!; + try { + await handleSyllable(spelling, ipa, wordId, i); + } catch (e) { + console.error("syl error", j.word, j.sounds); + console.error({ spelling, ipa, wikiIpaSplit }); + console.error(e); + } + } +} +const thaiTones: Record<string, string> = { + "˧": "mid", + "˨˩": "low", + "˥˩": "falling", + "˦˥": "high", + "˩˩˦": "rising", +}; +const thaiToneNums: Record<string, number> = { + "˧": 33, + "˨˩": 21, + "˥˩": 41, + "˦˥": 45, + "˩˩˦": 214, +}; +function parseTone(ipa: string, spelling: string): Tone { + try { + const name = thaiTones[ipa]!; + const numbers = thaiToneNums[ipa]!; + return { letters: ipa, name, numbers }; + } catch (e) { + console.error("wrong tones!!", { s: spelling, ipa }); + throw new Error(""); + } +} +async function handleSyllable( + spelling: string, + ipa: string, + wordId: number | bigint, + idx: number, +) { + const sorsyl = await sorSyl(spelling, "th", ipa); + if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!"); + const syl = sorsyl.syls[0]!; + const tone = syl.tone ? parseTone(syl.tone, spelling) : null; + try { + pdb.addSyllable( + wordId, + idx + 1, + "th", + syl.ipa, + syl.long, + spelling, + { spelling: syl.onset, ipa: syl.onset }, + { spelling: syl.medial, ipa: syl.medial }, + { spelling: syl.nucleus, ipa: syl.nucleus }, + { spelling: syl.coda, ipa: syl.coda }, + { spelling: syl.rhyme, ipa: syl.rhyme }, + tone, + null, + ); + } catch (e) { + // console.log("well fuck", syl); + // console.error(e); + console.log(); + } +} +async function handleIdiom(idiom: string) { + pdb.addIdiom(idiom, "th"); + // TODO later set idiom_words once all words are populated + // console.log(); +} +// ช้า ๆ +// งก ๆ +// หงก ๆ + +async function getFrequency() { + const files = [ + "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv", + ]; + const freqMap = new Map<number, string>(); + for (const file of files) { + await handleFile(file, (line, idx) => { + const [spelling, IPA, tone, length, frequency, ...rest] = line.split(","); + freqMap.set(Number(frequency!), spelling!); + }); + } + const orderedMap = new Map<string, number>(); + const keys = Array.from(freqMap.keys()).sort(); + for (let i = 0; i < keys.length; i++) { + const val = freqMap.get(keys[i]!)!; + orderedMap.set(val, i + 1); + } + return orderedMap; +} + +readDump("th"); diff --git a/src/lib/db/utils.ts b/src/lib/db/utils.ts new file mode 100644 index 0000000..1ac577f --- /dev/null +++ b/src/lib/db/utils.ts @@ -0,0 +1,29 @@ +export async function handleFile( + filename: string, + func: (line: string, idx: number) => void, +) { + const file = Bun.file(filename); + const s = file.stream(); + const reader = s.getReader(); + const decoder = new TextDecoder(); + let leftover = ""; + let lineCount = 0; + while (true) { + const { value, done } = await reader.read(); + if (done) break; + const chunk = decoder.decode(value, { stream: true }); + const lines = (leftover + chunk).split("\n"); + + // Process each line except the last (which might be incomplete) + for (const line of lines.slice(0, -1)) { + lineCount++; + func(line, lineCount); + } + + // Save the last incomplete line to process in the next iteration + leftover = lines[lines.length - 1]; + } + + // Handle any remaining content after reading all chunks + if (leftover) func(leftover, lineCount + 1); +} diff --git a/src/lib/types/phonetics.ts b/src/lib/types/phonetics.ts new file mode 100644 index 0000000..0009e78 --- /dev/null +++ b/src/lib/types/phonetics.ts @@ -0,0 +1,22 @@ +export type Tone = { + letters: string; + numbers: number; + name: string; +}; + +export type Phoneme = { + ipa: string; + spelling: string; +}; +export type Syllable = { + stressed: boolean; + long: boolean; + spelling: string; + ipa: string; + nucleus: Phoneme; + onset: Phoneme; + medial: Phoneme; + coda: Phoneme; + rhyme: Phoneme; + tone: Tone; +}; diff --git a/src/lib/utils.ts b/src/lib/utils.ts index 9bc74b8..0674dea 100644 --- a/src/lib/utils.ts +++ b/src/lib/utils.ts @@ -57,3 +57,9 @@ export function getRandomHexColor() { // Ensure the color code is always 6 digits by padding with zeros if needed return "#" + randomColor.padStart(6, "0"); } + +export function cleanIpa(ipa: string): string { + const r1 = /\.\//; + const r2 = /[\[\]\/]/g; + return ipa.replace(r1, "").replace(r2, ""); +} |