summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-06-02 23:05:36 +0700
committerpolwex <polwex@sortug.com>2025-06-02 23:05:36 +0700
commit904b34de8f7748b7954d88784369b9cae6fa92fb (patch)
tree53bb5cb3377ae40d8bfa44087a0c712edd6c9d02
parenta03c92dc82ad527d7da6bbaa3c43000e2e5f0e69 (diff)
all me here should merge
-rw-r--r--NOTES.md4
-rw-r--r--src/lib/calls/nlp.ts112
-rw-r--r--src/lib/db/perf.ts43
-rw-r--r--src/lib/db/prosodydb.ts153
-rw-r--r--src/lib/db/prosodyschema.sql67
-rw-r--r--src/lib/db/seed.ts132
-rw-r--r--src/lib/db/thaiseed.ts184
-rw-r--r--src/lib/db/utils.ts29
-rw-r--r--src/lib/types/phonetics.ts22
-rw-r--r--src/lib/utils.ts6
10 files changed, 576 insertions, 176 deletions
diff --git a/NOTES.md b/NOTES.md
new file mode 100644
index 0000000..c853835
--- /dev/null
+++ b/NOTES.md
@@ -0,0 +1,4 @@
+some weirdness:
+
+วันพฤหัสบดี
+วันพฤหัส
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
index 24e7cf3..3cff415 100644
--- a/src/lib/calls/nlp.ts
+++ b/src/lib/calls/nlp.ts
@@ -1,13 +1,35 @@
import { SyllableRes } from "../types/cards";
-type AnalyzeRes = {
+export type ThaiNLPRes = {
word: string;
+ normalized: string;
syllables: string[];
+ syllablesIpa: string[];
ipa: string;
pos: string;
};
+export type SorSylRes = {
+ word: string;
+ ipa: string;
+ clean_ipa: string;
+ syls: SorSyl[];
+};
+export type SorSyl = {
+ stressed: boolean;
+ long: boolean;
+ spelling: string;
+ ipa: string;
+ nucleus: string;
+ onset: string;
+ medial: string;
+ coda: string;
+ rhyme: string;
+ tone: string;
+ start_idx: number;
+ end_idx: number;
+};
-export async function thaiData(word: string): Promise<AnalyzeRes[]> {
+export async function thaiData(word: string): Promise<ThaiNLPRes[]> {
const [head, tail] = await Promise.all([
analyzeTHWord(word),
segmentateThai(word),
@@ -15,7 +37,7 @@ export async function thaiData(word: string): Promise<AnalyzeRes[]> {
return [head, ...tail];
}
-export async function analyzeTHWord(word: string): Promise<AnalyzeRes> {
+export async function analyzeTHWord(word: string): Promise<ThaiNLPRes> {
const opts = {
method: "POST",
headers: { "Content-type": "application/json" },
@@ -26,7 +48,7 @@ export async function analyzeTHWord(word: string): Promise<AnalyzeRes> {
const jj = await r1.json();
return jj;
}
-export async function segmentateThai(sentence: string): Promise<AnalyzeRes[]> {
+export async function segmentateThai(sentence: string): Promise<ThaiNLPRes[]> {
const opts = {
method: "POST",
headers: { "Content-type": "application/json" },
@@ -37,6 +59,70 @@ export async function segmentateThai(sentence: string): Promise<AnalyzeRes[]> {
const jj = await r2.json();
return jj;
}
+export async function getThaiFreq(word: string): Promise<number> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/freq`, opts);
+ const jj = await r2.json();
+ return jj;
+}
+export async function getThaiNext(word: string): Promise<string[]> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/next`, opts);
+ const jj = await r2.json();
+ return jj;
+}
+
+export async function getThaiPrev(word: string): Promise<string[]> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/prev`, opts);
+ const jj = await r2.json();
+ return jj;
+}
+
+export async function getThaiNext_bi(
+ word1: string,
+ word2: string,
+): Promise<string[]> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word1, word2 }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/next_bi`, opts);
+ const jj = await r2.json();
+ return jj;
+}
+
+export async function getThaiPrev_bi(
+ word1: string,
+ word2: string,
+): Promise<string[]> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word1, word2 }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/prev_bi`, opts);
+ const jj = await r2.json();
+ return jj;
+}
export async function deconstructSyllable(ipa: string): Promise<SyllableRes> {
const opts = {
@@ -52,6 +138,24 @@ export async function deconstructSyllable(ipa: string): Promise<SyllableRes> {
const jj = await r2.json();
return jj;
}
+export async function sorSyl(
+ word: string,
+ lang_code: string,
+ ipa: string,
+): Promise<SorSylRes> {
+ const opts = {
+ method: "POST",
+ headers: {
+ "Content-type": "application/json",
+ "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+ },
+ body: JSON.stringify({ string: word, lang: lang_code, ipa }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8104" + `/syls`, opts);
+ const jj = await r2.json();
+ return jj;
+}
export async function findLemma(word: string, lang: string) {
const opts = {
diff --git a/src/lib/db/perf.ts b/src/lib/db/perf.ts
index a5b57c3..d805314 100644
--- a/src/lib/db/perf.ts
+++ b/src/lib/db/perf.ts
@@ -1,4 +1,47 @@
/**
+ * Database Performance Optimizations Documentation
+ * ===============================================
+ *
+ * 1. SRS Card Fetching Optimization
+ * ---------------------------------
+ * Problem: When processing card reviews in the SRS system, the application was fetching an entire
+ * lesson's worth of cards just to retrieve a single updated card. This was inefficient, especially
+ * for lessons with many cards.
+ *
+ * Solution: Implemented a dedicated `fetchCardById` method in DatabaseHandler that retrieves only
+ * the specific card needed with all its associated data (expression, progress, etc.). This method
+ * is used in SRSStudyService.processReview to efficiently fetch just the updated card after a review.
+ *
+ * Impact:
+ * - Reduced database query load by eliminating unnecessary card fetches
+ * - Fixed the "Failed to fetch updated card data" error that occurred when processing reviews
+ * - Made card reviews more reliable and efficient
+ *
+ * Implementation details:
+ * 1. Added fetchCardById method to DatabaseHandler class
+ * 2. Updated SRSStudyService.processReview to use fetchCardById instead of fetchLesson
+ * 3. Maintained consistent timing measurements for performance monitoring
+ *
+ * 2. SQLite Optimization Techniques
+ * --------------------------------
+ * - WAL (Write-Ahead Logging) mode enabled for better concurrency
+ * - Increased cache size to 8MB for improved read performance
+ * - Temp tables stored in memory rather than disk
+ * - Reduced synchronous mode to NORMAL for better write performance
+ * - Added strategic indexes on frequently queried columns
+ *
+ * 3. JSON Processing Optimization
+ * ------------------------------
+ * - Measured and isolated JSON processing time from query execution time
+ * - Confirmed that database queries (~329ms) were the primary bottleneck rather than
+ * JSON processing (~0.8ms)
+ *
+ * 4. Query-Level Optimizations
+ * ---------------------------
+ * - Used proper indexing for user_progress, expressions, and cards_lessons tables
+ * - Optimized JOIN conditions to ensure efficient execution plans
+ * - Used parameterized queries to take advantage of SQLite's query cache
+ *
* Database performance optimization suggestions
*/
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
index 52312bd..ec95359 100644
--- a/src/lib/db/prosodydb.ts
+++ b/src/lib/db/prosodydb.ts
@@ -1,11 +1,12 @@
import Database from "bun:sqlite";
+import { Phoneme, Tone } from "../types/phonetics";
type Str = string | null;
type ItemType = "word" | "syllable" | "idiom";
class DatabaseHandler {
db: Database;
constructor() {
- const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/prosodynew.db";
+ const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db";
const db = new Database(dbPath, { create: true });
db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance
db.exec("PRAGMA foreign_keys = ON");
@@ -31,48 +32,39 @@ class DatabaseHandler {
.run(code, name);
}
addPronunciation(
- type: ItemType,
- parentId: number | bigint,
+ wordId: number | bigint,
ipa: string,
syllables: number,
tags: Str,
notes: Str,
) {
- try {
- const query = this.db
- .query(
- `INSERT INTO pronunciation(type, parent_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?, ?)`,
- )
- .run(type, parentId, ipa, syllables, tags, notes);
- } catch (e) {
- // console.error(e);
- }
+ const query = this.db
+ .query(
+ `INSERT OR IGNORE INTO word_phonetics(word_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?)`,
+ )
+ .run(wordId, ipa, syllables, tags, notes);
}
addWordRhyme(wordId: number | bigint, ipa: string, lang: string, notes: Str) {
- try {
- const query = this.db
- .query(
- `INSERT INTO word_rhymes(text, lang, notes) VALUES(?, ?, ?)
+ const query = this.db
+ .query(
+ `INSERT INTO word_rhymes(text, lang, notes) VALUES(?, ?, ?)
ON CONFLICT(text,lang) DO UPDATE SET
text = excluded.text
RETURNING rowid
`,
- )
- .get(ipa, lang, notes) as { id: number };
- const query2 = this.db
- .query(
- `
- INSERT INTO words_idioms(word_id, idiom_id) VALUES(?, ?)
+ )
+ .get(ipa, lang, notes) as { id: number };
+ const query2 = this.db
+ .query(
+ `
+ INSERT INTO words_wrhymes(word_id, wrhyme_id) VALUES(?, ?)
`,
- )
- .run(wordId, query.id);
- } catch (e) {
- // console.error(e);
- }
+ )
+ .run(wordId, query.id);
}
addIdiom(spelling: string, lang: string) {
const query = this.db.query(
- `INSERT INTO idioms(spelling, lang) VALUES(?, ?)`,
+ `INSERT OR IGNORE INTO idioms(spelling, lang) VALUES(?, ?)`,
);
const res = query.run(spelling, lang);
return res;
@@ -100,49 +92,72 @@ class DatabaseHandler {
this.findIdiomWords(row.spelling, row.id);
}
}
- addWord(spelling: string, lang: string) {
+ addWord(
+ spelling: string,
+ lang: string,
+ frequency: number | null,
+ notes: Str,
+ ) {
const query = this.db.query(
- // `INSERT OR IGNORE INTO words(spelling, lang) VALUES(?, ?)`,
- `INSERT INTO words(spelling, lang) VALUES(?, ?)`,
+ `INSERT OR IGNORE INTO words(spelling, lang, frequency, notes) VALUES(?, ?, ?, ?)`,
+ // `INSERT INTO words(spelling, lang) VALUES(?, ?)`,
);
- const res = query.run(spelling, lang);
+ const res = query.run(spelling, lang, frequency, notes);
const wordId = res.lastInsertRowid;
return wordId;
}
addSyllable(
wordId: number | bigint,
- text: string,
+ sylIdx: number,
lang: string,
+ ipa: string,
long: boolean,
- onset: Str,
- medial: Str,
- nucleus: string,
- coda: Str,
- rhyme: string,
- tone: Str,
+ text: string,
+ onset: Phoneme,
+ medial: Phoneme,
+ nucleus: Phoneme,
+ coda: Phoneme,
+ rhyme: Phoneme,
+ tone: Tone | null,
notes: Str,
) {
const tx = this.db.transaction(() => {
const query = this.db.query(
- `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+ `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
);
+ // TODO need a dual structure here for IPA and orto
const res = query.run(
- text,
lang,
+ ipa,
long,
- onset,
- medial,
- nucleus,
- coda,
- rhyme,
- tone,
+ text,
+ onset.spelling,
+ medial.spelling,
+ nucleus.spelling,
+ coda.spelling,
+ rhyme.spelling,
notes,
);
const sylId = res.lastInsertRowid;
-
+ const ipaq = this.db.query(`
+ INSERT INTO syl_ipa(syl_id, ipa, onset, medial, nucleus, coda, rhyme, notes)
+ VALUES(?, ?, ?, ?, ?, ?, ?, ?)`);
+ ipaq.run(
+ sylId,
+ ipa,
+ onset.ipa,
+ medial.ipa,
+ nucleus.ipa,
+ coda.ipa,
+ rhyme.ipa,
+ null,
+ );
+ //
const res1 = this.db
- .query(`INSERT INTO syllables_words(syl_id, word_id) VALUES(?, ?)`)
- .run(sylId, wordId);
+ .query(
+ `INSERT INTO syllables_words(syl_id, word_id, idx) VALUES(?, ?, ?)`,
+ )
+ .run(sylId, wordId, sylIdx);
//
return sylId;
});
@@ -151,13 +166,13 @@ class DatabaseHandler {
if (onset) {
res1 = this.db
.query(
- `INSERT INTO onsets(text, lang) VALUES(?, ?)
- ON CONFLICT(text, lang) DO UPDATE SET
+ `INSERT INTO onsets(ipa, lang, text) VALUES(?, ?, ?)
+ ON CONFLICT(ipa, lang, text) DO UPDATE SET
text = excluded.text
RETURNING rowid
`,
)
- .get(onset, lang);
+ .get(onset.ipa, lang, onset.spelling);
this.db
.query(`INSERT INTO onsets_syllables(syl_id, onset_id) VALUES(?, ?)`)
.run(sylId, res1.id);
@@ -165,65 +180,65 @@ class DatabaseHandler {
if (medial) {
res1 = this.db
.query(
- `INSERT INTO medials(text, lang) VALUES(?, ?)
- ON CONFLICT(text, lang) DO UPDATE SET
+ `INSERT INTO medials(ipa, lang, text) VALUES(?, ?, ?)
+ ON CONFLICT(ipa, lang, text) DO UPDATE SET
text = excluded.text
RETURNING rowid
`,
)
- .get(medial, lang);
+ .get(medial.ipa, lang, medial.spelling);
this.db
.query(`INSERT INTO medials_syllables(syl_id, medial_id) VALUES(?, ?)`)
.run(sylId, res1.id);
}
res1 = this.db
.query(
- `INSERT INTO nucleus(text, lang) VALUES(?, ?)
- ON CONFLICT(text, lang) DO UPDATE SET
+ `INSERT INTO nucleus(ipa, lang, text) VALUES(?, ?, ?)
+ ON CONFLICT(ipa, lang, text) DO UPDATE SET
text = excluded.text
RETURNING rowid
`,
)
- .get(nucleus, lang);
+ .get(nucleus.ipa, lang, nucleus.spelling);
this.db
.query(`INSERT INTO nucleus_syllables(syl_id, nucleus_id) VALUES(?, ?)`)
.run(sylId, res1.id);
if (coda) {
res1 = this.db
.query(
- `INSERT INTO codas(text, lang) VALUES(?, ?)
- ON CONFLICT(text, lang) DO UPDATE SET
+ `INSERT INTO codas(ipa, lang, text) VALUES(?, ?, ?)
+ ON CONFLICT(ipa, lang, text) DO UPDATE SET
text = excluded.text
RETURNING rowid
`,
)
- .get(coda, lang);
+ .get(coda.ipa, lang, coda.spelling);
this.db
.query(`INSERT INTO codas_syllables(syl_id, coda_id) VALUES(?, ?)`)
.run(sylId, res1.id);
}
res1 = this.db
.query(
- `INSERT INTO rhymes(text, lang) VALUES(?, ?)
- ON CONFLICT(text, lang) DO UPDATE SET
+ `INSERT INTO rhymes(ipa, lang, text) VALUES(?, ?, ?)
+ ON CONFLICT(ipa, lang, text) DO UPDATE SET
text = excluded.text
RETURNING rowid
`,
)
- .get(rhyme, lang);
+ .get(rhyme.ipa, lang, rhyme.spelling);
this.db
.query(`INSERT INTO rhymes_syllables(syl_id, rhyme_id) VALUES(?, ?)`)
.run(sylId, res1.id);
if (tone) {
res1 = this.db
.query(
- `INSERT INTO tones(text, lang) VALUES(?, ?)
- ON CONFLICT(text, lang) DO UPDATE SET
- text = excluded.text
+ `INSERT INTO tones(ipa, lang, name, nums) VALUES(?, ?, ?, ?)
+ ON CONFLICT(ipa, lang) DO UPDATE SET
+ ipa = excluded.ipa
RETURNING rowid
`,
)
- .get(tone, lang);
+ .get(tone.letters, lang, tone.name, tone.numbers);
this.db
.query(`INSERT INTO tones_syllables(syl_id, tone_id) VALUES(?, ?)`)
.run(sylId, res1.id);
diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql
index e70b005..09dabc2 100644
--- a/src/lib/db/prosodyschema.sql
+++ b/src/lib/db/prosodyschema.sql
@@ -35,6 +35,7 @@ CREATE TABLE IF NOT EXISTS words(
spelling TEXT NOT NULL,
lang TEXT NOT NULL,
frequency INTEGER,
+ notes TEXT,
FOREIGN KEY (lang) REFERENCES languages(iso6392),
CONSTRAINT spell_unique UNIQUE (spelling, lang)
);
@@ -48,7 +49,7 @@ CREATE TABLE IF NOT EXISTS word_rhymes(
notes TEXT,
CONSTRAINT wrhyme_unique UNIQUE (text, lang)
);
-CREATE TABLE IF NOT EXISTS words_rhymes(
+CREATE TABLE IF NOT EXISTS words_wrhymes(
word_id INTEGER NOT NULL,
wrhyme_id INTEGER NOT NULL,
FOREIGN KEY (word_id) REFERENCES words(id),
@@ -58,57 +59,62 @@ CREATE TABLE IF NOT EXISTS words_rhymes(
-- break up syllables
CREATE TABLE IF NOT EXISTS syllables(
id INTEGER PRIMARY KEY AUTOINCREMENT,
- text TEXT NOT NULL,
lang TEXT NOT NULL,
+ ipa TEXT NOT NULL,
long INTEGER NOT NULL,
- tone TEXT,
- onset TEXT,
- medial TEXT,
- nucleus TEXT,
- coda TEXT,
- rhyme TEXT,
+ text TEXT NOT NULL,
+ onset TEXT NOT NULL,
+ medial TEXT NOT NULL,
+ nucleus TEXT NOT NULL,
+ coda TEXT NOT NULL,
+ rhyme TEXT NOT NULL,
notes TEXT,
FOREIGN KEY (lang) REFERENCES languages(iso6392),
- CONSTRAINT spell_unique UNIQUE (text, lang)
+ CONSTRAINT syllable_unique UNIQUE (text, ipa, lang)
);
CREATE TABLE IF NOT EXISTS tones(
id INTEGER PRIMARY KEY AUTOINCREMENT,
- text TEXT NOT NULL,
+ ipa TEXT NOT NULL,
lang TEXT NOT NULL,
- name TEXT,
- num INTEGER,
- CONSTRAINT tone_unique UNIQUE (text, lang)
+ name TEXT NOT NULL,
+ nums INTEGER NOT NULL,
+ CONSTRAINT tone_unique UNIQUE (ipa, lang)
);
CREATE TABLE IF NOT EXISTS onsets(
id INTEGER PRIMARY KEY AUTOINCREMENT,
+ ipa TEXT NOT NULL,
text TEXT NOT NULL,
lang TEXT NOT NULL,
- CONSTRAINT onsets_unique UNIQUE (text, lang)
+ CONSTRAINT onsets_unique UNIQUE (ipa, text, lang)
);
CREATE TABLE IF NOT EXISTS medials(
id INTEGER PRIMARY KEY AUTOINCREMENT,
+ ipa TEXT NOT NULL,
text TEXT NOT NULL,
lang TEXT NOT NULL,
- CONSTRAINT medials_unique UNIQUE (text, lang)
+ CONSTRAINT onsets_unique UNIQUE (ipa, text, lang)
);
CREATE TABLE IF NOT EXISTS nucleus(
id INTEGER PRIMARY KEY AUTOINCREMENT,
+ ipa TEXT NOT NULL,
text TEXT NOT NULL,
lang TEXT NOT NULL,
- CONSTRAINT nucleus_unique UNIQUE (text, lang)
+ CONSTRAINT onsets_unique UNIQUE (ipa, text, lang)
);
CREATE TABLE IF NOT EXISTS codas(
id INTEGER PRIMARY KEY AUTOINCREMENT,
+ ipa TEXT NOT NULL,
text TEXT NOT NULL,
lang TEXT NOT NULL,
- CONSTRAINT coda_unique UNIQUE (text, lang)
+ CONSTRAINT onsets_unique UNIQUE (ipa, text, lang)
);
CREATE TABLE IF NOT EXISTS rhymes(
id INTEGER PRIMARY KEY AUTOINCREMENT,
+ ipa TEXT NOT NULL,
text TEXT NOT NULL,
lang TEXT NOT NULL,
- CONSTRAINT rhyme_unique UNIQUE (text, lang)
+ CONSTRAINT onsets_unique UNIQUE (ipa, text, lang)
);
-- join tables
@@ -153,9 +159,12 @@ CREATE TABLE IF NOT EXISTS rhymes_syllables(
CREATE TABLE IF NOT EXISTS syllables_words(
syl_id INTEGER NOT NULL,
word_id INTEGER NOT NULL,
+ idx INTEGER NOT NULL,
FOREIGN KEY (syl_id) REFERENCES syllables(id),
FOREIGN KEY (word_id) REFERENCES words(id)
);
+
+
CREATE TABLE IF NOT EXISTS words_idioms(
word_id INTEGER NOT NULL,
idiom_id INTEGER NOT NULL,
@@ -165,14 +174,26 @@ CREATE TABLE IF NOT EXISTS words_idioms(
--
-CREATE TABLE IF NOT EXISTS pronunciation(
+CREATE TABLE IF NOT EXISTS syl_ipa(
id INTEGER PRIMARY KEY AUTOINCREMENT,
- type TEXT CHECK(type IN ('word', 'syllable', 'idiom')) NOT NULL,
- parent_id INTEGER NOT NULL,
+ syl_id INTEGER NOT NULL,
+ ipa TEXT NOT NULL,
+ onset TEXT NOT NULL,
+ medial TEXT NOT NULL,
+ nucleus TEXT NOT NULL,
+ rhyme TEXT NOT NULL,
+ coda TEXT NOT NULL,
+ notes TEXT,
+ CONSTRAINT syl_ipa_unique UNIQUE (ipa, syl_id)
+);
+
+CREATE TABLE IF NOT EXISTS word_phonetics(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ word_id INTEGER NOT NULL,
ipa TEXT NOT NULL,
syllables INTEGER NOT NULL,
tag TEXT,
notes TEXT,
- CONSTRAINT ipa_unique UNIQUE (ipa, parent_id)
+ CONSTRAINT ipa_unique UNIQUE (ipa, word_id)
);
-CREATE INDEX IF NOT EXISTS idx_words_ipa ON pronunciation(ipa, parent_id);
+CREATE INDEX IF NOT EXISTS idx_words_ipa ON word_phonetics(ipa, word_id);
diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts
index 4780dc3..c03da60 100644
--- a/src/lib/db/seed.ts
+++ b/src/lib/db/seed.ts
@@ -1,3 +1,4 @@
+import Database from "bun:sqlite";
import { readWiktionaryDump } from "../services/wiki";
import { getStressedSyllable, getSyllableCount } from "../utils";
import useful from "@/lib/useful_thai.json";
@@ -7,36 +8,6 @@ import { findLemma } from "../calls/nlp";
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
-async function handleFile(
- filename: string,
- func: (line: string, idx: number) => void,
-) {
- const file = Bun.file(filename);
- const s = file.stream();
- const reader = s.getReader();
- const decoder = new TextDecoder();
- let leftover = "";
- let lineCount = 0;
- while (true) {
- const { value, done } = await reader.read();
- if (done) break;
- const chunk = decoder.decode(value, { stream: true });
- const lines = (leftover + chunk).split("\n");
-
- // Process each line except the last (which might be incomplete)
- for (const line of lines.slice(0, -1)) {
- lineCount++;
- func(line, lineCount);
- }
-
- // Save the last incomplete line to process in the next iteration
- leftover = lines[lines.length - 1];
- }
-
- // Handle any remaining content after reading all chunks
- if (leftover) func(leftover, lineCount + 1);
-}
-
function goodPos(pos: string): boolean {
const list = [
"CC",
@@ -90,12 +61,12 @@ async function englishFreq() {
}
async function thaiFreq() {
const files = [
- "/home/y/code/prosody/prosody/langdata/thai/data/1yin_freq.csv",
- "/home/y/code/prosody/prosody/langdata/thai/data/2yin_freq.csv",
- "/home/y/code/prosody/prosody/langdata/thai/data/3yin_freq.csv",
- "/home/y/code/prosody/prosody/langdata/thai/data/4yin_freq.csv",
- "/home/y/code/prosody/prosody/langdata/thai/data/5yin_freq.csv",
- "/home/y/code/prosody/prosody/langdata/thai/data/6yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
];
for (let f of files) {
handleFile(f, (line, idx) => {
@@ -508,52 +479,51 @@ function fixSyllables() {
//
const SORSYL_PATH =
"/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl";
-async function redump() {
- await pdb.init();
- let count = 0;
- // const soundTypes = new Set<string>();
- // [
- // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
- // "text", "hangeul", "topics", "form", "audio-ipa"
- // ]
- const langs = ["en", "th", "zh", "es", "ja", "vn"];
- for await (const line of readWiktionaryDump()) {
- try {
- count++;
- console.log({ count });
- // if (count > 50) break;
- const j = JSON.parse(line);
- // console.log(Object.keys(j), j.word);
- // add language to db
- pdb.addLanguage(j.lang_code, j.lang);
- if (!langs.includes(j.lang_code)) continue;
- // handleEtim(j);
- // handleDerived(j);
- // handleSenses(j.pos, j.senses);
- // //
- const isWord = j.word.trim().split(" ").length === 1;
- if (isWord) await handleWord(j);
- else await handleIdiom(j);
- } catch (e) {
- // console.log("error parsing", e);
- // break;
- }
+async function redump(lang: string) {
+ let count = 0;
+ const langdb = new Database(
+ `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+ );
+ const langrows: any = langdb.query("SELECT data FROM langs");
+ for (const langrow of langrows) {
+ const j = JSON.parse(langrow.data);
+ console.log({ j });
+ if (count > 10) break;
}
+ // await pdb.init();
+
+ // // const soundTypes = new Set<string>();
+ // // [
+ // // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
+ // // "text", "hangeul", "topics", "form", "audio-ipa"
+ // // ]
+ // const langs = ["en", "th", "zh", "es", "ja", "vn"];
+
+ // for await (const line of readWiktionaryDump()) {
+ // try {
+ // count++;
+ // console.log({ count });
+ // // if (count > 50) break;
+ // const j = JSON.parse(line);
+ // // console.log(Object.keys(j), j.word);
+ // // add language to db
+ // pdb.addLanguage(j.lang_code, j.lang);
+ // if (!langs.includes(j.lang_code)) continue;
+ // // handleEtim(j);
+ // // handleDerived(j);
+ // // handleSenses(j.pos, j.senses);
+ // // //
+ // const isWord = j.word.trim().split(" ").length === 1;
+ // if (isWord) await handleWord(j);
+ // else await handleIdiom(j);
+ // } catch (e) {
+ // // console.log("error parsing", e);
+ // // break;
+ // }
+ // }
}
-type SorSyl = {
- stressed: boolean;
- long: boolean;
- spelling: string;
- ipa: string;
- nucleus: string;
- onset: string;
- medial: string;
- coda: string;
- rhyme: string;
- tone: string;
-};
async function handleWord(j: any) {
let ts = Date.now();
const analyzed = await findLemma(j.word, j.lang_code);
@@ -615,9 +585,11 @@ async function handleIpa(
// TODO ideally syllables would have spelling not IPA... harsh tho
pdb.addSyllable(
wordId,
- syl.ipa,
+ idx,
j.lang_code,
+ syl.ipa,
syl.long,
+ "",
syl.onset || null,
syl.medial || null,
syl.nucleus,
@@ -689,7 +661,7 @@ async function handleSenses(pos: string, senses: any[]) {
}
}
-redump();
+redump("th");
async function newtest() {
// const query = pdb.db.query(
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
new file mode 100644
index 0000000..687f0f3
--- /dev/null
+++ b/src/lib/db/thaiseed.ts
@@ -0,0 +1,184 @@
+import Database from "bun:sqlite";
+import {
+ analyzeTHWord,
+ deconstructSyllable,
+ segmentateThai,
+ type SorSyl,
+ type ThaiNLPRes,
+ sorSyl,
+ getThaiFreq,
+} from "../calls/nlp";
+import pdb from "./prosodydb";
+import { cleanIpa } from "../utils";
+import { handleFile } from "./utils";
+import { Tone } from "../types/phonetics";
+
+async function readDump(lang: string) {
+ await pdb.init();
+ pdb.addLanguage("th", "thai");
+ let count = 0;
+ const langdb = new Database(
+ `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+ );
+ let langrows: any = langdb.query("SELECT data FROM langs");
+ // langrows = langrows.slice(10);
+ for (const langrow of langrows) {
+ count++;
+ console.log(count);
+ // if (count <= 10000) continue;
+ // if (count > 30) break;
+ const j = JSON.parse(langrow.data);
+ const word = j.word.trim();
+ if (!word) continue;
+ if (word.includes("ๆ")) await handleWord(word, j);
+ else {
+ const split = word.split(" ");
+ if (split.length > 1) await handleIdiom(word);
+ else await handleWord(word, j);
+ }
+ }
+}
+
+async function handleWord(word: string, j: any) {
+ // TODO add categories but add a tag to see what classifying scheme we're using
+ //
+ const sounds = j.sounds || [];
+ const hasIpa = sounds.find((s: any) => "ipa" in s);
+ if (!hasIpa) return;
+ const freq = await getThaiFreq(word);
+ const wordId = pdb.addWord(word, "th", freq, null);
+ const analyzed = await analyzeTHWord(word);
+ // console.log(analyzed);
+ for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed);
+}
+async function handleIpa(
+ wordId: number | bigint,
+ j: any,
+ snd: any,
+ analyzed: ThaiNLPRes,
+) {
+ const tags = JSON.stringify(snd.tags) || null;
+ // console.log("handleipa", analyzed.syllables.length);
+ // console.log(analyzed);
+ const wikiIpa = cleanIpa(snd.ipa);
+ const nlpIpa = cleanIpa(analyzed.ipa);
+ const ipa = wikiIpa || nlpIpa;
+ const wikiIpaSplit = wikiIpa.split(".");
+ const nlpIpaSplit = nlpIpa.split(".");
+ if (wikiIpaSplit.length !== nlpIpaSplit.length) {
+ console.log("ipa mismatch");
+ console.log(wikiIpa);
+ console.log(nlpIpa);
+ // return;
+ }
+ if (analyzed.syllables.length !== wikiIpaSplit.length) {
+ console.log("syllable analysis mismatch", j.word);
+ console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+ // console.dir(j, { depth: null });
+ return;
+ }
+ pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null);
+
+ for (let i = 0; i < analyzed.syllables.length; i++) {
+ const spelling = analyzed.syllables[i]!;
+ const ipa = wikiIpaSplit[i]!;
+ try {
+ await handleSyllable(spelling, ipa, wordId, i);
+ } catch (e) {
+ console.error("syl error", j.word, j.sounds);
+ console.error({ spelling, ipa, wikiIpaSplit });
+ console.error(e);
+ }
+ }
+}
+const thaiTones: Record<string, string> = {
+ "˧": "mid",
+ "˨˩": "low",
+ "˥˩": "falling",
+ "˦˥": "high",
+ "˩˩˦": "rising",
+};
+const thaiToneNums: Record<string, number> = {
+ "˧": 33,
+ "˨˩": 21,
+ "˥˩": 41,
+ "˦˥": 45,
+ "˩˩˦": 214,
+};
+function parseTone(ipa: string, spelling: string): Tone {
+ try {
+ const name = thaiTones[ipa]!;
+ const numbers = thaiToneNums[ipa]!;
+ return { letters: ipa, name, numbers };
+ } catch (e) {
+ console.error("wrong tones!!", { s: spelling, ipa });
+ throw new Error("");
+ }
+}
+async function handleSyllable(
+ spelling: string,
+ ipa: string,
+ wordId: number | bigint,
+ idx: number,
+) {
+ const sorsyl = await sorSyl(spelling, "th", ipa);
+ if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
+ const syl = sorsyl.syls[0]!;
+ const tone = syl.tone ? parseTone(syl.tone, spelling) : null;
+ try {
+ pdb.addSyllable(
+ wordId,
+ idx + 1,
+ "th",
+ syl.ipa,
+ syl.long,
+ spelling,
+ { spelling: syl.onset, ipa: syl.onset },
+ { spelling: syl.medial, ipa: syl.medial },
+ { spelling: syl.nucleus, ipa: syl.nucleus },
+ { spelling: syl.coda, ipa: syl.coda },
+ { spelling: syl.rhyme, ipa: syl.rhyme },
+ tone,
+ null,
+ );
+ } catch (e) {
+ // console.log("well fuck", syl);
+ // console.error(e);
+ console.log();
+ }
+}
+async function handleIdiom(idiom: string) {
+ pdb.addIdiom(idiom, "th");
+ // TODO later set idiom_words once all words are populated
+ // console.log();
+}
+// ช้า ๆ
+// งก ๆ
+// หงก ๆ
+
+async function getFrequency() {
+ const files = [
+ "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
+ ];
+ const freqMap = new Map<number, string>();
+ for (const file of files) {
+ await handleFile(file, (line, idx) => {
+ const [spelling, IPA, tone, length, frequency, ...rest] = line.split(",");
+ freqMap.set(Number(frequency!), spelling!);
+ });
+ }
+ const orderedMap = new Map<string, number>();
+ const keys = Array.from(freqMap.keys()).sort();
+ for (let i = 0; i < keys.length; i++) {
+ const val = freqMap.get(keys[i]!)!;
+ orderedMap.set(val, i + 1);
+ }
+ return orderedMap;
+}
+
+readDump("th");
diff --git a/src/lib/db/utils.ts b/src/lib/db/utils.ts
new file mode 100644
index 0000000..1ac577f
--- /dev/null
+++ b/src/lib/db/utils.ts
@@ -0,0 +1,29 @@
+export async function handleFile(
+ filename: string,
+ func: (line: string, idx: number) => void,
+) {
+ const file = Bun.file(filename);
+ const s = file.stream();
+ const reader = s.getReader();
+ const decoder = new TextDecoder();
+ let leftover = "";
+ let lineCount = 0;
+ while (true) {
+ const { value, done } = await reader.read();
+ if (done) break;
+ const chunk = decoder.decode(value, { stream: true });
+ const lines = (leftover + chunk).split("\n");
+
+ // Process each line except the last (which might be incomplete)
+ for (const line of lines.slice(0, -1)) {
+ lineCount++;
+ func(line, lineCount);
+ }
+
+ // Save the last incomplete line to process in the next iteration
+ leftover = lines[lines.length - 1];
+ }
+
+ // Handle any remaining content after reading all chunks
+ if (leftover) func(leftover, lineCount + 1);
+}
diff --git a/src/lib/types/phonetics.ts b/src/lib/types/phonetics.ts
new file mode 100644
index 0000000..0009e78
--- /dev/null
+++ b/src/lib/types/phonetics.ts
@@ -0,0 +1,22 @@
+export type Tone = {
+ letters: string;
+ numbers: number;
+ name: string;
+};
+
+export type Phoneme = {
+ ipa: string;
+ spelling: string;
+};
+export type Syllable = {
+ stressed: boolean;
+ long: boolean;
+ spelling: string;
+ ipa: string;
+ nucleus: Phoneme;
+ onset: Phoneme;
+ medial: Phoneme;
+ coda: Phoneme;
+ rhyme: Phoneme;
+ tone: Tone;
+};
diff --git a/src/lib/utils.ts b/src/lib/utils.ts
index 9bc74b8..0674dea 100644
--- a/src/lib/utils.ts
+++ b/src/lib/utils.ts
@@ -57,3 +57,9 @@ export function getRandomHexColor() {
// Ensure the color code is always 6 digits by padding with zeros if needed
return "#" + randomColor.padStart(6, "0");
}
+
+export function cleanIpa(ipa: string): string {
+ const r1 = /\.\//;
+ const r2 = /[\[\]\/]/g;
+ return ipa.replace(r1, "").replace(r2, "");
+}