all me here should merge

author: polwex <polwex@sortug.com> 2025-06-02 23:05:36 +0700
committer: polwex <polwex@sortug.com> 2025-06-02 23:05:36 +0700
commit: 904b34de8f7748b7954d88784369b9cae6fa92fb (patch)
tree: 53bb5cb3377ae40d8bfa44087a0c712edd6c9d02
parent: a03c92dc82ad527d7da6bbaa3c43000e2e5f0e69 (diff)
10 files changed, 576 insertions, 176 deletions
diff --git a/NOTES.md b/NOTES.md
new file mode 100644
index 0000000..c853835
--- /dev/null
+++ b/NOTES.md
@@ -0,0 +1,4 @@
+some weirdness:
+
+วันพฤหัสบดี
+วันพฤหัส
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
index 24e7cf3..3cff415 100644
--- a/src/lib/calls/nlp.ts
+++ b/src/lib/calls/nlp.ts
@@ -1,13 +1,35 @@
 import { SyllableRes } from "../types/cards";
 
-type AnalyzeRes = {
+export type ThaiNLPRes = {
   word: string;
+  normalized: string;
   syllables: string[];
+  syllablesIpa: string[];
   ipa: string;
   pos: string;
 };
+export type SorSylRes = {
+  word: string;
+  ipa: string;
+  clean_ipa: string;
+  syls: SorSyl[];
+};
+export type SorSyl = {
+  stressed: boolean;
+  long: boolean;
+  spelling: string;
+  ipa: string;
+  nucleus: string;
+  onset: string;
+  medial: string;
+  coda: string;
+  rhyme: string;
+  tone: string;
+  start_idx: number;
+  end_idx: number;
+};
 
-export async function thaiData(word: string): Promise<AnalyzeRes[]> {
+export async function thaiData(word: string): Promise<ThaiNLPRes[]> {
   const [head, tail] = await Promise.all([
     analyzeTHWord(word),
     segmentateThai(word),
@@ -15,7 +37,7 @@ export async function thaiData(word: string): Promise<AnalyzeRes[]> {
   return [head, ...tail];
 }
 
-export async function analyzeTHWord(word: string): Promise<AnalyzeRes> {
+export async function analyzeTHWord(word: string): Promise<ThaiNLPRes> {
   const opts = {
     method: "POST",
     headers: { "Content-type": "application/json" },
@@ -26,7 +48,7 @@ export async function analyzeTHWord(word: string): Promise<AnalyzeRes> {
   const jj = await r1.json();
   return jj;
 }
-export async function segmentateThai(sentence: string): Promise<AnalyzeRes[]> {
+export async function segmentateThai(sentence: string): Promise<ThaiNLPRes[]> {
   const opts = {
     method: "POST",
     headers: { "Content-type": "application/json" },
@@ -37,6 +59,70 @@ export async function segmentateThai(sentence: string): Promise<AnalyzeRes[]> {
   const jj = await r2.json();
   return jj;
 }
+export async function getThaiFreq(word: string): Promise<number> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/freq`, opts);
+  const jj = await r2.json();
+  return jj;
+}
+export async function getThaiNext(word: string): Promise<string[]> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/next`, opts);
+  const jj = await r2.json();
+  return jj;
+}
+
+export async function getThaiPrev(word: string): Promise<string[]> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/prev`, opts);
+  const jj = await r2.json();
+  return jj;
+}
+
+export async function getThaiNext_bi(
+  word1: string,
+  word2: string,
+): Promise<string[]> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word1, word2 }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/next_bi`, opts);
+  const jj = await r2.json();
+  return jj;
+}
+
+export async function getThaiPrev_bi(
+  word1: string,
+  word2: string,
+): Promise<string[]> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word1, word2 }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/prev_bi`, opts);
+  const jj = await r2.json();
+  return jj;
+}
 
 export async function deconstructSyllable(ipa: string): Promise<SyllableRes> {
   const opts = {
@@ -52,6 +138,24 @@ export async function deconstructSyllable(ipa: string): Promise<SyllableRes> {
   const jj = await r2.json();
   return jj;
 }
+export async function sorSyl(
+  word: string,
+  lang_code: string,
+  ipa: string,
+): Promise<SorSylRes> {
+  const opts = {
+    method: "POST",
+    headers: {
+      "Content-type": "application/json",
+      "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+    },
+    body: JSON.stringify({ string: word, lang: lang_code, ipa }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8104" + `/syls`, opts);
+  const jj = await r2.json();
+  return jj;
+}
 
 export async function findLemma(word: string, lang: string) {
   const opts = {
diff --git a/src/lib/db/perf.ts b/src/lib/db/perf.ts
index a5b57c3..d805314 100644
--- a/src/lib/db/perf.ts
+++ b/src/lib/db/perf.ts
@@ -1,4 +1,47 @@
 /**
+ * Database Performance Optimizations Documentation
+ * ===============================================
+ * 
+ * 1. SRS Card Fetching Optimization
+ * ---------------------------------
+ * Problem: When processing card reviews in the SRS system, the application was fetching an entire
+ * lesson's worth of cards just to retrieve a single updated card. This was inefficient, especially
+ * for lessons with many cards.
+ * 
+ * Solution: Implemented a dedicated `fetchCardById` method in DatabaseHandler that retrieves only
+ * the specific card needed with all its associated data (expression, progress, etc.). This method
+ * is used in SRSStudyService.processReview to efficiently fetch just the updated card after a review.
+ * 
+ * Impact:
+ * - Reduced database query load by eliminating unnecessary card fetches
+ * - Fixed the "Failed to fetch updated card data" error that occurred when processing reviews
+ * - Made card reviews more reliable and efficient
+ * 
+ * Implementation details:
+ * 1. Added fetchCardById method to DatabaseHandler class
+ * 2. Updated SRSStudyService.processReview to use fetchCardById instead of fetchLesson
+ * 3. Maintained consistent timing measurements for performance monitoring
+ * 
+ * 2. SQLite Optimization Techniques
+ * --------------------------------
+ * - WAL (Write-Ahead Logging) mode enabled for better concurrency
+ * - Increased cache size to 8MB for improved read performance
+ * - Temp tables stored in memory rather than disk
+ * - Reduced synchronous mode to NORMAL for better write performance
+ * - Added strategic indexes on frequently queried columns
+ * 
+ * 3. JSON Processing Optimization
+ * ------------------------------
+ * - Measured and isolated JSON processing time from query execution time
+ * - Confirmed that database queries (~329ms) were the primary bottleneck rather than 
+ *   JSON processing (~0.8ms)
+ * 
+ * 4. Query-Level Optimizations
+ * ---------------------------
+ * - Used proper indexing for user_progress, expressions, and cards_lessons tables
+ * - Optimized JOIN conditions to ensure efficient execution plans
+ * - Used parameterized queries to take advantage of SQLite's query cache
+ *
  * Database performance optimization suggestions
  */
 
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
index 52312bd..ec95359 100644
--- a/src/lib/db/prosodydb.ts
+++ b/src/lib/db/prosodydb.ts
@@ -1,11 +1,12 @@
 import Database from "bun:sqlite";
+import { Phoneme, Tone } from "../types/phonetics";
 type Str = string | null;
 type ItemType = "word" | "syllable" | "idiom";
 
 class DatabaseHandler {
   db: Database;
   constructor() {
-    const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/prosodynew.db";
+    const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db";
     const db = new Database(dbPath, { create: true });
     db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance
     db.exec("PRAGMA foreign_keys = ON");
@@ -31,48 +32,39 @@ class DatabaseHandler {
       .run(code, name);
   }
   addPronunciation(
-    type: ItemType,
-    parentId: number | bigint,
+    wordId: number | bigint,
     ipa: string,
     syllables: number,
     tags: Str,
     notes: Str,
   ) {
-    try {
-      const query = this.db
-        .query(
-          `INSERT INTO pronunciation(type, parent_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?, ?)`,
-        )
-        .run(type, parentId, ipa, syllables, tags, notes);
-    } catch (e) {
-      // console.error(e);
-    }
+    const query = this.db
+      .query(
+        `INSERT OR IGNORE INTO word_phonetics(word_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?)`,
+      )
+      .run(wordId, ipa, syllables, tags, notes);
   }
   addWordRhyme(wordId: number | bigint, ipa: string, lang: string, notes: Str) {
-    try {
-      const query = this.db
-        .query(
-          `INSERT INTO word_rhymes(text, lang, notes) VALUES(?, ?, ?)
+    const query = this.db
+      .query(
+        `INSERT INTO word_rhymes(text, lang, notes) VALUES(?, ?, ?)
                  ON CONFLICT(text,lang) DO UPDATE SET
                  text = excluded.text
                  RETURNING rowid
           `,
-        )
-        .get(ipa, lang, notes) as { id: number };
-      const query2 = this.db
-        .query(
-          `
-            INSERT INTO words_idioms(word_id, idiom_id) VALUES(?, ?)
+      )
+      .get(ipa, lang, notes) as { id: number };
+    const query2 = this.db
+      .query(
+        `
+            INSERT INTO words_wrhymes(word_id, wrhyme_id) VALUES(?, ?)
           `,
-        )
-        .run(wordId, query.id);
-    } catch (e) {
-      // console.error(e);
-    }
+      )
+      .run(wordId, query.id);
   }
   addIdiom(spelling: string, lang: string) {
     const query = this.db.query(
-      `INSERT INTO idioms(spelling, lang) VALUES(?, ?)`,
+      `INSERT OR IGNORE INTO idioms(spelling, lang) VALUES(?, ?)`,
     );
     const res = query.run(spelling, lang);
     return res;
@@ -100,49 +92,72 @@ class DatabaseHandler {
       this.findIdiomWords(row.spelling, row.id);
     }
   }
-  addWord(spelling: string, lang: string) {
+  addWord(
+    spelling: string,
+    lang: string,
+    frequency: number | null,
+    notes: Str,
+  ) {
     const query = this.db.query(
-      // `INSERT OR IGNORE INTO words(spelling, lang) VALUES(?, ?)`,
-      `INSERT INTO words(spelling, lang) VALUES(?, ?)`,
+      `INSERT OR IGNORE INTO words(spelling, lang, frequency, notes) VALUES(?, ?, ?, ?)`,
+      // `INSERT INTO words(spelling, lang) VALUES(?, ?)`,
     );
-    const res = query.run(spelling, lang);
+    const res = query.run(spelling, lang, frequency, notes);
     const wordId = res.lastInsertRowid;
     return wordId;
   }
   addSyllable(
     wordId: number | bigint,
-    text: string,
+    sylIdx: number,
     lang: string,
+    ipa: string,
     long: boolean,
-    onset: Str,
-    medial: Str,
-    nucleus: string,
-    coda: Str,
-    rhyme: string,
-    tone: Str,
+    text: string,
+    onset: Phoneme,
+    medial: Phoneme,
+    nucleus: Phoneme,
+    coda: Phoneme,
+    rhyme: Phoneme,
+    tone: Tone | null,
     notes: Str,
   ) {
     const tx = this.db.transaction(() => {
       const query = this.db.query(
-        `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+        `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
       );
+      // TODO need a dual structure here for IPA and orto
       const res = query.run(
-        text,
         lang,
+        ipa,
         long,
-        onset,
-        medial,
-        nucleus,
-        coda,
-        rhyme,
-        tone,
+        text,
+        onset.spelling,
+        medial.spelling,
+        nucleus.spelling,
+        coda.spelling,
+        rhyme.spelling,
         notes,
       );
       const sylId = res.lastInsertRowid;
-
+      const ipaq = this.db.query(`
+        INSERT INTO syl_ipa(syl_id, ipa, onset, medial, nucleus, coda, rhyme, notes)
+        VALUES(?, ?, ?, ?, ?, ?, ?, ?)`);
+      ipaq.run(
+        sylId,
+        ipa,
+        onset.ipa,
+        medial.ipa,
+        nucleus.ipa,
+        coda.ipa,
+        rhyme.ipa,
+        null,
+      );
+      //
       const res1 = this.db
-        .query(`INSERT INTO syllables_words(syl_id, word_id) VALUES(?, ?)`)
-        .run(sylId, wordId);
+        .query(
+          `INSERT INTO syllables_words(syl_id, word_id, idx) VALUES(?, ?, ?)`,
+        )
+        .run(sylId, wordId, sylIdx);
       //
       return sylId;
     });
@@ -151,13 +166,13 @@ class DatabaseHandler {
     if (onset) {
       res1 = this.db
         .query(
-          `INSERT INTO onsets(text, lang) VALUES(?, ?)
-           ON CONFLICT(text, lang) DO UPDATE SET
+          `INSERT INTO onsets(ipa, lang, text) VALUES(?, ?, ?)
+           ON CONFLICT(ipa, lang, text) DO UPDATE SET
            text = excluded.text
            RETURNING rowid
           `,
         )
-        .get(onset, lang);
+        .get(onset.ipa, lang, onset.spelling);
       this.db
         .query(`INSERT INTO onsets_syllables(syl_id, onset_id) VALUES(?, ?)`)
         .run(sylId, res1.id);
@@ -165,65 +180,65 @@ class DatabaseHandler {
     if (medial) {
       res1 = this.db
         .query(
-          `INSERT INTO medials(text, lang) VALUES(?, ?)
-           ON CONFLICT(text, lang) DO UPDATE SET
+          `INSERT INTO medials(ipa, lang, text) VALUES(?, ?, ?)
+           ON CONFLICT(ipa, lang, text) DO UPDATE SET
            text = excluded.text
            RETURNING rowid
           `,
         )
-        .get(medial, lang);
+        .get(medial.ipa, lang, medial.spelling);
       this.db
         .query(`INSERT INTO medials_syllables(syl_id, medial_id) VALUES(?, ?)`)
         .run(sylId, res1.id);
     }
     res1 = this.db
       .query(
-        `INSERT INTO nucleus(text, lang) VALUES(?, ?)
-         ON CONFLICT(text, lang) DO UPDATE SET
+        `INSERT INTO nucleus(ipa, lang, text) VALUES(?, ?, ?)
+         ON CONFLICT(ipa, lang, text) DO UPDATE SET
          text = excluded.text
          RETURNING rowid
         `,
       )
-      .get(nucleus, lang);
+      .get(nucleus.ipa, lang, nucleus.spelling);
     this.db
       .query(`INSERT INTO nucleus_syllables(syl_id, nucleus_id) VALUES(?, ?)`)
       .run(sylId, res1.id);
     if (coda) {
       res1 = this.db
         .query(
-          `INSERT INTO codas(text, lang) VALUES(?, ?)
-           ON CONFLICT(text, lang) DO UPDATE SET
+          `INSERT INTO codas(ipa, lang, text) VALUES(?, ?, ?)
+           ON CONFLICT(ipa, lang, text) DO UPDATE SET
            text = excluded.text
            RETURNING rowid
           `,
         )
-        .get(coda, lang);
+        .get(coda.ipa, lang, coda.spelling);
       this.db
         .query(`INSERT INTO codas_syllables(syl_id, coda_id) VALUES(?, ?)`)
         .run(sylId, res1.id);
     }
     res1 = this.db
       .query(
-        `INSERT INTO rhymes(text, lang) VALUES(?, ?)
-         ON CONFLICT(text, lang) DO UPDATE SET
+        `INSERT INTO rhymes(ipa, lang, text) VALUES(?, ?, ?)
+         ON CONFLICT(ipa, lang, text) DO UPDATE SET
          text = excluded.text
          RETURNING rowid
         `,
       )
-      .get(rhyme, lang);
+      .get(rhyme.ipa, lang, rhyme.spelling);
     this.db
       .query(`INSERT INTO rhymes_syllables(syl_id, rhyme_id) VALUES(?, ?)`)
       .run(sylId, res1.id);
     if (tone) {
       res1 = this.db
         .query(
-          `INSERT INTO tones(text, lang) VALUES(?, ?)
-           ON CONFLICT(text, lang) DO UPDATE SET
-           text = excluded.text
+          `INSERT INTO tones(ipa, lang, name, nums) VALUES(?, ?, ?, ?)
+           ON CONFLICT(ipa, lang) DO UPDATE SET
+           ipa = excluded.ipa
            RETURNING rowid
           `,
         )
-        .get(tone, lang);
+        .get(tone.letters, lang, tone.name, tone.numbers);
       this.db
         .query(`INSERT INTO tones_syllables(syl_id, tone_id) VALUES(?, ?)`)
         .run(sylId, res1.id);
diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql
index e70b005..09dabc2 100644
--- a/src/lib/db/prosodyschema.sql
+++ b/src/lib/db/prosodyschema.sql
@@ -35,6 +35,7 @@ CREATE TABLE IF NOT EXISTS words(
     spelling TEXT NOT NULL,
     lang TEXT NOT NULL,
     frequency INTEGER,
+    notes TEXT,
     FOREIGN KEY (lang) REFERENCES languages(iso6392),
     CONSTRAINT spell_unique UNIQUE (spelling, lang)
 );
@@ -48,7 +49,7 @@ CREATE TABLE IF NOT EXISTS word_rhymes(
     notes TEXT,
     CONSTRAINT wrhyme_unique UNIQUE (text, lang)
 );
-CREATE TABLE IF NOT EXISTS words_rhymes(
+CREATE TABLE IF NOT EXISTS words_wrhymes(
     word_id INTEGER NOT NULL,
     wrhyme_id INTEGER NOT NULL,
     FOREIGN KEY (word_id) REFERENCES words(id),
@@ -58,57 +59,62 @@ CREATE TABLE IF NOT EXISTS words_rhymes(
 -- break up syllables
 CREATE TABLE IF NOT EXISTS syllables(
     id INTEGER PRIMARY KEY AUTOINCREMENT,
-    text TEXT NOT NULL,
     lang TEXT NOT NULL,
+    ipa TEXT NOT NULL,
     long INTEGER NOT NULL,
-    tone TEXT,
-    onset TEXT,
-    medial TEXT,
-    nucleus TEXT,
-    coda TEXT,
-    rhyme TEXT,
+    text TEXT NOT NULL,
+    onset TEXT NOT NULL,
+    medial TEXT NOT NULL,
+    nucleus TEXT NOT NULL,
+    coda TEXT NOT NULL,
+    rhyme TEXT NOT NULL,
     notes TEXT,
     FOREIGN KEY (lang) REFERENCES languages(iso6392),
-    CONSTRAINT spell_unique UNIQUE (text, lang)
+    CONSTRAINT syllable_unique UNIQUE (text, ipa, lang)
 );
 
 CREATE TABLE IF NOT EXISTS tones(
     id INTEGER PRIMARY KEY AUTOINCREMENT,
-    text TEXT NOT NULL,
+    ipa TEXT NOT NULL,
     lang TEXT NOT NULL,
-    name TEXT,
-    num INTEGER,
-    CONSTRAINT tone_unique UNIQUE (text, lang)
+    name TEXT NOT NULL,
+    nums INTEGER NOT NULL,
+    CONSTRAINT tone_unique UNIQUE (ipa, lang)
 );
 CREATE TABLE IF NOT EXISTS onsets(
     id INTEGER PRIMARY KEY AUTOINCREMENT,
+    ipa  TEXT NOT NULL,
     text TEXT NOT NULL,
     lang TEXT NOT NULL,
-    CONSTRAINT onsets_unique UNIQUE (text, lang)
+    CONSTRAINT onsets_unique UNIQUE (ipa, text, lang)
 );
 CREATE TABLE IF NOT EXISTS medials(
     id INTEGER PRIMARY KEY AUTOINCREMENT,
+    ipa  TEXT NOT NULL,
     text TEXT NOT NULL,
     lang TEXT NOT NULL,
-    CONSTRAINT medials_unique UNIQUE (text, lang)
+    CONSTRAINT onsets_unique UNIQUE (ipa, text, lang)
 );
 CREATE TABLE IF NOT EXISTS nucleus(
     id INTEGER PRIMARY KEY AUTOINCREMENT,
+    ipa  TEXT NOT NULL,
     text TEXT NOT NULL,
     lang TEXT NOT NULL,
-    CONSTRAINT nucleus_unique UNIQUE (text, lang)
+    CONSTRAINT onsets_unique UNIQUE (ipa, text, lang)
 );
 CREATE TABLE IF NOT EXISTS codas(
     id INTEGER PRIMARY KEY AUTOINCREMENT,
+    ipa  TEXT NOT NULL,
     text TEXT NOT NULL,
     lang TEXT NOT NULL,
-    CONSTRAINT coda_unique UNIQUE (text, lang)
+    CONSTRAINT onsets_unique UNIQUE (ipa, text, lang)
 );
 CREATE TABLE IF NOT EXISTS rhymes(
     id INTEGER PRIMARY KEY AUTOINCREMENT,
+    ipa  TEXT NOT NULL,
     text TEXT NOT NULL,
     lang TEXT NOT NULL,
-    CONSTRAINT rhyme_unique UNIQUE (text, lang)
+    CONSTRAINT onsets_unique UNIQUE (ipa, text, lang)
 );
 
 -- join tables
@@ -153,9 +159,12 @@ CREATE TABLE IF NOT EXISTS rhymes_syllables(
 CREATE TABLE IF NOT EXISTS syllables_words(
     syl_id INTEGER NOT NULL,
     word_id INTEGER NOT NULL,
+    idx INTEGER NOT NULL,
     FOREIGN KEY (syl_id) REFERENCES syllables(id),
     FOREIGN KEY (word_id) REFERENCES words(id)
 );
+
+
 CREATE TABLE IF NOT EXISTS words_idioms(
     word_id INTEGER NOT NULL,
     idiom_id INTEGER NOT NULL,
@@ -165,14 +174,26 @@ CREATE TABLE IF NOT EXISTS words_idioms(
 
 
 -- 
-CREATE TABLE IF NOT EXISTS pronunciation(
+CREATE TABLE IF NOT EXISTS syl_ipa(
     id INTEGER PRIMARY KEY AUTOINCREMENT,
-    type TEXT CHECK(type IN ('word', 'syllable', 'idiom')) NOT NULL,
-    parent_id INTEGER NOT NULL,
+    syl_id INTEGER NOT NULL,
+    ipa TEXT NOT NULL,
+    onset TEXT NOT NULL,
+    medial TEXT NOT NULL,
+    nucleus TEXT NOT NULL,
+    rhyme TEXT NOT NULL,
+    coda TEXT NOT NULL,
+    notes TEXT,
+    CONSTRAINT syl_ipa_unique UNIQUE (ipa, syl_id)
+);
+
+CREATE TABLE IF NOT EXISTS word_phonetics(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    word_id INTEGER NOT NULL,
     ipa TEXT NOT NULL,
     syllables INTEGER NOT NULL,
     tag TEXT,
     notes TEXT,
-    CONSTRAINT ipa_unique UNIQUE (ipa, parent_id)
+    CONSTRAINT ipa_unique UNIQUE (ipa, word_id)
 );
-CREATE INDEX IF NOT EXISTS idx_words_ipa ON pronunciation(ipa, parent_id);
+CREATE INDEX IF NOT EXISTS idx_words_ipa ON word_phonetics(ipa, word_id);
diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts
index 4780dc3..c03da60 100644
--- a/src/lib/db/seed.ts
+++ b/src/lib/db/seed.ts
@@ -1,3 +1,4 @@
+import Database from "bun:sqlite";
 import { readWiktionaryDump } from "../services/wiki";
 import { getStressedSyllable, getSyllableCount } from "../utils";
 import useful from "@/lib/useful_thai.json";
@@ -7,36 +8,6 @@ import { findLemma } from "../calls/nlp";
 
 const SYMBOL_REGEX = new RegExp(/[\W\d]/);
 
-async function handleFile(
-  filename: string,
-  func: (line: string, idx: number) => void,
-) {
-  const file = Bun.file(filename);
-  const s = file.stream();
-  const reader = s.getReader();
-  const decoder = new TextDecoder();
-  let leftover = "";
-  let lineCount = 0;
-  while (true) {
-    const { value, done } = await reader.read();
-    if (done) break;
-    const chunk = decoder.decode(value, { stream: true });
-    const lines = (leftover + chunk).split("\n");
-
-    // Process each line except the last (which might be incomplete)
-    for (const line of lines.slice(0, -1)) {
-      lineCount++;
-      func(line, lineCount);
-    }
-
-    // Save the last incomplete line to process in the next iteration
-    leftover = lines[lines.length - 1];
-  }
-
-  // Handle any remaining content after reading all chunks
-  if (leftover) func(leftover, lineCount + 1);
-}
-
 function goodPos(pos: string): boolean {
   const list = [
     "CC",
@@ -90,12 +61,12 @@ async function englishFreq() {
 }
 async function thaiFreq() {
   const files = [
-    "/home/y/code/prosody/prosody/langdata/thai/data/1yin_freq.csv",
-    "/home/y/code/prosody/prosody/langdata/thai/data/2yin_freq.csv",
-    "/home/y/code/prosody/prosody/langdata/thai/data/3yin_freq.csv",
-    "/home/y/code/prosody/prosody/langdata/thai/data/4yin_freq.csv",
-    "/home/y/code/prosody/prosody/langdata/thai/data/5yin_freq.csv",
-    "/home/y/code/prosody/prosody/langdata/thai/data/6yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
   ];
   for (let f of files) {
     handleFile(f, (line, idx) => {
@@ -508,52 +479,51 @@ function fixSyllables() {
 //
 const SORSYL_PATH =
   "/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl";
-async function redump() {
-  await pdb.init();
-  let count = 0;
 
-  // const soundTypes = new Set<string>();
-  // [
-  //   "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
-  //   "text", "hangeul", "topics", "form", "audio-ipa"
-  // ]
-  const langs = ["en", "th", "zh", "es", "ja", "vn"];
-  for await (const line of readWiktionaryDump()) {
-    try {
-      count++;
-      console.log({ count });
-      // if (count > 50) break;
-      const j = JSON.parse(line);
-      // console.log(Object.keys(j), j.word);
-      // add language to db
-      pdb.addLanguage(j.lang_code, j.lang);
-      if (!langs.includes(j.lang_code)) continue;
-      // handleEtim(j);
-      // handleDerived(j);
-      // handleSenses(j.pos, j.senses);
-      // //
-      const isWord = j.word.trim().split(" ").length === 1;
-      if (isWord) await handleWord(j);
-      else await handleIdiom(j);
-    } catch (e) {
-      // console.log("error parsing", e);
-      // break;
-    }
+async function redump(lang: string) {
+  let count = 0;
+  const langdb = new Database(
+    `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+  );
+  const langrows: any = langdb.query("SELECT data FROM langs");
+  for (const langrow of langrows) {
+    const j = JSON.parse(langrow.data);
+    console.log({ j });
+    if (count > 10) break;
   }
+  // await pdb.init();
+
+  // // const soundTypes = new Set<string>();
+  // // [
+  // //   "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
+  // //   "text", "hangeul", "topics", "form", "audio-ipa"
+  // // ]
+  // const langs = ["en", "th", "zh", "es", "ja", "vn"];
+
+  // for await (const line of readWiktionaryDump()) {
+  //   try {
+  //     count++;
+  //     console.log({ count });
+  //     // if (count > 50) break;
+  //     const j = JSON.parse(line);
+  //     // console.log(Object.keys(j), j.word);
+  //     // add language to db
+  //     pdb.addLanguage(j.lang_code, j.lang);
+  //     if (!langs.includes(j.lang_code)) continue;
+  //     // handleEtim(j);
+  //     // handleDerived(j);
+  //     // handleSenses(j.pos, j.senses);
+  //     // //
+  //     const isWord = j.word.trim().split(" ").length === 1;
+  //     if (isWord) await handleWord(j);
+  //     else await handleIdiom(j);
+  //   } catch (e) {
+  //     // console.log("error parsing", e);
+  //     // break;
+  //   }
+  // }
 }
 
-type SorSyl = {
-  stressed: boolean;
-  long: boolean;
-  spelling: string;
-  ipa: string;
-  nucleus: string;
-  onset: string;
-  medial: string;
-  coda: string;
-  rhyme: string;
-  tone: string;
-};
 async function handleWord(j: any) {
   let ts = Date.now();
   const analyzed = await findLemma(j.word, j.lang_code);
@@ -615,9 +585,11 @@ async function handleIpa(
       // TODO ideally syllables would have spelling not IPA... harsh tho
       pdb.addSyllable(
         wordId,
-        syl.ipa,
+        idx,
         j.lang_code,
+        syl.ipa,
         syl.long,
+        "",
         syl.onset || null,
         syl.medial || null,
         syl.nucleus,
@@ -689,7 +661,7 @@ async function handleSenses(pos: string, senses: any[]) {
   }
 }
 
-redump();
+redump("th");
 
 async function newtest() {
   // const query = pdb.db.query(
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
new file mode 100644
index 0000000..687f0f3
--- /dev/null
+++ b/src/lib/db/thaiseed.ts
@@ -0,0 +1,184 @@
+import Database from "bun:sqlite";
+import {
+  analyzeTHWord,
+  deconstructSyllable,
+  segmentateThai,
+  type SorSyl,
+  type ThaiNLPRes,
+  sorSyl,
+  getThaiFreq,
+} from "../calls/nlp";
+import pdb from "./prosodydb";
+import { cleanIpa } from "../utils";
+import { handleFile } from "./utils";
+import { Tone } from "../types/phonetics";
+
+async function readDump(lang: string) {
+  await pdb.init();
+  pdb.addLanguage("th", "thai");
+  let count = 0;
+  const langdb = new Database(
+    `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+  );
+  let langrows: any = langdb.query("SELECT data FROM langs");
+  // langrows = langrows.slice(10);
+  for (const langrow of langrows) {
+    count++;
+    console.log(count);
+    // if (count <= 10000) continue;
+    // if (count > 30) break;
+    const j = JSON.parse(langrow.data);
+    const word = j.word.trim();
+    if (!word) continue;
+    if (word.includes("ๆ")) await handleWord(word, j);
+    else {
+      const split = word.split(" ");
+      if (split.length > 1) await handleIdiom(word);
+      else await handleWord(word, j);
+    }
+  }
+}
+
+async function handleWord(word: string, j: any) {
+  // TODO add categories but add a tag to see what classifying scheme we're using
+  //
+  const sounds = j.sounds || [];
+  const hasIpa = sounds.find((s: any) => "ipa" in s);
+  if (!hasIpa) return;
+  const freq = await getThaiFreq(word);
+  const wordId = pdb.addWord(word, "th", freq, null);
+  const analyzed = await analyzeTHWord(word);
+  // console.log(analyzed);
+  for (let snd of sounds) if ("ipa" in snd) handleIpa(wordId, j, snd, analyzed);
+}
+async function handleIpa(
+  wordId: number | bigint,
+  j: any,
+  snd: any,
+  analyzed: ThaiNLPRes,
+) {
+  const tags = JSON.stringify(snd.tags) || null;
+  // console.log("handleipa", analyzed.syllables.length);
+  // console.log(analyzed);
+  const wikiIpa = cleanIpa(snd.ipa);
+  const nlpIpa = cleanIpa(analyzed.ipa);
+  const ipa = wikiIpa || nlpIpa;
+  const wikiIpaSplit = wikiIpa.split(".");
+  const nlpIpaSplit = nlpIpa.split(".");
+  if (wikiIpaSplit.length !== nlpIpaSplit.length) {
+    console.log("ipa mismatch");
+    console.log(wikiIpa);
+    console.log(nlpIpa);
+    // return;
+  }
+  if (analyzed.syllables.length !== wikiIpaSplit.length) {
+    console.log("syllable analysis mismatch", j.word);
+    console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+    // console.dir(j, { depth: null });
+    return;
+  }
+  pdb.addPronunciation(wordId, ipa, analyzed.syllables.length, tags, null);
+
+  for (let i = 0; i < analyzed.syllables.length; i++) {
+    const spelling = analyzed.syllables[i]!;
+    const ipa = wikiIpaSplit[i]!;
+    try {
+      await handleSyllable(spelling, ipa, wordId, i);
+    } catch (e) {
+      console.error("syl error", j.word, j.sounds);
+      console.error({ spelling, ipa, wikiIpaSplit });
+      console.error(e);
+    }
+  }
+}
+const thaiTones: Record<string, string> = {
+  "˧": "mid",
+  "˨˩": "low",
+  "˥˩": "falling",
+  "˦˥": "high",
+  "˩˩˦": "rising",
+};
+const thaiToneNums: Record<string, number> = {
+  "˧": 33,
+  "˨˩": 21,
+  "˥˩": 41,
+  "˦˥": 45,
+  "˩˩˦": 214,
+};
+function parseTone(ipa: string, spelling: string): Tone {
+  try {
+    const name = thaiTones[ipa]!;
+    const numbers = thaiToneNums[ipa]!;
+    return { letters: ipa, name, numbers };
+  } catch (e) {
+    console.error("wrong tones!!", { s: spelling, ipa });
+    throw new Error("");
+  }
+}
+async function handleSyllable(
+  spelling: string,
+  ipa: string,
+  wordId: number | bigint,
+  idx: number,
+) {
+  const sorsyl = await sorSyl(spelling, "th", ipa);
+  if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
+  const syl = sorsyl.syls[0]!;
+  const tone = syl.tone ? parseTone(syl.tone, spelling) : null;
+  try {
+    pdb.addSyllable(
+      wordId,
+      idx + 1,
+      "th",
+      syl.ipa,
+      syl.long,
+      spelling,
+      { spelling: syl.onset, ipa: syl.onset },
+      { spelling: syl.medial, ipa: syl.medial },
+      { spelling: syl.nucleus, ipa: syl.nucleus },
+      { spelling: syl.coda, ipa: syl.coda },
+      { spelling: syl.rhyme, ipa: syl.rhyme },
+      tone,
+      null,
+    );
+  } catch (e) {
+    // console.log("well fuck", syl);
+    // console.error(e);
+    console.log();
+  }
+}
+async function handleIdiom(idiom: string) {
+  pdb.addIdiom(idiom, "th");
+  // TODO later set idiom_words once all words are populated
+  // console.log();
+}
+// ช้า ๆ
+// งก ๆ
+// หงก ๆ
+
+async function getFrequency() {
+  const files = [
+    "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
+  ];
+  const freqMap = new Map<number, string>();
+  for (const file of files) {
+    await handleFile(file, (line, idx) => {
+      const [spelling, IPA, tone, length, frequency, ...rest] = line.split(",");
+      freqMap.set(Number(frequency!), spelling!);
+    });
+  }
+  const orderedMap = new Map<string, number>();
+  const keys = Array.from(freqMap.keys()).sort();
+  for (let i = 0; i < keys.length; i++) {
+    const val = freqMap.get(keys[i]!)!;
+    orderedMap.set(val, i + 1);
+  }
+  return orderedMap;
+}
+
+readDump("th");
diff --git a/src/lib/db/utils.ts b/src/lib/db/utils.ts
new file mode 100644
index 0000000..1ac577f
--- /dev/null
+++ b/src/lib/db/utils.ts
@@ -0,0 +1,29 @@
+export async function handleFile(
+  filename: string,
+  func: (line: string, idx: number) => void,
+) {
+  const file = Bun.file(filename);
+  const s = file.stream();
+  const reader = s.getReader();
+  const decoder = new TextDecoder();
+  let leftover = "";
+  let lineCount = 0;
+  while (true) {
+    const { value, done } = await reader.read();
+    if (done) break;
+    const chunk = decoder.decode(value, { stream: true });
+    const lines = (leftover + chunk).split("\n");
+
+    // Process each line except the last (which might be incomplete)
+    for (const line of lines.slice(0, -1)) {
+      lineCount++;
+      func(line, lineCount);
+    }
+
+    // Save the last incomplete line to process in the next iteration
+    leftover = lines[lines.length - 1];
+  }
+
+  // Handle any remaining content after reading all chunks
+  if (leftover) func(leftover, lineCount + 1);
+}
diff --git a/src/lib/types/phonetics.ts b/src/lib/types/phonetics.ts
new file mode 100644
index 0000000..0009e78
--- /dev/null
+++ b/src/lib/types/phonetics.ts
@@ -0,0 +1,22 @@
+export type Tone = {
+  letters: string;
+  numbers: number;
+  name: string;
+};
+
+export type Phoneme = {
+  ipa: string;
+  spelling: string;
+};
+export type Syllable = {
+  stressed: boolean;
+  long: boolean;
+  spelling: string;
+  ipa: string;
+  nucleus: Phoneme;
+  onset: Phoneme;
+  medial: Phoneme;
+  coda: Phoneme;
+  rhyme: Phoneme;
+  tone: Tone;
+};
diff --git a/src/lib/utils.ts b/src/lib/utils.ts
index 9bc74b8..0674dea 100644
--- a/src/lib/utils.ts
+++ b/src/lib/utils.ts
@@ -57,3 +57,9 @@ export function getRandomHexColor() {
   // Ensure the color code is always 6 digits by padding with zeros if needed
   return "#" + randomColor.padStart(6, "0");
 }
+
+export function cleanIpa(ipa: string): string {
+  const r1 = /\.\//;
+  const r2 = /[\[\]\/]/g;
+  return ipa.replace(r1, "").replace(r2, "");
+}
author	polwex <polwex@sortug.com>	2025-06-02 23:05:36 +0700
committer	polwex <polwex@sortug.com>	2025-06-02 23:05:36 +0700
commit	904b34de8f7748b7954d88784369b9cae6fa92fb (patch)
tree	53bb5cb3377ae40d8bfa44087a0c712edd6c9d02
parent	a03c92dc82ad527d7da6bbaa3c43000e2e5f0e69 (diff)