glorious new db

author: polwex <polwex@sortug.com> 2025-05-29 12:10:22 +0700
committer: polwex <polwex@sortug.com> 2025-05-29 12:10:22 +0700
commit: a3f24ea79b14394b24c4b60a010651eb29eeb872 (patch)
tree: cb1c4937084116f66a59727ee752afd974714c8e /src/lib
parent: 7abf2227438362ad30820ee236405ec1b57a40b6 (diff)
9 files changed, 986 insertions, 2 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
new file mode 100644
index 0000000..28562d0
--- /dev/null
+++ b/src/lib/calls/nlp.ts
@@ -0,0 +1,54 @@
+import { SyllableRes } from "../types/cards";
+
+type AnalyzeRes = {
+  word: string;
+  syllables: string[];
+  ipa: string;
+  pos: string;
+};
+
+export async function thaiData(word: string): Promise<AnalyzeRes[]> {
+  const [head, tail] = await Promise.all([
+    analyzeTHWord(word),
+    segmentateThai(word),
+  ]);
+  return [head, ...tail];
+}
+
+export async function analyzeTHWord(word: string): Promise<AnalyzeRes> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word }),
+  };
+  const r1 = await fetch("http://localhost:8001" + "/analyze", opts);
+  // const r2 = await fetch(`http://192.168.1.110:8000/analyze`, opts);
+  const jj = await r1.json();
+  return jj;
+}
+export async function segmentateThai(sentence: string): Promise<AnalyzeRes[]> {
+  const opts = {
+    method: "POST",
+    headers: { "Content-type": "application/json" },
+    body: JSON.stringify({ word: sentence }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8001" + `/segmentate`, opts);
+  const jj = await r2.json();
+  return jj;
+}
+
+export async function deconstructSyllable(ipa: string): Promise<SyllableRes> {
+  const opts = {
+    method: "POST",
+    headers: {
+      "Content-type": "application/json",
+      "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+    },
+    body: JSON.stringify({ string: ipa }),
+  };
+  // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+  const r2 = await fetch("http://localhost:8102" + `/lingpy`, opts);
+  const jj = await r2.json();
+  return jj;
+}
diff --git a/src/lib/db/codes.js b/src/lib/db/codes.js
new file mode 100644
index 0000000..bef3e1b
--- /dev/null
+++ b/src/lib/db/codes.js
@@ -0,0 +1,203 @@
+
+
+const ALL_LANGUAGE_CODES = {
+    iso639_1: [],
+    iso639_2_T: [], // Terminology codes
+    iso639_2_B: [], // Bibliographic codes
+    iso639_3: [],
+    iso639_5: [],
+    bcp47_language_subtags: [], // Primary language subtags from IANA
+};
+
+const LOC_ISO639_2_URL = 'https://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt';
+// For ISO 639-3, SIL provides dated files. This is the structure of the comprehensive file.
+// The actual filename changes with each update (e.g., iso-639-3_20240123.tab).
+// You might need to go to https://iso639-3.sil.org/code_tables/download_tables and get the current link
+// for the "Complete Code Tables" zip, then extract the main .tab file.
+// For this script, I'll use a link to one specific (potentially older) version for demonstration.
+// A more robust solution would involve downloading and unzipping the latest.
+// This link points to the main table that includes mappings.
+const SIL_ISO639_3_URL = 'https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_20240701.tab'; // Example: replace with current
+const LOC_ISO639_5_URL = 'https://www.loc.gov/standards/iso639-5/iso639-5.tsv'; // TSV format
+const IANA_LANGUAGE_SUBTAG_REGISTRY_URL = 'https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry';
+
+async function fetchAndParseISO639_1_2() {
+    try {
+        console.log('Fetching ISO 639-1 & 639-2 codes from LOC...');
+        const response = await fetch(LOC_ISO639_2_URL);
+        if (!response.ok) throw new Error(`Failed to fetch ISO 639-1/2: ${response.statusText}`);
+        const text = await response.text();
+
+        const lines = text.trim().split('\n');
+        lines.forEach(line => {
+            // Format: alpha3-b|alpha3-t|alpha2|english_name|french_name
+            const parts = line.split('|');
+            if (parts.length >= 4) {
+                const alpha3_b = parts[0].trim();
+                const alpha3_t = parts[1].trim();
+                const alpha2 = parts[2].trim();
+                const englishName = parts[3].trim();
+
+                if (alpha3_b) { // Bibliographic code
+                    ALL_LANGUAGE_CODES.iso639_2_B.push({ code: alpha3_b, name: englishName });
+                }
+                if (alpha3_t) { // Terminology code
+                    ALL_LANGUAGE_CODES.iso639_2_T.push({ code: alpha3_t, name: englishName });
+                }
+                if (alpha2) { // Alpha-2 code
+                    ALL_LANGUAGE_CODES.iso639_1.push({ code: alpha2, name: englishName });
+                }
+            }
+        });
+        console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_1.length} ISO 639-1 codes.`);
+        console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_B.length} ISO 639-2/B codes.`);
+        console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_T.length} ISO 639-2/T codes.`);
+    } catch (error) {
+        console.error('Error fetching ISO 639-1/2 codes:', error.message);
+    }
+}
+
+async function fetchAndParseISO639_3() {
+    try {
+        console.log('Fetching ISO 639-3 codes from SIL...');
+        const response = await fetch(SIL_ISO639_3_URL);
+        if (!response.ok) throw new Error(`Failed to fetch ISO 639-3: ${response.statusText}`);
+        const text = await response.text();
+
+        const lines = text.trim().split('\n');
+        const header = lines.shift().split('\t'); // Remove header line
+        // Expected header fields (order matters):
+        // Id (3-letter code) | Part2B | Part2T | Part1 | Scope | Language_Type | Ref_Name | Comment
+        const idIndex = header.indexOf('Id');
+        const refNameIndex = header.indexOf('Ref_Name');
+        const part1Index = header.indexOf('Part1'); // For cross-referencing ISO 639-1
+
+        if (idIndex === -1 || refNameIndex === -1) {
+            throw new Error('ISO 639-3 header format mismatch. Expected "Id" and "Ref_Name" columns.');
+        }
+
+        lines.forEach(line => {
+            const parts = line.split('\t');
+            const code = parts[idIndex]?.trim();
+            const name = parts[refNameIndex]?.trim();
+            const part1Code = parts[part1Index]?.trim();
+
+            if (code && name) {
+                ALL_LANGUAGE_CODES.iso639_3.push({ code, name });
+
+                // Also, let's try to get more complete ISO 639-1 from this source
+                // as it might be more comprehensive than LOC's where 639-1 is only if 639-2 exists
+                if (part1Code && !ALL_LANGUAGE_CODES.iso639_1.find(c => c.code === part1Code)) {
+                     ALL_LANGUAGE_CODES.iso639_1.push({ code: part1Code, name });
+                }
+            }
+        });
+        console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_3.length} ISO 639-3 codes.`);
+         // Deduplicate and sort ISO 639-1 after potential additions
+        const uniqueIso639_1 = {};
+        ALL_LANGUAGE_CODES.iso639_1.forEach(item => uniqueIso639_1[item.code] = item);
+        ALL_LANGUAGE_CODES.iso639_1 = Object.values(uniqueIso639_1).sort((a, b) => a.code.localeCompare(b.code));
+        console.log(`Final unique ISO 639-1 count: ${ALL_LANGUAGE_CODES.iso639_1.length}.`);
+
+    } catch (error) {
+        console.error('Error fetching ISO 639-3 codes:', error.message);
+        console.warn('Make sure the SIL_ISO639_3_URL is current or points to a valid .tab file.');
+    }
+}
+
+async function fetchAndParseISO639_5() {
+    try {
+        console.log('Fetching ISO 639-5 codes from LOC...');
+        const response = await fetch(LOC_ISO639_5_URL);
+        if (!response.ok) throw new Error(`Failed to fetch ISO 639-5: ${response.statusText}`);
+        const text = await response.text();
+
+        const lines = text.trim().split('\n');
+        lines.shift(); // Remove header line: URI Code Label_en
+
+        lines.forEach(line => {
+            const parts = line.split('\t');
+            // URI | Code | Label_en | Label_fr ...
+            if (parts.length >= 3) {
+                const code = parts[1].trim();
+                const name = parts[2].trim();
+                if (code && name) {
+                    ALL_LANGUAGE_CODES.iso639_5.push({ code, name });
+                }
+            }
+        });
+        console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_5.length} ISO 639-5 codes (language families/groups).`);
+    } catch (error) {
+        console.error('Error fetching ISO 639-5 codes:', error.message);
+    }
+}
+
+async function fetchAndParseIANALanguageSubtags() {
+    try {
+        console.log('Fetching IANA Language Subtag Registry...');
+        const response = await fetch(IANA_LANGUAGE_SUBTAG_REGISTRY_URL);
+        if (!response.ok) throw new Error(`Failed to fetch IANA registry: ${response.statusText}`);
+        const text = await response.text();
+
+        const entries = text.split('%%'); // Entries are separated by %%
+        entries.forEach(entry => {
+            const lines = entry.trim().split('\n');
+            let type = '';
+            let subtag = '';
+            let description = '';
+
+            lines.forEach(line => {
+                if (line.startsWith('Type:')) {
+                    type = line.substring(5).trim();
+                } else if (line.startsWith('Subtag:')) {
+                    subtag = line.substring(7).trim();
+                } else if (line.startsWith('Description:')) {
+                    // Description can span multiple lines, but for simplicity, we take the first
+                    if (!description) description = line.substring(12).trim();
+                }
+            });
+
+            if (type === 'language' && subtag && description) {
+                ALL_LANGUAGE_CODES.bcp47_language_subtags.push({
+                    code: subtag,
+                    name: description
+                });
+            }
+        });
+        console.log(`Fetched ${ALL_LANGUAGE_CODES.bcp47_language_subtags.length} primary language subtags from IANA.`);
+    } catch (error) {
+        console.error('Error fetching IANA Language Subtag Registry:', error.message);
+    }
+}
+
+
+async function main() {
+    console.log('Starting to fetch all language codes...\n');
+
+    await Promise.all([
+        fetchAndParseISO639_1_2(),
+        fetchAndParseISO639_3(), // Run this after 1_2 to potentially augment 639-1
+        fetchAndParseISO639_5(),
+        fetchAndParseIANALanguageSubtags()
+    ]);
+    await Bun.write("bcp.json", JSON.stringify(ALL_LANGUAGE_CODES.bcp47_language_subtags))
+
+    console.log('\n\n--- All Fetched Language Codes ---');
+
+    // Example: Print counts and first few of each
+    for (const [key, codes] of Object.entries(ALL_LANGUAGE_CODES)) {
+        console.log(`\n--- ${key} (Total: ${codes.length}) ---`);
+        codes.slice(0, 50).forEach(c => console.log(`${c.code}: ${c.name}`));
+        if (codes.length > 50) console.log('... and more ...');
+    }
+
+    // You can now use ALL_LANGUAGE_CODES object for your needs
+    // e.g., save to a JSON file
+    // import fs from 'fs';
+    // fs.writeFileSync('all_language_codes.json', JSON.stringify(ALL_LANGUAGE_CODES, null, 2));
+    // console.log('\n\nSaved all codes to all_language_codes.json');
+
+    console.log('\nFetching complete.');
+}
+
+main().catch(console.error);
diff --git a/src/lib/db/index.ts b/src/lib/db/index.ts
index fcfab57..6bd417c 100644
--- a/src/lib/db/index.ts
+++ b/src/lib/db/index.ts
@@ -92,10 +92,16 @@ class DatabaseHandler {
     const query = this.db.query(
       `
       SELECT * FROM expressions e
-      WHERE e.syllables = 1 AND e.lang = ?
+      WHERE e.type = 'syllable' AND e.lang = ?
       ORDER BY frequency DESC
       LIMIT ${PAGE_SIZE} ${page ? "OFFSET " + getDBOffset(page, PAGE_SIZE) : ""}
       `,
+      // `
+      // SELECT * FROM expressions e
+      // WHERE e.syllables = 1 AND e.lang = ?
+      // ORDER BY frequency DESC
+      // LIMIT ${PAGE_SIZE} ${page ? "OFFSET " + getDBOffset(page, PAGE_SIZE) : ""}
+      // `,
     );
     const results = query.all(lang);
     console.log({ lang, page }, "results");
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
new file mode 100644
index 0000000..b3b973b
--- /dev/null
+++ b/src/lib/db/prosodydb.ts
@@ -0,0 +1,238 @@
+import Database from "bun:sqlite";
+type Str = string | null;
+type ItemType = "word" | "syllable" | "idiom";
+
+class DatabaseHandler {
+  db: Database;
+  constructor() {
+    const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/prosodynew.db";
+    const db = new Database(dbPath, { create: true });
+    db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance
+    db.exec("PRAGMA foreign_keys = ON");
+    this.db = db;
+  }
+  async init() {
+    const file = Bun.file("./prosodyschema.sql");
+    const sql = await file.text();
+    this.db.exec(sql);
+  }
+  // selects
+  fetchWords(words: string[]) {
+    const query = this.db.query(
+      `SELECT id FROM words where spelling IN (${words.map((w) => `'${w}'`).join(", ")})`,
+    );
+    return query.all() as Array<{ id: number }>;
+  }
+  // inserts
+
+  addLanguage(code: string, name: string) {
+    const query = this.db
+      .query(`INSERT OR IGNORE INTO languages(iso6392, english) VALUES(?, ?)`)
+      .run(code, name);
+  }
+  addPronunciation(
+    type: ItemType,
+    parentId: number | bigint,
+    ipa: string,
+    syllables: number,
+    tags: Str,
+    notes: Str,
+  ) {
+    try {
+      const query = this.db
+        .query(
+          `INSERT INTO pronunciation(type, parent_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?, ?)`,
+        )
+        .run(type, parentId, ipa, syllables, tags, notes);
+    } catch (e) {
+      // console.error(e);
+    }
+  }
+  addWordRhyme(wordId: number | bigint, ipa: string, lang: string, notes: Str) {
+    console.log("wordrhyme", notes);
+    try {
+      const query = this.db
+        .query(
+          `INSERT INTO word_rhymes(text, lang, notes) VALUES(?, ?, ?)
+                 ON CONFLICT(text,lang) DO UPDATE SET
+                 text = excluded.text
+                 RETURNING rowid
+          `,
+        )
+        .get(ipa, lang, notes) as { id: number };
+      const query2 = this.db
+        .query(
+          `
+            INSERT INTO words_idioms(word_id, idiom_id) VALUES(?, ?)
+          `,
+        )
+        .run(wordId, query.id);
+    } catch (e) {
+      // console.error(e);
+    }
+  }
+  addIdiom(spelling: string, lang: string) {
+    const query = this.db.query(
+      `INSERT INTO idioms(spelling, lang) VALUES(?, ?)`,
+    );
+    const res = query.run(spelling, lang);
+    return res;
+  }
+  findIdiomWords(spelling: string, idId: number | bigint) {
+    const split = spelling.split(" ");
+    const words = this.fetchWords(split);
+    console.log({ words });
+    const tx = this.db.transaction(() => {
+      for (const w of words) {
+        this.db
+          .query(
+            `
+            INSERT INTO words_idioms(word_id, idiom_id) VALUES(?, ?)
+          `,
+          )
+          .run(w.id, idId);
+      }
+    });
+    tx();
+  }
+  findIdiomsWords() {
+    const rows: any = this.db.query(`SELECT id, spelling FROM idioms`);
+    for (const row of rows) {
+      this.findIdiomWords(row.spelling, row.id);
+    }
+  }
+  addWord(spelling: string, lang: string) {
+    const query = this.db.query(
+      // `INSERT OR IGNORE INTO words(spelling, lang) VALUES(?, ?)`,
+      `INSERT INTO words(spelling, lang) VALUES(?, ?)`,
+    );
+    const res = query.run(spelling, lang);
+    const wordId = res.lastInsertRowid;
+    return wordId;
+  }
+  addSyllable(
+    wordId: number | bigint,
+    text: string,
+    lang: string,
+    long: boolean,
+    onset: Str,
+    medial: Str,
+    nucleus: string,
+    coda: Str,
+    rhyme: string,
+    tone: Str,
+    notes: Str,
+  ) {
+    const tx = this.db.transaction(() => {
+      const query = this.db.query(
+        `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+      );
+      const res = query.run(
+        text,
+        lang,
+        long,
+        onset,
+        medial,
+        nucleus,
+        coda,
+        rhyme,
+        tone,
+        notes,
+      );
+      const sylId = res.lastInsertRowid;
+
+      const res1 = this.db
+        .query(`INSERT INTO syllables_words(syl_id, word_id) VALUES(?, ?)`)
+        .run(sylId, wordId);
+      //
+      return sylId;
+    });
+    const sylId = tx();
+    let res1: any;
+    if (onset) {
+      res1 = this.db
+        .query(
+          `INSERT INTO onsets(text, lang) VALUES(?, ?)
+           ON CONFLICT(text, lang) DO UPDATE SET
+           text = excluded.text
+           RETURNING rowid
+          `,
+        )
+        .get(onset, lang);
+      this.db
+        .query(`INSERT INTO onsets_syllables(syl_id, onset_id) VALUES(?, ?)`)
+        .run(sylId, res1.id);
+    }
+    if (medial) {
+      res1 = this.db
+        .query(
+          `INSERT INTO medials(text, lang) VALUES(?, ?)
+           ON CONFLICT(text, lang) DO UPDATE SET
+           text = excluded.text
+           RETURNING rowid
+          `,
+        )
+        .get(medial, lang);
+      this.db
+        .query(`INSERT INTO medials_syllables(syl_id, medial_id) VALUES(?, ?)`)
+        .run(sylId, res1.id);
+    }
+    res1 = this.db
+      .query(
+        `INSERT INTO nucleus(text, lang) VALUES(?, ?)
+         ON CONFLICT(text, lang) DO UPDATE SET
+         text = excluded.text
+         RETURNING rowid
+        `,
+      )
+      .get(nucleus, lang);
+    this.db
+      .query(`INSERT INTO nucleus_syllables(syl_id, nucleus_id) VALUES(?, ?)`)
+      .run(sylId, res1.id);
+    if (coda) {
+      res1 = this.db
+        .query(
+          `INSERT INTO codas(text, lang) VALUES(?, ?)
+           ON CONFLICT(text, lang) DO UPDATE SET
+           text = excluded.text
+           RETURNING rowid
+          `,
+        )
+        .get(coda, lang);
+      this.db
+        .query(`INSERT INTO codas_syllables(syl_id, coda_id) VALUES(?, ?)`)
+        .run(sylId, res1.id);
+    }
+    res1 = this.db
+      .query(
+        `INSERT INTO rhymes(text, lang) VALUES(?, ?)
+         ON CONFLICT(text, lang) DO UPDATE SET
+         text = excluded.text
+         RETURNING rowid
+        `,
+      )
+      .get(rhyme, lang);
+    this.db
+      .query(`INSERT INTO rhymes_syllables(syl_id, rhyme_id) VALUES(?, ?)`)
+      .run(sylId, res1.id);
+    if (tone) {
+      res1 = this.db
+        .query(
+          `INSERT INTO tones(text, lang) VALUES(?, ?)
+           ON CONFLICT(text, lang) DO UPDATE SET
+           text = excluded.text
+           RETURNING rowid
+          `,
+        )
+        .get(tone, lang);
+      this.db
+        .query(`INSERT INTO tones_syllables(syl_id, tone_id) VALUES(?, ?)`)
+        .run(sylId, res1.id);
+    }
+  }
+
+  // reads
+}
+const db = new DatabaseHandler();
+
+export default db;
diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql
new file mode 100644
index 0000000..e70b005
--- /dev/null
+++ b/src/lib/db/prosodyschema.sql
@@ -0,0 +1,178 @@
+-- Enable foreign key support
+PRAGMA foreign_keys = ON;
+PRAGMA journal_mode = WAL;
+PRAGMA cache_size = -2000;
+PRAGMA mmap_size = 30000000000;
+
+
+-- proper prosody now
+--
+-- 
+--
+CREATE TABLE IF NOT EXISTS languages(
+    iso6392 TEXT PRIMARY KEY,
+    -- bcp47 TEXT PRIMARY KEY,
+    -- iso6393 TEXT NOT NULL,
+    english TEXT NOT NULL
+    -- native TEXT,
+    -- iso6391 TEXT,
+    -- iso6395 TEXT,
+    -- glottolog TEXT
+);
+
+CREATE TABLE IF NOT EXISTS idioms(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    spelling TEXT NOT NULL,
+    lang TEXT NOT NULL,
+    frequency INTEGER,
+    FOREIGN KEY (lang) REFERENCES languages(iso6392),
+    CONSTRAINT spell_unique UNIQUE (spelling, lang)
+);
+
+CREATE INDEX IF NOT EXISTS idx_idioms_spelling ON idioms(spelling);
+CREATE TABLE IF NOT EXISTS words(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    spelling TEXT NOT NULL,
+    lang TEXT NOT NULL,
+    frequency INTEGER,
+    FOREIGN KEY (lang) REFERENCES languages(iso6392),
+    CONSTRAINT spell_unique UNIQUE (spelling, lang)
+);
+
+CREATE INDEX IF NOT EXISTS idx_words_spelling ON words(spelling);
+
+CREATE TABLE IF NOT EXISTS word_rhymes(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    text TEXT NOT NULL,
+    lang TEXT NOT NULL,
+    notes TEXT,
+    CONSTRAINT wrhyme_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS words_rhymes(
+    word_id INTEGER NOT NULL,
+    wrhyme_id INTEGER NOT NULL,
+    FOREIGN KEY (word_id) REFERENCES words(id),
+    FOREIGN KEY (wrhyme_id) REFERENCES word_rhymes(id)
+);
+
+-- break up syllables
+CREATE TABLE IF NOT EXISTS syllables(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    text TEXT NOT NULL,
+    lang TEXT NOT NULL,
+    long INTEGER NOT NULL,
+    tone TEXT,
+    onset TEXT,
+    medial TEXT,
+    nucleus TEXT,
+    coda TEXT,
+    rhyme TEXT,
+    notes TEXT,
+    FOREIGN KEY (lang) REFERENCES languages(iso6392),
+    CONSTRAINT spell_unique UNIQUE (text, lang)
+);
+
+CREATE TABLE IF NOT EXISTS tones(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    text TEXT NOT NULL,
+    lang TEXT NOT NULL,
+    name TEXT,
+    num INTEGER,
+    CONSTRAINT tone_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS onsets(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    text TEXT NOT NULL,
+    lang TEXT NOT NULL,
+    CONSTRAINT onsets_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS medials(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    text TEXT NOT NULL,
+    lang TEXT NOT NULL,
+    CONSTRAINT medials_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS nucleus(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    text TEXT NOT NULL,
+    lang TEXT NOT NULL,
+    CONSTRAINT nucleus_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS codas(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    text TEXT NOT NULL,
+    lang TEXT NOT NULL,
+    CONSTRAINT coda_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS rhymes(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    text TEXT NOT NULL,
+    lang TEXT NOT NULL,
+    CONSTRAINT rhyme_unique UNIQUE (text, lang)
+);
+
+-- join tables
+
+CREATE TABLE IF NOT EXISTS tones_syllables(
+    syl_id INTEGER NOT NULL,
+    tone_id INTEGER NOT NULL,
+    FOREIGN KEY (syl_id) REFERENCES syllables(id),
+    FOREIGN KEY (tone_id) REFERENCES tones(id)
+);
+CREATE TABLE IF NOT EXISTS onsets_syllables(
+    syl_id INTEGER NOT NULL,
+    onset_id INTEGER NOT NULL,
+    FOREIGN KEY (syl_id) REFERENCES syllables(id),
+    FOREIGN KEY (onset_id) REFERENCES onsets(id)
+);
+CREATE TABLE IF NOT EXISTS medials_syllables(
+    syl_id INTEGER NOT NULL,
+    medial_id INTEGER NOT NULL,
+    FOREIGN KEY (syl_id) REFERENCES syllables(id),
+    FOREIGN KEY (medial_id) REFERENCES medials(id)
+);
+CREATE TABLE IF NOT EXISTS nucleus_syllables(
+    syl_id INTEGER NOT NULL,
+    nucleus_id INTEGER NOT NULL,
+    FOREIGN KEY (syl_id) REFERENCES syllables(id),
+    FOREIGN KEY (nucleus_id) REFERENCES nucleus(id)
+);
+CREATE TABLE IF NOT EXISTS codas_syllables(
+    syl_id INTEGER NOT NULL,
+    coda_id INTEGER NOT NULL,
+    FOREIGN KEY (syl_id) REFERENCES syllables(id),
+    FOREIGN KEY (coda_id) REFERENCES codas(id)
+);
+CREATE TABLE IF NOT EXISTS rhymes_syllables(
+    syl_id INTEGER NOT NULL,
+    rhyme_id INTEGER NOT NULL,
+    FOREIGN KEY (syl_id) REFERENCES syllables(id),
+    FOREIGN KEY (rhyme_id) REFERENCES rhymes(id)
+);
+
+CREATE TABLE IF NOT EXISTS syllables_words(
+    syl_id INTEGER NOT NULL,
+    word_id INTEGER NOT NULL,
+    FOREIGN KEY (syl_id) REFERENCES syllables(id),
+    FOREIGN KEY (word_id) REFERENCES words(id)
+);
+CREATE TABLE IF NOT EXISTS words_idioms(
+    word_id INTEGER NOT NULL,
+    idiom_id INTEGER NOT NULL,
+    FOREIGN KEY (idiom_id) REFERENCES idioms(id),
+    FOREIGN KEY (word_id) REFERENCES words(id)
+);
+
+
+-- 
+CREATE TABLE IF NOT EXISTS pronunciation(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    type TEXT CHECK(type IN ('word', 'syllable', 'idiom')) NOT NULL,
+    parent_id INTEGER NOT NULL,
+    ipa TEXT NOT NULL,
+    syllables INTEGER NOT NULL,
+    tag TEXT,
+    notes TEXT,
+    CONSTRAINT ipa_unique UNIQUE (ipa, parent_id)
+);
+CREATE INDEX IF NOT EXISTS idx_words_ipa ON pronunciation(ipa, parent_id);
diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts
index c4094de..7f4352f 100644
--- a/src/lib/db/seed.ts
+++ b/src/lib/db/seed.ts
@@ -2,6 +2,8 @@ import { readWiktionaryDump } from "../services/wiki";
 import { getStressedSyllable, getSyllableCount } from "../utils";
 import useful from "@/lib/useful_thai.json";
 import db from ".";
+import pdb from "./prosodydb";
+import * as Sorsyl from "sorsyl";
 
 const SYMBOL_REGEX = new RegExp(/[\W\d]/);
 
@@ -483,7 +485,16 @@ function addThaiSyllablesLesson() {
 //     }
 //   }
 // }
-addThaiUseful();
+function fixSyllables() {
+  const res = db.db.query(`SELECT ipa, syllables FROM expressions;`).all();
+  for (let i = 0; i < 10; i++) {
+    // for (const row of res) {
+    const row = res[i];
+    console.log({ row });
+  }
+}
+// fixSyllables();
+// addThaiUseful();
 // addThaiSyllablesLesson();
 
 // adjustFrequency("th");
@@ -492,3 +503,202 @@ addThaiUseful();
 // fillFromDump();
 // thaiSyllables();
 // thaiFreq();
+//
+//
+const SORSYL_PATH =
+  "/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl";
+async function redump() {
+  await pdb.init();
+  let count = 0;
+
+  // const soundTypes = new Set<string>();
+  // [
+  //   "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
+  //   "text", "hangeul", "topics", "form", "audio-ipa"
+  // ]
+  for await (const line of readWiktionaryDump()) {
+    try {
+      count++;
+      // if (count > 50) break;
+      const j = JSON.parse(line);
+      console.log(Object.keys(j), j.word);
+      // add language to db
+      pdb.addLanguage(j.lang_code, j.lang);
+      // handleEtim(j);
+      // handleDerived(j);
+      // handleSenses(j.pos, j.senses);
+      // //
+      const isWord = j.word.trim().split(" ").length === 1;
+      if (isWord) await handleWord(j);
+      else await handleIdiom(j);
+    } catch (e) {
+      console.log("error parsing", e);
+      // break;
+    }
+  }
+}
+
+type SorSyl = {
+  stressed: boolean;
+  long: boolean;
+  spelling: string;
+  ipa: string;
+  nucleus: string;
+  onset: string;
+  medial: string;
+  coda: string;
+  rhyme: string;
+  tone: string;
+};
+async function handleWord(j: any) {
+  const wordId = pdb.addWord(j.word, j.lang_code);
+  let ts = Date.now();
+
+  const hwikiRhyme = j.sounds.find((s) => "rhymes" in s);
+  const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
+  for (let snd of j.sounds || []) {
+    if ("ipa" in snd) {
+      const tags = JSON.stringify(snd.tags) || null;
+      const ipa = snd.ipa;
+      try {
+        const hres = await fetch("http://localhost:8104/syls", {
+          method: "POST",
+          headers: { "content-type": "application/json" },
+          body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }),
+        });
+        const hjon = await hres.json();
+        console.log(Date.now() - ts, "elapsed in http");
+        ts = Date.now();
+        pdb.addPronunciation(
+          "word",
+          wordId,
+          hjon.clean_ipa,
+          hjon.syls.length,
+          tags,
+          null,
+        );
+        const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => {
+          if (!item.stressed && !acc) return acc;
+          if (item.stressed && !acc) return `${acc}${item.rhyme}`;
+          else return `${acc}${item.ipa}`;
+        }, "");
+        if (wordRhyme)
+          pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
+        else console.log("no rhyme?", hjon);
+        for (const syl of hjon.syls) {
+          // TODO ideally syllables would have spelling not IPA... harsh tho
+          pdb.addSyllable(
+            wordId,
+            syl.ipa,
+            j.lang_code,
+            syl.long,
+            syl.onset || null,
+            syl.medial || null,
+            syl.nucleus,
+            syl.coda || null,
+            syl.rhyme,
+            syl.tone || null,
+            null,
+          );
+        }
+        console.log(Date.now() - ts, "elapsed in db");
+        ts = Date.now();
+      } catch (e) {
+        console.error(e);
+        console.error(j);
+        // break;
+      }
+    }
+  }
+}
+async function handleIdiom(j: any) {
+  console.log(j.word, "idiom");
+  pdb.addIdiom(j.word, j.lang_code);
+  // TODO IPA of idioms...?
+}
+async function handleEtim(j: any) {
+  console.log(j.etymology_text, "etym");
+  console.log(j.etymology_templates, "etym");
+
+  // {
+  //    name: "inh",
+  //    args: {
+  //      "1": "en",
+  //      "2": "ang",
+  //      "3": "frēo",
+  //      "4": "",
+  //      "5": "free",
+  //    },
+  //    expansion: "Old English frēo (“free”)",
+  //  },
+
+  console.log(j.head_templates, "head");
+  // {
+  //    name: "en-verb",
+  //    args: {},
+  //    expansion: "free (third-person singular simple present frees, present participle freeing, simple past and past participle freed)",
+  //  }
+}
+async function handleDerived(j: any) {
+  const { forms, derived, related, antonyms, hyponyms, synonyms, descendants } =
+    j;
+  console.log("forms", forms);
+  // {form: string; tags: string[]}
+  console.log("derived", derived);
+  // {word: string}
+  console.log("related", related);
+  // {word: string, source?: string;}
+  console.log("ant", antonyms);
+  // {word: string, source?: string;}
+  console.log("hypo", hyponyms);
+  console.log("syno", synonyms);
+  // {word: string, source?: string;}
+  console.log("desc", descendants);
+}
+async function handleSenses(pos: string, senses: any[]) {
+  console.log("ex", senses[0].examples);
+  // {text: string; ref: string; type: "quote"}
+  console.log("info", senses[0].info_templates);
+  for (const s of senses) {
+    // s.glosses[]
+    // s.tags[]
+  }
+}
+
+redump();
+
+async function newtest() {
+  // const query = pdb.db.query(
+  //   `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+  // );
+  // const res = query.run(
+  //   "lol",
+  //   "en",
+  //   true,
+  //   "l",
+  //   "j",
+  //   "o",
+  //   "q",
+  //   "joq",
+  //   null,
+  //   null,
+  // );
+  // const sylId = res.lastInsertRowid;
+  const res1 = pdb.db
+    .query(
+      `INSERT INTO onsets(text, lang) VALUES(?, ?)
+         ON CONFLICT(text, lang) DO UPDATE SET
+         text = excluded.text
+         RETURNING rowid
+      `,
+    )
+    .get("lll", "en");
+  console.log({ res1 });
+}
+// newtest();
+// TIL calling shell commands is terribly slow wtf
+// Bun.$.env({ FOO: ipa });
+// const res = await Bun.$`${SORSYL_PATH} $FOO`;
+// const syllables = JSON.parse(res.stdout.toString());
+// console.log(Date.now() - ts, "elapsed in py");
+// ts = Date.now();
diff --git a/src/lib/db/senseschema.sql b/src/lib/db/senseschema.sql
new file mode 100644
index 0000000..f07a208
--- /dev/null
+++ b/src/lib/db/senseschema.sql
@@ -0,0 +1,54 @@
+-- Enable foreign key support
+PRAGMA foreign_keys = ON;
+PRAGMA journal_mode = WAL;
+PRAGMA cache_size = -2000;
+PRAGMA mmap_size = 30000000000;
+
+-- a semantic entity
+CREATE TABLE IF NOT EXISTS senses(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    parent_id INTEGER NOT NULL,
+    spelling TEXT NOT NULL,
+    pos TEXT,
+    etymology TEXT,
+    senses JSONB,
+    forms JSONB,
+    related JSONB,
+    confidence INTEGER NOT NULL DEFAULT 0,
+    FOREIGN KEY (parent_id) REFERENCES expressions(id)
+);
+CREATE INDEX IF NOT EXISTS idx_words_pos ON senses(pos);
+CREATE INDEX IF NOT EXISTS idx_senses_parent ON senses(parent_id);
+
+CREATE TABLE IF NOT EXISTS subsenses(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    sid INTEGER NOT NULL
+    gloss TEXT NOT NULL,
+    examples JSONB,    
+    FOREIGN KEY (sid) REFERENCES senses(id)
+);
+
+CREATE TABLE IF NOT EXISTS derivation(
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    sid INTEGER NOT NULL
+    type TEXT NOT NULL,
+    text TEXT NOT NULL,
+    tags JSONB,    
+    FOREIGN KEY (sid) REFERENCES senses(id)
+);
+
+-- Categories table (for noun and verb categories)
+CREATE TABLE IF NOT EXISTS categories (
+    name TEXT PRIMARY KEY
+);
+
+-- Word Categories junction table
+CREATE TABLE IF NOT EXISTS word_categories (
+    word_id INTEGER NOT NULL,
+    category INTEGER NOT NULL,
+    PRIMARY KEY (word_id, category),
+    FOREIGN KEY (word_id) REFERENCES expressions(id),
+    FOREIGN KEY (category) REFERENCES categories(name)
+);
+CREATE INDEX IF NOT EXISTS idx_word_categories_category_id ON word_categories(category);
+
diff --git a/src/lib/lang/utils.ts b/src/lib/lang/utils.ts
new file mode 100644
index 0000000..b6c5bae
--- /dev/null
+++ b/src/lib/lang/utils.ts
@@ -0,0 +1,28 @@
+const TONE_LETTER_REGEX = new RegExp(
+  /[\u{02E5}-\u{02EE}\u{A700}-\u{A71F}\u{A789}-\u{A78A}]/u,
+);
+const TONE_NUMBERS_REGEX = new RegExp(/\w(\d{2,})\s/);
+
+export function isTonal(text: string): boolean {
+  return !!TONE_LETTER_REGEX.exec(text) || !!TONE_NUMBERS_REGEX.exec(text);
+}
+
+export function isLongVowel(text: string): boolean {
+  return text.includes("ː");
+}
+
+// In this order!
+export const thaiTones = {
+  "˧": "M",
+  "˨˩": "L",
+  "˦˥": "H",
+  "˥˩": "F",
+  "˩˩˦": "R",
+};
+
+export const mandarinTones = {
+  "˥˥": 1,
+  "˧˥": 2,
+  "˨˩˦": 3,
+  "˥˩": 4,
+};
diff --git a/src/lib/types/cards.ts b/src/lib/types/cards.ts
index cef02d2..1a62a44 100644
--- a/src/lib/types/cards.ts
+++ b/src/lib/types/cards.ts
@@ -210,3 +210,16 @@ export type ExpressionSearchParams = {
   frequency?: { num: number; above: boolean };
   type?: ExpressionType;
 };
+
+export type SyllableRes = { input: string; result: SyllableToken[] };
+export type SyllableToken = [IPACharacter, SyllablePart];
+export type IPACharacter = string; // one char mostly
+export enum SyllablePart {
+  INITIAL = "#",
+  OTHER_ONSET = "C",
+  VOWEL = "V",
+  OTHER_VOWEL = "v",
+  FINAL_VOWEL = ">",
+  OTHER_OFFSET = "c",
+  CODA = "$",
+}
author	polwex <polwex@sortug.com>	2025-05-29 12:10:22 +0700
committer	polwex <polwex@sortug.com>	2025-05-29 12:10:22 +0700
commit	a3f24ea79b14394b24c4b60a010651eb29eeb872 (patch)
tree	cb1c4937084116f66a59727ee752afd974714c8e /src/lib
parent	7abf2227438362ad30820ee236405ec1b57a40b6 (diff)