From ba2dbc660c229d3e86662d35513dfa7c904d9870 Mon Sep 17 00:00:00 2001 From: polwex Date: Sun, 23 Nov 2025 13:29:28 +0700 Subject: wew --- packages/db/bun.lock | 25 ---- packages/db/index.ts | 1 + packages/db/package.json | 4 + packages/db/src/index.ts | 327 +++++++++++++++++++++++++++++++++------------ packages/db/src/schema.sql | 97 +------------- packages/db/src/server.ts | 66 +++++++++ 6 files changed, 320 insertions(+), 200 deletions(-) delete mode 100644 packages/db/bun.lock create mode 100644 packages/db/src/server.ts (limited to 'packages/db') diff --git a/packages/db/bun.lock b/packages/db/bun.lock deleted file mode 100644 index cedc049..0000000 --- a/packages/db/bun.lock +++ /dev/null @@ -1,25 +0,0 @@ -{ - "lockfileVersion": 1, - "workspaces": { - "": { - "name": "@sortug/sorlang-db", - "devDependencies": { - "@types/bun": "latest", - }, - "peerDependencies": { - "typescript": "^5", - }, - }, - }, - "packages": { - "@types/bun": ["@types/bun@1.3.3", "", { "dependencies": { "bun-types": "1.3.3" } }, "sha512-ogrKbJ2X5N0kWLLFKeytG0eHDleBYtngtlbu9cyBKFtNL3cnpDZkNdQj8flVf6WTZUX5ulI9AY1oa7ljhSrp+g=="], - - "@types/node": ["@types/node@24.10.1", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-GNWcUTRBgIRJD5zj+Tq0fKOJ5XZajIiBroOF0yvj2bSU1WvNdYS/dn9UxwsujGW4JX06dnHyjV2y9rRaybH0iQ=="], - - "bun-types": ["bun-types@1.3.3", "", { "dependencies": { "@types/node": "*" } }, "sha512-z3Xwlg7j2l9JY27x5Qn3Wlyos8YAp0kKRlrePAOjgjMGS5IG6E7Jnlx736vH9UVI4wUICwwhC9anYL++XeOgTQ=="], - - "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], - - "undici-types": ["undici-types@7.16.0", "", {}, "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="], - } -} diff --git a/packages/db/index.ts b/packages/db/index.ts index ceecd52..c5660a0 100644 --- a/packages/db/index.ts +++ b/packages/db/index.ts @@ -1,3 +1,4 @@ import DB from "./src"; export default DB; +export { handler } from "./src/server"; diff --git a/packages/db/package.json b/packages/db/package.json index a121b42..fc2c534 100644 --- a/packages/db/package.json +++ b/packages/db/package.json @@ -7,5 +7,9 @@ }, "peerDependencies": { "typescript": "^5" + }, + "dependencies": { + "@sortug/lib": "workspace:*", + "@sortug/langlib": "workspace:*" } } diff --git a/packages/db/src/index.ts b/packages/db/src/index.ts index 6a89ee2..1425a43 100644 --- a/packages/db/src/index.ts +++ b/packages/db/src/index.ts @@ -1,39 +1,242 @@ -import { Database } from "bun:sqlite"; -import type { FullWordDataDB } from "./types"; +import { SQL } from "bun"; import { SRSQueries } from "./srs"; +// NEW API +// +// https://bun.com/docs/runtime/sql +// import { sql } from "bun"; + +// // Basic insert with direct values +// const [user] = await sql` +// INSERT INTO users (name, email) +// VALUES (${name}, ${email}) +// RETURNING * +// `; + +// // Using object helper for cleaner syntax +// const userData = { +// name: "Alice", +// email: "alice@example.com", +// }; + +// const [newUser] = await sql` +// INSERT INTO users ${sql(userData)} +// RETURNING * +// `; +// // Expands to: INSERT INTO users (name, email) VALUES ('Alice', 'alice@example.com') + +const DB_PATH = "/home/y/code/bun/sorlang/bulkdata/db.db"; +import type { FullWordDataDB } from "@sortug/langlib"; export class Queries { - db: Database; - srs: SRSQueries; + db: SQL; + ready = false; + // srs: SRSQueries; constructor() { - const db = new Database("/home/y/code/bun/sorlang/bulkdata/unified.db"); - db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance - db.exec("PRAGMA foreign_keys = ON"); - db.exec("PRAGMA cache_size = -8000"); // Increase cache size to 8MB - db.exec("PRAGMA mmap_size = 30000000000"); - db.exec("PRAGMA temp_store = MEMORY"); // Store temp tables in memory - db.exec("PRAGMA synchronous = NORMAL"); // Slightly less safe but faster - + console.log("setting sorlang DB"); + const db = new SQL({ + adapter: "sqlite", + filename: DB_PATH, + create: false, + }); this.db = db; - this.srs = new SRSQueries(db); + // this.srs = new SRSQueries(db); + } + async init() { + try { + await this.db`PRAGMA journal_mode = WAL`; // Enable Write-Ahead Logging for better performance + await this.db`PRAGMA foreign_keys = ON`; + await this.db`PRAGMA cache_size = -8000`; // Increase cache size to 8MB + await this.db`PRAGMA mmap_size = 30000000000`; + await this.db`PRAGMA temp_store = MEMORY`; // Store temp tables in memory + await this.db`PRAGMA synchronous = NORMAL`; // Slightly less safe but faster + this.ready = true; + } catch (e) { + console.error("error starting db", e); + } + } + + async fetchExpressionById(id: number) { + const query = await this.db` + SELECT * FROM expressions WHERE id = ${id} + `; + return query; + } + // TODO word_phonetics is MANY TO ONE vs expressions, so it's possible for word_phonetics to lack entries for syllables + async fetchExpressionBySpelling( + spelling: string, + lang: string, + ): Promise { + const data = await this.db` +SELECT + json_object( + 'id', e.id, + 'spelling', e.spelling, + 'frequency', e.frequency, + 'phonetic', ( + SELECT json_object( + 'id', wp.id, + 'tone_sequence', wp.tone_sequence, + 'syl_seq', wp.syllable_sequence, + 'syllable_count', wp.syllable_count, + 'ipa', wp.ipa, + 'syllables', ( + SELECT json_group_array( + json_object( + 'stressed', CASE WHEN sw.stressed = 1 THEN json('true') ELSE json('false') END, + 'long', CASE WHEN s.long = 1 THEN json('true') ELSE json('false') END, + 'spelling', s.text, + 'ipa', s.ipa, + 'onset', ( + SELECT json_object('ipa', o.ipa, 'spelling', o.text) + FROM onsets o WHERE o.id = s.onset + ), + 'nucleus', ( + SELECT json_object('ipa', n.ipa, 'spelling', n.text) + FROM nucleus n WHERE n.id = s.nucleus + ), + 'medial', ( + SELECT json_object('ipa', m.ipa, 'spelling', m.text) + FROM medials m WHERE m.id = s.medial + ), + 'coda', ( + SELECT json_object('ipa', c.ipa, 'spelling', c.text) + FROM codas c WHERE c.id = s.coda + ), + 'rhyme', ( + SELECT json_object('ipa', r.ipa, 'spelling', r.text) + FROM rhymes r WHERE r.id = s.rhyme + ), + 'tone', ( + SELECT json_object('letters', t.ipa, 'numbers', t.nums, 'name', t.name) + FROM tones t WHERE t.id = s.tone + ) + ) + ) + FROM syllables_words sw + JOIN syllables s ON sw.syl_id = s.id + WHERE sw.word_id = wp.id + ORDER BY sw.idx ASC + ) + ) + FROM word_phonetics wp + WHERE wp.word_id = e.id + -- Select the most general pronunciation (sense_id IS NULL) or the first available + ORDER BY (wp.sense_id IS NULL) DESC, wp.id ASC + LIMIT 1 + ), + 'senses', ( + SELECT json_group_array( + json_object( + 'id', sn.id, + 'confidence', sn.confidence, + 'etymology', sn.etymology, + 'pos', sn.pos, + 'glosses', ( + SELECT json_group_array(sub.gloss) + FROM subsenses sub + WHERE sub.sid = sn.id + ), + 'examples', ( + SELECT json_group_array( + json_object( + 'example', ex.example, + 'ref', ex.ref + ) + ) + FROM examples ex + WHERE ex.sid = sn.id + ), + 'categories', ( + SELECT json_group_array(wc.category) + FROM word_categories wc + WHERE wc.word_id = sn.id + ), + 'derivation', ( + SELECT json_group_array( + json_object( + 'type', d.type, + 'text', d.text, + 'tags', json(d.tags) + ) + ) + FROM derivation d + WHERE d.sid = sn.id + ) + ) + ) + FROM senses sn + WHERE sn.parent_id = e.id + ) + ) AS full_word_data +FROM expressions e +WHERE e.spelling = ${spelling} AND e.lang = ${lang}; + `; + + if (data.length > 1) { + console.log({ spelling, lang, data }); + throw new Error("more rows than 1 wtf"); + } + if (data.length === 0) { + console.log({ spelling, lang, data }); + return null; + } + const row = data[0]; + const json = JSON.parse(row.full_word_data); + const phonetic = JSON.parse(json.phonetic); + const obj = { ...json, phonetic }; + return obj; } - fetchExpressionById(id: number) { - const query = this.db.query( - ` - SELECT * FROM expressions WHERE id = ? - `, - ); - return query.get(id) as any; + + // TODO combine with this old query to get both phonetic and semantic data + // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table + + async fetchSenses(spelling: string, lang: string) { + const rows = await this.db` + WITH sense_data AS ( + SELECT + s.*, + GROUP_CONCAT(DISTINCT ss.id || ':' || ss.gloss, '|') as subsenses_data, + GROUP_CONCAT(DISTINCT ex.id || ':' || ex.example || ':' || COALESCE(ex.ref, ''), '|') as examples_data, + GROUP_CONCAT(DISTINCT d.id || ':' || d.type || ':' || d.text, '|') as derivation_data, + GROUP_CONCAT(DISTINCT wc.category, '|') as categories_data + FROM senses s + LEFT JOIN subsenses ss ON ss.sid = s.id + LEFT JOIN examples ex ON ex.sid = s.id + LEFT JOIN derivation d ON d.sid = s.id + LEFT JOIN word_categories wc ON wc.word_id = s.id + GROUP BY s.id + ) + SELECT e.*, + (SELECT + json_group_array(json_object( + 'id', sd.id, + 'pos', sd.pos, + 'etymology', sd.etymology, + 'confidence', sd.confidence, + 'subsenses_data', sd.subsenses_data, + 'examples_data', sd.examples_data, + 'derivation_data', sd.derivation_data, + 'categories_data', sd.categories_data + )) + FROM sense_data sd + WHERE sd.parent_id = e.id + ) as senses_array + FROM expressions e + WHERE e.spelling = ${spelling} AND e.lang = ${lang} + ORDER BY e.frequency DESC`; + + return rows; } - fetchWordsByToneAndSyls1(tones: Array) { + + // Tones and syls + async fetchWordsByToneAndSyls1(tones: Array) { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); - const query = this.db.query( - ` + const data = await this.db` WITH word_tone_sequences AS ( SELECT wp.ipa, @@ -48,21 +251,19 @@ export class Queries { ) SELECT * FROM word_tone_sequences - WHERE tone_sequence LIKE ? - AND syllable_count = ? - `, - ); - return query.all(toneString, tones.length) as any[]; + WHERE tone_sequence LIKE ${toneString} + AND syllable_count = ${tones.length} + `; + return data; } - fetchWordsByToneAndSylsO(tones: Array) { + async fetchWordsByToneAndSylsO(tones: Array) { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); - const query = this.db.query( - ` + const data = await this.db` WITH word_tone_sequences AS ( SELECT w.id as word_id, @@ -87,25 +288,25 @@ export class Queries { tone_sequence, syllable_count FROM word_tone_sequences - WHERE tone_sequence LIKE ? - AND syllable_count = ? + WHERE tone_sequence LIKE ${toneString} + AND syllable_count = ${tones.length} ORDER BY frequency ASC NULLS LAST; - `, - ); + `; // TODO combine with this old query to get both phonetic and semantic data // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table - return query.all(toneString, tones.length) as any[]; + return data; } - fetchWordsByToneAndSyls(tones: Array): FullWordDataDB[] { + async fetchWordsByToneAndSyls( + tones: Array, + ): Promise { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); - const query = this.db.query( - ` + const data = await this.db` WITH word_tone_sequences AS ( SELECT w.id as word_id, @@ -161,54 +362,14 @@ export class Queries { ) SELECT * FROM word_tone_sequences - WHERE tone_sequence LIKE ? - AND syllable_count = ? + WHERE tone_sequence LIKE ${toneString} + AND syllable_count = ${tones.length} ORDER BY frequency ASC NULLS LAST; - `, - ); + `; // TODO combine with this old query to get both phonetic and semantic data // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table - return query.all(toneString, tones.length) as any[]; - } - // TODO combine with this old query to get both phonetic and semantic data - // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table - - fetchSenses(spelling: string, lang: string) { - const query = this.db.query(` - WITH sense_data AS ( - SELECT - s.*, - GROUP_CONCAT(DISTINCT ss.id || ':' || ss.gloss, '|') as subsenses_data, - GROUP_CONCAT(DISTINCT ex.id || ':' || ex.example || ':' || COALESCE(ex.ref, ''), '|') as examples_data, - GROUP_CONCAT(DISTINCT d.id || ':' || d.type || ':' || d.text, '|') as derivation_data, - GROUP_CONCAT(DISTINCT wc.category, '|') as categories_data - FROM senses s - LEFT JOIN subsenses ss ON ss.sid = s.id - LEFT JOIN examples ex ON ex.sid = s.id - LEFT JOIN derivation d ON d.sid = s.id - LEFT JOIN word_categories wc ON wc.word_id = s.id - GROUP BY s.id - ) - SELECT e.*, - (SELECT - json_group_array(json_object( - 'id', sd.id, - 'pos', sd.pos, - 'etymology', sd.etymology, - 'confidence', sd.confidence, - 'subsenses_data', sd.subsenses_data, - 'examples_data', sd.examples_data, - 'derivation_data', sd.derivation_data, - 'categories_data', sd.categories_data - )) - FROM sense_data sd - WHERE sd.parent_id = e.id - ) as senses_array - FROM expressions e - WHERE e.spelling = ? AND e.lang = ? - ORDER BY e.frequency DESC`); - return query.all(spelling, lang); + return data; } } export default Queries; diff --git a/packages/db/src/schema.sql b/packages/db/src/schema.sql index c4a7c76..85d0d77 100644 --- a/packages/db/src/schema.sql +++ b/packages/db/src/schema.sql @@ -1,82 +1,11 @@ -/** - * UNIFIED DATABASE SCHEMA FOR THAI LANGUAGE LEARNING APPLICATION - * - * This file consolidates 3 redundant database schemas: - * - schema.sql (main schema with courses and SRS) - * - prosodyschema.sql (phonetic analysis with syllable breakdown) - * - senseschema.sql (semantic analysis with subsenses and derivations) - * - * REDUNDANCY ANALYSIS: - * ==================== - * - ** MAJOR CONFLICTS RESOLVED: - * 1. Languages table: - * - schema.sql: uses 'code' as PRIMARY KEY - * - prosodyschema.sql: uses 'iso6392' as PRIMARY KEY - * - RESOLVED: Unified with both code systems supported - * - * 2. Senses table: - * - schema.sql: includes ipa/prosody JSONB fields - * - senseschema.sql: missing phonetic fields - * - RESOLVED: Enhanced version with all fields - * - * 3. Word/Expression entities: - * - schema.sql: uses 'expressions' table - * - prosodyschema.sql: uses 'words' table - * - RESOLVED: Standardized on 'expressions' terminology - * - * 4. Categories tables: - * - EXACT DUPLICATES in schema.sql and senseschema.sql - * - RESOLVED: Keep one instance, remove duplicate - * - * UNIQUE FEATURES PRESERVED: - * ======================== - * - * FROM schema.sql (Main Course Learning): - * - User management: users, cookies - * - Course structure: lessons, cards, cards_lessons, cards_expressions - * - SRS tracking: user_progress, attempts - * - Bookmarks and user features - * - * FROM prosodyschema.sql (Phonetic Analysis): - * - Detailed syllable breakdown: tones, onsets, medials, nucleus, codas, rhymes - * - Phonetic patterns: word_phonetics, syllables_words - * - Rhyme analysis: word_rhymes, words_wrhymes - * - * FROM senseschema.sql (Semantic Analysis): - * - Detailed definitions: subsenses - * - Etymology tracking: derivation - * - * FOREIGN KEY RELATIONSHIPS: - * ========================= - * - Updated all references from 'words.id' to 'expressions.id' - * - Unified language references to use languages.code - * - Maintained proper cascade relationships - * - * BENEFITS OF UNIFICATION: - * ======================= - * 1. Single source of truth for database structure - * 2. Eliminated conflicting table definitions - * 3. Preserved all unique functionality - * 4. Improved foreign key consistency - * 5. Better support for comprehensive Thai language analysis - */ --- Enable foreign key support and performance optimizations PRAGMA foreign_keys = ON; PRAGMA journal_mode = WAL; PRAGMA cache_size = -2000; PRAGMA mmap_size = 30000000000; +PRAGMA busy_timeout = 5000; + -/** - * UNIFIED LANGUAGES TABLE - * - * RESOLVES CONFLICT between: - * - schema.sql: code (PRIMARY KEY), name, native_name - * - prosodyschema.sql: iso6392 (PRIMARY KEY), english - * - * NOW SUPPORTS multiple language code systems for maximum compatibility - */ CREATE TABLE IF NOT EXISTS languages ( code TEXT PRIMARY KEY, -- Primary language code (ISO 639-1 preferred) name TEXT NOT NULL, -- English name @@ -85,14 +14,7 @@ CREATE TABLE IF NOT EXISTS languages ( CONSTRAINT name_unique UNIQUE (name) ); -/** - * CORE CONTENT TABLES - * Standardized on 'expressions' terminology from schema.sql - * Enhanced with fields from both schemas - */ --- Main expressions table (formerly 'words' in prosodyschema.sql) --- UNIFIED from schema.sql expressions and prosodyschema.sql words CREATE TABLE IF NOT EXISTS expressions ( id INTEGER PRIMARY KEY AUTOINCREMENT, spelling TEXT NOT NULL, @@ -104,8 +26,6 @@ CREATE TABLE IF NOT EXISTS expressions ( CONSTRAINT spell_unique UNIQUE (spelling, lang) ); --- Enhanced senses table with phonetic capabilities --- MERGED from schema.sql senses (with ipa/prosody) and senseschema.sql senses CREATE TABLE IF NOT EXISTS senses ( id INTEGER PRIMARY KEY AUTOINCREMENT, parent_id INTEGER NOT NULL, @@ -153,14 +73,6 @@ CREATE TABLE IF NOT EXISTS derivation ( FOREIGN KEY (sid) REFERENCES senses(id) ON DELETE CASCADE ); -/** - * PHONETIC ANALYSIS TABLES (FROM prosodyschema.sql) - * - * Comprehensive syllable breakdown system for Thai language analysis - * These tables provide detailed phonetic analysis beyond basic JSONB fields - */ - - -- Syllable component tables CREATE TABLE IF NOT EXISTS tones ( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -252,10 +164,11 @@ CREATE TABLE IF NOT EXISTS word_phonetics ( syllable_sequence TEXT NOT NULL, -- Comma-separated syllables stressed INTEGER, -- index of stressed syllable tone_sequence TEXT, -- Comma-separated tones - tag JSONB, -- Pattern/usage tag + tags TEXT, -- Usually dialect, array from wikisource, we join it with "/-/" notes TEXT, FOREIGN KEY (word_id) REFERENCES expressions(id), - FOREIGN KEY (sense_id) REFERENCES senses(id) + FOREIGN KEY (sense_id) REFERENCES senses(id), + CONSTRAINT phonetics_unique UNIQUE (ipa, tags) ); -- Rhyme analysis tables (UNIQUE to prosodyschema.sql) diff --git a/packages/db/src/server.ts b/packages/db/src/server.ts new file mode 100644 index 0000000..6479b7f --- /dev/null +++ b/packages/db/src/server.ts @@ -0,0 +1,66 @@ +import DB from "."; + +const db = new DB(); +const ready = db.init(); + +type Route = Partial< + Record< + Bun.Serve.HTTPMethod, + Bun.Serve.Handler, Response> + > +>; +export const handler: Route = { + async GET(req: Bun.BunRequest) { + await ready; + if (!db.ready) return Response.json({ error: "DB failed to initialize" }); + // const db = new DB(); + console.log("Handling HTTP Request on DB", req.url); + + try { + const url = new URL(req.url); + const params = url.searchParams; + const word = params.get("word"); + const lang = params.get("lang"); + if (!word) return Response.json({ error: "word param is required" }); + if (!lang) return Response.json({ error: "lang param is required" }); + const row = await db.fetchExpressionBySpelling(word, lang); + if (!row) return Response.json({ error: "No data found" }); + else return Response.json({ ok: row }); + } catch (e) { + return Response.json({ error: `${e}` }); + } + }, + async POST(req: Bun.BunRequest) { + await ready; + if (!db.ready) return Response.json({ error: "DB failed to initialize" }); + // const db = new DB(); + console.log("Handling HTTP Request on DB", req.url); + + try { + const reqBody = (await req.json()) as RequestBody; + if (!reqBody) return Response.json({ error: "No request body" }); + const returnData = + "getWordFull" in reqBody + ? db.fetchExpressionBySpelling( + reqBody.getWordFull.spelling, + reqBody.getWordFull.lang, + ) + : null; + const row = await returnData; + console.log({ row }); + if (!returnData || !row) return Response.json({ error: "No data found" }); + else return Response.json({ ok: row }); + } catch (e) { + return Response.json({ error: `${e}` }); + } + }, +}; + +type BySpelling = { spelling: string; lang: string }; +type ByLanguage = string; // iso6393 +type RequestBody = + | { getWordFull: BySpelling } + | { getWordsenses: BySpelling } + | { getWordPhonetics: BySpelling } + | { getLanguages: null } + | { getExpressions: ByLanguage }; -- cgit v1.2.3