diff options
Diffstat (limited to 'packages/db/src/index.ts')
| -rw-r--r-- | packages/db/src/index.ts | 327 |
1 files changed, 244 insertions, 83 deletions
diff --git a/packages/db/src/index.ts b/packages/db/src/index.ts index 6a89ee2..1425a43 100644 --- a/packages/db/src/index.ts +++ b/packages/db/src/index.ts @@ -1,39 +1,242 @@ -import { Database } from "bun:sqlite"; -import type { FullWordDataDB } from "./types"; +import { SQL } from "bun"; import { SRSQueries } from "./srs"; +// NEW API +// +// https://bun.com/docs/runtime/sql +// import { sql } from "bun"; + +// // Basic insert with direct values +// const [user] = await sql` +// INSERT INTO users (name, email) +// VALUES (${name}, ${email}) +// RETURNING * +// `; + +// // Using object helper for cleaner syntax +// const userData = { +// name: "Alice", +// email: "alice@example.com", +// }; + +// const [newUser] = await sql` +// INSERT INTO users ${sql(userData)} +// RETURNING * +// `; +// // Expands to: INSERT INTO users (name, email) VALUES ('Alice', 'alice@example.com') + +const DB_PATH = "/home/y/code/bun/sorlang/bulkdata/db.db"; +import type { FullWordDataDB } from "@sortug/langlib"; export class Queries { - db: Database; - srs: SRSQueries; + db: SQL; + ready = false; + // srs: SRSQueries; constructor() { - const db = new Database("/home/y/code/bun/sorlang/bulkdata/unified.db"); - db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance - db.exec("PRAGMA foreign_keys = ON"); - db.exec("PRAGMA cache_size = -8000"); // Increase cache size to 8MB - db.exec("PRAGMA mmap_size = 30000000000"); - db.exec("PRAGMA temp_store = MEMORY"); // Store temp tables in memory - db.exec("PRAGMA synchronous = NORMAL"); // Slightly less safe but faster - + console.log("setting sorlang DB"); + const db = new SQL({ + adapter: "sqlite", + filename: DB_PATH, + create: false, + }); this.db = db; - this.srs = new SRSQueries(db); + // this.srs = new SRSQueries(db); + } + async init() { + try { + await this.db`PRAGMA journal_mode = WAL`; // Enable Write-Ahead Logging for better performance + await this.db`PRAGMA foreign_keys = ON`; + await this.db`PRAGMA cache_size = -8000`; // Increase cache size to 8MB + await this.db`PRAGMA mmap_size = 30000000000`; + await this.db`PRAGMA temp_store = MEMORY`; // Store temp tables in memory + await this.db`PRAGMA synchronous = NORMAL`; // Slightly less safe but faster + this.ready = true; + } catch (e) { + console.error("error starting db", e); + } + } + + async fetchExpressionById(id: number) { + const query = await this.db` + SELECT * FROM expressions WHERE id = ${id} + `; + return query; + } + // TODO word_phonetics is MANY TO ONE vs expressions, so it's possible for word_phonetics to lack entries for syllables + async fetchExpressionBySpelling( + spelling: string, + lang: string, + ): Promise<FullWordDataDB | null> { + const data = await this.db` +SELECT + json_object( + 'id', e.id, + 'spelling', e.spelling, + 'frequency', e.frequency, + 'phonetic', ( + SELECT json_object( + 'id', wp.id, + 'tone_sequence', wp.tone_sequence, + 'syl_seq', wp.syllable_sequence, + 'syllable_count', wp.syllable_count, + 'ipa', wp.ipa, + 'syllables', ( + SELECT json_group_array( + json_object( + 'stressed', CASE WHEN sw.stressed = 1 THEN json('true') ELSE json('false') END, + 'long', CASE WHEN s.long = 1 THEN json('true') ELSE json('false') END, + 'spelling', s.text, + 'ipa', s.ipa, + 'onset', ( + SELECT json_object('ipa', o.ipa, 'spelling', o.text) + FROM onsets o WHERE o.id = s.onset + ), + 'nucleus', ( + SELECT json_object('ipa', n.ipa, 'spelling', n.text) + FROM nucleus n WHERE n.id = s.nucleus + ), + 'medial', ( + SELECT json_object('ipa', m.ipa, 'spelling', m.text) + FROM medials m WHERE m.id = s.medial + ), + 'coda', ( + SELECT json_object('ipa', c.ipa, 'spelling', c.text) + FROM codas c WHERE c.id = s.coda + ), + 'rhyme', ( + SELECT json_object('ipa', r.ipa, 'spelling', r.text) + FROM rhymes r WHERE r.id = s.rhyme + ), + 'tone', ( + SELECT json_object('letters', t.ipa, 'numbers', t.nums, 'name', t.name) + FROM tones t WHERE t.id = s.tone + ) + ) + ) + FROM syllables_words sw + JOIN syllables s ON sw.syl_id = s.id + WHERE sw.word_id = wp.id + ORDER BY sw.idx ASC + ) + ) + FROM word_phonetics wp + WHERE wp.word_id = e.id + -- Select the most general pronunciation (sense_id IS NULL) or the first available + ORDER BY (wp.sense_id IS NULL) DESC, wp.id ASC + LIMIT 1 + ), + 'senses', ( + SELECT json_group_array( + json_object( + 'id', sn.id, + 'confidence', sn.confidence, + 'etymology', sn.etymology, + 'pos', sn.pos, + 'glosses', ( + SELECT json_group_array(sub.gloss) + FROM subsenses sub + WHERE sub.sid = sn.id + ), + 'examples', ( + SELECT json_group_array( + json_object( + 'example', ex.example, + 'ref', ex.ref + ) + ) + FROM examples ex + WHERE ex.sid = sn.id + ), + 'categories', ( + SELECT json_group_array(wc.category) + FROM word_categories wc + WHERE wc.word_id = sn.id + ), + 'derivation', ( + SELECT json_group_array( + json_object( + 'type', d.type, + 'text', d.text, + 'tags', json(d.tags) + ) + ) + FROM derivation d + WHERE d.sid = sn.id + ) + ) + ) + FROM senses sn + WHERE sn.parent_id = e.id + ) + ) AS full_word_data +FROM expressions e +WHERE e.spelling = ${spelling} AND e.lang = ${lang}; + `; + + if (data.length > 1) { + console.log({ spelling, lang, data }); + throw new Error("more rows than 1 wtf"); + } + if (data.length === 0) { + console.log({ spelling, lang, data }); + return null; + } + const row = data[0]; + const json = JSON.parse(row.full_word_data); + const phonetic = JSON.parse(json.phonetic); + const obj = { ...json, phonetic }; + return obj; } - fetchExpressionById(id: number) { - const query = this.db.query( - ` - SELECT * FROM expressions WHERE id = ? - `, - ); - return query.get(id) as any; + + // TODO combine with this old query to get both phonetic and semantic data + // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table + + async fetchSenses(spelling: string, lang: string) { + const rows = await this.db` + WITH sense_data AS ( + SELECT + s.*, + GROUP_CONCAT(DISTINCT ss.id || ':' || ss.gloss, '|') as subsenses_data, + GROUP_CONCAT(DISTINCT ex.id || ':' || ex.example || ':' || COALESCE(ex.ref, ''), '|') as examples_data, + GROUP_CONCAT(DISTINCT d.id || ':' || d.type || ':' || d.text, '|') as derivation_data, + GROUP_CONCAT(DISTINCT wc.category, '|') as categories_data + FROM senses s + LEFT JOIN subsenses ss ON ss.sid = s.id + LEFT JOIN examples ex ON ex.sid = s.id + LEFT JOIN derivation d ON d.sid = s.id + LEFT JOIN word_categories wc ON wc.word_id = s.id + GROUP BY s.id + ) + SELECT e.*, + (SELECT + json_group_array(json_object( + 'id', sd.id, + 'pos', sd.pos, + 'etymology', sd.etymology, + 'confidence', sd.confidence, + 'subsenses_data', sd.subsenses_data, + 'examples_data', sd.examples_data, + 'derivation_data', sd.derivation_data, + 'categories_data', sd.categories_data + )) + FROM sense_data sd + WHERE sd.parent_id = e.id + ) as senses_array + FROM expressions e + WHERE e.spelling = ${spelling} AND e.lang = ${lang} + ORDER BY e.frequency DESC`; + + return rows; } - fetchWordsByToneAndSyls1(tones: Array<string | null>) { + + // Tones and syls + async fetchWordsByToneAndSyls1(tones: Array<string | null>) { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); - const query = this.db.query( - ` + const data = await this.db` WITH word_tone_sequences AS ( SELECT wp.ipa, @@ -48,21 +251,19 @@ export class Queries { ) SELECT * FROM word_tone_sequences - WHERE tone_sequence LIKE ? - AND syllable_count = ? - `, - ); - return query.all(toneString, tones.length) as any[]; + WHERE tone_sequence LIKE ${toneString} + AND syllable_count = ${tones.length} + `; + return data; } - fetchWordsByToneAndSylsO(tones: Array<string | null>) { + async fetchWordsByToneAndSylsO(tones: Array<string | null>) { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); - const query = this.db.query( - ` + const data = await this.db` WITH word_tone_sequences AS ( SELECT w.id as word_id, @@ -87,25 +288,25 @@ export class Queries { tone_sequence, syllable_count FROM word_tone_sequences - WHERE tone_sequence LIKE ? - AND syllable_count = ? + WHERE tone_sequence LIKE ${toneString} + AND syllable_count = ${tones.length} ORDER BY frequency ASC NULLS LAST; - `, - ); + `; // TODO combine with this old query to get both phonetic and semantic data // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table - return query.all(toneString, tones.length) as any[]; + return data; } - fetchWordsByToneAndSyls(tones: Array<string | null>): FullWordDataDB[] { + async fetchWordsByToneAndSyls( + tones: Array<string | null>, + ): Promise<FullWordDataDB[]> { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); - const query = this.db.query( - ` + const data = await this.db` WITH word_tone_sequences AS ( SELECT w.id as word_id, @@ -161,54 +362,14 @@ export class Queries { ) SELECT * FROM word_tone_sequences - WHERE tone_sequence LIKE ? - AND syllable_count = ? + WHERE tone_sequence LIKE ${toneString} + AND syllable_count = ${tones.length} ORDER BY frequency ASC NULLS LAST; - `, - ); + `; // TODO combine with this old query to get both phonetic and semantic data // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table - return query.all(toneString, tones.length) as any[]; - } - // TODO combine with this old query to get both phonetic and semantic data - // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table - - fetchSenses(spelling: string, lang: string) { - const query = this.db.query(` - WITH sense_data AS ( - SELECT - s.*, - GROUP_CONCAT(DISTINCT ss.id || ':' || ss.gloss, '|') as subsenses_data, - GROUP_CONCAT(DISTINCT ex.id || ':' || ex.example || ':' || COALESCE(ex.ref, ''), '|') as examples_data, - GROUP_CONCAT(DISTINCT d.id || ':' || d.type || ':' || d.text, '|') as derivation_data, - GROUP_CONCAT(DISTINCT wc.category, '|') as categories_data - FROM senses s - LEFT JOIN subsenses ss ON ss.sid = s.id - LEFT JOIN examples ex ON ex.sid = s.id - LEFT JOIN derivation d ON d.sid = s.id - LEFT JOIN word_categories wc ON wc.word_id = s.id - GROUP BY s.id - ) - SELECT e.*, - (SELECT - json_group_array(json_object( - 'id', sd.id, - 'pos', sd.pos, - 'etymology', sd.etymology, - 'confidence', sd.confidence, - 'subsenses_data', sd.subsenses_data, - 'examples_data', sd.examples_data, - 'derivation_data', sd.derivation_data, - 'categories_data', sd.categories_data - )) - FROM sense_data sd - WHERE sd.parent_id = e.id - ) as senses_array - FROM expressions e - WHERE e.spelling = ? AND e.lang = ? - ORDER BY e.frequency DESC`); - return query.all(spelling, lang); + return data; } } export default Queries; |
