import { SQL } from "bun"; import { SRSQueries } from "./srs"; // NEW API // // https://bun.com/docs/runtime/sql // import { sql } from "bun"; // // Basic insert with direct values // const [user] = await sql` // INSERT INTO users (name, email) // VALUES (${name}, ${email}) // RETURNING * // `; // // Using object helper for cleaner syntax // const userData = { // name: "Alice", // email: "alice@example.com", // }; // const [newUser] = await sql` // INSERT INTO users ${sql(userData)} // RETURNING * // `; // // Expands to: INSERT INTO users (name, email) VALUES ('Alice', 'alice@example.com') const DB_PATH = "/home/y/code/bun/sorlang/bulkdata/db.db"; import type { FullWordDataDB } from "@sortug/langlib"; export class Queries { db: SQL; ready = false; // srs: SRSQueries; constructor() { console.log("setting sorlang DB"); const db = new SQL({ adapter: "sqlite", filename: DB_PATH, create: false, }); this.db = db; // this.srs = new SRSQueries(db); } async init() { try { await this.db`PRAGMA journal_mode = WAL`; // Enable Write-Ahead Logging for better performance await this.db`PRAGMA foreign_keys = ON`; await this.db`PRAGMA cache_size = -8000`; // Increase cache size to 8MB await this.db`PRAGMA mmap_size = 30000000000`; await this.db`PRAGMA temp_store = MEMORY`; // Store temp tables in memory await this.db`PRAGMA synchronous = NORMAL`; // Slightly less safe but faster this.ready = true; } catch (e) { console.error("error starting db", e); } } async fetchExpressionById(id: number) { const query = await this.db` SELECT * FROM expressions WHERE id = ${id} `; return query; } // TODO word_phonetics is MANY TO ONE vs expressions, so it's possible for word_phonetics to lack entries for syllables async fetchExpressionBySpelling( spelling: string, lang: string, ): Promise { const data = await this.db` SELECT json_object( 'id', e.id, 'spelling', e.spelling, 'frequency', e.frequency, 'phonetic', ( SELECT json_object( 'id', wp.id, 'tone_sequence', wp.tone_sequence, 'syl_seq', wp.syllable_sequence, 'syllable_count', wp.syllable_count, 'ipa', wp.ipa, 'syllables', ( SELECT json_group_array( json_object( 'stressed', CASE WHEN sw.stressed = 1 THEN json('true') ELSE json('false') END, 'long', CASE WHEN s.long = 1 THEN json('true') ELSE json('false') END, 'spelling', s.text, 'ipa', s.ipa, 'onset', ( SELECT json_object('ipa', o.ipa, 'spelling', o.text) FROM onsets o WHERE o.id = s.onset ), 'nucleus', ( SELECT json_object('ipa', n.ipa, 'spelling', n.text) FROM nucleus n WHERE n.id = s.nucleus ), 'medial', ( SELECT json_object('ipa', m.ipa, 'spelling', m.text) FROM medials m WHERE m.id = s.medial ), 'coda', ( SELECT json_object('ipa', c.ipa, 'spelling', c.text) FROM codas c WHERE c.id = s.coda ), 'rhyme', ( SELECT json_object('ipa', r.ipa, 'spelling', r.text) FROM rhymes r WHERE r.id = s.rhyme ), 'tone', ( SELECT json_object('letters', t.ipa, 'numbers', t.nums, 'name', t.name) FROM tones t WHERE t.id = s.tone ) ) ) FROM syllables_words sw JOIN syllables s ON sw.syl_id = s.id WHERE sw.word_id = wp.id ORDER BY sw.idx ASC ) ) FROM word_phonetics wp WHERE wp.word_id = e.id -- Select the most general pronunciation (sense_id IS NULL) or the first available ORDER BY (wp.sense_id IS NULL) DESC, wp.id ASC LIMIT 1 ), 'senses', ( SELECT json_group_array( json_object( 'id', sn.id, 'confidence', sn.confidence, 'etymology', sn.etymology, 'pos', sn.pos, 'glosses', ( SELECT json_group_array(sub.gloss) FROM subsenses sub WHERE sub.sid = sn.id ), 'examples', ( SELECT json_group_array( json_object( 'example', ex.example, 'ref', ex.ref ) ) FROM examples ex WHERE ex.sid = sn.id ), 'categories', ( SELECT json_group_array(wc.category) FROM word_categories wc WHERE wc.word_id = sn.id ), 'derivation', ( SELECT json_group_array( json_object( 'type', d.type, 'text', d.text, 'tags', json(d.tags) ) ) FROM derivation d WHERE d.sid = sn.id ) ) ) FROM senses sn WHERE sn.parent_id = e.id ) ) AS full_word_data FROM expressions e WHERE e.spelling = ${spelling} AND e.lang = ${lang}; `; if (data.length > 1) { console.log({ spelling, lang, data }); throw new Error("more rows than 1 wtf"); } if (data.length === 0) { console.log({ spelling, lang, data }); return null; } const row = data[0]; const json = JSON.parse(row.full_word_data); const phonetic = JSON.parse(json.phonetic); const obj = { ...json, phonetic }; return obj; } // TODO combine with this old query to get both phonetic and semantic data // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table async fetchSenses(spelling: string, lang: string) { const rows = await this.db` WITH sense_data AS ( SELECT s.*, GROUP_CONCAT(DISTINCT ss.id || ':' || ss.gloss, '|') as subsenses_data, GROUP_CONCAT(DISTINCT ex.id || ':' || ex.example || ':' || COALESCE(ex.ref, ''), '|') as examples_data, GROUP_CONCAT(DISTINCT d.id || ':' || d.type || ':' || d.text, '|') as derivation_data, GROUP_CONCAT(DISTINCT wc.category, '|') as categories_data FROM senses s LEFT JOIN subsenses ss ON ss.sid = s.id LEFT JOIN examples ex ON ex.sid = s.id LEFT JOIN derivation d ON d.sid = s.id LEFT JOIN word_categories wc ON wc.word_id = s.id GROUP BY s.id ) SELECT e.*, (SELECT json_group_array(json_object( 'id', sd.id, 'pos', sd.pos, 'etymology', sd.etymology, 'confidence', sd.confidence, 'subsenses_data', sd.subsenses_data, 'examples_data', sd.examples_data, 'derivation_data', sd.derivation_data, 'categories_data', sd.categories_data )) FROM sense_data sd WHERE sd.parent_id = e.id ) as senses_array FROM expressions e WHERE e.spelling = ${spelling} AND e.lang = ${lang} ORDER BY e.frequency DESC`; return rows; } // Tones and syls async fetchWordsByToneAndSyls1(tones: Array) { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); const data = await this.db` WITH word_tone_sequences AS ( SELECT wp.ipa, GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq, GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, COUNT(sw.syl_id) as syllable_count FROM word_phonetics wp JOIN syllables_words sw ON wp.id = sw.word_id JOIN syllables sy ON sw.syl_id = sy.id JOIN tones t ON sy.tone = t.id GROUP BY wp.ipa ) SELECT * FROM word_tone_sequences WHERE tone_sequence LIKE ${toneString} AND syllable_count = ${tones.length} `; return data; } async fetchWordsByToneAndSylsO(tones: Array) { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); const data = await this.db` WITH word_tone_sequences AS ( SELECT w.id as word_id, w.spelling, wp.ipa, w.frequency, GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq, GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, COUNT(sw.syl_id) as syllable_count FROM expressions w JOIN word_phonetics wp ON w.id = wp.word_id JOIN syllables_words sw ON wp.id = sw.word_id JOIN syllables sy ON sw.syl_id = sy.id JOIN tones t ON sy.tone = t.id GROUP BY w.id, w.spelling, w.lang, w.frequency ) SELECT word_id, spelling, ipa, frequency, syl_seq, tone_sequence, syllable_count FROM word_tone_sequences WHERE tone_sequence LIKE ${toneString} AND syllable_count = ${tones.length} ORDER BY frequency ASC NULLS LAST; `; // TODO combine with this old query to get both phonetic and semantic data // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table return data; } async fetchWordsByToneAndSyls( tones: Array, ): Promise { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); const data = await this.db` WITH word_tone_sequences AS ( SELECT w.id as word_id, w.spelling, wp.ipa, w.frequency, GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq, GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, COUNT(sw.syl_id) as syllable_count, (SELECT json_group_array(json_object( 'id', s.id, 'pos', s.pos, 'etymology', s.etymology, 'confidence', s.confidence, 'glosses', ( SELECT json_group_array(ss.gloss) FROM subsenses ss WHERE ss.sid = s.id ), 'examples', ( SELECT json_group_array(json_object( 'example', ex.example, 'ref', ex.ref )) FROM examples ex WHERE ex.sid = s.id ), 'derivation', ( SELECT json_group_array(json_object( 'type', d.type, 'text', d.text, 'tags', d.tags )) FROM derivation d WHERE d.sid = s.id ), 'categories', ( SELECT json_group_array(wc.category) FROM word_categories wc WHERE wc.word_id = s.id ) )) FROM senses s WHERE s.parent_id = w.id ) as senses_array FROM expressions w JOIN word_phonetics wp ON w.id = wp.word_id JOIN syllables_words sw ON wp.id = sw.word_id JOIN syllables sy ON sw.syl_id = sy.id JOIN tones t ON sy.tone = t.id GROUP BY w.id, w.spelling, w.lang, w.frequency ) SELECT * FROM word_tone_sequences WHERE tone_sequence LIKE ${toneString} AND syllable_count = ${tones.length} ORDER BY frequency ASC NULLS LAST; `; // TODO combine with this old query to get both phonetic and semantic data // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table return data; } } export default Queries; // ` // WITH word_tone_sequences AS ( // SELECT // w.id as word_id, // w.spelling, // wp.ipa, // w.frequency, // s.etymology, // s.pos, // GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq, // GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, // // GROUP_CONCAT(t.name ORDER BY s.id) as senses, // // GROUP_CONCAT(ex.example ORDER BY s.id) as examples, // // GROUP_CONCAT(cat.category) as tags, // COUNT(sw.syl_id) as syllable_count // FROM expressions w // JOIN word_phonetics wp ON w.id = wp.word_id // JOIN syllables_words sw ON wp.id = sw.word_id // JOIN syllables s ON sw.syl_id = s.id // JOIN tones t ON s.tone = t.id // JOIN senses s ON s.parent_id = w.id // JOIN word_categories cat ON s.id = cat.word_id // JOIN subsenses ss ON ss.sid = s.id // JOIN examples ex ON ex.sid = s.id // GROUP BY w.id, w.spelling, w.lang, w.frequency // ) // SELECT // word_id, // spelling, // ipa, // frequency, // syl_seq, // tone_sequence, // syllable_count // FROM word_tone_sequences // WHERE tone_sequence LIKE ? // AND syllable_count = ? // ORDER BY frequency ASC NULLS LAST; // `