import { Database } from "bun:sqlite"; import type { FullWordDataDB } from "./types"; import { SRSQueries } from "./srs"; export class Queries { db: Database; srs: SRSQueries; constructor() { const db = new Database("/home/y/code/bun/sorlang/bulkdata/unified.db"); db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance db.exec("PRAGMA foreign_keys = ON"); db.exec("PRAGMA cache_size = -8000"); // Increase cache size to 8MB db.exec("PRAGMA mmap_size = 30000000000"); db.exec("PRAGMA temp_store = MEMORY"); // Store temp tables in memory db.exec("PRAGMA synchronous = NORMAL"); // Slightly less safe but faster this.db = db; this.srs = new SRSQueries(db); } fetchExpressionById(id: number) { const query = this.db.query( ` SELECT * FROM expressions WHERE id = ? `, ); return query.get(id) as any; } fetchWordsByToneAndSyls1(tones: Array) { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); const query = this.db.query( ` WITH word_tone_sequences AS ( SELECT wp.ipa, GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq, GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, COUNT(sw.syl_id) as syllable_count FROM word_phonetics wp JOIN syllables_words sw ON wp.id = sw.word_id JOIN syllables sy ON sw.syl_id = sy.id JOIN tones t ON sy.tone = t.id GROUP BY wp.ipa ) SELECT * FROM word_tone_sequences WHERE tone_sequence LIKE ? AND syllable_count = ? `, ); return query.all(toneString, tones.length) as any[]; } fetchWordsByToneAndSylsO(tones: Array) { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); const query = this.db.query( ` WITH word_tone_sequences AS ( SELECT w.id as word_id, w.spelling, wp.ipa, w.frequency, GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq, GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, COUNT(sw.syl_id) as syllable_count FROM expressions w JOIN word_phonetics wp ON w.id = wp.word_id JOIN syllables_words sw ON wp.id = sw.word_id JOIN syllables sy ON sw.syl_id = sy.id JOIN tones t ON sy.tone = t.id GROUP BY w.id, w.spelling, w.lang, w.frequency ) SELECT word_id, spelling, ipa, frequency, syl_seq, tone_sequence, syllable_count FROM word_tone_sequences WHERE tone_sequence LIKE ? AND syllable_count = ? ORDER BY frequency ASC NULLS LAST; `, ); // TODO combine with this old query to get both phonetic and semantic data // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table return query.all(toneString, tones.length) as any[]; } fetchWordsByToneAndSyls(tones: Array): FullWordDataDB[] { const toneString = tones .reduce((acc: string, item) => { if (!item) return `${acc},%`; else return `${acc},${item}`; }, "") .slice(1); const query = this.db.query( ` WITH word_tone_sequences AS ( SELECT w.id as word_id, w.spelling, wp.ipa, w.frequency, GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq, GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, COUNT(sw.syl_id) as syllable_count, (SELECT json_group_array(json_object( 'id', s.id, 'pos', s.pos, 'etymology', s.etymology, 'confidence', s.confidence, 'glosses', ( SELECT json_group_array(ss.gloss) FROM subsenses ss WHERE ss.sid = s.id ), 'examples', ( SELECT json_group_array(json_object( 'example', ex.example, 'ref', ex.ref )) FROM examples ex WHERE ex.sid = s.id ), 'derivation', ( SELECT json_group_array(json_object( 'type', d.type, 'text', d.text, 'tags', d.tags )) FROM derivation d WHERE d.sid = s.id ), 'categories', ( SELECT json_group_array(wc.category) FROM word_categories wc WHERE wc.word_id = s.id ) )) FROM senses s WHERE s.parent_id = w.id ) as senses_array FROM expressions w JOIN word_phonetics wp ON w.id = wp.word_id JOIN syllables_words sw ON wp.id = sw.word_id JOIN syllables sy ON sw.syl_id = sy.id JOIN tones t ON sy.tone = t.id GROUP BY w.id, w.spelling, w.lang, w.frequency ) SELECT * FROM word_tone_sequences WHERE tone_sequence LIKE ? AND syllable_count = ? ORDER BY frequency ASC NULLS LAST; `, ); // TODO combine with this old query to get both phonetic and semantic data // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table return query.all(toneString, tones.length) as any[]; } // TODO combine with this old query to get both phonetic and semantic data // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table fetchSenses(spelling: string, lang: string) { const query = this.db.query(` WITH sense_data AS ( SELECT s.*, GROUP_CONCAT(DISTINCT ss.id || ':' || ss.gloss, '|') as subsenses_data, GROUP_CONCAT(DISTINCT ex.id || ':' || ex.example || ':' || COALESCE(ex.ref, ''), '|') as examples_data, GROUP_CONCAT(DISTINCT d.id || ':' || d.type || ':' || d.text, '|') as derivation_data, GROUP_CONCAT(DISTINCT wc.category, '|') as categories_data FROM senses s LEFT JOIN subsenses ss ON ss.sid = s.id LEFT JOIN examples ex ON ex.sid = s.id LEFT JOIN derivation d ON d.sid = s.id LEFT JOIN word_categories wc ON wc.word_id = s.id GROUP BY s.id ) SELECT e.*, (SELECT json_group_array(json_object( 'id', sd.id, 'pos', sd.pos, 'etymology', sd.etymology, 'confidence', sd.confidence, 'subsenses_data', sd.subsenses_data, 'examples_data', sd.examples_data, 'derivation_data', sd.derivation_data, 'categories_data', sd.categories_data )) FROM sense_data sd WHERE sd.parent_id = e.id ) as senses_array FROM expressions e WHERE e.spelling = ? AND e.lang = ? ORDER BY e.frequency DESC`); return query.all(spelling, lang); } } export default Queries; // ` // WITH word_tone_sequences AS ( // SELECT // w.id as word_id, // w.spelling, // wp.ipa, // w.frequency, // s.etymology, // s.pos, // GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq, // GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, // // GROUP_CONCAT(t.name ORDER BY s.id) as senses, // // GROUP_CONCAT(ex.example ORDER BY s.id) as examples, // // GROUP_CONCAT(cat.category) as tags, // COUNT(sw.syl_id) as syllable_count // FROM expressions w // JOIN word_phonetics wp ON w.id = wp.word_id // JOIN syllables_words sw ON wp.id = sw.word_id // JOIN syllables s ON sw.syl_id = s.id // JOIN tones t ON s.tone = t.id // JOIN senses s ON s.parent_id = w.id // JOIN word_categories cat ON s.id = cat.word_id // JOIN subsenses ss ON ss.sid = s.id // JOIN examples ex ON ex.sid = s.id // GROUP BY w.id, w.spelling, w.lang, w.frequency // ) // SELECT // word_id, // spelling, // ipa, // frequency, // syl_seq, // tone_sequence, // syllable_count // FROM word_tone_sequences // WHERE tone_sequence LIKE ? // AND syllable_count = ? // ORDER BY frequency ASC NULLS LAST; // `