diff options
Diffstat (limited to 'packages/db/src/index.ts')
| -rw-r--r-- | packages/db/src/index.ts | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/packages/db/src/index.ts b/packages/db/src/index.ts new file mode 100644 index 0000000..6a89ee2 --- /dev/null +++ b/packages/db/src/index.ts @@ -0,0 +1,253 @@ +import { Database } from "bun:sqlite"; +import type { FullWordDataDB } from "./types"; +import { SRSQueries } from "./srs"; + +export class Queries { + db: Database; + srs: SRSQueries; + constructor() { + const db = new Database("/home/y/code/bun/sorlang/bulkdata/unified.db"); + db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance + db.exec("PRAGMA foreign_keys = ON"); + db.exec("PRAGMA cache_size = -8000"); // Increase cache size to 8MB + db.exec("PRAGMA mmap_size = 30000000000"); + db.exec("PRAGMA temp_store = MEMORY"); // Store temp tables in memory + db.exec("PRAGMA synchronous = NORMAL"); // Slightly less safe but faster + + this.db = db; + this.srs = new SRSQueries(db); + } + fetchExpressionById(id: number) { + const query = this.db.query( + ` + SELECT * FROM expressions WHERE id = ? + `, + ); + return query.get(id) as any; + } + fetchWordsByToneAndSyls1(tones: Array<string | null>) { + const toneString = tones + .reduce((acc: string, item) => { + if (!item) return `${acc},%`; + else return `${acc},${item}`; + }, "") + .slice(1); + const query = this.db.query( + ` + WITH word_tone_sequences AS ( + SELECT + wp.ipa, + GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq, + GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, + COUNT(sw.syl_id) as syllable_count + FROM word_phonetics wp + JOIN syllables_words sw ON wp.id = sw.word_id + JOIN syllables sy ON sw.syl_id = sy.id + JOIN tones t ON sy.tone = t.id + GROUP BY wp.ipa + ) + SELECT * + FROM word_tone_sequences + WHERE tone_sequence LIKE ? + AND syllable_count = ? + `, + ); + return query.all(toneString, tones.length) as any[]; + } + fetchWordsByToneAndSylsO(tones: Array<string | null>) { + const toneString = tones + .reduce((acc: string, item) => { + if (!item) return `${acc},%`; + else return `${acc},${item}`; + }, "") + .slice(1); + const query = this.db.query( + ` + WITH word_tone_sequences AS ( + SELECT + w.id as word_id, + w.spelling, + wp.ipa, + w.frequency, + GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq, + GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, + COUNT(sw.syl_id) as syllable_count + FROM expressions w + JOIN word_phonetics wp ON w.id = wp.word_id + JOIN syllables_words sw ON wp.id = sw.word_id + JOIN syllables sy ON sw.syl_id = sy.id + JOIN tones t ON sy.tone = t.id + GROUP BY w.id, w.spelling, w.lang, w.frequency + ) + SELECT word_id, + spelling, + ipa, + frequency, + syl_seq, + tone_sequence, + syllable_count + FROM word_tone_sequences + WHERE tone_sequence LIKE ? + AND syllable_count = ? + ORDER BY frequency ASC NULLS LAST; + `, + ); + // TODO combine with this old query to get both phonetic and semantic data + // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table + + return query.all(toneString, tones.length) as any[]; + } + fetchWordsByToneAndSyls(tones: Array<string | null>): FullWordDataDB[] { + const toneString = tones + .reduce((acc: string, item) => { + if (!item) return `${acc},%`; + else return `${acc},${item}`; + }, "") + .slice(1); + const query = this.db.query( + ` + WITH word_tone_sequences AS ( + SELECT + w.id as word_id, + w.spelling, + wp.ipa, + w.frequency, + GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq, + GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, + COUNT(sw.syl_id) as syllable_count, + (SELECT + json_group_array(json_object( + 'id', s.id, + 'pos', s.pos, + 'etymology', s.etymology, + 'confidence', s.confidence, + 'glosses', ( + SELECT json_group_array(ss.gloss) + FROM subsenses ss + WHERE ss.sid = s.id + ), + 'examples', ( + SELECT json_group_array(json_object( + 'example', ex.example, + 'ref', ex.ref + )) + FROM examples ex + WHERE ex.sid = s.id + ), + 'derivation', ( + SELECT json_group_array(json_object( + 'type', d.type, + 'text', d.text, + 'tags', d.tags + )) + FROM derivation d + WHERE d.sid = s.id + ), + 'categories', ( + SELECT json_group_array(wc.category) + FROM word_categories wc + WHERE wc.word_id = s.id + ) + )) + FROM senses s + WHERE s.parent_id = w.id + ) as senses_array + FROM expressions w + JOIN word_phonetics wp ON w.id = wp.word_id + JOIN syllables_words sw ON wp.id = sw.word_id + JOIN syllables sy ON sw.syl_id = sy.id + JOIN tones t ON sy.tone = t.id + GROUP BY w.id, w.spelling, w.lang, w.frequency + ) + SELECT * + FROM word_tone_sequences + WHERE tone_sequence LIKE ? + AND syllable_count = ? + ORDER BY frequency ASC NULLS LAST; + `, + ); + // TODO combine with this old query to get both phonetic and semantic data + // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table + + return query.all(toneString, tones.length) as any[]; + } + // TODO combine with this old query to get both phonetic and semantic data + // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table + + fetchSenses(spelling: string, lang: string) { + const query = this.db.query(` + WITH sense_data AS ( + SELECT + s.*, + GROUP_CONCAT(DISTINCT ss.id || ':' || ss.gloss, '|') as subsenses_data, + GROUP_CONCAT(DISTINCT ex.id || ':' || ex.example || ':' || COALESCE(ex.ref, ''), '|') as examples_data, + GROUP_CONCAT(DISTINCT d.id || ':' || d.type || ':' || d.text, '|') as derivation_data, + GROUP_CONCAT(DISTINCT wc.category, '|') as categories_data + FROM senses s + LEFT JOIN subsenses ss ON ss.sid = s.id + LEFT JOIN examples ex ON ex.sid = s.id + LEFT JOIN derivation d ON d.sid = s.id + LEFT JOIN word_categories wc ON wc.word_id = s.id + GROUP BY s.id + ) + SELECT e.*, + (SELECT + json_group_array(json_object( + 'id', sd.id, + 'pos', sd.pos, + 'etymology', sd.etymology, + 'confidence', sd.confidence, + 'subsenses_data', sd.subsenses_data, + 'examples_data', sd.examples_data, + 'derivation_data', sd.derivation_data, + 'categories_data', sd.categories_data + )) + FROM sense_data sd + WHERE sd.parent_id = e.id + ) as senses_array + FROM expressions e + WHERE e.spelling = ? AND e.lang = ? + ORDER BY e.frequency DESC`); + return query.all(spelling, lang); + } +} +export default Queries; +// ` +// WITH word_tone_sequences AS ( +// SELECT +// w.id as word_id, +// w.spelling, +// wp.ipa, +// w.frequency, +// s.etymology, +// s.pos, +// GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq, +// GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence, +// // GROUP_CONCAT(t.name ORDER BY s.id) as senses, +// // GROUP_CONCAT(ex.example ORDER BY s.id) as examples, +// // GROUP_CONCAT(cat.category) as tags, +// COUNT(sw.syl_id) as syllable_count +// FROM expressions w +// JOIN word_phonetics wp ON w.id = wp.word_id +// JOIN syllables_words sw ON wp.id = sw.word_id +// JOIN syllables s ON sw.syl_id = s.id +// JOIN tones t ON s.tone = t.id +// JOIN senses s ON s.parent_id = w.id +// JOIN word_categories cat ON s.id = cat.word_id +// JOIN subsenses ss ON ss.sid = s.id +// JOIN examples ex ON ex.sid = s.id +// GROUP BY w.id, w.spelling, w.lang, w.frequency +// ) +// SELECT +// word_id, +// spelling, +// ipa, +// frequency, +// syl_seq, +// tone_sequence, +// syllable_count +// FROM word_tone_sequences +// WHERE tone_sequence LIKE ? +// AND syllable_count = ? +// ORDER BY frequency ASC NULLS LAST; +// ` |
