summaryrefslogtreecommitdiff
path: root/packages/db/src/index.ts
diff options
context:
space:
mode:
Diffstat (limited to 'packages/db/src/index.ts')
-rw-r--r--packages/db/src/index.ts253
1 files changed, 253 insertions, 0 deletions
diff --git a/packages/db/src/index.ts b/packages/db/src/index.ts
new file mode 100644
index 0000000..6a89ee2
--- /dev/null
+++ b/packages/db/src/index.ts
@@ -0,0 +1,253 @@
+import { Database } from "bun:sqlite";
+import type { FullWordDataDB } from "./types";
+import { SRSQueries } from "./srs";
+
+export class Queries {
+ db: Database;
+ srs: SRSQueries;
+ constructor() {
+ const db = new Database("/home/y/code/bun/sorlang/bulkdata/unified.db");
+ db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance
+ db.exec("PRAGMA foreign_keys = ON");
+ db.exec("PRAGMA cache_size = -8000"); // Increase cache size to 8MB
+ db.exec("PRAGMA mmap_size = 30000000000");
+ db.exec("PRAGMA temp_store = MEMORY"); // Store temp tables in memory
+ db.exec("PRAGMA synchronous = NORMAL"); // Slightly less safe but faster
+
+ this.db = db;
+ this.srs = new SRSQueries(db);
+ }
+ fetchExpressionById(id: number) {
+ const query = this.db.query(
+ `
+ SELECT * FROM expressions WHERE id = ?
+ `,
+ );
+ return query.get(id) as any;
+ }
+ fetchWordsByToneAndSyls1(tones: Array<string | null>) {
+ const toneString = tones
+ .reduce((acc: string, item) => {
+ if (!item) return `${acc},%`;
+ else return `${acc},${item}`;
+ }, "")
+ .slice(1);
+ const query = this.db.query(
+ `
+ WITH word_tone_sequences AS (
+ SELECT
+ wp.ipa,
+ GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq,
+ GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+ COUNT(sw.syl_id) as syllable_count
+ FROM word_phonetics wp
+ JOIN syllables_words sw ON wp.id = sw.word_id
+ JOIN syllables sy ON sw.syl_id = sy.id
+ JOIN tones t ON sy.tone = t.id
+ GROUP BY wp.ipa
+ )
+ SELECT *
+ FROM word_tone_sequences
+ WHERE tone_sequence LIKE ?
+ AND syllable_count = ?
+ `,
+ );
+ return query.all(toneString, tones.length) as any[];
+ }
+ fetchWordsByToneAndSylsO(tones: Array<string | null>) {
+ const toneString = tones
+ .reduce((acc: string, item) => {
+ if (!item) return `${acc},%`;
+ else return `${acc},${item}`;
+ }, "")
+ .slice(1);
+ const query = this.db.query(
+ `
+ WITH word_tone_sequences AS (
+ SELECT
+ w.id as word_id,
+ w.spelling,
+ wp.ipa,
+ w.frequency,
+ GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq,
+ GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+ COUNT(sw.syl_id) as syllable_count
+ FROM expressions w
+ JOIN word_phonetics wp ON w.id = wp.word_id
+ JOIN syllables_words sw ON wp.id = sw.word_id
+ JOIN syllables sy ON sw.syl_id = sy.id
+ JOIN tones t ON sy.tone = t.id
+ GROUP BY w.id, w.spelling, w.lang, w.frequency
+ )
+ SELECT word_id,
+ spelling,
+ ipa,
+ frequency,
+ syl_seq,
+ tone_sequence,
+ syllable_count
+ FROM word_tone_sequences
+ WHERE tone_sequence LIKE ?
+ AND syllable_count = ?
+ ORDER BY frequency ASC NULLS LAST;
+ `,
+ );
+ // TODO combine with this old query to get both phonetic and semantic data
+ // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table
+
+ return query.all(toneString, tones.length) as any[];
+ }
+ fetchWordsByToneAndSyls(tones: Array<string | null>): FullWordDataDB[] {
+ const toneString = tones
+ .reduce((acc: string, item) => {
+ if (!item) return `${acc},%`;
+ else return `${acc},${item}`;
+ }, "")
+ .slice(1);
+ const query = this.db.query(
+ `
+ WITH word_tone_sequences AS (
+ SELECT
+ w.id as word_id,
+ w.spelling,
+ wp.ipa,
+ w.frequency,
+ GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq,
+ GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+ COUNT(sw.syl_id) as syllable_count,
+ (SELECT
+ json_group_array(json_object(
+ 'id', s.id,
+ 'pos', s.pos,
+ 'etymology', s.etymology,
+ 'confidence', s.confidence,
+ 'glosses', (
+ SELECT json_group_array(ss.gloss)
+ FROM subsenses ss
+ WHERE ss.sid = s.id
+ ),
+ 'examples', (
+ SELECT json_group_array(json_object(
+ 'example', ex.example,
+ 'ref', ex.ref
+ ))
+ FROM examples ex
+ WHERE ex.sid = s.id
+ ),
+ 'derivation', (
+ SELECT json_group_array(json_object(
+ 'type', d.type,
+ 'text', d.text,
+ 'tags', d.tags
+ ))
+ FROM derivation d
+ WHERE d.sid = s.id
+ ),
+ 'categories', (
+ SELECT json_group_array(wc.category)
+ FROM word_categories wc
+ WHERE wc.word_id = s.id
+ )
+ ))
+ FROM senses s
+ WHERE s.parent_id = w.id
+ ) as senses_array
+ FROM expressions w
+ JOIN word_phonetics wp ON w.id = wp.word_id
+ JOIN syllables_words sw ON wp.id = sw.word_id
+ JOIN syllables sy ON sw.syl_id = sy.id
+ JOIN tones t ON sy.tone = t.id
+ GROUP BY w.id, w.spelling, w.lang, w.frequency
+ )
+ SELECT *
+ FROM word_tone_sequences
+ WHERE tone_sequence LIKE ?
+ AND syllable_count = ?
+ ORDER BY frequency ASC NULLS LAST;
+ `,
+ );
+ // TODO combine with this old query to get both phonetic and semantic data
+ // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table
+
+ return query.all(toneString, tones.length) as any[];
+ }
+ // TODO combine with this old query to get both phonetic and semantic data
+ // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table
+
+ fetchSenses(spelling: string, lang: string) {
+ const query = this.db.query(`
+ WITH sense_data AS (
+ SELECT
+ s.*,
+ GROUP_CONCAT(DISTINCT ss.id || ':' || ss.gloss, '|') as subsenses_data,
+ GROUP_CONCAT(DISTINCT ex.id || ':' || ex.example || ':' || COALESCE(ex.ref, ''), '|') as examples_data,
+ GROUP_CONCAT(DISTINCT d.id || ':' || d.type || ':' || d.text, '|') as derivation_data,
+ GROUP_CONCAT(DISTINCT wc.category, '|') as categories_data
+ FROM senses s
+ LEFT JOIN subsenses ss ON ss.sid = s.id
+ LEFT JOIN examples ex ON ex.sid = s.id
+ LEFT JOIN derivation d ON d.sid = s.id
+ LEFT JOIN word_categories wc ON wc.word_id = s.id
+ GROUP BY s.id
+ )
+ SELECT e.*,
+ (SELECT
+ json_group_array(json_object(
+ 'id', sd.id,
+ 'pos', sd.pos,
+ 'etymology', sd.etymology,
+ 'confidence', sd.confidence,
+ 'subsenses_data', sd.subsenses_data,
+ 'examples_data', sd.examples_data,
+ 'derivation_data', sd.derivation_data,
+ 'categories_data', sd.categories_data
+ ))
+ FROM sense_data sd
+ WHERE sd.parent_id = e.id
+ ) as senses_array
+ FROM expressions e
+ WHERE e.spelling = ? AND e.lang = ?
+ ORDER BY e.frequency DESC`);
+ return query.all(spelling, lang);
+ }
+}
+export default Queries;
+// `
+// WITH word_tone_sequences AS (
+// SELECT
+// w.id as word_id,
+// w.spelling,
+// wp.ipa,
+// w.frequency,
+// s.etymology,
+// s.pos,
+// GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq,
+// GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+// // GROUP_CONCAT(t.name ORDER BY s.id) as senses,
+// // GROUP_CONCAT(ex.example ORDER BY s.id) as examples,
+// // GROUP_CONCAT(cat.category) as tags,
+// COUNT(sw.syl_id) as syllable_count
+// FROM expressions w
+// JOIN word_phonetics wp ON w.id = wp.word_id
+// JOIN syllables_words sw ON wp.id = sw.word_id
+// JOIN syllables s ON sw.syl_id = s.id
+// JOIN tones t ON s.tone = t.id
+// JOIN senses s ON s.parent_id = w.id
+// JOIN word_categories cat ON s.id = cat.word_id
+// JOIN subsenses ss ON ss.sid = s.id
+// JOIN examples ex ON ex.sid = s.id
+// GROUP BY w.id, w.spelling, w.lang, w.frequency
+// )
+// SELECT
+// word_id,
+// spelling,
+// ipa,
+// frequency,
+// syl_seq,
+// tone_sequence,
+// syllable_count
+// FROM word_tone_sequences
+// WHERE tone_sequence LIKE ?
+// AND syllable_count = ?
+// ORDER BY frequency ASC NULLS LAST;
+// `