1 files changed, 253 insertions, 0 deletions
diff --git a/packages/db/src/index.ts b/packages/db/src/index.ts
new file mode 100644
index 0000000..6a89ee2
--- /dev/null
+++ b/packages/db/src/index.ts
@@ -0,0 +1,253 @@
+import { Database } from "bun:sqlite";
+import type { FullWordDataDB } from "./types";
+import { SRSQueries } from "./srs";
+
+export class Queries {
+  db: Database;
+  srs: SRSQueries;
+  constructor() {
+    const db = new Database("/home/y/code/bun/sorlang/bulkdata/unified.db");
+    db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance
+    db.exec("PRAGMA foreign_keys = ON");
+    db.exec("PRAGMA cache_size = -8000"); // Increase cache size to 8MB
+    db.exec("PRAGMA mmap_size = 30000000000");
+    db.exec("PRAGMA temp_store = MEMORY"); // Store temp tables in memory
+    db.exec("PRAGMA synchronous = NORMAL"); // Slightly less safe but faster
+
+    this.db = db;
+    this.srs = new SRSQueries(db);
+  }
+  fetchExpressionById(id: number) {
+    const query = this.db.query(
+      `
+      SELECT * FROM expressions WHERE id = ?
+      `,
+    );
+    return query.get(id) as any;
+  }
+  fetchWordsByToneAndSyls1(tones: Array<string | null>) {
+    const toneString = tones
+      .reduce((acc: string, item) => {
+        if (!item) return `${acc},%`;
+        else return `${acc},${item}`;
+      }, "")
+      .slice(1);
+    const query = this.db.query(
+      `
+      WITH word_tone_sequences AS (
+          SELECT 
+              wp.ipa,
+              GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq,
+              GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+              COUNT(sw.syl_id) as syllable_count
+          FROM word_phonetics wp
+          JOIN syllables_words sw ON wp.id = sw.word_id
+          JOIN syllables sy ON sw.syl_id = sy.id
+          JOIN tones t ON sy.tone = t.id
+          GROUP BY wp.ipa
+      )        
+        SELECT *
+        FROM word_tone_sequences
+        WHERE tone_sequence LIKE ?
+        AND syllable_count = ?
+      `,
+    );
+    return query.all(toneString, tones.length) as any[];
+  }
+  fetchWordsByToneAndSylsO(tones: Array<string | null>) {
+    const toneString = tones
+      .reduce((acc: string, item) => {
+        if (!item) return `${acc},%`;
+        else return `${acc},${item}`;
+      }, "")
+      .slice(1);
+    const query = this.db.query(
+      `
+      WITH word_tone_sequences AS (
+          SELECT 
+              w.id as word_id,
+              w.spelling,
+              wp.ipa,
+              w.frequency,
+              GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq,
+              GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+              COUNT(sw.syl_id) as syllable_count
+          FROM expressions w
+          JOIN word_phonetics wp ON w.id = wp.word_id
+          JOIN syllables_words sw ON wp.id = sw.word_id
+          JOIN syllables sy ON sw.syl_id = sy.id
+          JOIN tones t ON sy.tone = t.id
+          GROUP BY w.id, w.spelling, w.lang, w.frequency
+      )        
+        SELECT word_id,
+               spelling,
+               ipa,
+               frequency,
+               syl_seq,
+               tone_sequence,
+               syllable_count
+        FROM word_tone_sequences
+        WHERE tone_sequence LIKE ?
+        AND syllable_count = ?
+        ORDER BY frequency ASC NULLS LAST;
+      `,
+    );
+    // TODO combine with this old query to get both phonetic and semantic data
+    // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table
+
+    return query.all(toneString, tones.length) as any[];
+  }
+  fetchWordsByToneAndSyls(tones: Array<string | null>): FullWordDataDB[] {
+    const toneString = tones
+      .reduce((acc: string, item) => {
+        if (!item) return `${acc},%`;
+        else return `${acc},${item}`;
+      }, "")
+      .slice(1);
+    const query = this.db.query(
+      `
+      WITH word_tone_sequences AS (
+          SELECT 
+              w.id as word_id,
+              w.spelling,
+              wp.ipa,
+              w.frequency,
+              GROUP_CONCAT(sy.text ORDER BY sw.idx) as syl_seq,
+              GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+              COUNT(sw.syl_id) as syllable_count,
+              (SELECT
+                json_group_array(json_object(
+                  'id', s.id,
+                  'pos', s.pos,
+                  'etymology', s.etymology,
+                  'confidence', s.confidence,
+                  'glosses', (
+                    SELECT json_group_array(ss.gloss)
+                    FROM subsenses ss
+                    WHERE ss.sid = s.id
+                  ),
+                  'examples', (
+                    SELECT json_group_array(json_object(
+                      'example', ex.example,
+                      'ref', ex.ref
+                    ))
+                    FROM examples ex
+                    WHERE ex.sid = s.id
+                  ),
+                  'derivation', (
+                    SELECT json_group_array(json_object(
+                      'type', d.type,
+                      'text', d.text,
+                      'tags', d.tags
+                    ))
+                    FROM derivation d
+                    WHERE d.sid = s.id
+                  ),
+                  'categories', (
+                    SELECT json_group_array(wc.category)
+                    FROM word_categories wc
+                    WHERE wc.word_id = s.id
+                  )
+                ))
+                FROM senses s
+                WHERE s.parent_id = w.id
+              ) as senses_array
+          FROM expressions w
+          JOIN word_phonetics wp ON w.id = wp.word_id
+          JOIN syllables_words sw ON wp.id = sw.word_id
+          JOIN syllables sy ON sw.syl_id = sy.id
+          JOIN tones t ON sy.tone = t.id
+          GROUP BY w.id, w.spelling, w.lang, w.frequency
+      )        
+        SELECT *
+        FROM word_tone_sequences
+        WHERE tone_sequence LIKE ?
+        AND syllable_count = ?
+        ORDER BY frequency ASC NULLS LAST;
+      `,
+    );
+    // TODO combine with this old query to get both phonetic and semantic data
+    // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table
+
+    return query.all(toneString, tones.length) as any[];
+  }
+  // TODO combine with this old query to get both phonetic and semantic data
+  // Thing is the tables have changed, used to be that the senses table had JSONB columns for senses, forms and related but those are now proper tables now with one-to-many relation to the senses table
+
+  fetchSenses(spelling: string, lang: string) {
+    const query = this.db.query(`
+      WITH sense_data AS (
+        SELECT
+          s.*,
+          GROUP_CONCAT(DISTINCT ss.id || ':' || ss.gloss, '|') as subsenses_data,
+          GROUP_CONCAT(DISTINCT ex.id || ':' || ex.example || ':' || COALESCE(ex.ref, ''), '|') as examples_data,
+          GROUP_CONCAT(DISTINCT d.id || ':' || d.type || ':' || d.text, '|') as derivation_data,
+          GROUP_CONCAT(DISTINCT wc.category, '|') as categories_data
+        FROM senses s
+        LEFT JOIN subsenses ss ON ss.sid = s.id
+        LEFT JOIN examples ex ON ex.sid = s.id
+        LEFT JOIN derivation d ON d.sid = s.id
+        LEFT JOIN word_categories wc ON wc.word_id = s.id
+        GROUP BY s.id
+      )
+      SELECT e.*,
+        (SELECT
+          json_group_array(json_object(
+            'id', sd.id,
+            'pos', sd.pos,
+            'etymology', sd.etymology,
+            'confidence', sd.confidence,
+            'subsenses_data', sd.subsenses_data,
+            'examples_data', sd.examples_data,
+            'derivation_data', sd.derivation_data,
+            'categories_data', sd.categories_data
+          ))
+          FROM sense_data sd
+          WHERE sd.parent_id = e.id
+        ) as senses_array
+      FROM expressions e
+      WHERE e.spelling = ? AND e.lang = ?
+      ORDER BY e.frequency DESC`);
+    return query.all(spelling, lang);
+  }
+}
+export default Queries;
+// `
+//       WITH word_tone_sequences AS (
+//           SELECT
+//               w.id as word_id,
+//               w.spelling,
+//               wp.ipa,
+//               w.frequency,
+//               s.etymology,
+//               s.pos,
+//               GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq,
+//               GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+//               // GROUP_CONCAT(t.name ORDER BY s.id) as senses,
+//               // GROUP_CONCAT(ex.example ORDER BY s.id) as examples,
+//               // GROUP_CONCAT(cat.category) as tags,
+//               COUNT(sw.syl_id) as syllable_count
+//           FROM expressions w
+//           JOIN word_phonetics wp ON w.id = wp.word_id
+//           JOIN syllables_words sw ON wp.id = sw.word_id
+//           JOIN syllables s ON sw.syl_id = s.id
+//           JOIN tones t ON s.tone = t.id
+//           JOIN senses s ON s.parent_id = w.id
+//           JOIN word_categories cat ON s.id = cat.word_id
+//           JOIN subsenses ss ON ss.sid = s.id
+//           JOIN examples ex ON ex.sid = s.id
+//           GROUP BY w.id, w.spelling, w.lang, w.frequency
+//       )
+//         SELECT
+//             word_id,
+//             spelling,
+//             ipa,
+//             frequency,
+//             syl_seq,
+//             tone_sequence,
+//             syllable_count
+//         FROM word_tone_sequences
+//         WHERE tone_sequence LIKE ?
+//         AND syllable_count = ?
+//         ORDER BY frequency ASC NULLS LAST;
+//       `