summaryrefslogtreecommitdiff
path: root/packages/db/src/migration.ts
diff options
context:
space:
mode:
Diffstat (limited to 'packages/db/src/migration.ts')
-rw-r--r--packages/db/src/migration.ts430
1 files changed, 430 insertions, 0 deletions
diff --git a/packages/db/src/migration.ts b/packages/db/src/migration.ts
new file mode 100644
index 0000000..c4150f8
--- /dev/null
+++ b/packages/db/src/migration.ts
@@ -0,0 +1,430 @@
+import { Database } from "bun:sqlite";
+
+const PROSODY_DB = "/home/y/code/bun/ssr/claudesorlang/bulkdata/prosody.db";
+const THAIPHON_DB = "/home/y/code/bun/ssr/claudesorlang/bulkdata/thaiphon.db";
+const UNIFIED_DB = "/home/y/code/bun/ssr/claudesorlang/bulkdata/unified.db";
+
+async function migrateSenses() {
+ console.log("Starting database migration to unified.db...\n");
+ const expressionIdMap: Map<number, number> = new Map();
+ const senseIdMap: Map<number, number> = new Map();
+ const wordExpressionMap: Map<number, number> = new Map();
+
+ // Open all databases
+ const prosodyDb = new Database(PROSODY_DB, { readonly: true });
+ const thaiphonDb = new Database(THAIPHON_DB, { readonly: true });
+ const unifiedDb = new Database(UNIFIED_DB);
+
+ // Enable foreign keys and WAL mode for unified database
+ unifiedDb.run("PRAGMA foreign_keys = ON");
+ unifiedDb.run("PRAGMA journal_mode = WAL");
+
+ try {
+ // Start transaction for atomic migration
+ unifiedDb.run("BEGIN TRANSACTION");
+
+ console.log("šŸ“¦ Migrating from prosody.db...");
+
+ // 1. Migrate languages (combining both sources, preferring prosody.db)
+ console.log(" → Migrating categories...");
+ // categories
+ const prosodyCats = prosodyDb
+ .query("SELECT * FROM categories")
+ .all() as any[];
+ const insertCat = unifiedDb.prepare(
+ `INSERT INTO categories (name) VALUES (?)
+ ON CONFLICT(name) DO NOTHING
+ RETURNING rowid
+ `,
+ );
+ for (const cat of prosodyCats) {
+ const cr = insertCat.run(cat.name.toLowerCase().trim());
+ // console.log({ cr });
+ if (cr.changes !== 1) console.log({ cr });
+ }
+ console.log(` āœ“ Migrated ${prosodyCats.length} categories`);
+
+ console.log(" → Migrating languages...");
+ const prosodyLangs = prosodyDb
+ .query("SELECT * FROM languages")
+ .all() as any[];
+
+ // Add thaiphon languages first
+ // Override with prosody languages (they have more complete data)
+ for (const lang of prosodyLangs) {
+ // Insert merged languages
+ const insertLang = unifiedDb.prepare(
+ "INSERT OR IGNORE INTO languages (code, name, native_name, iso6392) VALUES (?, ?, ?, ?)",
+ );
+
+ insertLang.run(lang.code, lang.name, lang.native_name, null);
+ }
+ console.log(` āœ“ Migrated ${prosodyLangs.length} languages`);
+
+ // 2. Migrate expressions from prosody.db
+ console.log(" → Migrating expressions...");
+ const expressions = prosodyDb
+ .query("SELECT * FROM expressions")
+ .all() as any[];
+ const insertExpression = unifiedDb.prepare(
+ "INSERT INTO expressions (spelling, lang, frequency, type, notes) VALUES (?, ?, ?, ?, ?)",
+ );
+
+ for (const expr of expressions) {
+ const ei = insertExpression.run(
+ expr.spelling,
+ expr.lang,
+ expr.frequency,
+ expr.type,
+ expr.notes,
+ );
+ if (ei.changes > 0)
+ expressionIdMap.set(expr.id, Number(ei.lastInsertRowid));
+ }
+
+ const insertExample = unifiedDb.prepare(
+ "INSERT INTO examples(sid, example, ref) VALUES (?, ?, ?)",
+ );
+ const insertSubsense = unifiedDb.prepare(
+ "INSERT INTO subsenses (sid, gloss) VALUES (?, ?)",
+ );
+ const insertDerivation = unifiedDb.prepare(
+ "INSERT INTO derivation (sid, type, text) VALUES (?, ?, ?)",
+ );
+ const insertCatw = unifiedDb.prepare(
+ "INSERT OR IGNORE INTO word_categories(category, word_id) VALUES (?, ?)",
+ );
+ const insertWp = unifiedDb.prepare(
+ "INSERT INTO word_phonetics(word_id, sense_id, ipa, ipa_sequence, syllable_count, syllable_sequence, stressed, tone_sequence, tag, notes) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+ );
+ console.log(" → Migrating senses...");
+ const senses = prosodyDb.query("SELECT * FROM senses").all() as any[];
+ const insertSense = unifiedDb.prepare(
+ "INSERT INTO senses (parent_id, spelling, pos, etymology, confidence) VALUES (?, ?, ?, ?, ?)",
+ );
+
+ for (const sense of senses) {
+ const parentId = expressionIdMap.get(sense.parent_id);
+ const senseRow = insertSense.run(
+ parentId,
+ sense.spelling,
+ sense.pos,
+ sense.etymology,
+ sense.confidence,
+ );
+ const senseId = senseRow.lastInsertRowid;
+ senseIdMap.set(sense.id, Number(senseId));
+ const deriv = (
+ name: string,
+ type: Array<{ word: string }> | undefined,
+ ) => {
+ if (type)
+ for (const t of type) insertDerivation.run(senseId, name, t.word);
+ };
+ const subsenses = JSON.parse(sense.senses);
+ for (const ss of subsenses) {
+ const {
+ examples,
+ glosses,
+ tags,
+ synonyms,
+ antonyms,
+ hypernyms,
+ meronyms,
+ hyponyms,
+ holonyms,
+ } = ss;
+ if (examples)
+ for (const ex of examples) {
+ const ref = ex.ref || null;
+
+ if (ex.text) insertExample.run(senseId, ex.text, ref);
+ // else console.log("bad example", ex);
+ }
+ if (glosses)
+ for (const gloss of glosses) insertSubsense.run(senseId, gloss);
+ if (tags)
+ for (const tag of tags) {
+ const lw = tag.toLowerCase().trim();
+ insertCat.run(lw);
+ insertCatw.run(lw, senseId);
+ }
+ if (synonyms) deriv("synonym", synonyms);
+ if (antonyms) deriv("antonym", antonyms);
+ if (hypernyms) deriv("hypernym", hypernyms);
+ if (meronyms) deriv("meronym", meronyms);
+ if (hyponyms) deriv("hyponym", hyponyms);
+ if (holonyms) deriv("holonym", holonyms);
+ }
+ const prosody = JSON.parse(sense.prosody);
+ const stressed = prosody?.stressedSyllable || null;
+ if (!sense.ipa) {
+ console.log("no ipa", sense);
+ continue;
+ }
+ const ipa = JSON.parse(sense.ipa);
+ for (const ip of ipa) {
+ insertWp.run(
+ parentId,
+ senseId,
+ ip.ipa,
+ ip.ipa,
+ 0,
+ "",
+ stressed,
+ "",
+ ip.tag,
+ null,
+ );
+ }
+ }
+ console.log(` āœ“ Migrated ${senses.length} senses`);
+
+ console.log("\nšŸ“¦ Migrating from thaiphon.db...");
+
+ // 6. Migrate phonetic components from thaiphon.db
+ console.log(" → Migrating phonetic components...");
+
+ // Tones
+ const tones = thaiphonDb.query("SELECT * FROM tones").all() as any;
+ const insertTone = unifiedDb.prepare(
+ "INSERT INTO tones (id, ipa, lang, name, nums) VALUES (?, ?, ?, ?, ?)",
+ );
+ for (const tone of tones) {
+ insertTone.run(tone.id, tone.ipa, tone.lang, tone.name, tone.nums);
+ }
+ console.log(` āœ“ Migrated ${tones.length} tones`);
+
+ // Onsets
+ const onsets = thaiphonDb.query("SELECT * FROM onsets").all() as any;
+ const insertOnset = unifiedDb.prepare(
+ "INSERT INTO onsets (id, ipa, text, lang) VALUES (?, ?, ?, ?)",
+ );
+ for (const onset of onsets) {
+ insertOnset.run(onset.id, onset.ipa, onset.text, onset.lang);
+ }
+ console.log(` āœ“ Migrated ${onsets.length} onsets`);
+
+ // Medials
+ const medials = thaiphonDb.query("SELECT * FROM medials").all() as any;
+ const insertMedial = unifiedDb.prepare(
+ "INSERT INTO medials (id, ipa, text, lang) VALUES (?, ?, ?, ?)",
+ );
+ for (const medial of medials) {
+ insertMedial.run(medial.id, medial.ipa, medial.text, medial.lang) as any;
+ }
+ console.log(` āœ“ Migrated ${medials.length} medials`);
+
+ // Nucleus
+ const nucleus = thaiphonDb.query("SELECT * FROM nucleus").all() as any;
+ const insertNucleus = unifiedDb.prepare(
+ "INSERT INTO nucleus (id, ipa, text, lang) VALUES (?, ?, ?, ?)",
+ );
+ for (const nucl of nucleus) {
+ insertNucleus.run(nucl.id, nucl.ipa, nucl.text, nucl.lang);
+ }
+ console.log(` āœ“ Migrated ${nucleus.length} nucleus`);
+
+ // Codas
+ const codas = thaiphonDb.query("SELECT * FROM codas").all() as any;
+ const insertCoda = unifiedDb.prepare(
+ "INSERT INTO codas (id, ipa, text, lang) VALUES (?, ?, ?, ?)",
+ );
+ for (const coda of codas) {
+ insertCoda.run(coda.id, coda.ipa, coda.text, coda.lang) as any;
+ }
+ console.log(` āœ“ Migrated ${codas.length} codas`);
+
+ // Rhymes
+ const rhymes = thaiphonDb.query("SELECT * FROM rhymes").all() as any;
+ const insertRhyme = unifiedDb.prepare(
+ "INSERT INTO rhymes (id, ipa, text, lang) VALUES (?, ?, ?, ?)",
+ );
+ for (const rhyme of rhymes) {
+ insertRhyme.run(rhyme.id, rhyme.ipa, rhyme.text, rhyme.lang);
+ }
+ console.log(` āœ“ Migrated ${rhymes.length} rhymes`);
+
+ // 7. Migrate syllables from thaiphon.db
+ console.log(" → Migrating syllables...");
+ const syllables = thaiphonDb.query("SELECT * FROM syllables").all() as any;
+ const insertSyllable = unifiedDb.prepare(
+ "INSERT INTO syllables (id, lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+ );
+ for (const syl of syllables) {
+ insertSyllable.run(
+ syl.id,
+ syl.lang,
+ syl.ipa,
+ syl.long,
+ syl.text,
+ syl.onset,
+ syl.medial,
+ syl.nucleus,
+ syl.coda,
+ syl.rhyme,
+ syl.tone,
+ syl.notes,
+ );
+ }
+ console.log(` āœ“ Migrated ${syllables.length} syllables`);
+
+ // 8. Migrate words from thaiphon.db (as expressions if not exists)
+ console.log(" → Migrating words as expressions...");
+ const words = thaiphonDb.query("SELECT * FROM words").all() as any[];
+ const insertWordAsExpr = unifiedDb.query(
+ `INSERT INTO expressions (spelling, lang, frequency, type, notes) VALUES (?, ?, ?, ?, ?)
+ ON CONFLICT(spelling, lang) DO UPDATE SET spelling = excluded.spelling
+ RETURNING rowid`,
+ );
+
+ // Also keep track for syllables_words mapping
+ let prev: any = null;
+ for (const word of words) {
+ const result = insertWordAsExpr.get(
+ word.spelling,
+ word.lang,
+ word.frequency,
+ "word",
+ word.notes,
+ ) as { id: number };
+ // 41085
+ console.log({ result });
+ // if (prev === result.id) {
+ // console.log(word, result);
+ // throw new Error("fucked up again");
+ // }
+ // prev = result.id;
+ wordExpressionMap.set(word.id, result.id);
+ }
+ console.log(` āœ“ Processed ${words.length} words`);
+
+ console.log(" → Migrating word phonetics...");
+ const wordPhonetics = thaiphonDb
+ .query("SELECT * FROM word_phonetics")
+ .all() as any[];
+ const insertWordPhonetic = unifiedDb.prepare(
+ "INSERT INTO word_phonetics (word_id, sense_id, ipa, syllable_count, syllable_sequence, tone_sequence, ipa_sequence, tag, notes) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
+ );
+ // TODO find the sense id later I guess
+ const wpMap: Map<number, number> = new Map();
+ for (const wp of wordPhonetics) {
+ const newWordId = wordExpressionMap.get(wp.word_id);
+ if (newWordId) {
+ const wr = insertWordPhonetic.run(
+ newWordId,
+ null,
+ wp.ipa,
+ wp.syllable_count,
+ wp.syllable_sequence,
+ wp.tone_sequence,
+ wp.ipa_sequence,
+ wp.tag,
+ wp.notes,
+ );
+ if (wr.changes !== 0) wpMap.set(wp.word_id, Number(wr.lastInsertRowid));
+ }
+ }
+
+ // 9. Migrate syllables_words junction
+ console.log(" → Migrating syllables_words...");
+ const sylWords = thaiphonDb
+ .query("SELECT * FROM syllables_words")
+ .all() as any;
+ const insertSylWord = unifiedDb.prepare(
+ "INSERT OR IGNORE INTO syllables_words (syl_id, word_id, idx, stressed) VALUES (?, ?, ?, ?)",
+ );
+ for (const sw of sylWords) {
+ const newWordId = wpMap.get(sw.word_id);
+ if (newWordId)
+ insertSylWord.run(sw.syl_id, newWordId, sw.idx, sw.stressed);
+ }
+ console.log(
+ ` āœ“ Migrated ${sylWords.length} syllable-word relationships`,
+ );
+
+ unifiedDb.run("COMMIT");
+
+ console.log("\nāœ… Migration completed successfully!");
+ console.log(" Unified database is ready at:", UNIFIED_DB);
+ } catch (error) {
+ console.error("\nāŒ Migration failed:", error);
+ unifiedDb.run("ROLLBACK");
+ throw error;
+ } finally {
+ // Close all databases
+ prosodyDb.close();
+ // thaiphonDb.close();
+ unifiedDb.close();
+ }
+}
+
+// Run the migration
+migrateSenses();
+// migrateTones();
+// test();
+//
+
+async function test() {
+ console.log("Starting database migration to unified.db...\n");
+ // Open all databases
+ const prosodyDb = new Database(PROSODY_DB, { readonly: true });
+ const thaiphonDb = new Database(THAIPHON_DB, { readonly: true });
+ const unifiedDb = new Database(UNIFIED_DB, { readonly: true });
+
+ // Enable foreign keys and WAL mode for unified database
+ unifiedDb.run("PRAGMA foreign_keys = ON");
+ unifiedDb.run("PRAGMA journal_mode = WAL");
+ const oldE = prosodyDb.query("SELECT COUNT(*) FROM expressions").get();
+ const oldS = prosodyDb.query("SELECT COUNT(*) FROM senses").get();
+ const oldW = thaiphonDb.query("SELECT COUNT(*) FROM words").get();
+ const oldWp = thaiphonDb.query("SELECT COUNT(*) FROM word_phonetics").get();
+ const oldSw = thaiphonDb.query("SELECT COUNT(*) FROM syllables_words").get();
+ const oldSy = thaiphonDb.query("SELECT COUNT(*) FROM syllables").get();
+ const oldTo = thaiphonDb.query("SELECT COUNT(*) FROM tones").get();
+ const oldCod = thaiphonDb.query("SELECT COUNT(*) FROM codas").get();
+ const oldNuc = thaiphonDb.query("SELECT COUNT(*) FROM nucleus").get();
+ const oldOns = thaiphonDb.query("SELECT COUNT(*) FROM onsets").get();
+ const oldRhy = thaiphonDb.query("SELECT COUNT(*) FROM rhymes").get();
+ const oldWrh = thaiphonDb.query("SELECT COUNT(*) FROM word_rhymes").get();
+ const oldWry = thaiphonDb.query("SELECT COUNT(*) FROM words_wrhymes").get();
+ const newSw = unifiedDb.query("SELECT COUNT(*) FROM syllables_words").get();
+ const newSy = unifiedDb.query("SELECT COUNT(*) FROM syllables").get();
+ const newTo = unifiedDb.query("SELECT COUNT(*) FROM tones").get();
+ const newCod = unifiedDb.query("SELECT COUNT(*) FROM codas").get();
+ const newNuc = unifiedDb.query("SELECT COUNT(*) FROM nucleus").get();
+ const newOns = unifiedDb.query("SELECT COUNT(*) FROM onsets").get();
+ const newRhy = unifiedDb.query("SELECT COUNT(*) FROM rhymes").get();
+ const newWrh = unifiedDb.query("SELECT COUNT(*) FROM word_rhymes").get();
+ const newWry = unifiedDb.query("SELECT COUNT(*) FROM words_wrhymes").get();
+ const newE = unifiedDb.query("SELECT COUNT(*) FROM expressions").get();
+ const newS = unifiedDb.query("SELECT COUNT(*) FROM senses").get();
+ const newWp = unifiedDb.query("SELECT COUNT(*) FROM word_phonetics").get();
+ console.log({
+ oldE,
+ oldS,
+ oldW,
+ oldWp,
+ newE,
+ newS,
+ newWp,
+ oldSw,
+ oldSy,
+ oldTo,
+ oldCod,
+ oldNuc,
+ oldOns,
+ oldRhy,
+ oldWrh,
+ oldWry,
+ newSw,
+ newSy,
+ newTo,
+ newCod,
+ newNuc,
+ newOns,
+ newRhy,
+ newWrh,
+ newWry,
+ });
+}