import { Database } from "bun:sqlite"; const PROSODY_DB = "/home/y/code/bun/ssr/claudesorlang/bulkdata/prosody.db"; const THAIPHON_DB = "/home/y/code/bun/ssr/claudesorlang/bulkdata/thaiphon.db"; const UNIFIED_DB = "/home/y/code/bun/ssr/claudesorlang/bulkdata/unified.db"; async function migrateSenses() { console.log("Starting database migration to unified.db...\n"); const expressionIdMap: Map = new Map(); const senseIdMap: Map = new Map(); const wordExpressionMap: Map = new Map(); // Open all databases const prosodyDb = new Database(PROSODY_DB, { readonly: true }); const thaiphonDb = new Database(THAIPHON_DB, { readonly: true }); const unifiedDb = new Database(UNIFIED_DB); // Enable foreign keys and WAL mode for unified database unifiedDb.run("PRAGMA foreign_keys = ON"); unifiedDb.run("PRAGMA journal_mode = WAL"); try { // Start transaction for atomic migration unifiedDb.run("BEGIN TRANSACTION"); console.log("šŸ“¦ Migrating from prosody.db..."); // 1. Migrate languages (combining both sources, preferring prosody.db) console.log(" → Migrating categories..."); // categories const prosodyCats = prosodyDb .query("SELECT * FROM categories") .all() as any[]; const insertCat = unifiedDb.prepare( `INSERT INTO categories (name) VALUES (?) ON CONFLICT(name) DO NOTHING RETURNING rowid `, ); for (const cat of prosodyCats) { const cr = insertCat.run(cat.name.toLowerCase().trim()); // console.log({ cr }); if (cr.changes !== 1) console.log({ cr }); } console.log(` āœ“ Migrated ${prosodyCats.length} categories`); console.log(" → Migrating languages..."); const prosodyLangs = prosodyDb .query("SELECT * FROM languages") .all() as any[]; // Add thaiphon languages first // Override with prosody languages (they have more complete data) for (const lang of prosodyLangs) { // Insert merged languages const insertLang = unifiedDb.prepare( "INSERT OR IGNORE INTO languages (code, name, native_name, iso6392) VALUES (?, ?, ?, ?)", ); insertLang.run(lang.code, lang.name, lang.native_name, null); } console.log(` āœ“ Migrated ${prosodyLangs.length} languages`); // 2. Migrate expressions from prosody.db console.log(" → Migrating expressions..."); const expressions = prosodyDb .query("SELECT * FROM expressions") .all() as any[]; const insertExpression = unifiedDb.prepare( "INSERT INTO expressions (spelling, lang, frequency, type, notes) VALUES (?, ?, ?, ?, ?)", ); for (const expr of expressions) { const ei = insertExpression.run( expr.spelling, expr.lang, expr.frequency, expr.type, expr.notes, ); if (ei.changes > 0) expressionIdMap.set(expr.id, Number(ei.lastInsertRowid)); } const insertExample = unifiedDb.prepare( "INSERT INTO examples(sid, example, ref) VALUES (?, ?, ?)", ); const insertSubsense = unifiedDb.prepare( "INSERT INTO subsenses (sid, gloss) VALUES (?, ?)", ); const insertDerivation = unifiedDb.prepare( "INSERT INTO derivation (sid, type, text) VALUES (?, ?, ?)", ); const insertCatw = unifiedDb.prepare( "INSERT OR IGNORE INTO word_categories(category, word_id) VALUES (?, ?)", ); const insertWp = unifiedDb.prepare( "INSERT INTO word_phonetics(word_id, sense_id, ipa, ipa_sequence, syllable_count, syllable_sequence, stressed, tone_sequence, tag, notes) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", ); console.log(" → Migrating senses..."); const senses = prosodyDb.query("SELECT * FROM senses").all() as any[]; const insertSense = unifiedDb.prepare( "INSERT INTO senses (parent_id, spelling, pos, etymology, confidence) VALUES (?, ?, ?, ?, ?)", ); for (const sense of senses) { const parentId = expressionIdMap.get(sense.parent_id); const senseRow = insertSense.run( parentId, sense.spelling, sense.pos, sense.etymology, sense.confidence, ); const senseId = senseRow.lastInsertRowid; senseIdMap.set(sense.id, Number(senseId)); const deriv = ( name: string, type: Array<{ word: string }> | undefined, ) => { if (type) for (const t of type) insertDerivation.run(senseId, name, t.word); }; const subsenses = JSON.parse(sense.senses); for (const ss of subsenses) { const { examples, glosses, tags, synonyms, antonyms, hypernyms, meronyms, hyponyms, holonyms, } = ss; if (examples) for (const ex of examples) { const ref = ex.ref || null; if (ex.text) insertExample.run(senseId, ex.text, ref); // else console.log("bad example", ex); } if (glosses) for (const gloss of glosses) insertSubsense.run(senseId, gloss); if (tags) for (const tag of tags) { const lw = tag.toLowerCase().trim(); insertCat.run(lw); insertCatw.run(lw, senseId); } if (synonyms) deriv("synonym", synonyms); if (antonyms) deriv("antonym", antonyms); if (hypernyms) deriv("hypernym", hypernyms); if (meronyms) deriv("meronym", meronyms); if (hyponyms) deriv("hyponym", hyponyms); if (holonyms) deriv("holonym", holonyms); } const prosody = JSON.parse(sense.prosody); const stressed = prosody?.stressedSyllable || null; if (!sense.ipa) { console.log("no ipa", sense); continue; } const ipa = JSON.parse(sense.ipa); for (const ip of ipa) { insertWp.run( parentId, senseId, ip.ipa, ip.ipa, 0, "", stressed, "", ip.tag, null, ); } } console.log(` āœ“ Migrated ${senses.length} senses`); console.log("\nšŸ“¦ Migrating from thaiphon.db..."); // 6. Migrate phonetic components from thaiphon.db console.log(" → Migrating phonetic components..."); // Tones const tones = thaiphonDb.query("SELECT * FROM tones").all() as any; const insertTone = unifiedDb.prepare( "INSERT INTO tones (id, ipa, lang, name, nums) VALUES (?, ?, ?, ?, ?)", ); for (const tone of tones) { insertTone.run(tone.id, tone.ipa, tone.lang, tone.name, tone.nums); } console.log(` āœ“ Migrated ${tones.length} tones`); // Onsets const onsets = thaiphonDb.query("SELECT * FROM onsets").all() as any; const insertOnset = unifiedDb.prepare( "INSERT INTO onsets (id, ipa, text, lang) VALUES (?, ?, ?, ?)", ); for (const onset of onsets) { insertOnset.run(onset.id, onset.ipa, onset.text, onset.lang); } console.log(` āœ“ Migrated ${onsets.length} onsets`); // Medials const medials = thaiphonDb.query("SELECT * FROM medials").all() as any; const insertMedial = unifiedDb.prepare( "INSERT INTO medials (id, ipa, text, lang) VALUES (?, ?, ?, ?)", ); for (const medial of medials) { insertMedial.run(medial.id, medial.ipa, medial.text, medial.lang) as any; } console.log(` āœ“ Migrated ${medials.length} medials`); // Nucleus const nucleus = thaiphonDb.query("SELECT * FROM nucleus").all() as any; const insertNucleus = unifiedDb.prepare( "INSERT INTO nucleus (id, ipa, text, lang) VALUES (?, ?, ?, ?)", ); for (const nucl of nucleus) { insertNucleus.run(nucl.id, nucl.ipa, nucl.text, nucl.lang); } console.log(` āœ“ Migrated ${nucleus.length} nucleus`); // Codas const codas = thaiphonDb.query("SELECT * FROM codas").all() as any; const insertCoda = unifiedDb.prepare( "INSERT INTO codas (id, ipa, text, lang) VALUES (?, ?, ?, ?)", ); for (const coda of codas) { insertCoda.run(coda.id, coda.ipa, coda.text, coda.lang) as any; } console.log(` āœ“ Migrated ${codas.length} codas`); // Rhymes const rhymes = thaiphonDb.query("SELECT * FROM rhymes").all() as any; const insertRhyme = unifiedDb.prepare( "INSERT INTO rhymes (id, ipa, text, lang) VALUES (?, ?, ?, ?)", ); for (const rhyme of rhymes) { insertRhyme.run(rhyme.id, rhyme.ipa, rhyme.text, rhyme.lang); } console.log(` āœ“ Migrated ${rhymes.length} rhymes`); // 7. Migrate syllables from thaiphon.db console.log(" → Migrating syllables..."); const syllables = thaiphonDb.query("SELECT * FROM syllables").all() as any; const insertSyllable = unifiedDb.prepare( "INSERT INTO syllables (id, lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", ); for (const syl of syllables) { insertSyllable.run( syl.id, syl.lang, syl.ipa, syl.long, syl.text, syl.onset, syl.medial, syl.nucleus, syl.coda, syl.rhyme, syl.tone, syl.notes, ); } console.log(` āœ“ Migrated ${syllables.length} syllables`); // 8. Migrate words from thaiphon.db (as expressions if not exists) console.log(" → Migrating words as expressions..."); const words = thaiphonDb.query("SELECT * FROM words").all() as any[]; const insertWordAsExpr = unifiedDb.query( `INSERT INTO expressions (spelling, lang, frequency, type, notes) VALUES (?, ?, ?, ?, ?) ON CONFLICT(spelling, lang) DO UPDATE SET spelling = excluded.spelling RETURNING rowid`, ); // Also keep track for syllables_words mapping let prev: any = null; for (const word of words) { const result = insertWordAsExpr.get( word.spelling, word.lang, word.frequency, "word", word.notes, ) as { id: number }; // 41085 console.log({ result }); // if (prev === result.id) { // console.log(word, result); // throw new Error("fucked up again"); // } // prev = result.id; wordExpressionMap.set(word.id, result.id); } console.log(` āœ“ Processed ${words.length} words`); console.log(" → Migrating word phonetics..."); const wordPhonetics = thaiphonDb .query("SELECT * FROM word_phonetics") .all() as any[]; const insertWordPhonetic = unifiedDb.prepare( "INSERT INTO word_phonetics (word_id, sense_id, ipa, syllable_count, syllable_sequence, tone_sequence, ipa_sequence, tag, notes) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", ); // TODO find the sense id later I guess const wpMap: Map = new Map(); for (const wp of wordPhonetics) { const newWordId = wordExpressionMap.get(wp.word_id); if (newWordId) { const wr = insertWordPhonetic.run( newWordId, null, wp.ipa, wp.syllable_count, wp.syllable_sequence, wp.tone_sequence, wp.ipa_sequence, wp.tag, wp.notes, ); if (wr.changes !== 0) wpMap.set(wp.word_id, Number(wr.lastInsertRowid)); } } // 9. Migrate syllables_words junction console.log(" → Migrating syllables_words..."); const sylWords = thaiphonDb .query("SELECT * FROM syllables_words") .all() as any; const insertSylWord = unifiedDb.prepare( "INSERT OR IGNORE INTO syllables_words (syl_id, word_id, idx, stressed) VALUES (?, ?, ?, ?)", ); for (const sw of sylWords) { const newWordId = wpMap.get(sw.word_id); if (newWordId) insertSylWord.run(sw.syl_id, newWordId, sw.idx, sw.stressed); } console.log( ` āœ“ Migrated ${sylWords.length} syllable-word relationships`, ); unifiedDb.run("COMMIT"); console.log("\nāœ… Migration completed successfully!"); console.log(" Unified database is ready at:", UNIFIED_DB); } catch (error) { console.error("\nāŒ Migration failed:", error); unifiedDb.run("ROLLBACK"); throw error; } finally { // Close all databases prosodyDb.close(); // thaiphonDb.close(); unifiedDb.close(); } } // Run the migration migrateSenses(); // migrateTones(); // test(); // async function test() { console.log("Starting database migration to unified.db...\n"); // Open all databases const prosodyDb = new Database(PROSODY_DB, { readonly: true }); const thaiphonDb = new Database(THAIPHON_DB, { readonly: true }); const unifiedDb = new Database(UNIFIED_DB, { readonly: true }); // Enable foreign keys and WAL mode for unified database unifiedDb.run("PRAGMA foreign_keys = ON"); unifiedDb.run("PRAGMA journal_mode = WAL"); const oldE = prosodyDb.query("SELECT COUNT(*) FROM expressions").get(); const oldS = prosodyDb.query("SELECT COUNT(*) FROM senses").get(); const oldW = thaiphonDb.query("SELECT COUNT(*) FROM words").get(); const oldWp = thaiphonDb.query("SELECT COUNT(*) FROM word_phonetics").get(); const oldSw = thaiphonDb.query("SELECT COUNT(*) FROM syllables_words").get(); const oldSy = thaiphonDb.query("SELECT COUNT(*) FROM syllables").get(); const oldTo = thaiphonDb.query("SELECT COUNT(*) FROM tones").get(); const oldCod = thaiphonDb.query("SELECT COUNT(*) FROM codas").get(); const oldNuc = thaiphonDb.query("SELECT COUNT(*) FROM nucleus").get(); const oldOns = thaiphonDb.query("SELECT COUNT(*) FROM onsets").get(); const oldRhy = thaiphonDb.query("SELECT COUNT(*) FROM rhymes").get(); const oldWrh = thaiphonDb.query("SELECT COUNT(*) FROM word_rhymes").get(); const oldWry = thaiphonDb.query("SELECT COUNT(*) FROM words_wrhymes").get(); const newSw = unifiedDb.query("SELECT COUNT(*) FROM syllables_words").get(); const newSy = unifiedDb.query("SELECT COUNT(*) FROM syllables").get(); const newTo = unifiedDb.query("SELECT COUNT(*) FROM tones").get(); const newCod = unifiedDb.query("SELECT COUNT(*) FROM codas").get(); const newNuc = unifiedDb.query("SELECT COUNT(*) FROM nucleus").get(); const newOns = unifiedDb.query("SELECT COUNT(*) FROM onsets").get(); const newRhy = unifiedDb.query("SELECT COUNT(*) FROM rhymes").get(); const newWrh = unifiedDb.query("SELECT COUNT(*) FROM word_rhymes").get(); const newWry = unifiedDb.query("SELECT COUNT(*) FROM words_wrhymes").get(); const newE = unifiedDb.query("SELECT COUNT(*) FROM expressions").get(); const newS = unifiedDb.query("SELECT COUNT(*) FROM senses").get(); const newWp = unifiedDb.query("SELECT COUNT(*) FROM word_phonetics").get(); console.log({ oldE, oldS, oldW, oldWp, newE, newS, newWp, oldSw, oldSy, oldTo, oldCod, oldNuc, oldOns, oldRhy, oldWrh, oldWry, newSw, newSy, newTo, newCod, newNuc, newOns, newRhy, newWrh, newWry, }); }