diff options
Diffstat (limited to 'packages/db/src/migration.ts')
| -rw-r--r-- | packages/db/src/migration.ts | 430 |
1 files changed, 430 insertions, 0 deletions
diff --git a/packages/db/src/migration.ts b/packages/db/src/migration.ts new file mode 100644 index 0000000..c4150f8 --- /dev/null +++ b/packages/db/src/migration.ts @@ -0,0 +1,430 @@ +import { Database } from "bun:sqlite"; + +const PROSODY_DB = "/home/y/code/bun/ssr/claudesorlang/bulkdata/prosody.db"; +const THAIPHON_DB = "/home/y/code/bun/ssr/claudesorlang/bulkdata/thaiphon.db"; +const UNIFIED_DB = "/home/y/code/bun/ssr/claudesorlang/bulkdata/unified.db"; + +async function migrateSenses() { + console.log("Starting database migration to unified.db...\n"); + const expressionIdMap: Map<number, number> = new Map(); + const senseIdMap: Map<number, number> = new Map(); + const wordExpressionMap: Map<number, number> = new Map(); + + // Open all databases + const prosodyDb = new Database(PROSODY_DB, { readonly: true }); + const thaiphonDb = new Database(THAIPHON_DB, { readonly: true }); + const unifiedDb = new Database(UNIFIED_DB); + + // Enable foreign keys and WAL mode for unified database + unifiedDb.run("PRAGMA foreign_keys = ON"); + unifiedDb.run("PRAGMA journal_mode = WAL"); + + try { + // Start transaction for atomic migration + unifiedDb.run("BEGIN TRANSACTION"); + + console.log("š¦ Migrating from prosody.db..."); + + // 1. Migrate languages (combining both sources, preferring prosody.db) + console.log(" ā Migrating categories..."); + // categories + const prosodyCats = prosodyDb + .query("SELECT * FROM categories") + .all() as any[]; + const insertCat = unifiedDb.prepare( + `INSERT INTO categories (name) VALUES (?) + ON CONFLICT(name) DO NOTHING + RETURNING rowid + `, + ); + for (const cat of prosodyCats) { + const cr = insertCat.run(cat.name.toLowerCase().trim()); + // console.log({ cr }); + if (cr.changes !== 1) console.log({ cr }); + } + console.log(` ā Migrated ${prosodyCats.length} categories`); + + console.log(" ā Migrating languages..."); + const prosodyLangs = prosodyDb + .query("SELECT * FROM languages") + .all() as any[]; + + // Add thaiphon languages first + // Override with prosody languages (they have more complete data) + for (const lang of prosodyLangs) { + // Insert merged languages + const insertLang = unifiedDb.prepare( + "INSERT OR IGNORE INTO languages (code, name, native_name, iso6392) VALUES (?, ?, ?, ?)", + ); + + insertLang.run(lang.code, lang.name, lang.native_name, null); + } + console.log(` ā Migrated ${prosodyLangs.length} languages`); + + // 2. Migrate expressions from prosody.db + console.log(" ā Migrating expressions..."); + const expressions = prosodyDb + .query("SELECT * FROM expressions") + .all() as any[]; + const insertExpression = unifiedDb.prepare( + "INSERT INTO expressions (spelling, lang, frequency, type, notes) VALUES (?, ?, ?, ?, ?)", + ); + + for (const expr of expressions) { + const ei = insertExpression.run( + expr.spelling, + expr.lang, + expr.frequency, + expr.type, + expr.notes, + ); + if (ei.changes > 0) + expressionIdMap.set(expr.id, Number(ei.lastInsertRowid)); + } + + const insertExample = unifiedDb.prepare( + "INSERT INTO examples(sid, example, ref) VALUES (?, ?, ?)", + ); + const insertSubsense = unifiedDb.prepare( + "INSERT INTO subsenses (sid, gloss) VALUES (?, ?)", + ); + const insertDerivation = unifiedDb.prepare( + "INSERT INTO derivation (sid, type, text) VALUES (?, ?, ?)", + ); + const insertCatw = unifiedDb.prepare( + "INSERT OR IGNORE INTO word_categories(category, word_id) VALUES (?, ?)", + ); + const insertWp = unifiedDb.prepare( + "INSERT INTO word_phonetics(word_id, sense_id, ipa, ipa_sequence, syllable_count, syllable_sequence, stressed, tone_sequence, tag, notes) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ); + console.log(" ā Migrating senses..."); + const senses = prosodyDb.query("SELECT * FROM senses").all() as any[]; + const insertSense = unifiedDb.prepare( + "INSERT INTO senses (parent_id, spelling, pos, etymology, confidence) VALUES (?, ?, ?, ?, ?)", + ); + + for (const sense of senses) { + const parentId = expressionIdMap.get(sense.parent_id); + const senseRow = insertSense.run( + parentId, + sense.spelling, + sense.pos, + sense.etymology, + sense.confidence, + ); + const senseId = senseRow.lastInsertRowid; + senseIdMap.set(sense.id, Number(senseId)); + const deriv = ( + name: string, + type: Array<{ word: string }> | undefined, + ) => { + if (type) + for (const t of type) insertDerivation.run(senseId, name, t.word); + }; + const subsenses = JSON.parse(sense.senses); + for (const ss of subsenses) { + const { + examples, + glosses, + tags, + synonyms, + antonyms, + hypernyms, + meronyms, + hyponyms, + holonyms, + } = ss; + if (examples) + for (const ex of examples) { + const ref = ex.ref || null; + + if (ex.text) insertExample.run(senseId, ex.text, ref); + // else console.log("bad example", ex); + } + if (glosses) + for (const gloss of glosses) insertSubsense.run(senseId, gloss); + if (tags) + for (const tag of tags) { + const lw = tag.toLowerCase().trim(); + insertCat.run(lw); + insertCatw.run(lw, senseId); + } + if (synonyms) deriv("synonym", synonyms); + if (antonyms) deriv("antonym", antonyms); + if (hypernyms) deriv("hypernym", hypernyms); + if (meronyms) deriv("meronym", meronyms); + if (hyponyms) deriv("hyponym", hyponyms); + if (holonyms) deriv("holonym", holonyms); + } + const prosody = JSON.parse(sense.prosody); + const stressed = prosody?.stressedSyllable || null; + if (!sense.ipa) { + console.log("no ipa", sense); + continue; + } + const ipa = JSON.parse(sense.ipa); + for (const ip of ipa) { + insertWp.run( + parentId, + senseId, + ip.ipa, + ip.ipa, + 0, + "", + stressed, + "", + ip.tag, + null, + ); + } + } + console.log(` ā Migrated ${senses.length} senses`); + + console.log("\nš¦ Migrating from thaiphon.db..."); + + // 6. Migrate phonetic components from thaiphon.db + console.log(" ā Migrating phonetic components..."); + + // Tones + const tones = thaiphonDb.query("SELECT * FROM tones").all() as any; + const insertTone = unifiedDb.prepare( + "INSERT INTO tones (id, ipa, lang, name, nums) VALUES (?, ?, ?, ?, ?)", + ); + for (const tone of tones) { + insertTone.run(tone.id, tone.ipa, tone.lang, tone.name, tone.nums); + } + console.log(` ā Migrated ${tones.length} tones`); + + // Onsets + const onsets = thaiphonDb.query("SELECT * FROM onsets").all() as any; + const insertOnset = unifiedDb.prepare( + "INSERT INTO onsets (id, ipa, text, lang) VALUES (?, ?, ?, ?)", + ); + for (const onset of onsets) { + insertOnset.run(onset.id, onset.ipa, onset.text, onset.lang); + } + console.log(` ā Migrated ${onsets.length} onsets`); + + // Medials + const medials = thaiphonDb.query("SELECT * FROM medials").all() as any; + const insertMedial = unifiedDb.prepare( + "INSERT INTO medials (id, ipa, text, lang) VALUES (?, ?, ?, ?)", + ); + for (const medial of medials) { + insertMedial.run(medial.id, medial.ipa, medial.text, medial.lang) as any; + } + console.log(` ā Migrated ${medials.length} medials`); + + // Nucleus + const nucleus = thaiphonDb.query("SELECT * FROM nucleus").all() as any; + const insertNucleus = unifiedDb.prepare( + "INSERT INTO nucleus (id, ipa, text, lang) VALUES (?, ?, ?, ?)", + ); + for (const nucl of nucleus) { + insertNucleus.run(nucl.id, nucl.ipa, nucl.text, nucl.lang); + } + console.log(` ā Migrated ${nucleus.length} nucleus`); + + // Codas + const codas = thaiphonDb.query("SELECT * FROM codas").all() as any; + const insertCoda = unifiedDb.prepare( + "INSERT INTO codas (id, ipa, text, lang) VALUES (?, ?, ?, ?)", + ); + for (const coda of codas) { + insertCoda.run(coda.id, coda.ipa, coda.text, coda.lang) as any; + } + console.log(` ā Migrated ${codas.length} codas`); + + // Rhymes + const rhymes = thaiphonDb.query("SELECT * FROM rhymes").all() as any; + const insertRhyme = unifiedDb.prepare( + "INSERT INTO rhymes (id, ipa, text, lang) VALUES (?, ?, ?, ?)", + ); + for (const rhyme of rhymes) { + insertRhyme.run(rhyme.id, rhyme.ipa, rhyme.text, rhyme.lang); + } + console.log(` ā Migrated ${rhymes.length} rhymes`); + + // 7. Migrate syllables from thaiphon.db + console.log(" ā Migrating syllables..."); + const syllables = thaiphonDb.query("SELECT * FROM syllables").all() as any; + const insertSyllable = unifiedDb.prepare( + "INSERT INTO syllables (id, lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ); + for (const syl of syllables) { + insertSyllable.run( + syl.id, + syl.lang, + syl.ipa, + syl.long, + syl.text, + syl.onset, + syl.medial, + syl.nucleus, + syl.coda, + syl.rhyme, + syl.tone, + syl.notes, + ); + } + console.log(` ā Migrated ${syllables.length} syllables`); + + // 8. Migrate words from thaiphon.db (as expressions if not exists) + console.log(" ā Migrating words as expressions..."); + const words = thaiphonDb.query("SELECT * FROM words").all() as any[]; + const insertWordAsExpr = unifiedDb.query( + `INSERT INTO expressions (spelling, lang, frequency, type, notes) VALUES (?, ?, ?, ?, ?) + ON CONFLICT(spelling, lang) DO UPDATE SET spelling = excluded.spelling + RETURNING rowid`, + ); + + // Also keep track for syllables_words mapping + let prev: any = null; + for (const word of words) { + const result = insertWordAsExpr.get( + word.spelling, + word.lang, + word.frequency, + "word", + word.notes, + ) as { id: number }; + // 41085 + console.log({ result }); + // if (prev === result.id) { + // console.log(word, result); + // throw new Error("fucked up again"); + // } + // prev = result.id; + wordExpressionMap.set(word.id, result.id); + } + console.log(` ā Processed ${words.length} words`); + + console.log(" ā Migrating word phonetics..."); + const wordPhonetics = thaiphonDb + .query("SELECT * FROM word_phonetics") + .all() as any[]; + const insertWordPhonetic = unifiedDb.prepare( + "INSERT INTO word_phonetics (word_id, sense_id, ipa, syllable_count, syllable_sequence, tone_sequence, ipa_sequence, tag, notes) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", + ); + // TODO find the sense id later I guess + const wpMap: Map<number, number> = new Map(); + for (const wp of wordPhonetics) { + const newWordId = wordExpressionMap.get(wp.word_id); + if (newWordId) { + const wr = insertWordPhonetic.run( + newWordId, + null, + wp.ipa, + wp.syllable_count, + wp.syllable_sequence, + wp.tone_sequence, + wp.ipa_sequence, + wp.tag, + wp.notes, + ); + if (wr.changes !== 0) wpMap.set(wp.word_id, Number(wr.lastInsertRowid)); + } + } + + // 9. Migrate syllables_words junction + console.log(" ā Migrating syllables_words..."); + const sylWords = thaiphonDb + .query("SELECT * FROM syllables_words") + .all() as any; + const insertSylWord = unifiedDb.prepare( + "INSERT OR IGNORE INTO syllables_words (syl_id, word_id, idx, stressed) VALUES (?, ?, ?, ?)", + ); + for (const sw of sylWords) { + const newWordId = wpMap.get(sw.word_id); + if (newWordId) + insertSylWord.run(sw.syl_id, newWordId, sw.idx, sw.stressed); + } + console.log( + ` ā Migrated ${sylWords.length} syllable-word relationships`, + ); + + unifiedDb.run("COMMIT"); + + console.log("\nā
Migration completed successfully!"); + console.log(" Unified database is ready at:", UNIFIED_DB); + } catch (error) { + console.error("\nā Migration failed:", error); + unifiedDb.run("ROLLBACK"); + throw error; + } finally { + // Close all databases + prosodyDb.close(); + // thaiphonDb.close(); + unifiedDb.close(); + } +} + +// Run the migration +migrateSenses(); +// migrateTones(); +// test(); +// + +async function test() { + console.log("Starting database migration to unified.db...\n"); + // Open all databases + const prosodyDb = new Database(PROSODY_DB, { readonly: true }); + const thaiphonDb = new Database(THAIPHON_DB, { readonly: true }); + const unifiedDb = new Database(UNIFIED_DB, { readonly: true }); + + // Enable foreign keys and WAL mode for unified database + unifiedDb.run("PRAGMA foreign_keys = ON"); + unifiedDb.run("PRAGMA journal_mode = WAL"); + const oldE = prosodyDb.query("SELECT COUNT(*) FROM expressions").get(); + const oldS = prosodyDb.query("SELECT COUNT(*) FROM senses").get(); + const oldW = thaiphonDb.query("SELECT COUNT(*) FROM words").get(); + const oldWp = thaiphonDb.query("SELECT COUNT(*) FROM word_phonetics").get(); + const oldSw = thaiphonDb.query("SELECT COUNT(*) FROM syllables_words").get(); + const oldSy = thaiphonDb.query("SELECT COUNT(*) FROM syllables").get(); + const oldTo = thaiphonDb.query("SELECT COUNT(*) FROM tones").get(); + const oldCod = thaiphonDb.query("SELECT COUNT(*) FROM codas").get(); + const oldNuc = thaiphonDb.query("SELECT COUNT(*) FROM nucleus").get(); + const oldOns = thaiphonDb.query("SELECT COUNT(*) FROM onsets").get(); + const oldRhy = thaiphonDb.query("SELECT COUNT(*) FROM rhymes").get(); + const oldWrh = thaiphonDb.query("SELECT COUNT(*) FROM word_rhymes").get(); + const oldWry = thaiphonDb.query("SELECT COUNT(*) FROM words_wrhymes").get(); + const newSw = unifiedDb.query("SELECT COUNT(*) FROM syllables_words").get(); + const newSy = unifiedDb.query("SELECT COUNT(*) FROM syllables").get(); + const newTo = unifiedDb.query("SELECT COUNT(*) FROM tones").get(); + const newCod = unifiedDb.query("SELECT COUNT(*) FROM codas").get(); + const newNuc = unifiedDb.query("SELECT COUNT(*) FROM nucleus").get(); + const newOns = unifiedDb.query("SELECT COUNT(*) FROM onsets").get(); + const newRhy = unifiedDb.query("SELECT COUNT(*) FROM rhymes").get(); + const newWrh = unifiedDb.query("SELECT COUNT(*) FROM word_rhymes").get(); + const newWry = unifiedDb.query("SELECT COUNT(*) FROM words_wrhymes").get(); + const newE = unifiedDb.query("SELECT COUNT(*) FROM expressions").get(); + const newS = unifiedDb.query("SELECT COUNT(*) FROM senses").get(); + const newWp = unifiedDb.query("SELECT COUNT(*) FROM word_phonetics").get(); + console.log({ + oldE, + oldS, + oldW, + oldWp, + newE, + newS, + newWp, + oldSw, + oldSy, + oldTo, + oldCod, + oldNuc, + oldOns, + oldRhy, + oldWrh, + oldWry, + newSw, + newSy, + newTo, + newCod, + newNuc, + newOns, + newRhy, + newWrh, + newWry, + }); +} |
