From a3f24ea79b14394b24c4b60a010651eb29eeb872 Mon Sep 17 00:00:00 2001 From: polwex Date: Thu, 29 May 2025 12:10:22 +0700 Subject: glorious new db --- src/lib/db/codes.js | 203 ++++++++++++++++++++++++++++++++++++ src/lib/db/index.ts | 8 +- src/lib/db/prosodydb.ts | 238 +++++++++++++++++++++++++++++++++++++++++++ src/lib/db/prosodyschema.sql | 178 ++++++++++++++++++++++++++++++++ src/lib/db/seed.ts | 212 +++++++++++++++++++++++++++++++++++++- src/lib/db/senseschema.sql | 54 ++++++++++ 6 files changed, 891 insertions(+), 2 deletions(-) create mode 100644 src/lib/db/codes.js create mode 100644 src/lib/db/prosodydb.ts create mode 100644 src/lib/db/prosodyschema.sql create mode 100644 src/lib/db/senseschema.sql (limited to 'src/lib/db') diff --git a/src/lib/db/codes.js b/src/lib/db/codes.js new file mode 100644 index 0000000..bef3e1b --- /dev/null +++ b/src/lib/db/codes.js @@ -0,0 +1,203 @@ + + +const ALL_LANGUAGE_CODES = { + iso639_1: [], + iso639_2_T: [], // Terminology codes + iso639_2_B: [], // Bibliographic codes + iso639_3: [], + iso639_5: [], + bcp47_language_subtags: [], // Primary language subtags from IANA +}; + +const LOC_ISO639_2_URL = 'https://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt'; +// For ISO 639-3, SIL provides dated files. This is the structure of the comprehensive file. +// The actual filename changes with each update (e.g., iso-639-3_20240123.tab). +// You might need to go to https://iso639-3.sil.org/code_tables/download_tables and get the current link +// for the "Complete Code Tables" zip, then extract the main .tab file. +// For this script, I'll use a link to one specific (potentially older) version for demonstration. +// A more robust solution would involve downloading and unzipping the latest. +// This link points to the main table that includes mappings. +const SIL_ISO639_3_URL = 'https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_20240701.tab'; // Example: replace with current +const LOC_ISO639_5_URL = 'https://www.loc.gov/standards/iso639-5/iso639-5.tsv'; // TSV format +const IANA_LANGUAGE_SUBTAG_REGISTRY_URL = 'https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry'; + +async function fetchAndParseISO639_1_2() { + try { + console.log('Fetching ISO 639-1 & 639-2 codes from LOC...'); + const response = await fetch(LOC_ISO639_2_URL); + if (!response.ok) throw new Error(`Failed to fetch ISO 639-1/2: ${response.statusText}`); + const text = await response.text(); + + const lines = text.trim().split('\n'); + lines.forEach(line => { + // Format: alpha3-b|alpha3-t|alpha2|english_name|french_name + const parts = line.split('|'); + if (parts.length >= 4) { + const alpha3_b = parts[0].trim(); + const alpha3_t = parts[1].trim(); + const alpha2 = parts[2].trim(); + const englishName = parts[3].trim(); + + if (alpha3_b) { // Bibliographic code + ALL_LANGUAGE_CODES.iso639_2_B.push({ code: alpha3_b, name: englishName }); + } + if (alpha3_t) { // Terminology code + ALL_LANGUAGE_CODES.iso639_2_T.push({ code: alpha3_t, name: englishName }); + } + if (alpha2) { // Alpha-2 code + ALL_LANGUAGE_CODES.iso639_1.push({ code: alpha2, name: englishName }); + } + } + }); + console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_1.length} ISO 639-1 codes.`); + console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_B.length} ISO 639-2/B codes.`); + console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_T.length} ISO 639-2/T codes.`); + } catch (error) { + console.error('Error fetching ISO 639-1/2 codes:', error.message); + } +} + +async function fetchAndParseISO639_3() { + try { + console.log('Fetching ISO 639-3 codes from SIL...'); + const response = await fetch(SIL_ISO639_3_URL); + if (!response.ok) throw new Error(`Failed to fetch ISO 639-3: ${response.statusText}`); + const text = await response.text(); + + const lines = text.trim().split('\n'); + const header = lines.shift().split('\t'); // Remove header line + // Expected header fields (order matters): + // Id (3-letter code) | Part2B | Part2T | Part1 | Scope | Language_Type | Ref_Name | Comment + const idIndex = header.indexOf('Id'); + const refNameIndex = header.indexOf('Ref_Name'); + const part1Index = header.indexOf('Part1'); // For cross-referencing ISO 639-1 + + if (idIndex === -1 || refNameIndex === -1) { + throw new Error('ISO 639-3 header format mismatch. Expected "Id" and "Ref_Name" columns.'); + } + + lines.forEach(line => { + const parts = line.split('\t'); + const code = parts[idIndex]?.trim(); + const name = parts[refNameIndex]?.trim(); + const part1Code = parts[part1Index]?.trim(); + + if (code && name) { + ALL_LANGUAGE_CODES.iso639_3.push({ code, name }); + + // Also, let's try to get more complete ISO 639-1 from this source + // as it might be more comprehensive than LOC's where 639-1 is only if 639-2 exists + if (part1Code && !ALL_LANGUAGE_CODES.iso639_1.find(c => c.code === part1Code)) { + ALL_LANGUAGE_CODES.iso639_1.push({ code: part1Code, name }); + } + } + }); + console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_3.length} ISO 639-3 codes.`); + // Deduplicate and sort ISO 639-1 after potential additions + const uniqueIso639_1 = {}; + ALL_LANGUAGE_CODES.iso639_1.forEach(item => uniqueIso639_1[item.code] = item); + ALL_LANGUAGE_CODES.iso639_1 = Object.values(uniqueIso639_1).sort((a, b) => a.code.localeCompare(b.code)); + console.log(`Final unique ISO 639-1 count: ${ALL_LANGUAGE_CODES.iso639_1.length}.`); + + } catch (error) { + console.error('Error fetching ISO 639-3 codes:', error.message); + console.warn('Make sure the SIL_ISO639_3_URL is current or points to a valid .tab file.'); + } +} + +async function fetchAndParseISO639_5() { + try { + console.log('Fetching ISO 639-5 codes from LOC...'); + const response = await fetch(LOC_ISO639_5_URL); + if (!response.ok) throw new Error(`Failed to fetch ISO 639-5: ${response.statusText}`); + const text = await response.text(); + + const lines = text.trim().split('\n'); + lines.shift(); // Remove header line: URI Code Label_en + + lines.forEach(line => { + const parts = line.split('\t'); + // URI | Code | Label_en | Label_fr ... + if (parts.length >= 3) { + const code = parts[1].trim(); + const name = parts[2].trim(); + if (code && name) { + ALL_LANGUAGE_CODES.iso639_5.push({ code, name }); + } + } + }); + console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_5.length} ISO 639-5 codes (language families/groups).`); + } catch (error) { + console.error('Error fetching ISO 639-5 codes:', error.message); + } +} + +async function fetchAndParseIANALanguageSubtags() { + try { + console.log('Fetching IANA Language Subtag Registry...'); + const response = await fetch(IANA_LANGUAGE_SUBTAG_REGISTRY_URL); + if (!response.ok) throw new Error(`Failed to fetch IANA registry: ${response.statusText}`); + const text = await response.text(); + + const entries = text.split('%%'); // Entries are separated by %% + entries.forEach(entry => { + const lines = entry.trim().split('\n'); + let type = ''; + let subtag = ''; + let description = ''; + + lines.forEach(line => { + if (line.startsWith('Type:')) { + type = line.substring(5).trim(); + } else if (line.startsWith('Subtag:')) { + subtag = line.substring(7).trim(); + } else if (line.startsWith('Description:')) { + // Description can span multiple lines, but for simplicity, we take the first + if (!description) description = line.substring(12).trim(); + } + }); + + if (type === 'language' && subtag && description) { + ALL_LANGUAGE_CODES.bcp47_language_subtags.push({ + code: subtag, + name: description + }); + } + }); + console.log(`Fetched ${ALL_LANGUAGE_CODES.bcp47_language_subtags.length} primary language subtags from IANA.`); + } catch (error) { + console.error('Error fetching IANA Language Subtag Registry:', error.message); + } +} + + +async function main() { + console.log('Starting to fetch all language codes...\n'); + + await Promise.all([ + fetchAndParseISO639_1_2(), + fetchAndParseISO639_3(), // Run this after 1_2 to potentially augment 639-1 + fetchAndParseISO639_5(), + fetchAndParseIANALanguageSubtags() + ]); + await Bun.write("bcp.json", JSON.stringify(ALL_LANGUAGE_CODES.bcp47_language_subtags)) + + console.log('\n\n--- All Fetched Language Codes ---'); + + // Example: Print counts and first few of each + for (const [key, codes] of Object.entries(ALL_LANGUAGE_CODES)) { + console.log(`\n--- ${key} (Total: ${codes.length}) ---`); + codes.slice(0, 50).forEach(c => console.log(`${c.code}: ${c.name}`)); + if (codes.length > 50) console.log('... and more ...'); + } + + // You can now use ALL_LANGUAGE_CODES object for your needs + // e.g., save to a JSON file + // import fs from 'fs'; + // fs.writeFileSync('all_language_codes.json', JSON.stringify(ALL_LANGUAGE_CODES, null, 2)); + // console.log('\n\nSaved all codes to all_language_codes.json'); + + console.log('\nFetching complete.'); +} + +main().catch(console.error); diff --git a/src/lib/db/index.ts b/src/lib/db/index.ts index fcfab57..6bd417c 100644 --- a/src/lib/db/index.ts +++ b/src/lib/db/index.ts @@ -92,10 +92,16 @@ class DatabaseHandler { const query = this.db.query( ` SELECT * FROM expressions e - WHERE e.syllables = 1 AND e.lang = ? + WHERE e.type = 'syllable' AND e.lang = ? ORDER BY frequency DESC LIMIT ${PAGE_SIZE} ${page ? "OFFSET " + getDBOffset(page, PAGE_SIZE) : ""} `, + // ` + // SELECT * FROM expressions e + // WHERE e.syllables = 1 AND e.lang = ? + // ORDER BY frequency DESC + // LIMIT ${PAGE_SIZE} ${page ? "OFFSET " + getDBOffset(page, PAGE_SIZE) : ""} + // `, ); const results = query.all(lang); console.log({ lang, page }, "results"); diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts new file mode 100644 index 0000000..b3b973b --- /dev/null +++ b/src/lib/db/prosodydb.ts @@ -0,0 +1,238 @@ +import Database from "bun:sqlite"; +type Str = string | null; +type ItemType = "word" | "syllable" | "idiom"; + +class DatabaseHandler { + db: Database; + constructor() { + const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/prosodynew.db"; + const db = new Database(dbPath, { create: true }); + db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance + db.exec("PRAGMA foreign_keys = ON"); + this.db = db; + } + async init() { + const file = Bun.file("./prosodyschema.sql"); + const sql = await file.text(); + this.db.exec(sql); + } + // selects + fetchWords(words: string[]) { + const query = this.db.query( + `SELECT id FROM words where spelling IN (${words.map((w) => `'${w}'`).join(", ")})`, + ); + return query.all() as Array<{ id: number }>; + } + // inserts + + addLanguage(code: string, name: string) { + const query = this.db + .query(`INSERT OR IGNORE INTO languages(iso6392, english) VALUES(?, ?)`) + .run(code, name); + } + addPronunciation( + type: ItemType, + parentId: number | bigint, + ipa: string, + syllables: number, + tags: Str, + notes: Str, + ) { + try { + const query = this.db + .query( + `INSERT INTO pronunciation(type, parent_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?, ?)`, + ) + .run(type, parentId, ipa, syllables, tags, notes); + } catch (e) { + // console.error(e); + } + } + addWordRhyme(wordId: number | bigint, ipa: string, lang: string, notes: Str) { + console.log("wordrhyme", notes); + try { + const query = this.db + .query( + `INSERT INTO word_rhymes(text, lang, notes) VALUES(?, ?, ?) + ON CONFLICT(text,lang) DO UPDATE SET + text = excluded.text + RETURNING rowid + `, + ) + .get(ipa, lang, notes) as { id: number }; + const query2 = this.db + .query( + ` + INSERT INTO words_idioms(word_id, idiom_id) VALUES(?, ?) + `, + ) + .run(wordId, query.id); + } catch (e) { + // console.error(e); + } + } + addIdiom(spelling: string, lang: string) { + const query = this.db.query( + `INSERT INTO idioms(spelling, lang) VALUES(?, ?)`, + ); + const res = query.run(spelling, lang); + return res; + } + findIdiomWords(spelling: string, idId: number | bigint) { + const split = spelling.split(" "); + const words = this.fetchWords(split); + console.log({ words }); + const tx = this.db.transaction(() => { + for (const w of words) { + this.db + .query( + ` + INSERT INTO words_idioms(word_id, idiom_id) VALUES(?, ?) + `, + ) + .run(w.id, idId); + } + }); + tx(); + } + findIdiomsWords() { + const rows: any = this.db.query(`SELECT id, spelling FROM idioms`); + for (const row of rows) { + this.findIdiomWords(row.spelling, row.id); + } + } + addWord(spelling: string, lang: string) { + const query = this.db.query( + // `INSERT OR IGNORE INTO words(spelling, lang) VALUES(?, ?)`, + `INSERT INTO words(spelling, lang) VALUES(?, ?)`, + ); + const res = query.run(spelling, lang); + const wordId = res.lastInsertRowid; + return wordId; + } + addSyllable( + wordId: number | bigint, + text: string, + lang: string, + long: boolean, + onset: Str, + medial: Str, + nucleus: string, + coda: Str, + rhyme: string, + tone: Str, + notes: Str, + ) { + const tx = this.db.transaction(() => { + const query = this.db.query( + `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ); + const res = query.run( + text, + lang, + long, + onset, + medial, + nucleus, + coda, + rhyme, + tone, + notes, + ); + const sylId = res.lastInsertRowid; + + const res1 = this.db + .query(`INSERT INTO syllables_words(syl_id, word_id) VALUES(?, ?)`) + .run(sylId, wordId); + // + return sylId; + }); + const sylId = tx(); + let res1: any; + if (onset) { + res1 = this.db + .query( + `INSERT INTO onsets(text, lang) VALUES(?, ?) + ON CONFLICT(text, lang) DO UPDATE SET + text = excluded.text + RETURNING rowid + `, + ) + .get(onset, lang); + this.db + .query(`INSERT INTO onsets_syllables(syl_id, onset_id) VALUES(?, ?)`) + .run(sylId, res1.id); + } + if (medial) { + res1 = this.db + .query( + `INSERT INTO medials(text, lang) VALUES(?, ?) + ON CONFLICT(text, lang) DO UPDATE SET + text = excluded.text + RETURNING rowid + `, + ) + .get(medial, lang); + this.db + .query(`INSERT INTO medials_syllables(syl_id, medial_id) VALUES(?, ?)`) + .run(sylId, res1.id); + } + res1 = this.db + .query( + `INSERT INTO nucleus(text, lang) VALUES(?, ?) + ON CONFLICT(text, lang) DO UPDATE SET + text = excluded.text + RETURNING rowid + `, + ) + .get(nucleus, lang); + this.db + .query(`INSERT INTO nucleus_syllables(syl_id, nucleus_id) VALUES(?, ?)`) + .run(sylId, res1.id); + if (coda) { + res1 = this.db + .query( + `INSERT INTO codas(text, lang) VALUES(?, ?) + ON CONFLICT(text, lang) DO UPDATE SET + text = excluded.text + RETURNING rowid + `, + ) + .get(coda, lang); + this.db + .query(`INSERT INTO codas_syllables(syl_id, coda_id) VALUES(?, ?)`) + .run(sylId, res1.id); + } + res1 = this.db + .query( + `INSERT INTO rhymes(text, lang) VALUES(?, ?) + ON CONFLICT(text, lang) DO UPDATE SET + text = excluded.text + RETURNING rowid + `, + ) + .get(rhyme, lang); + this.db + .query(`INSERT INTO rhymes_syllables(syl_id, rhyme_id) VALUES(?, ?)`) + .run(sylId, res1.id); + if (tone) { + res1 = this.db + .query( + `INSERT INTO tones(text, lang) VALUES(?, ?) + ON CONFLICT(text, lang) DO UPDATE SET + text = excluded.text + RETURNING rowid + `, + ) + .get(tone, lang); + this.db + .query(`INSERT INTO tones_syllables(syl_id, tone_id) VALUES(?, ?)`) + .run(sylId, res1.id); + } + } + + // reads +} +const db = new DatabaseHandler(); + +export default db; diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql new file mode 100644 index 0000000..e70b005 --- /dev/null +++ b/src/lib/db/prosodyschema.sql @@ -0,0 +1,178 @@ +-- Enable foreign key support +PRAGMA foreign_keys = ON; +PRAGMA journal_mode = WAL; +PRAGMA cache_size = -2000; +PRAGMA mmap_size = 30000000000; + + +-- proper prosody now +-- +-- +-- +CREATE TABLE IF NOT EXISTS languages( + iso6392 TEXT PRIMARY KEY, + -- bcp47 TEXT PRIMARY KEY, + -- iso6393 TEXT NOT NULL, + english TEXT NOT NULL + -- native TEXT, + -- iso6391 TEXT, + -- iso6395 TEXT, + -- glottolog TEXT +); + +CREATE TABLE IF NOT EXISTS idioms( + id INTEGER PRIMARY KEY AUTOINCREMENT, + spelling TEXT NOT NULL, + lang TEXT NOT NULL, + frequency INTEGER, + FOREIGN KEY (lang) REFERENCES languages(iso6392), + CONSTRAINT spell_unique UNIQUE (spelling, lang) +); + +CREATE INDEX IF NOT EXISTS idx_idioms_spelling ON idioms(spelling); +CREATE TABLE IF NOT EXISTS words( + id INTEGER PRIMARY KEY AUTOINCREMENT, + spelling TEXT NOT NULL, + lang TEXT NOT NULL, + frequency INTEGER, + FOREIGN KEY (lang) REFERENCES languages(iso6392), + CONSTRAINT spell_unique UNIQUE (spelling, lang) +); + +CREATE INDEX IF NOT EXISTS idx_words_spelling ON words(spelling); + +CREATE TABLE IF NOT EXISTS word_rhymes( + id INTEGER PRIMARY KEY AUTOINCREMENT, + text TEXT NOT NULL, + lang TEXT NOT NULL, + notes TEXT, + CONSTRAINT wrhyme_unique UNIQUE (text, lang) +); +CREATE TABLE IF NOT EXISTS words_rhymes( + word_id INTEGER NOT NULL, + wrhyme_id INTEGER NOT NULL, + FOREIGN KEY (word_id) REFERENCES words(id), + FOREIGN KEY (wrhyme_id) REFERENCES word_rhymes(id) +); + +-- break up syllables +CREATE TABLE IF NOT EXISTS syllables( + id INTEGER PRIMARY KEY AUTOINCREMENT, + text TEXT NOT NULL, + lang TEXT NOT NULL, + long INTEGER NOT NULL, + tone TEXT, + onset TEXT, + medial TEXT, + nucleus TEXT, + coda TEXT, + rhyme TEXT, + notes TEXT, + FOREIGN KEY (lang) REFERENCES languages(iso6392), + CONSTRAINT spell_unique UNIQUE (text, lang) +); + +CREATE TABLE IF NOT EXISTS tones( + id INTEGER PRIMARY KEY AUTOINCREMENT, + text TEXT NOT NULL, + lang TEXT NOT NULL, + name TEXT, + num INTEGER, + CONSTRAINT tone_unique UNIQUE (text, lang) +); +CREATE TABLE IF NOT EXISTS onsets( + id INTEGER PRIMARY KEY AUTOINCREMENT, + text TEXT NOT NULL, + lang TEXT NOT NULL, + CONSTRAINT onsets_unique UNIQUE (text, lang) +); +CREATE TABLE IF NOT EXISTS medials( + id INTEGER PRIMARY KEY AUTOINCREMENT, + text TEXT NOT NULL, + lang TEXT NOT NULL, + CONSTRAINT medials_unique UNIQUE (text, lang) +); +CREATE TABLE IF NOT EXISTS nucleus( + id INTEGER PRIMARY KEY AUTOINCREMENT, + text TEXT NOT NULL, + lang TEXT NOT NULL, + CONSTRAINT nucleus_unique UNIQUE (text, lang) +); +CREATE TABLE IF NOT EXISTS codas( + id INTEGER PRIMARY KEY AUTOINCREMENT, + text TEXT NOT NULL, + lang TEXT NOT NULL, + CONSTRAINT coda_unique UNIQUE (text, lang) +); +CREATE TABLE IF NOT EXISTS rhymes( + id INTEGER PRIMARY KEY AUTOINCREMENT, + text TEXT NOT NULL, + lang TEXT NOT NULL, + CONSTRAINT rhyme_unique UNIQUE (text, lang) +); + +-- join tables + +CREATE TABLE IF NOT EXISTS tones_syllables( + syl_id INTEGER NOT NULL, + tone_id INTEGER NOT NULL, + FOREIGN KEY (syl_id) REFERENCES syllables(id), + FOREIGN KEY (tone_id) REFERENCES tones(id) +); +CREATE TABLE IF NOT EXISTS onsets_syllables( + syl_id INTEGER NOT NULL, + onset_id INTEGER NOT NULL, + FOREIGN KEY (syl_id) REFERENCES syllables(id), + FOREIGN KEY (onset_id) REFERENCES onsets(id) +); +CREATE TABLE IF NOT EXISTS medials_syllables( + syl_id INTEGER NOT NULL, + medial_id INTEGER NOT NULL, + FOREIGN KEY (syl_id) REFERENCES syllables(id), + FOREIGN KEY (medial_id) REFERENCES medials(id) +); +CREATE TABLE IF NOT EXISTS nucleus_syllables( + syl_id INTEGER NOT NULL, + nucleus_id INTEGER NOT NULL, + FOREIGN KEY (syl_id) REFERENCES syllables(id), + FOREIGN KEY (nucleus_id) REFERENCES nucleus(id) +); +CREATE TABLE IF NOT EXISTS codas_syllables( + syl_id INTEGER NOT NULL, + coda_id INTEGER NOT NULL, + FOREIGN KEY (syl_id) REFERENCES syllables(id), + FOREIGN KEY (coda_id) REFERENCES codas(id) +); +CREATE TABLE IF NOT EXISTS rhymes_syllables( + syl_id INTEGER NOT NULL, + rhyme_id INTEGER NOT NULL, + FOREIGN KEY (syl_id) REFERENCES syllables(id), + FOREIGN KEY (rhyme_id) REFERENCES rhymes(id) +); + +CREATE TABLE IF NOT EXISTS syllables_words( + syl_id INTEGER NOT NULL, + word_id INTEGER NOT NULL, + FOREIGN KEY (syl_id) REFERENCES syllables(id), + FOREIGN KEY (word_id) REFERENCES words(id) +); +CREATE TABLE IF NOT EXISTS words_idioms( + word_id INTEGER NOT NULL, + idiom_id INTEGER NOT NULL, + FOREIGN KEY (idiom_id) REFERENCES idioms(id), + FOREIGN KEY (word_id) REFERENCES words(id) +); + + +-- +CREATE TABLE IF NOT EXISTS pronunciation( + id INTEGER PRIMARY KEY AUTOINCREMENT, + type TEXT CHECK(type IN ('word', 'syllable', 'idiom')) NOT NULL, + parent_id INTEGER NOT NULL, + ipa TEXT NOT NULL, + syllables INTEGER NOT NULL, + tag TEXT, + notes TEXT, + CONSTRAINT ipa_unique UNIQUE (ipa, parent_id) +); +CREATE INDEX IF NOT EXISTS idx_words_ipa ON pronunciation(ipa, parent_id); diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts index c4094de..7f4352f 100644 --- a/src/lib/db/seed.ts +++ b/src/lib/db/seed.ts @@ -2,6 +2,8 @@ import { readWiktionaryDump } from "../services/wiki"; import { getStressedSyllable, getSyllableCount } from "../utils"; import useful from "@/lib/useful_thai.json"; import db from "."; +import pdb from "./prosodydb"; +import * as Sorsyl from "sorsyl"; const SYMBOL_REGEX = new RegExp(/[\W\d]/); @@ -483,7 +485,16 @@ function addThaiSyllablesLesson() { // } // } // } -addThaiUseful(); +function fixSyllables() { + const res = db.db.query(`SELECT ipa, syllables FROM expressions;`).all(); + for (let i = 0; i < 10; i++) { + // for (const row of res) { + const row = res[i]; + console.log({ row }); + } +} +// fixSyllables(); +// addThaiUseful(); // addThaiSyllablesLesson(); // adjustFrequency("th"); @@ -492,3 +503,202 @@ addThaiUseful(); // fillFromDump(); // thaiSyllables(); // thaiFreq(); +// +// +const SORSYL_PATH = + "/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl"; +async function redump() { + await pdb.init(); + let count = 0; + + // const soundTypes = new Set(); + // [ + // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other", + // "text", "hangeul", "topics", "form", "audio-ipa" + // ] + for await (const line of readWiktionaryDump()) { + try { + count++; + // if (count > 50) break; + const j = JSON.parse(line); + console.log(Object.keys(j), j.word); + // add language to db + pdb.addLanguage(j.lang_code, j.lang); + // handleEtim(j); + // handleDerived(j); + // handleSenses(j.pos, j.senses); + // // + const isWord = j.word.trim().split(" ").length === 1; + if (isWord) await handleWord(j); + else await handleIdiom(j); + } catch (e) { + console.log("error parsing", e); + // break; + } + } +} + +type SorSyl = { + stressed: boolean; + long: boolean; + spelling: string; + ipa: string; + nucleus: string; + onset: string; + medial: string; + coda: string; + rhyme: string; + tone: string; +}; +async function handleWord(j: any) { + const wordId = pdb.addWord(j.word, j.lang_code); + let ts = Date.now(); + + const hwikiRhyme = j.sounds.find((s) => "rhymes" in s); + const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; + for (let snd of j.sounds || []) { + if ("ipa" in snd) { + const tags = JSON.stringify(snd.tags) || null; + const ipa = snd.ipa; + try { + const hres = await fetch("http://localhost:8104/syls", { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }), + }); + const hjon = await hres.json(); + console.log(Date.now() - ts, "elapsed in http"); + ts = Date.now(); + pdb.addPronunciation( + "word", + wordId, + hjon.clean_ipa, + hjon.syls.length, + tags, + null, + ); + const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => { + if (!item.stressed && !acc) return acc; + if (item.stressed && !acc) return `${acc}${item.rhyme}`; + else return `${acc}${item.ipa}`; + }, ""); + if (wordRhyme) + pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme); + else console.log("no rhyme?", hjon); + for (const syl of hjon.syls) { + // TODO ideally syllables would have spelling not IPA... harsh tho + pdb.addSyllable( + wordId, + syl.ipa, + j.lang_code, + syl.long, + syl.onset || null, + syl.medial || null, + syl.nucleus, + syl.coda || null, + syl.rhyme, + syl.tone || null, + null, + ); + } + console.log(Date.now() - ts, "elapsed in db"); + ts = Date.now(); + } catch (e) { + console.error(e); + console.error(j); + // break; + } + } + } +} +async function handleIdiom(j: any) { + console.log(j.word, "idiom"); + pdb.addIdiom(j.word, j.lang_code); + // TODO IPA of idioms...? +} +async function handleEtim(j: any) { + console.log(j.etymology_text, "etym"); + console.log(j.etymology_templates, "etym"); + + // { + // name: "inh", + // args: { + // "1": "en", + // "2": "ang", + // "3": "frēo", + // "4": "", + // "5": "free", + // }, + // expansion: "Old English frēo (“free”)", + // }, + + console.log(j.head_templates, "head"); + // { + // name: "en-verb", + // args: {}, + // expansion: "free (third-person singular simple present frees, present participle freeing, simple past and past participle freed)", + // } +} +async function handleDerived(j: any) { + const { forms, derived, related, antonyms, hyponyms, synonyms, descendants } = + j; + console.log("forms", forms); + // {form: string; tags: string[]} + console.log("derived", derived); + // {word: string} + console.log("related", related); + // {word: string, source?: string;} + console.log("ant", antonyms); + // {word: string, source?: string;} + console.log("hypo", hyponyms); + console.log("syno", synonyms); + // {word: string, source?: string;} + console.log("desc", descendants); +} +async function handleSenses(pos: string, senses: any[]) { + console.log("ex", senses[0].examples); + // {text: string; ref: string; type: "quote"} + console.log("info", senses[0].info_templates); + for (const s of senses) { + // s.glosses[] + // s.tags[] + } +} + +redump(); + +async function newtest() { + // const query = pdb.db.query( + // `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + // ); + // const res = query.run( + // "lol", + // "en", + // true, + // "l", + // "j", + // "o", + // "q", + // "joq", + // null, + // null, + // ); + // const sylId = res.lastInsertRowid; + const res1 = pdb.db + .query( + `INSERT INTO onsets(text, lang) VALUES(?, ?) + ON CONFLICT(text, lang) DO UPDATE SET + text = excluded.text + RETURNING rowid + `, + ) + .get("lll", "en"); + console.log({ res1 }); +} +// newtest(); +// TIL calling shell commands is terribly slow wtf +// Bun.$.env({ FOO: ipa }); +// const res = await Bun.$`${SORSYL_PATH} $FOO`; +// const syllables = JSON.parse(res.stdout.toString()); +// console.log(Date.now() - ts, "elapsed in py"); +// ts = Date.now(); diff --git a/src/lib/db/senseschema.sql b/src/lib/db/senseschema.sql new file mode 100644 index 0000000..f07a208 --- /dev/null +++ b/src/lib/db/senseschema.sql @@ -0,0 +1,54 @@ +-- Enable foreign key support +PRAGMA foreign_keys = ON; +PRAGMA journal_mode = WAL; +PRAGMA cache_size = -2000; +PRAGMA mmap_size = 30000000000; + +-- a semantic entity +CREATE TABLE IF NOT EXISTS senses( + id INTEGER PRIMARY KEY AUTOINCREMENT, + parent_id INTEGER NOT NULL, + spelling TEXT NOT NULL, + pos TEXT, + etymology TEXT, + senses JSONB, + forms JSONB, + related JSONB, + confidence INTEGER NOT NULL DEFAULT 0, + FOREIGN KEY (parent_id) REFERENCES expressions(id) +); +CREATE INDEX IF NOT EXISTS idx_words_pos ON senses(pos); +CREATE INDEX IF NOT EXISTS idx_senses_parent ON senses(parent_id); + +CREATE TABLE IF NOT EXISTS subsenses( + id INTEGER PRIMARY KEY AUTOINCREMENT, + sid INTEGER NOT NULL + gloss TEXT NOT NULL, + examples JSONB, + FOREIGN KEY (sid) REFERENCES senses(id) +); + +CREATE TABLE IF NOT EXISTS derivation( + id INTEGER PRIMARY KEY AUTOINCREMENT, + sid INTEGER NOT NULL + type TEXT NOT NULL, + text TEXT NOT NULL, + tags JSONB, + FOREIGN KEY (sid) REFERENCES senses(id) +); + +-- Categories table (for noun and verb categories) +CREATE TABLE IF NOT EXISTS categories ( + name TEXT PRIMARY KEY +); + +-- Word Categories junction table +CREATE TABLE IF NOT EXISTS word_categories ( + word_id INTEGER NOT NULL, + category INTEGER NOT NULL, + PRIMARY KEY (word_id, category), + FOREIGN KEY (word_id) REFERENCES expressions(id), + FOREIGN KEY (category) REFERENCES categories(name) +); +CREATE INDEX IF NOT EXISTS idx_word_categories_category_id ON word_categories(category); + -- cgit v1.2.3