import Database from "bun:sqlite"; import { readWiktionaryDump } from "../services/wiki"; import { getStressedSyllable, getSyllableCount } from "../utils"; import useful from "@/lib/useful_thai.json"; import db from "."; import pdb from "./prosodydb"; import { findLemma } from "../calls/nlp"; const SYMBOL_REGEX = new RegExp(/[\W\d]/); function goodPos(pos: string): boolean { const list = [ "CC", "DT", "EX", "IN", "LS", "MD", "PDT", "POS", "PRP", "PRP$", "RP", "TO", "WDT", "WP", "WP$", ]; return list.includes(pos); } // function englishKaggle() { // handleFile("../datasets/words_pos.csv", (line, idx) => { // const [_, spelling, pos] = line.split(","); // if (!goodPos(pos)) return; // const rowid = addWord(db, spelling, "", "en-us", "word", null); // const category = poss[pos] || "unknown;"; // addCat(db, rowid, category); // }); // } // async function englishIPA() { // handleFile("ipa/en-us/ipadict.txt", (line, idx) => { // const [spelling, ipa] = line.split(/\s+/); // if (!spelling || !ipa) return; // const hasSymbols = spelling.match(SYMBOL_REGEX); // if (hasSymbols) return; // const split = spelling.split(" "); // const type = split.length > 1 ? "expression" : "word"; // const subtype = null; // addWord(db, spelling, ipa, "en-us", type, subtype); // }); // } async function englishFreq() { handleFile( "/home/y/code/prosody/hanchu/datasets/unigram_freq.csv", (line, idx) => { const [spelling, _frequency] = line.split(","); db.addFrequency(spelling, idx); }, ); } async function thaiFreq() { const files = [ "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv", "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv", "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv", "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv", "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv", "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv", ]; for (let f of files) { handleFile(f, (line, idx) => { const [spelling, IPA, tone, length, frequency, ...rest] = line.split(","); db.addFrequency(spelling, Number(frequency)); }); } } const thaiTones: Record = { M: 1, L: 2, F: 3, H: 4, R: 5, }; const thaiTones2: Record = { "˧": 1, "˨˩": 2, "˥˩": 3, "˦˥": 4, "˩˩˦": 5, }; async function thaiSyllables() { handleFile( "/home/y/code/prosody/prosody/langdata/thai/data/1yin_freq.csv", (line, idx) => { const [spelling, IPA, toneS, length, frequency, ...rest] = line.split(","); const isLong = length === "長"; const tone = thaiTones[toneS]; const prosody = JSON.stringify({ isLong, tone, lang: "th" }); db.upsertWord({ spelling, lang: "th", ipa: JSON.stringify([{ ipa: IPA, tags: ["sortug"] }]), prosody, syllables: 1, type: "syllable", frequency: Number(frequency), confidence: 10, }); }, ); handleFile( "/home/y/code/prosody/prosody/langdata/thai/data/1yinjie.csv", (line, idx) => { const [spelling, IPA] = line.split(","); const isLong = IPA.includes("ː"); let tone = 0; const toneMarks = Object.keys(thaiTones2); for (let tm of toneMarks) { if (IPA.includes(tm)) tone = thaiTones2[tm]; } const prosody = JSON.stringify({ isLong, tone, lang: "th" }); db.upsertWord({ spelling, lang: "th", ipa: JSON.stringify([{ ipa: IPA, tags: ["sortug"] }]), prosody, syllables: 1, type: "syllable", confidence: 10, }); }, ); } // // Save the last incomplete line to process in the next iteration // } // TODO no conjunctions or adpositions in Wordnet!! // function englishWordnet() { // // LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB' // // LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid // const queryString = ` // WITH ranked_ipa AS ( // SELECT // lp.wordid, // pr.pronunciation, // lp.variety, // ROW_NUMBER() OVER ( // PARTITION BY lp.wordid // ORDER BY // CASE // WHEN lp.variety = 'US' THEN 1 // WHEN lp.variety IS NULL THEN 2 // WHEN lp.variety IS 'GB' THEN 3 // ELSE 4 // END // ) AS rank // FROM lexes_pronunciations lp // JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid // ) // SELECT words.wordid, word, rp.pronunciation as ipa, domainname // FROM words // LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1 // LEFT JOIN senses ON senses.wordid = words.wordid // LEFT JOIN synsets ON synsets.synsetid = senses.synsetid // LEFT JOIN domains ON domains.domainid = synsets.domainid // GROUP BY words.wordid // `; // const query = wndb.query(queryString); // const res: Array<{ // word: string; // ipa: string; // domainname: string; // }> = query.all() as any; // console.log("res", res.length); // for (const r of res) { // console.log(r, "r"); // // if (r.word === 'abrasive') throw new Error('stop right here'); // const ok = filterWord(r.word); // if (!ok) continue; // const split = r.word.split(" "); // const type = split.length > 1 ? "expression" : "word"; // const subtype = null; // const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype); // const category = domains[r.domainname] || "unknown;"; // addCat(db, wordid, category); // } // } function filterWord(s: string) { const hasSymbols = s.match(SYMBOL_REGEX); if (hasSymbols) return false; else return true; } // function checkWordNet(word: string) { // const query = wndb.query(`SELECT * FROM words WHERE word = $word`); // const res = query.get({ $word: word }); // return !!res; // } // function englishCards() { // const lesson_id = addLesson(db, "First Lesson, some easy stuff"); // const texts = [ // "I", // "friend", // "my friend", // "you", // "your friend", // "my friends' friend", // "you are my friend", // "I am your friend", // "your friend is my friend", // "my friend is your friend", // "he is my friend", // "this is mine", // "this is yours", // "this is my friends'", // "no", // "you are not my friend", // "this is not yours", // "your friend is not my friend", // "that is mine", // "this is mine, that is yours", // "he is not your friend", // "no, I am not", // "that is not me", // "that is not mine, that is my friends'", // ]; // for (const text of texts) { // addCard(db, lesson_id, text); // } // } // englishWordnet(); // englishFreq(); // englishCards(); // englishKaggle(); async function fillFromDump() { await db.init(); // const log = Bun.file("./stuff.log"); // const logWriter = log.writer(); let count = 0; const fields = new Set(); // let biggest = 0; for await (const line of readWiktionaryDump()) { try { count++; console.log({ count }); // if (count > 80) break; // if (line.length > biggest) { // biggest = line.length; // Bun.write("./biggest.log", line, { createPath: true }); // } const j = JSON.parse(line); db.addLanguage(j.lang_code, j.lang); db.addCat(j.pos); // for (let key of Object.keys(j)) { // if (!fields.has(key)) { // fields.add(key); // logWriter.write(`${line}\n`); // } // } if (j.lang_code === "en" || j.lang_code === "th") { console.log("saving", j.word); // console.log(j.sounds); const related = { derived: j.derived, antonyms: j.antonyms, synonyms: j.synonyms, related: j.related, }; let rhyme = ""; let ipaExample = ""; let ipa: any[] = []; for (let snd of j.sounds || []) { if ("ipa" in snd) { ipa.push(snd); if (!ipaExample) ipaExample = snd.ipa; } if ("rhymes" in snd) rhyme = snd.rhymes; } const isWord = j.word.trim().split(" ").length === 1; const type: any = isWord ? "word" : "expression"; const syllables = ipaExample ? getSyllableCount(ipaExample) : 0; console.log({ ipaExample, syllables }); let prosody: any = {}; if (ipaExample) { const stressedSyllable = getStressedSyllable(ipaExample); if ("ok" in stressedSyllable) prosody.stressedSyllable = stressedSyllable.ok; } if (rhyme) prosody.rhyme = rhyme; try { const row = db.addWord({ spelling: j.word, lang: j.lang_code, ipa: JSON.stringify(ipa), prosody: JSON.stringify(prosody), syllables, type, }); let parent_id: number | bigint; if (row.changes === 1) parent_id = row.lastInsertRowid; else { const data: any = db.fetchExpressionBySpelling(j.word, j.lang_code); parent_id = data.id; } const senseRow = db.addSense({ id: count - 1, parent_id, spelling: j.word, etymology: j.etymology_text || "", pos: j.pos, ipa: JSON.stringify(ipa), prosody: JSON.stringify(prosody), senses: JSON.stringify(j.senses), forms: JSON.stringify(j.forms || []), related: JSON.stringify(related), }); } catch (e) { console.log("error inserting", e); } } // langset.add(j.lang_code); // if (j.lang === "Translingual") continue; // if (j.lang_code === "en") en++; // if (j.lang_code === "th") thai++; // if (j.lang_code === "zh") zh++; // if (j.word === "cat") { // console.log(j.word); // console.log(Object.keys(j)); // console.log(j); // console.log("senses", j.senses); // console.log("forms", j.forms); // // console.log("ett", j.etymology_templates); // // console.log("derived", j.derived); // // const meaning: Meaning = {etymology: j.etymology_text} // // const wd = { lang: j.lang_code, spelling: j.word, ipa, {} }; // break; // } } catch (e) { console.log("error parsing", e); } } console.log("fields", fields); } function addDecks() { // const lesson_id = db.addLesson({ // name: "Thai Syllables", // description: "All the syllables in the Thai language ordered by frequency", // lang: "th", // }); const syllables: any[] = db.fetchExpressionRaw({ confidence: "10", syllables: "1", lang: "th", }); for (let expression of syllables) { db.addCard({ lesson_id: 5, eid: expression.id, text: "Syllable", mnote: "from Sortug Development", }); } } function adjustFrequency(lang: string) { const frequencies: Set = new Set(); const all: any[] = db.fetchExpressionRaw({ lang }); for (let row of all) { if (row.frequency) frequencies.add(row.frequency); } const freqArray = Array.from(frequencies).sort((a, b) => b - a); console.log(freqArray); for (let row of all) { if (row.frequency) { const f = freqArray.indexOf(row.frequency); if (f === -1) throw new Error("wtf" + row.frequency); db.updateWord(row.id, { frequency: f + 1 }); } } } // -- INSERT INTO lessons(name, description) values('8000 Super Useful Expressions', 'David Martins Facebook list of coloquial Thai expressions'); // -- INSERT INTO lang_lessons(lesson_id, lang) VALUES(1, 'th'); // -- INSERT INTO lessons(name, description) values('Thai Syllables', 'All syllables in Thai phonology'); // -- INSERT INTO lang_lessons(lesson_id, lang) VALUES(2, 'th'); function addThaiUseful() { let idx = 0; for (const level in useful) { db.addCat(level); const exps = (useful as any)[level]; console.log(level, exps.length); for (const exp of exps) { const split = exp.ipa.split("/").filter((s) => s.trim()); const ipa = split.map((ip: any) => ({ ipa: ip, tags: [] })); try { idx++; const tx = db.db.transaction(() => { const wid = db.addWord({ spelling: exp.spelling, lang: "th", type: "expression", ipa: JSON.stringify(ipa), }); console.log({ wid }); db.addWCat(wid, level); if (exp.register) { db.addCat(exp.register); db.addWCat(wid, exp.register); } const glosses = [exp.english]; if (exp.note) glosses.push(exp.note); db.addSense({ parent_id: wid, spelling: exp.spelling, senses: JSON.stringify([{ glosses }]), }); db.addCard({ text: `Super Useful ${idx}`, eid: wid as any, lesson_id: 1, }); }); tx(); } catch (e) { console.log({ exp }); console.error(`${e}`); // break; } } } } function addThaiSyllablesLesson() { const res = db.db .query( "SELECT id FROM expressions e WHERE e.type = 'syllable' and e.lang = 'th'", ) .all() as any[]; for (const row of res) { db.addCard({ text: "Syllable", eid: row.id, lesson_id: 2 }); } } // function fixIpa() { // const res = db.db.query(`SELECT id, ipa FROM expressions`).all() as any[]; // for (const row of res) { // try { // const jon = JSON.parse(row.ipa); // } catch (_) { // const clean: string = row.ipa.replace("...", "").trim(); // db.db.query(`UPDATE expressions SET ipa = ? WHERE `).run(JSON.stringify(ipa)); // } // } // } function fixSyllables() { const res = db.db.query(`SELECT ipa, syllables FROM expressions;`).all(); for (let i = 0; i < 10; i++) { // for (const row of res) { const row = res[i]; console.log({ row }); } } // fixSyllables(); // addThaiUseful(); // addThaiSyllablesLesson(); // adjustFrequency("th"); // addDecks(); // fillFromDump(); // thaiSyllables(); // thaiFreq(); // // const SORSYL_PATH = "/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl"; async function redump(lang: string) { let count = 0; const langdb = new Database( `/home/y/code/prosody/resources/wiktionary/${lang}.db`, ); const langrows: any = langdb.query("SELECT data FROM langs"); for (const langrow of langrows) { const j = JSON.parse(langrow.data); console.log({ j }); if (count > 10) break; } // await pdb.init(); // // const soundTypes = new Set(); // // [ // // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other", // // "text", "hangeul", "topics", "form", "audio-ipa" // // ] // const langs = ["en", "th", "zh", "es", "ja", "vn"]; // for await (const line of readWiktionaryDump()) { // try { // count++; // console.log({ count }); // // if (count > 50) break; // const j = JSON.parse(line); // // console.log(Object.keys(j), j.word); // // add language to db // pdb.addLanguage(j.lang_code, j.lang); // if (!langs.includes(j.lang_code)) continue; // // handleEtim(j); // // handleDerived(j); // // handleSenses(j.pos, j.senses); // // // // const isWord = j.word.trim().split(" ").length === 1; // if (isWord) await handleWord(j); // else await handleIdiom(j); // } catch (e) { // // console.log("error parsing", e); // // break; // } // } } async function handleWord(j: any) { let ts = Date.now(); const analyzed = await findLemma(j.word, j.lang_code); // console.log(analyzed.segments.length); if (analyzed.segments.length !== 1) return console.error("wtf bruh", analyzed); const seg = analyzed.segments[0]; if (!seg) return console.log("no seg", analyzed); const isLemma = analyzed.input === seg.root.lemma; if (!isLemma) // return console.error("not lemma", { // ...seg, // word: j.word, // input: analyzed.input, // }); return; const wordId = pdb.addWord(j.word, j.lang_code); const sounds = j.sounds || []; const hwikiRhyme = sounds.find((s: any) => "rhymes" in s); const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; for (let snd of sounds) { if ("ipa" in snd) handleIpa(wordId, j, snd, wikiRhyme); } } async function handleIpa( wordId: number | bigint, j: any, snd: any, wikiRhyme: string | null, ) { const tags = JSON.stringify(snd.tags) || null; const ipa = snd.ipa; try { const hres = await fetch("http://localhost:8104/syls", { method: "POST", headers: { "content-type": "application/json" }, body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }), }); const hjon = await hres.json(); // console.log(Date.now() - ts, "elapsed in http"); // ts = Date.now(); pdb.addPronunciation( "word", wordId, hjon.clean_ipa, hjon.syls.length, tags, null, ); const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => { if (!item.stressed && !acc) return acc; if (item.stressed && !acc) return `${acc}${item.rhyme}`; else return `${acc}${item.ipa}`; }, ""); if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme); // else console.log("no rhyme?", hjon); for (const syl of hjon.syls) { // TODO ideally syllables would have spelling not IPA... harsh tho pdb.addSyllable( wordId, idx, j.lang_code, syl.ipa, syl.long, "", syl.onset || null, syl.medial || null, syl.nucleus, syl.coda || null, syl.rhyme, syl.tone || null, null, ); } // console.log(Date.now() - ts, "elapsed in db"); // ts = Date.now(); } catch (e) { // console.error(e); // console.error({ snd }); // break; } } async function handleIdiom(j: any) { console.log(j.word, "idiom"); pdb.addIdiom(j.word, j.lang_code); // TODO IPA of idioms...? } async function handleEtim(j: any) { console.log(j.etymology_text, "etym"); console.log(j.etymology_templates, "etym"); // { // name: "inh", // args: { // "1": "en", // "2": "ang", // "3": "frēo", // "4": "", // "5": "free", // }, // expansion: "Old English frēo (“free”)", // }, console.log(j.head_templates, "head"); // { // name: "en-verb", // args: {}, // expansion: "free (third-person singular simple present frees, present participle freeing, simple past and past participle freed)", // } } async function handleDerived(j: any) { const { forms, derived, related, antonyms, hyponyms, synonyms, descendants } = j; console.log("forms", forms); // {form: string; tags: string[]} console.log("derived", derived); // {word: string} console.log("related", related); // {word: string, source?: string;} console.log("ant", antonyms); // {word: string, source?: string;} console.log("hypo", hyponyms); console.log("syno", synonyms); // {word: string, source?: string;} console.log("desc", descendants); } async function handleSenses(pos: string, senses: any[]) { console.log("ex", senses[0].examples); // {text: string; ref: string; type: "quote"} console.log("info", senses[0].info_templates); for (const s of senses) { // s.glosses[] // s.tags[] } } redump("th"); async function newtest() { // const query = pdb.db.query( // `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, // ); // const res = query.run( // "lol", // "en", // true, // "l", // "j", // "o", // "q", // "joq", // null, // null, // ); // const sylId = res.lastInsertRowid; const res1 = pdb.db .query( `INSERT INTO onsets(text, lang) VALUES(?, ?) ON CONFLICT(text, lang) DO UPDATE SET text = excluded.text RETURNING rowid `, ) .get("lll", "en"); console.log({ res1 }); } // newtest(); // TIL calling shell commands is terribly slow wtf // Bun.$.env({ FOO: ipa }); // const res = await Bun.$`${SORSYL_PATH} $FOO`; // const syllables = JSON.parse(res.stdout.toString()); // console.log(Date.now() - ts, "elapsed in py"); // ts = Date.now();