From a3f24ea79b14394b24c4b60a010651eb29eeb872 Mon Sep 17 00:00:00 2001 From: polwex Date: Thu, 29 May 2025 12:10:22 +0700 Subject: glorious new db --- src/lib/db/seed.ts | 212 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 211 insertions(+), 1 deletion(-) (limited to 'src/lib/db/seed.ts') diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts index c4094de..7f4352f 100644 --- a/src/lib/db/seed.ts +++ b/src/lib/db/seed.ts @@ -2,6 +2,8 @@ import { readWiktionaryDump } from "../services/wiki"; import { getStressedSyllable, getSyllableCount } from "../utils"; import useful from "@/lib/useful_thai.json"; import db from "."; +import pdb from "./prosodydb"; +import * as Sorsyl from "sorsyl"; const SYMBOL_REGEX = new RegExp(/[\W\d]/); @@ -483,7 +485,16 @@ function addThaiSyllablesLesson() { // } // } // } -addThaiUseful(); +function fixSyllables() { + const res = db.db.query(`SELECT ipa, syllables FROM expressions;`).all(); + for (let i = 0; i < 10; i++) { + // for (const row of res) { + const row = res[i]; + console.log({ row }); + } +} +// fixSyllables(); +// addThaiUseful(); // addThaiSyllablesLesson(); // adjustFrequency("th"); @@ -492,3 +503,202 @@ addThaiUseful(); // fillFromDump(); // thaiSyllables(); // thaiFreq(); +// +// +const SORSYL_PATH = + "/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl"; +async function redump() { + await pdb.init(); + let count = 0; + + // const soundTypes = new Set(); + // [ + // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other", + // "text", "hangeul", "topics", "form", "audio-ipa" + // ] + for await (const line of readWiktionaryDump()) { + try { + count++; + // if (count > 50) break; + const j = JSON.parse(line); + console.log(Object.keys(j), j.word); + // add language to db + pdb.addLanguage(j.lang_code, j.lang); + // handleEtim(j); + // handleDerived(j); + // handleSenses(j.pos, j.senses); + // // + const isWord = j.word.trim().split(" ").length === 1; + if (isWord) await handleWord(j); + else await handleIdiom(j); + } catch (e) { + console.log("error parsing", e); + // break; + } + } +} + +type SorSyl = { + stressed: boolean; + long: boolean; + spelling: string; + ipa: string; + nucleus: string; + onset: string; + medial: string; + coda: string; + rhyme: string; + tone: string; +}; +async function handleWord(j: any) { + const wordId = pdb.addWord(j.word, j.lang_code); + let ts = Date.now(); + + const hwikiRhyme = j.sounds.find((s) => "rhymes" in s); + const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; + for (let snd of j.sounds || []) { + if ("ipa" in snd) { + const tags = JSON.stringify(snd.tags) || null; + const ipa = snd.ipa; + try { + const hres = await fetch("http://localhost:8104/syls", { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }), + }); + const hjon = await hres.json(); + console.log(Date.now() - ts, "elapsed in http"); + ts = Date.now(); + pdb.addPronunciation( + "word", + wordId, + hjon.clean_ipa, + hjon.syls.length, + tags, + null, + ); + const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => { + if (!item.stressed && !acc) return acc; + if (item.stressed && !acc) return `${acc}${item.rhyme}`; + else return `${acc}${item.ipa}`; + }, ""); + if (wordRhyme) + pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme); + else console.log("no rhyme?", hjon); + for (const syl of hjon.syls) { + // TODO ideally syllables would have spelling not IPA... harsh tho + pdb.addSyllable( + wordId, + syl.ipa, + j.lang_code, + syl.long, + syl.onset || null, + syl.medial || null, + syl.nucleus, + syl.coda || null, + syl.rhyme, + syl.tone || null, + null, + ); + } + console.log(Date.now() - ts, "elapsed in db"); + ts = Date.now(); + } catch (e) { + console.error(e); + console.error(j); + // break; + } + } + } +} +async function handleIdiom(j: any) { + console.log(j.word, "idiom"); + pdb.addIdiom(j.word, j.lang_code); + // TODO IPA of idioms...? +} +async function handleEtim(j: any) { + console.log(j.etymology_text, "etym"); + console.log(j.etymology_templates, "etym"); + + // { + // name: "inh", + // args: { + // "1": "en", + // "2": "ang", + // "3": "frēo", + // "4": "", + // "5": "free", + // }, + // expansion: "Old English frēo (“free”)", + // }, + + console.log(j.head_templates, "head"); + // { + // name: "en-verb", + // args: {}, + // expansion: "free (third-person singular simple present frees, present participle freeing, simple past and past participle freed)", + // } +} +async function handleDerived(j: any) { + const { forms, derived, related, antonyms, hyponyms, synonyms, descendants } = + j; + console.log("forms", forms); + // {form: string; tags: string[]} + console.log("derived", derived); + // {word: string} + console.log("related", related); + // {word: string, source?: string;} + console.log("ant", antonyms); + // {word: string, source?: string;} + console.log("hypo", hyponyms); + console.log("syno", synonyms); + // {word: string, source?: string;} + console.log("desc", descendants); +} +async function handleSenses(pos: string, senses: any[]) { + console.log("ex", senses[0].examples); + // {text: string; ref: string; type: "quote"} + console.log("info", senses[0].info_templates); + for (const s of senses) { + // s.glosses[] + // s.tags[] + } +} + +redump(); + +async function newtest() { + // const query = pdb.db.query( + // `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + // ); + // const res = query.run( + // "lol", + // "en", + // true, + // "l", + // "j", + // "o", + // "q", + // "joq", + // null, + // null, + // ); + // const sylId = res.lastInsertRowid; + const res1 = pdb.db + .query( + `INSERT INTO onsets(text, lang) VALUES(?, ?) + ON CONFLICT(text, lang) DO UPDATE SET + text = excluded.text + RETURNING rowid + `, + ) + .get("lll", "en"); + console.log({ res1 }); +} +// newtest(); +// TIL calling shell commands is terribly slow wtf +// Bun.$.env({ FOO: ipa }); +// const res = await Bun.$`${SORSYL_PATH} $FOO`; +// const syllables = JSON.parse(res.stdout.toString()); +// console.log(Date.now() - ts, "elapsed in py"); +// ts = Date.now(); -- cgit v1.2.3