summaryrefslogtreecommitdiff
path: root/src/lib/db/seed.ts
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-05-29 12:10:22 +0700
committerpolwex <polwex@sortug.com>2025-05-29 12:10:22 +0700
commita3f24ea79b14394b24c4b60a010651eb29eeb872 (patch)
treecb1c4937084116f66a59727ee752afd974714c8e /src/lib/db/seed.ts
parent7abf2227438362ad30820ee236405ec1b57a40b6 (diff)
glorious new db
Diffstat (limited to 'src/lib/db/seed.ts')
-rw-r--r--src/lib/db/seed.ts212
1 files changed, 211 insertions, 1 deletions
diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts
index c4094de..7f4352f 100644
--- a/src/lib/db/seed.ts
+++ b/src/lib/db/seed.ts
@@ -2,6 +2,8 @@ import { readWiktionaryDump } from "../services/wiki";
import { getStressedSyllable, getSyllableCount } from "../utils";
import useful from "@/lib/useful_thai.json";
import db from ".";
+import pdb from "./prosodydb";
+import * as Sorsyl from "sorsyl";
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
@@ -483,7 +485,16 @@ function addThaiSyllablesLesson() {
// }
// }
// }
-addThaiUseful();
+function fixSyllables() {
+ const res = db.db.query(`SELECT ipa, syllables FROM expressions;`).all();
+ for (let i = 0; i < 10; i++) {
+ // for (const row of res) {
+ const row = res[i];
+ console.log({ row });
+ }
+}
+// fixSyllables();
+// addThaiUseful();
// addThaiSyllablesLesson();
// adjustFrequency("th");
@@ -492,3 +503,202 @@ addThaiUseful();
// fillFromDump();
// thaiSyllables();
// thaiFreq();
+//
+//
+const SORSYL_PATH =
+ "/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl";
+async function redump() {
+ await pdb.init();
+ let count = 0;
+
+ // const soundTypes = new Set<string>();
+ // [
+ // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
+ // "text", "hangeul", "topics", "form", "audio-ipa"
+ // ]
+ for await (const line of readWiktionaryDump()) {
+ try {
+ count++;
+ // if (count > 50) break;
+ const j = JSON.parse(line);
+ console.log(Object.keys(j), j.word);
+ // add language to db
+ pdb.addLanguage(j.lang_code, j.lang);
+ // handleEtim(j);
+ // handleDerived(j);
+ // handleSenses(j.pos, j.senses);
+ // //
+ const isWord = j.word.trim().split(" ").length === 1;
+ if (isWord) await handleWord(j);
+ else await handleIdiom(j);
+ } catch (e) {
+ console.log("error parsing", e);
+ // break;
+ }
+ }
+}
+
+type SorSyl = {
+ stressed: boolean;
+ long: boolean;
+ spelling: string;
+ ipa: string;
+ nucleus: string;
+ onset: string;
+ medial: string;
+ coda: string;
+ rhyme: string;
+ tone: string;
+};
+async function handleWord(j: any) {
+ const wordId = pdb.addWord(j.word, j.lang_code);
+ let ts = Date.now();
+
+ const hwikiRhyme = j.sounds.find((s) => "rhymes" in s);
+ const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
+ for (let snd of j.sounds || []) {
+ if ("ipa" in snd) {
+ const tags = JSON.stringify(snd.tags) || null;
+ const ipa = snd.ipa;
+ try {
+ const hres = await fetch("http://localhost:8104/syls", {
+ method: "POST",
+ headers: { "content-type": "application/json" },
+ body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }),
+ });
+ const hjon = await hres.json();
+ console.log(Date.now() - ts, "elapsed in http");
+ ts = Date.now();
+ pdb.addPronunciation(
+ "word",
+ wordId,
+ hjon.clean_ipa,
+ hjon.syls.length,
+ tags,
+ null,
+ );
+ const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => {
+ if (!item.stressed && !acc) return acc;
+ if (item.stressed && !acc) return `${acc}${item.rhyme}`;
+ else return `${acc}${item.ipa}`;
+ }, "");
+ if (wordRhyme)
+ pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
+ else console.log("no rhyme?", hjon);
+ for (const syl of hjon.syls) {
+ // TODO ideally syllables would have spelling not IPA... harsh tho
+ pdb.addSyllable(
+ wordId,
+ syl.ipa,
+ j.lang_code,
+ syl.long,
+ syl.onset || null,
+ syl.medial || null,
+ syl.nucleus,
+ syl.coda || null,
+ syl.rhyme,
+ syl.tone || null,
+ null,
+ );
+ }
+ console.log(Date.now() - ts, "elapsed in db");
+ ts = Date.now();
+ } catch (e) {
+ console.error(e);
+ console.error(j);
+ // break;
+ }
+ }
+ }
+}
+async function handleIdiom(j: any) {
+ console.log(j.word, "idiom");
+ pdb.addIdiom(j.word, j.lang_code);
+ // TODO IPA of idioms...?
+}
+async function handleEtim(j: any) {
+ console.log(j.etymology_text, "etym");
+ console.log(j.etymology_templates, "etym");
+
+ // {
+ // name: "inh",
+ // args: {
+ // "1": "en",
+ // "2": "ang",
+ // "3": "frēo",
+ // "4": "",
+ // "5": "free",
+ // },
+ // expansion: "Old English frēo (“free”)",
+ // },
+
+ console.log(j.head_templates, "head");
+ // {
+ // name: "en-verb",
+ // args: {},
+ // expansion: "free (third-person singular simple present frees, present participle freeing, simple past and past participle freed)",
+ // }
+}
+async function handleDerived(j: any) {
+ const { forms, derived, related, antonyms, hyponyms, synonyms, descendants } =
+ j;
+ console.log("forms", forms);
+ // {form: string; tags: string[]}
+ console.log("derived", derived);
+ // {word: string}
+ console.log("related", related);
+ // {word: string, source?: string;}
+ console.log("ant", antonyms);
+ // {word: string, source?: string;}
+ console.log("hypo", hyponyms);
+ console.log("syno", synonyms);
+ // {word: string, source?: string;}
+ console.log("desc", descendants);
+}
+async function handleSenses(pos: string, senses: any[]) {
+ console.log("ex", senses[0].examples);
+ // {text: string; ref: string; type: "quote"}
+ console.log("info", senses[0].info_templates);
+ for (const s of senses) {
+ // s.glosses[]
+ // s.tags[]
+ }
+}
+
+redump();
+
+async function newtest() {
+ // const query = pdb.db.query(
+ // `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+ // );
+ // const res = query.run(
+ // "lol",
+ // "en",
+ // true,
+ // "l",
+ // "j",
+ // "o",
+ // "q",
+ // "joq",
+ // null,
+ // null,
+ // );
+ // const sylId = res.lastInsertRowid;
+ const res1 = pdb.db
+ .query(
+ `INSERT INTO onsets(text, lang) VALUES(?, ?)
+ ON CONFLICT(text, lang) DO UPDATE SET
+ text = excluded.text
+ RETURNING rowid
+ `,
+ )
+ .get("lll", "en");
+ console.log({ res1 });
+}
+// newtest();
+// TIL calling shell commands is terribly slow wtf
+// Bun.$.env({ FOO: ipa });
+// const res = await Bun.$`${SORSYL_PATH} $FOO`;
+// const syllables = JSON.parse(res.stdout.toString());
+// console.log(Date.now() - ts, "elapsed in py");
+// ts = Date.now();