summaryrefslogtreecommitdiff
path: root/src/lib/db/seed.ts
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/db/seed.ts')
-rw-r--r--src/lib/db/seed.ts132
1 files changed, 52 insertions, 80 deletions
diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts
index 4780dc3..c03da60 100644
--- a/src/lib/db/seed.ts
+++ b/src/lib/db/seed.ts
@@ -1,3 +1,4 @@
+import Database from "bun:sqlite";
import { readWiktionaryDump } from "../services/wiki";
import { getStressedSyllable, getSyllableCount } from "../utils";
import useful from "@/lib/useful_thai.json";
@@ -7,36 +8,6 @@ import { findLemma } from "../calls/nlp";
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
-async function handleFile(
- filename: string,
- func: (line: string, idx: number) => void,
-) {
- const file = Bun.file(filename);
- const s = file.stream();
- const reader = s.getReader();
- const decoder = new TextDecoder();
- let leftover = "";
- let lineCount = 0;
- while (true) {
- const { value, done } = await reader.read();
- if (done) break;
- const chunk = decoder.decode(value, { stream: true });
- const lines = (leftover + chunk).split("\n");
-
- // Process each line except the last (which might be incomplete)
- for (const line of lines.slice(0, -1)) {
- lineCount++;
- func(line, lineCount);
- }
-
- // Save the last incomplete line to process in the next iteration
- leftover = lines[lines.length - 1];
- }
-
- // Handle any remaining content after reading all chunks
- if (leftover) func(leftover, lineCount + 1);
-}
-
function goodPos(pos: string): boolean {
const list = [
"CC",
@@ -90,12 +61,12 @@ async function englishFreq() {
}
async function thaiFreq() {
const files = [
- "/home/y/code/prosody/prosody/langdata/thai/data/1yin_freq.csv",
- "/home/y/code/prosody/prosody/langdata/thai/data/2yin_freq.csv",
- "/home/y/code/prosody/prosody/langdata/thai/data/3yin_freq.csv",
- "/home/y/code/prosody/prosody/langdata/thai/data/4yin_freq.csv",
- "/home/y/code/prosody/prosody/langdata/thai/data/5yin_freq.csv",
- "/home/y/code/prosody/prosody/langdata/thai/data/6yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
];
for (let f of files) {
handleFile(f, (line, idx) => {
@@ -508,52 +479,51 @@ function fixSyllables() {
//
const SORSYL_PATH =
"/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl";
-async function redump() {
- await pdb.init();
- let count = 0;
- // const soundTypes = new Set<string>();
- // [
- // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
- // "text", "hangeul", "topics", "form", "audio-ipa"
- // ]
- const langs = ["en", "th", "zh", "es", "ja", "vn"];
- for await (const line of readWiktionaryDump()) {
- try {
- count++;
- console.log({ count });
- // if (count > 50) break;
- const j = JSON.parse(line);
- // console.log(Object.keys(j), j.word);
- // add language to db
- pdb.addLanguage(j.lang_code, j.lang);
- if (!langs.includes(j.lang_code)) continue;
- // handleEtim(j);
- // handleDerived(j);
- // handleSenses(j.pos, j.senses);
- // //
- const isWord = j.word.trim().split(" ").length === 1;
- if (isWord) await handleWord(j);
- else await handleIdiom(j);
- } catch (e) {
- // console.log("error parsing", e);
- // break;
- }
+async function redump(lang: string) {
+ let count = 0;
+ const langdb = new Database(
+ `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+ );
+ const langrows: any = langdb.query("SELECT data FROM langs");
+ for (const langrow of langrows) {
+ const j = JSON.parse(langrow.data);
+ console.log({ j });
+ if (count > 10) break;
}
+ // await pdb.init();
+
+ // // const soundTypes = new Set<string>();
+ // // [
+ // // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
+ // // "text", "hangeul", "topics", "form", "audio-ipa"
+ // // ]
+ // const langs = ["en", "th", "zh", "es", "ja", "vn"];
+
+ // for await (const line of readWiktionaryDump()) {
+ // try {
+ // count++;
+ // console.log({ count });
+ // // if (count > 50) break;
+ // const j = JSON.parse(line);
+ // // console.log(Object.keys(j), j.word);
+ // // add language to db
+ // pdb.addLanguage(j.lang_code, j.lang);
+ // if (!langs.includes(j.lang_code)) continue;
+ // // handleEtim(j);
+ // // handleDerived(j);
+ // // handleSenses(j.pos, j.senses);
+ // // //
+ // const isWord = j.word.trim().split(" ").length === 1;
+ // if (isWord) await handleWord(j);
+ // else await handleIdiom(j);
+ // } catch (e) {
+ // // console.log("error parsing", e);
+ // // break;
+ // }
+ // }
}
-type SorSyl = {
- stressed: boolean;
- long: boolean;
- spelling: string;
- ipa: string;
- nucleus: string;
- onset: string;
- medial: string;
- coda: string;
- rhyme: string;
- tone: string;
-};
async function handleWord(j: any) {
let ts = Date.now();
const analyzed = await findLemma(j.word, j.lang_code);
@@ -615,9 +585,11 @@ async function handleIpa(
// TODO ideally syllables would have spelling not IPA... harsh tho
pdb.addSyllable(
wordId,
- syl.ipa,
+ idx,
j.lang_code,
+ syl.ipa,
syl.long,
+ "",
syl.onset || null,
syl.medial || null,
syl.nucleus,
@@ -689,7 +661,7 @@ async function handleSenses(pos: string, senses: any[]) {
}
}
-redump();
+redump("th");
async function newtest() {
// const query = pdb.db.query(