all me here should merge

author: polwex <polwex@sortug.com> 2025-06-02 23:05:36 +0700
committer: polwex <polwex@sortug.com> 2025-06-02 23:05:36 +0700
commit: 904b34de8f7748b7954d88784369b9cae6fa92fb (patch)
tree: 53bb5cb3377ae40d8bfa44087a0c712edd6c9d02 /src/lib/db/seed.ts
parent: a03c92dc82ad527d7da6bbaa3c43000e2e5f0e69 (diff)
1 files changed, 52 insertions, 80 deletions
diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts
index 4780dc3..c03da60 100644
--- a/src/lib/db/seed.ts
+++ b/src/lib/db/seed.ts
@@ -1,3 +1,4 @@
+import Database from "bun:sqlite";
 import { readWiktionaryDump } from "../services/wiki";
 import { getStressedSyllable, getSyllableCount } from "../utils";
 import useful from "@/lib/useful_thai.json";
@@ -7,36 +8,6 @@ import { findLemma } from "../calls/nlp";
 
 const SYMBOL_REGEX = new RegExp(/[\W\d]/);
 
-async function handleFile(
-  filename: string,
-  func: (line: string, idx: number) => void,
-) {
-  const file = Bun.file(filename);
-  const s = file.stream();
-  const reader = s.getReader();
-  const decoder = new TextDecoder();
-  let leftover = "";
-  let lineCount = 0;
-  while (true) {
-    const { value, done } = await reader.read();
-    if (done) break;
-    const chunk = decoder.decode(value, { stream: true });
-    const lines = (leftover + chunk).split("\n");
-
-    // Process each line except the last (which might be incomplete)
-    for (const line of lines.slice(0, -1)) {
-      lineCount++;
-      func(line, lineCount);
-    }
-
-    // Save the last incomplete line to process in the next iteration
-    leftover = lines[lines.length - 1];
-  }
-
-  // Handle any remaining content after reading all chunks
-  if (leftover) func(leftover, lineCount + 1);
-}
-
 function goodPos(pos: string): boolean {
   const list = [
     "CC",
@@ -90,12 +61,12 @@ async function englishFreq() {
 }
 async function thaiFreq() {
   const files = [
-    "/home/y/code/prosody/prosody/langdata/thai/data/1yin_freq.csv",
-    "/home/y/code/prosody/prosody/langdata/thai/data/2yin_freq.csv",
-    "/home/y/code/prosody/prosody/langdata/thai/data/3yin_freq.csv",
-    "/home/y/code/prosody/prosody/langdata/thai/data/4yin_freq.csv",
-    "/home/y/code/prosody/prosody/langdata/thai/data/5yin_freq.csv",
-    "/home/y/code/prosody/prosody/langdata/thai/data/6yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
+    "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
   ];
   for (let f of files) {
     handleFile(f, (line, idx) => {
@@ -508,52 +479,51 @@ function fixSyllables() {
 //
 const SORSYL_PATH =
   "/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl";
-async function redump() {
-  await pdb.init();
-  let count = 0;
 
-  // const soundTypes = new Set<string>();
-  // [
-  //   "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
-  //   "text", "hangeul", "topics", "form", "audio-ipa"
-  // ]
-  const langs = ["en", "th", "zh", "es", "ja", "vn"];
-  for await (const line of readWiktionaryDump()) {
-    try {
-      count++;
-      console.log({ count });
-      // if (count > 50) break;
-      const j = JSON.parse(line);
-      // console.log(Object.keys(j), j.word);
-      // add language to db
-      pdb.addLanguage(j.lang_code, j.lang);
-      if (!langs.includes(j.lang_code)) continue;
-      // handleEtim(j);
-      // handleDerived(j);
-      // handleSenses(j.pos, j.senses);
-      // //
-      const isWord = j.word.trim().split(" ").length === 1;
-      if (isWord) await handleWord(j);
-      else await handleIdiom(j);
-    } catch (e) {
-      // console.log("error parsing", e);
-      // break;
-    }
+async function redump(lang: string) {
+  let count = 0;
+  const langdb = new Database(
+    `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+  );
+  const langrows: any = langdb.query("SELECT data FROM langs");
+  for (const langrow of langrows) {
+    const j = JSON.parse(langrow.data);
+    console.log({ j });
+    if (count > 10) break;
   }
+  // await pdb.init();
+
+  // // const soundTypes = new Set<string>();
+  // // [
+  // //   "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
+  // //   "text", "hangeul", "topics", "form", "audio-ipa"
+  // // ]
+  // const langs = ["en", "th", "zh", "es", "ja", "vn"];
+
+  // for await (const line of readWiktionaryDump()) {
+  //   try {
+  //     count++;
+  //     console.log({ count });
+  //     // if (count > 50) break;
+  //     const j = JSON.parse(line);
+  //     // console.log(Object.keys(j), j.word);
+  //     // add language to db
+  //     pdb.addLanguage(j.lang_code, j.lang);
+  //     if (!langs.includes(j.lang_code)) continue;
+  //     // handleEtim(j);
+  //     // handleDerived(j);
+  //     // handleSenses(j.pos, j.senses);
+  //     // //
+  //     const isWord = j.word.trim().split(" ").length === 1;
+  //     if (isWord) await handleWord(j);
+  //     else await handleIdiom(j);
+  //   } catch (e) {
+  //     // console.log("error parsing", e);
+  //     // break;
+  //   }
+  // }
 }
 
-type SorSyl = {
-  stressed: boolean;
-  long: boolean;
-  spelling: string;
-  ipa: string;
-  nucleus: string;
-  onset: string;
-  medial: string;
-  coda: string;
-  rhyme: string;
-  tone: string;
-};
 async function handleWord(j: any) {
   let ts = Date.now();
   const analyzed = await findLemma(j.word, j.lang_code);
@@ -615,9 +585,11 @@ async function handleIpa(
       // TODO ideally syllables would have spelling not IPA... harsh tho
       pdb.addSyllable(
         wordId,
-        syl.ipa,
+        idx,
         j.lang_code,
+        syl.ipa,
         syl.long,
+        "",
         syl.onset || null,
         syl.medial || null,
         syl.nucleus,
@@ -689,7 +661,7 @@ async function handleSenses(pos: string, senses: any[]) {
   }
 }
 
-redump();
+redump("th");
 
 async function newtest() {
   // const query = pdb.db.query(
author	polwex <polwex@sortug.com>	2025-06-02 23:05:36 +0700
committer	polwex <polwex@sortug.com>	2025-06-02 23:05:36 +0700
commit	904b34de8f7748b7954d88784369b9cae6fa92fb (patch)
tree	53bb5cb3377ae40d8bfa44087a0c712edd6c9d02 /src/lib/db/seed.ts
parent	a03c92dc82ad527d7da6bbaa3c43000e2e5f0e69 (diff)