diff options
author | polwex <polwex@sortug.com> | 2025-06-02 23:05:36 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-06-02 23:05:36 +0700 |
commit | 904b34de8f7748b7954d88784369b9cae6fa92fb (patch) | |
tree | 53bb5cb3377ae40d8bfa44087a0c712edd6c9d02 /src/lib/db/seed.ts | |
parent | a03c92dc82ad527d7da6bbaa3c43000e2e5f0e69 (diff) |
all me here should merge
Diffstat (limited to 'src/lib/db/seed.ts')
-rw-r--r-- | src/lib/db/seed.ts | 132 |
1 files changed, 52 insertions, 80 deletions
diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts index 4780dc3..c03da60 100644 --- a/src/lib/db/seed.ts +++ b/src/lib/db/seed.ts @@ -1,3 +1,4 @@ +import Database from "bun:sqlite"; import { readWiktionaryDump } from "../services/wiki"; import { getStressedSyllable, getSyllableCount } from "../utils"; import useful from "@/lib/useful_thai.json"; @@ -7,36 +8,6 @@ import { findLemma } from "../calls/nlp"; const SYMBOL_REGEX = new RegExp(/[\W\d]/); -async function handleFile( - filename: string, - func: (line: string, idx: number) => void, -) { - const file = Bun.file(filename); - const s = file.stream(); - const reader = s.getReader(); - const decoder = new TextDecoder(); - let leftover = ""; - let lineCount = 0; - while (true) { - const { value, done } = await reader.read(); - if (done) break; - const chunk = decoder.decode(value, { stream: true }); - const lines = (leftover + chunk).split("\n"); - - // Process each line except the last (which might be incomplete) - for (const line of lines.slice(0, -1)) { - lineCount++; - func(line, lineCount); - } - - // Save the last incomplete line to process in the next iteration - leftover = lines[lines.length - 1]; - } - - // Handle any remaining content after reading all chunks - if (leftover) func(leftover, lineCount + 1); -} - function goodPos(pos: string): boolean { const list = [ "CC", @@ -90,12 +61,12 @@ async function englishFreq() { } async function thaiFreq() { const files = [ - "/home/y/code/prosody/prosody/langdata/thai/data/1yin_freq.csv", - "/home/y/code/prosody/prosody/langdata/thai/data/2yin_freq.csv", - "/home/y/code/prosody/prosody/langdata/thai/data/3yin_freq.csv", - "/home/y/code/prosody/prosody/langdata/thai/data/4yin_freq.csv", - "/home/y/code/prosody/prosody/langdata/thai/data/5yin_freq.csv", - "/home/y/code/prosody/prosody/langdata/thai/data/6yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv", + "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv", ]; for (let f of files) { handleFile(f, (line, idx) => { @@ -508,52 +479,51 @@ function fixSyllables() { // const SORSYL_PATH = "/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl"; -async function redump() { - await pdb.init(); - let count = 0; - // const soundTypes = new Set<string>(); - // [ - // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other", - // "text", "hangeul", "topics", "form", "audio-ipa" - // ] - const langs = ["en", "th", "zh", "es", "ja", "vn"]; - for await (const line of readWiktionaryDump()) { - try { - count++; - console.log({ count }); - // if (count > 50) break; - const j = JSON.parse(line); - // console.log(Object.keys(j), j.word); - // add language to db - pdb.addLanguage(j.lang_code, j.lang); - if (!langs.includes(j.lang_code)) continue; - // handleEtim(j); - // handleDerived(j); - // handleSenses(j.pos, j.senses); - // // - const isWord = j.word.trim().split(" ").length === 1; - if (isWord) await handleWord(j); - else await handleIdiom(j); - } catch (e) { - // console.log("error parsing", e); - // break; - } +async function redump(lang: string) { + let count = 0; + const langdb = new Database( + `/home/y/code/prosody/resources/wiktionary/${lang}.db`, + ); + const langrows: any = langdb.query("SELECT data FROM langs"); + for (const langrow of langrows) { + const j = JSON.parse(langrow.data); + console.log({ j }); + if (count > 10) break; } + // await pdb.init(); + + // // const soundTypes = new Set<string>(); + // // [ + // // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other", + // // "text", "hangeul", "topics", "form", "audio-ipa" + // // ] + // const langs = ["en", "th", "zh", "es", "ja", "vn"]; + + // for await (const line of readWiktionaryDump()) { + // try { + // count++; + // console.log({ count }); + // // if (count > 50) break; + // const j = JSON.parse(line); + // // console.log(Object.keys(j), j.word); + // // add language to db + // pdb.addLanguage(j.lang_code, j.lang); + // if (!langs.includes(j.lang_code)) continue; + // // handleEtim(j); + // // handleDerived(j); + // // handleSenses(j.pos, j.senses); + // // // + // const isWord = j.word.trim().split(" ").length === 1; + // if (isWord) await handleWord(j); + // else await handleIdiom(j); + // } catch (e) { + // // console.log("error parsing", e); + // // break; + // } + // } } -type SorSyl = { - stressed: boolean; - long: boolean; - spelling: string; - ipa: string; - nucleus: string; - onset: string; - medial: string; - coda: string; - rhyme: string; - tone: string; -}; async function handleWord(j: any) { let ts = Date.now(); const analyzed = await findLemma(j.word, j.lang_code); @@ -615,9 +585,11 @@ async function handleIpa( // TODO ideally syllables would have spelling not IPA... harsh tho pdb.addSyllable( wordId, - syl.ipa, + idx, j.lang_code, + syl.ipa, syl.long, + "", syl.onset || null, syl.medial || null, syl.nucleus, @@ -689,7 +661,7 @@ async function handleSenses(pos: string, senses: any[]) { } } -redump(); +redump("th"); async function newtest() { // const query = pdb.db.query( |