summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-06-03 19:40:34 +0700
committerpolwex <polwex@sortug.com>2025-06-03 19:40:34 +0700
commitb91b758041cbc7b8bf7e2a4aee8d6228a75d8105 (patch)
tree4fa343ed394034b16841ecfcb6411b1574d24b25
parent175ddca375cef765cec8ca5bbc527a205c40bf25 (diff)
m
-rw-r--r--src/lib/calls/nlp.ts14
-rw-r--r--src/lib/db/enseed.ts195
-rw-r--r--src/lib/db/prosodydb.ts10
3 files changed, 137 insertions, 82 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
index 1e84e93..2810744 100644
--- a/src/lib/calls/nlp.ts
+++ b/src/lib/calls/nlp.ts
@@ -176,3 +176,17 @@ export async function findLemma(word: string, lang: string) {
const jj = await r2.json();
return jj;
}
+export async function charsiuG2P(word: string, lang: string) {
+ const opts = {
+ method: "POST",
+ headers: {
+ "Content-type": "application/json",
+ "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+ },
+ body: JSON.stringify({ string: word, lang }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8105" + `/ipa`, opts);
+ const jj = await r2.json();
+ return jj;
+}
diff --git a/src/lib/db/enseed.ts b/src/lib/db/enseed.ts
index 39dec44..9ef61ed 100644
--- a/src/lib/db/enseed.ts
+++ b/src/lib/db/enseed.ts
@@ -1,24 +1,15 @@
import Database from "bun:sqlite";
-import {
- analyzeTHWord,
- deconstructSyllable,
- segmentateThai,
- type SorSyl,
- type ThaiNLPRes,
- sorSyl,
- getThaiFreq,
- SorBSyl,
-} from "../calls/nlp";
+import { sorSyl, SorBSyl, charsiuG2P, SorSylRes } from "../calls/nlp";
import pdb from "./prosodydb";
import { cleanIpa } from "../utils";
import { handleFile } from "./utils";
-import { Tone } from "../types/phonetics";
+import { Phoneme, Tone } from "../types/phonetics";
import { AsyncRes } from "../types";
const errors: string[] = [];
async function readDump(lang: string) {
await pdb.init();
- pdb.addLanguage("th", "thai");
+ pdb.addLanguage("en", "english");
let count = 0;
const langdb = new Database(
`/home/y/code/prosody/resources/wiktionary/${lang}.db`,
@@ -37,8 +28,8 @@ async function readDump(lang: string) {
const split = word.split(" ");
const res =
split.length > 1
- ? await handleIdiom(lang, word)
- : await handleWord(lang, word, j, freqMap);
+ ? await handleIdiom(word, lang)
+ : await handleWord(word, lang, j, freqMap);
if ("error" in res) {
console.error(res.error);
break;
@@ -48,50 +39,69 @@ async function readDump(lang: string) {
}
async function handleWord(
- lang: string,
word: string,
+ lang: string,
j: any,
freqMap: Map<string, number>,
): AsyncRes<string> {
- // TODO add categories but add a tag to see what classifying scheme we're using
- //
+ const frequency = freqMap.get(word) || null;
+ const promises = await getIpa(word, lang, j);
+ const phonetics = await Promise.all(promises);
+
+ // pdb.superAdd({ word, lang, frequency, wordNotes: null, phonetics });
+ return { ok: "" };
+}
+
+type IPAData = {
+ ipa: string;
+ syllable_count: number;
+ syllable_sequence: string;
+ tone_sequence: string;
+ ipa_sequence: string;
+ tags: string | null;
+ notes: string | null;
+ wordRhyme: string | null;
+ syllables: SylData[];
+};
+async function getIpa(
+ word: string,
+ lang: string,
+ j: any,
+): Promise<Promise<IPAData>[]> {
const sounds = j.sounds || [];
const hasIpa = sounds.find((s: any) => "ipa" in s);
- const hwikiRhyme = sounds.find((s: any) => "rhymes" in s);
- const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
if (!hasIpa) {
- // console.error("no ipa!!", word);
+ console.log("no ipa", word);
// console.dir(j, { depth: null });
- return { error: "meh no ipa" };
+ console.dir(sounds, { depth: null });
+ // TODO fetch from idk charsiu
+ // const ipa = await charsiuG2P(word, lang);
+ // console.log("charsiu", ipa);
}
- const freq = freqMap.get(word) || null;
- // const wordId = pdb.addWord(word, lang, freq, null);
- // WIPE
- const wordId = 0;
- // console.log(analyzed);
- for (let snd of sounds)
- if ("ipa" in snd) {
- const res = await handleIpa(wordId, word, lang, j, snd, wikiRhyme);
- if ("error" in res) return res;
- }
- return { ok: "" };
+ const hwikiRhyme = sounds.find((s: any) => "rhymes" in s);
+ const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
+ const ipaData: Promise<IPAData>[] = sounds.reduce(
+ (acc: Promise<IPAData>[], snd: any) => {
+ if ("ipa" in snd) {
+ const data = getIpaData(word, lang, snd, wikiRhyme);
+ return [...acc, data];
+ } else return acc;
+ },
+ [],
+ );
+ return ipaData;
}
-async function handleIpa(
- wordId: number | bigint,
+
+async function getIpaData(
word: string,
lang: string,
- j: any,
snd: any,
wikiRhyme: string | null,
-) {
+): Promise<IPAData> {
+ console.log("geting ipa...");
const tags = JSON.stringify(snd.tags) || null;
- const ipa = snd.ipa;
+ const ipa = cleanIpa(snd.ipa);
const syls = await sorSyl(word, lang, ipa);
- // console.log(syls, "sorsyl");
-
- console.log(word);
- console.log(ipa);
- pdb.addPronunciation(wordId, ipa, syls.syls.length, tags, null);
// set word rhyme
const wordRhyme = syls.syls.reduce((acc: string, itemm: SorBSyl) => {
const item = itemm.ipa;
@@ -99,47 +109,76 @@ async function handleIpa(
if (item.stressed && !acc) return `${acc}${item.rhyme}`;
else return `${acc}${item.all}`;
}, "");
- if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
+ console.log({ word, wikiRhyme, wordRhyme });
- for (let i = 0; i < syls.syls.length; i++) {
- const syl = syls.syls[i]!;
- const res = await handleSyllable(syl, wordId, i);
- if ("error" in res) return res;
- }
- return { ok: "" };
+ const tone_sequence = "";
+ const seqs = syls.syls.reduce(
+ (acc, item, idx) => {
+ const startString = idx === 0 ? "" : ",";
+ const { ipa, spelling } = item;
+ acc.ipa += `${startString}${ipa.all}`;
+ acc.syls += `${startString}${spelling.all}`;
+ return acc;
+ },
+ { syls: "", ipa: "" },
+ );
+ const syllable_sequence = seqs.syls;
+ const ipa_sequence = seqs.ipa;
+ const syllables = getSyllables(syls);
+ return {
+ ipa,
+ syllable_count: syls.syls.length,
+ syllable_sequence,
+ tone_sequence,
+ ipa_sequence,
+ tags,
+ notes: null,
+ wordRhyme: null,
+ syllables,
+ };
}
-async function handleSyllable(
- syl: SorBSyl,
- wordId: number | bigint,
- idx: number,
-): AsyncRes<string> {
- try {
- pdb.addSyllable(
- wordId,
- idx + 1,
- syl.ipa.stressed,
- "th",
- syl.ipa.all,
- syl.ipa.long,
- syl.spelling.all,
- { spelling: syl.spelling.onset, ipa: syl.ipa.onset },
- { spelling: syl.spelling.medial, ipa: syl.ipa.medial },
- { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus },
- { spelling: syl.spelling.coda, ipa: syl.ipa.coda },
- { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme },
- { letters: "", numbers: 0, name: "" },
- null,
- );
- return { ok: "" };
- } catch (e) {
- // console.log("well fuck", syl);
- // console.error(e);
- return { error: `${e}` };
+type SylData = {
+ idx: number;
+ stressed: boolean | null;
+ spelling: string;
+ ipa: string;
+ long: boolean;
+ onset: Phoneme;
+ medial: Phoneme;
+ nucleus: Phoneme;
+ coda: Phoneme;
+ rhyme: Phoneme;
+ tone: Tone;
+ notes: string | null;
+};
+function getSyllables(syl: SorSylRes): SylData[] {
+ let syls: SylData[] = [];
+ for (let i = 0; i < syl.syls.length; i++) {
+ const syllable = syl.syls[i]!;
+ const res = getSyllable(syllable, i);
+ syls.push(res);
}
+ return syls;
+}
+function getSyllable(syl: SorBSyl, idx: number): SylData {
+ return {
+ idx: idx + 1,
+ stressed: null,
+ spelling: syl.spelling.all,
+ ipa: syl.ipa.all,
+ long: syl.ipa.long,
+ onset: { spelling: syl.spelling.onset, ipa: syl.ipa.onset },
+ medial: { spelling: syl.spelling.medial, ipa: syl.ipa.medial },
+ nucleus: { spelling: syl.spelling.nucleus, ipa: syl.ipa.nucleus },
+ coda: { spelling: syl.spelling.coda, ipa: syl.ipa.coda },
+ rhyme: { spelling: syl.spelling.rhyme, ipa: syl.ipa.rhyme },
+ tone: { name: "", letters: "", numbers: 0 },
+ notes: null,
+ };
}
-async function handleIdiom(lang: string, idiom: string): AsyncRes<string> {
+async function handleIdiom(idiom: string, lang: string): AsyncRes<string> {
try {
- pdb.addIdiom(idiom, lang);
+ // pdb.addIdiom(idiom, lang);
// TODO later set idiom_words once all words are populated
// console.log();
return { ok: "" };
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
index 7c067d2..26687a2 100644
--- a/src/lib/db/prosodydb.ts
+++ b/src/lib/db/prosodydb.ts
@@ -8,10 +8,13 @@ class DatabaseHandler {
db: Database;
constructor() {
// const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/phon.db";
- const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/thaiphon.db";
+ const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/enphon.db";
const db = new Database(dbPath, { create: true });
db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance
db.exec("PRAGMA foreign_keys = ON");
+ db.exec("PRAGMA cache_size = -8000"); // Increase cache size to 8MB
+ db.exec("PRAGMA temp_store = MEMORY"); // Store temp tables in memory
+ db.exec("PRAGMA synchronous = NORMAL"); // Slightly less safe but faster
this.db = db;
}
async init() {
@@ -62,9 +65,8 @@ class DatabaseHandler {
FROM words w
JOIN word_phonetics wp ON wp.word_id = w.id
JOIN syllables_words sw ON sw.word_id = w.id
- WHERE w.frequency IS NOT NULL
- AND w.lang = ?
- ORDER BY w.frequency ASC
+ WHERE w.lang = ?
+ ORDER BY w.frequency ASC NULLS LAST
LIMIT 300
`,
);