diff options
author | polwex <polwex@sortug.com> | 2025-05-29 12:46:01 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-05-29 12:46:01 +0700 |
commit | 06e8d0a0d636f539f20ece3d9d767190d0a71b3b (patch) | |
tree | 4564a30b20e6bafc89ecdf0e818b23e4734f3ec1 | |
parent | a3f24ea79b14394b24c4b60a010651eb29eeb872 (diff) |
m
-rw-r--r-- | src/lib/calls/nlp.ts | 15 | ||||
-rw-r--r-- | src/lib/db/seed.ts | 128 |
2 files changed, 89 insertions, 54 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts index 28562d0..24e7cf3 100644 --- a/src/lib/calls/nlp.ts +++ b/src/lib/calls/nlp.ts @@ -52,3 +52,18 @@ export async function deconstructSyllable(ipa: string): Promise<SyllableRes> { const jj = await r2.json(); return jj; } + +export async function findLemma(word: string, lang: string) { + const opts = { + method: "POST", + headers: { + "Content-type": "application/json", + "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!, + }, + body: JSON.stringify({ string: word, lang }), + }; + // const r1 = await fetch(`http://localhost:8000/segmentate`, opts); + const r2 = await fetch("http://localhost:8102" + `/spacy`, opts); + const jj = await r2.json(); + return jj; +} diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts index 7f4352f..0e291c3 100644 --- a/src/lib/db/seed.ts +++ b/src/lib/db/seed.ts @@ -4,6 +4,7 @@ import useful from "@/lib/useful_thai.json"; import db from "."; import pdb from "./prosodydb"; import * as Sorsyl from "sorsyl"; +import { findLemma } from "../calls/nlp"; const SYMBOL_REGEX = new RegExp(/[\W\d]/); @@ -516,6 +517,7 @@ async function redump() { // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other", // "text", "hangeul", "topics", "form", "audio-ipa" // ] + const langs = ["en", "th", "zh", "es", "ja", "vn"]; for await (const line of readWiktionaryDump()) { try { count++; @@ -524,6 +526,7 @@ async function redump() { console.log(Object.keys(j), j.word); // add language to db pdb.addLanguage(j.lang_code, j.lang); + if (!langs.includes(j.lang_code)) continue; // handleEtim(j); // handleDerived(j); // handleSenses(j.pos, j.senses); @@ -551,64 +554,81 @@ type SorSyl = { tone: string; }; async function handleWord(j: any) { - const wordId = pdb.addWord(j.word, j.lang_code); let ts = Date.now(); + const analyzed = await findLemma(j.word, j.lang_code); + if (analyzed.segments.length !== 1) + return console.error("wtf bruh", analyzed); + const seg = analyzed.segments[0]; + const isLemma = analyzed.input === seg.lemma; + if (!isLemma) + return console.error("not lemma", { + ...seg, + word: j.word, + input: analyzed.input, + }); + const wordId = pdb.addWord(j.word, j.lang_code); - const hwikiRhyme = j.sounds.find((s) => "rhymes" in s); + const sounds = j.sounds || []; + const hwikiRhyme = sounds.find((s: any) => "rhymes" in s); const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null; - for (let snd of j.sounds || []) { - if ("ipa" in snd) { - const tags = JSON.stringify(snd.tags) || null; - const ipa = snd.ipa; - try { - const hres = await fetch("http://localhost:8104/syls", { - method: "POST", - headers: { "content-type": "application/json" }, - body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }), - }); - const hjon = await hres.json(); - console.log(Date.now() - ts, "elapsed in http"); - ts = Date.now(); - pdb.addPronunciation( - "word", - wordId, - hjon.clean_ipa, - hjon.syls.length, - tags, - null, - ); - const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => { - if (!item.stressed && !acc) return acc; - if (item.stressed && !acc) return `${acc}${item.rhyme}`; - else return `${acc}${item.ipa}`; - }, ""); - if (wordRhyme) - pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme); - else console.log("no rhyme?", hjon); - for (const syl of hjon.syls) { - // TODO ideally syllables would have spelling not IPA... harsh tho - pdb.addSyllable( - wordId, - syl.ipa, - j.lang_code, - syl.long, - syl.onset || null, - syl.medial || null, - syl.nucleus, - syl.coda || null, - syl.rhyme, - syl.tone || null, - null, - ); - } - console.log(Date.now() - ts, "elapsed in db"); - ts = Date.now(); - } catch (e) { - console.error(e); - console.error(j); - // break; - } + for (let snd of sounds) { + if ("ipa" in snd) handleIpa(wordId, j, snd, wikiRhyme); + } +} +async function handleIpa( + wordId: number | bigint, + j: any, + snd: any, + wikiRhyme: string | null, +) { + const tags = JSON.stringify(snd.tags) || null; + const ipa = snd.ipa; + try { + const hres = await fetch("http://localhost:8104/syls", { + method: "POST", + headers: { "content-type": "application/json" }, + body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }), + }); + const hjon = await hres.json(); + // console.log(Date.now() - ts, "elapsed in http"); + // ts = Date.now(); + pdb.addPronunciation( + "word", + wordId, + hjon.clean_ipa, + hjon.syls.length, + tags, + null, + ); + const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => { + if (!item.stressed && !acc) return acc; + if (item.stressed && !acc) return `${acc}${item.rhyme}`; + else return `${acc}${item.ipa}`; + }, ""); + if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme); + else console.log("no rhyme?", hjon); + for (const syl of hjon.syls) { + // TODO ideally syllables would have spelling not IPA... harsh tho + pdb.addSyllable( + wordId, + syl.ipa, + j.lang_code, + syl.long, + syl.onset || null, + syl.medial || null, + syl.nucleus, + syl.coda || null, + syl.rhyme, + syl.tone || null, + null, + ); } + // console.log(Date.now() - ts, "elapsed in db"); + // ts = Date.now(); + } catch (e) { + console.error(e); + console.error({ snd }); + // break; } } async function handleIdiom(j: any) { |