summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-05-29 12:46:01 +0700
committerpolwex <polwex@sortug.com>2025-05-29 12:46:01 +0700
commit06e8d0a0d636f539f20ece3d9d767190d0a71b3b (patch)
tree4564a30b20e6bafc89ecdf0e818b23e4734f3ec1
parenta3f24ea79b14394b24c4b60a010651eb29eeb872 (diff)
m
-rw-r--r--src/lib/calls/nlp.ts15
-rw-r--r--src/lib/db/seed.ts128
2 files changed, 89 insertions, 54 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
index 28562d0..24e7cf3 100644
--- a/src/lib/calls/nlp.ts
+++ b/src/lib/calls/nlp.ts
@@ -52,3 +52,18 @@ export async function deconstructSyllable(ipa: string): Promise<SyllableRes> {
const jj = await r2.json();
return jj;
}
+
+export async function findLemma(word: string, lang: string) {
+ const opts = {
+ method: "POST",
+ headers: {
+ "Content-type": "application/json",
+ "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+ },
+ body: JSON.stringify({ string: word, lang }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8102" + `/spacy`, opts);
+ const jj = await r2.json();
+ return jj;
+}
diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts
index 7f4352f..0e291c3 100644
--- a/src/lib/db/seed.ts
+++ b/src/lib/db/seed.ts
@@ -4,6 +4,7 @@ import useful from "@/lib/useful_thai.json";
import db from ".";
import pdb from "./prosodydb";
import * as Sorsyl from "sorsyl";
+import { findLemma } from "../calls/nlp";
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
@@ -516,6 +517,7 @@ async function redump() {
// "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
// "text", "hangeul", "topics", "form", "audio-ipa"
// ]
+ const langs = ["en", "th", "zh", "es", "ja", "vn"];
for await (const line of readWiktionaryDump()) {
try {
count++;
@@ -524,6 +526,7 @@ async function redump() {
console.log(Object.keys(j), j.word);
// add language to db
pdb.addLanguage(j.lang_code, j.lang);
+ if (!langs.includes(j.lang_code)) continue;
// handleEtim(j);
// handleDerived(j);
// handleSenses(j.pos, j.senses);
@@ -551,64 +554,81 @@ type SorSyl = {
tone: string;
};
async function handleWord(j: any) {
- const wordId = pdb.addWord(j.word, j.lang_code);
let ts = Date.now();
+ const analyzed = await findLemma(j.word, j.lang_code);
+ if (analyzed.segments.length !== 1)
+ return console.error("wtf bruh", analyzed);
+ const seg = analyzed.segments[0];
+ const isLemma = analyzed.input === seg.lemma;
+ if (!isLemma)
+ return console.error("not lemma", {
+ ...seg,
+ word: j.word,
+ input: analyzed.input,
+ });
+ const wordId = pdb.addWord(j.word, j.lang_code);
- const hwikiRhyme = j.sounds.find((s) => "rhymes" in s);
+ const sounds = j.sounds || [];
+ const hwikiRhyme = sounds.find((s: any) => "rhymes" in s);
const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
- for (let snd of j.sounds || []) {
- if ("ipa" in snd) {
- const tags = JSON.stringify(snd.tags) || null;
- const ipa = snd.ipa;
- try {
- const hres = await fetch("http://localhost:8104/syls", {
- method: "POST",
- headers: { "content-type": "application/json" },
- body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }),
- });
- const hjon = await hres.json();
- console.log(Date.now() - ts, "elapsed in http");
- ts = Date.now();
- pdb.addPronunciation(
- "word",
- wordId,
- hjon.clean_ipa,
- hjon.syls.length,
- tags,
- null,
- );
- const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => {
- if (!item.stressed && !acc) return acc;
- if (item.stressed && !acc) return `${acc}${item.rhyme}`;
- else return `${acc}${item.ipa}`;
- }, "");
- if (wordRhyme)
- pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
- else console.log("no rhyme?", hjon);
- for (const syl of hjon.syls) {
- // TODO ideally syllables would have spelling not IPA... harsh tho
- pdb.addSyllable(
- wordId,
- syl.ipa,
- j.lang_code,
- syl.long,
- syl.onset || null,
- syl.medial || null,
- syl.nucleus,
- syl.coda || null,
- syl.rhyme,
- syl.tone || null,
- null,
- );
- }
- console.log(Date.now() - ts, "elapsed in db");
- ts = Date.now();
- } catch (e) {
- console.error(e);
- console.error(j);
- // break;
- }
+ for (let snd of sounds) {
+ if ("ipa" in snd) handleIpa(wordId, j, snd, wikiRhyme);
+ }
+}
+async function handleIpa(
+ wordId: number | bigint,
+ j: any,
+ snd: any,
+ wikiRhyme: string | null,
+) {
+ const tags = JSON.stringify(snd.tags) || null;
+ const ipa = snd.ipa;
+ try {
+ const hres = await fetch("http://localhost:8104/syls", {
+ method: "POST",
+ headers: { "content-type": "application/json" },
+ body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }),
+ });
+ const hjon = await hres.json();
+ // console.log(Date.now() - ts, "elapsed in http");
+ // ts = Date.now();
+ pdb.addPronunciation(
+ "word",
+ wordId,
+ hjon.clean_ipa,
+ hjon.syls.length,
+ tags,
+ null,
+ );
+ const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => {
+ if (!item.stressed && !acc) return acc;
+ if (item.stressed && !acc) return `${acc}${item.rhyme}`;
+ else return `${acc}${item.ipa}`;
+ }, "");
+ if (wordRhyme) pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
+ else console.log("no rhyme?", hjon);
+ for (const syl of hjon.syls) {
+ // TODO ideally syllables would have spelling not IPA... harsh tho
+ pdb.addSyllable(
+ wordId,
+ syl.ipa,
+ j.lang_code,
+ syl.long,
+ syl.onset || null,
+ syl.medial || null,
+ syl.nucleus,
+ syl.coda || null,
+ syl.rhyme,
+ syl.tone || null,
+ null,
+ );
}
+ // console.log(Date.now() - ts, "elapsed in db");
+ // ts = Date.now();
+ } catch (e) {
+ console.error(e);
+ console.error({ snd });
+ // break;
}
}
async function handleIdiom(j: any) {