summaryrefslogtreecommitdiff
path: root/src/lib
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-06-03 15:41:31 +0700
committerpolwex <polwex@sortug.com>2025-06-03 15:41:31 +0700
commit175ddca375cef765cec8ca5bbc527a205c40bf25 (patch)
treef2e47a5d85e4d5e0297613e5a17cebce7d09b09b /src/lib
parent2401217a4019938d1c1cc61b6e33ccb233eb6e74 (diff)
preeeeettty much done FUCK YES
Diffstat (limited to 'src/lib')
-rw-r--r--src/lib/calls/nlp.ts1
-rw-r--r--src/lib/db/prosodydb.ts215
-rw-r--r--src/lib/db/prosodyschema.sql98
-rw-r--r--src/lib/db/thaiseed.ts253
-rw-r--r--src/lib/db/thaiseedold.ts301
-rw-r--r--src/lib/types/phonetics.ts4
-rw-r--r--src/lib/utils.ts5
7 files changed, 732 insertions, 145 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
index f19c976..1e84e93 100644
--- a/src/lib/calls/nlp.ts
+++ b/src/lib/calls/nlp.ts
@@ -1,4 +1,5 @@
import { SyllableRes } from "../types/cards";
+import { randomFromArray } from "../utils";
export type ThaiNLPRes = {
word: string;
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
index d6da389..7c067d2 100644
--- a/src/lib/db/prosodydb.ts
+++ b/src/lib/db/prosodydb.ts
@@ -1,5 +1,5 @@
import Database from "bun:sqlite";
-import { Phoneme, Tone } from "../types/phonetics";
+import { MutationOrder, Phoneme, Tone } from "../types/phonetics";
import { ProsodyWord, ProsodyWordDB } from "../types/cards";
type Str = string | null;
type ItemType = "word" | "syllable" | "idiom";
@@ -113,6 +113,7 @@ class DatabaseHandler {
w.spelling,
wp.ipa,
w.frequency,
+ GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq,
GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
COUNT(sw.syl_id) as syllable_count
FROM words w
@@ -127,17 +128,166 @@ class DatabaseHandler {
spelling,
ipa,
frequency,
+ syl_seq,
tone_sequence,
syllable_count
FROM word_tone_sequences
WHERE tone_sequence LIKE ?
AND syllable_count = ?
- ORDER BY frequency DESC NULLS LAST;
+ ORDER BY frequency ASC NULLS LAST;
`,
);
return query.all(toneString.slice(1), tones.length) as any[];
}
+ // fetchWordsByToneAndSyls(tones: Array<string | null>) {
+ // const toneString = tones.reduce((acc: string, item) => {
+ // if (!item) return `${acc},%`;
+ // else return `${acc},${item}`;
+ // }, "");
+ // console.log({ toneString });
+ // const query = this.db.query(
+ // `
+ // WITH word_tone_sequences AS (
+ // SELECT
+ // w.id as word_id,
+ // w.spelling,
+ // wp.ipa,
+ // w.frequency,
+ // GROUP_CONCAT(s.text ORDER BY sw.idx) as syl_seq,
+ // GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_sequence,
+ // COUNT(sw.syl_id) as syllable_count
+ // FROM words w
+ // JOIN word_phonetics wp ON w.id = wp.word_id
+ // JOIN syllables_words sw ON w.id = sw.word_id
+ // JOIN syllables s ON sw.syl_id = s.id
+ // JOIN tones t ON s.tone = t.id
+ // GROUP BY w.id, w.spelling, w.lang, w.frequency
+ // )
+ // SELECT
+ // word_id,
+ // spelling,
+ // ipa,
+ // frequency,
+ // syl_seq,
+ // tone_sequence,
+ // syllable_count
+ // FROM word_tone_sequences
+ // WHERE tone_sequence LIKE ?
+ // AND syllable_count = ?
+ // ORDER BY frequency DESC NULLS LAST;
+ // `,
+ // );
+ // return query.all(toneString.slice(1), tones.length) as any[];
+ // }
+ fetchWordsByToneSylsWords(order: MutationOrder) {
+ console.log({ order });
+ type Acc = { tones: string; syls: string };
+ const strings = order.reduce(
+ (acc: Acc, item, idx) => {
+ const startString = idx === 0 ? "" : ",";
+ if ("change" in item)
+ return {
+ tones: `${acc.tones}${startString}${item.change}`,
+ syls: `${acc.syls}${startString}%`,
+ };
+ else
+ return {
+ tones: `${acc.tones}${startString}%`,
+ syls: `${acc.syls}${startString}${item.keep}`,
+ };
+ },
+ { tones: "", syls: "" },
+ );
+ const query = this.db.query(`
+ SELECT
+ w.id as word_id,
+ w.spelling,
+ w.lang,
+ w.frequency,
+ wp.ipa,
+ wp.syllable_sequence,
+ wp.tone_sequence,
+ wp.ipa_sequence,
+ GROUP_CONCAT(s.text ORDER BY sw.idx) as syllable_pattern,
+ GROUP_CONCAT(t.name ORDER BY sw.idx) as tone_pattern
+ FROM words w
+ JOIN syllables_words sw ON w.id = sw.word_id
+ JOIN syllables s ON sw.syl_id = s.id
+ JOIN tones t ON s.tone = t.id
+ JOIN word_phonetics wp ON wp.word_id= w.id
+ WHERE wp.syllable_sequence LIKE ?1
+ AND tone_sequence LIKE ?2
+ AND syllable_count = ?3
+ GROUP BY w.id, w.spelling, w.lang, w.frequency
+ ORDER BY w.frequency ASC NULLS LAST; `);
+ return query.all(strings.syls, strings.tones, order.length) as any[];
+ }
// inserts
+ superAdd(p: {
+ word: string;
+ lang: string;
+ frequency: number | null;
+ wordNotes: Str;
+ phonetics: Array<{
+ ipa: string;
+ syllable_count: number;
+ syllable_sequence: string;
+ tone_sequence: string;
+ ipa_sequence: string;
+ tags: Str;
+ notes: Str;
+ wordRhyme: Str;
+ syllables: Array<{
+ idx: number;
+ stressed: boolean | null;
+ spelling: string;
+ ipa: string;
+ long: boolean;
+ onset: Phoneme;
+ medial: Phoneme;
+ nucleus: Phoneme;
+ coda: Phoneme;
+ rhyme: Phoneme;
+ tone: Tone;
+ notes: Str;
+ }>;
+ }>;
+ }) {
+ const tx = this.db.transaction(() => {
+ const wordId = this.addWord(p.word, p.lang, p.frequency, p.wordNotes);
+ for (const ph of p.phonetics) {
+ this.addPronunciation(
+ wordId,
+ ph.ipa,
+ ph.syllable_count,
+ ph.syllable_sequence,
+ ph.tone_sequence,
+ ph.ipa_sequence,
+ ph.tags,
+ ph.notes,
+ );
+ for (const syl of ph.syllables) {
+ this.addSyllable(
+ wordId,
+ syl.idx,
+ syl.stressed,
+ p.lang,
+ syl.ipa,
+ syl.long,
+ syl.spelling,
+ syl.onset,
+ syl.medial,
+ syl.nucleus,
+ syl.coda,
+ syl.rhyme,
+ syl.tone,
+ syl.notes,
+ );
+ }
+ }
+ });
+ tx();
+ }
addLanguage(code: string, name: string) {
const query = this.db
@@ -147,15 +297,44 @@ class DatabaseHandler {
addPronunciation(
wordId: number | bigint,
ipa: string,
- syllables: number,
+ syllable_count: number,
+ syllable_sequence: string,
+ tone_sequence: string,
+ ipa_sequence: string,
tags: Str,
notes: Str,
) {
+ console.log({
+ wordId,
+ ipa,
+ syllable_count,
+ syllable_sequence,
+ tone_sequence,
+ ipa_sequence,
+ });
const query = this.db
.query(
- `INSERT OR IGNORE INTO word_phonetics(word_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?)`,
+ `INSERT OR IGNORE INTO word_phonetics(
+ word_id,
+ ipa,
+ syllable_count,
+ syllable_sequence,
+ tone_sequence,
+ ipa_sequence,
+ tag,
+ notes)
+ VALUES(?, ?, ?, ?, ?, ?, ?, ?)`,
)
- .run(wordId, ipa, syllables, tags, notes);
+ .run(
+ wordId,
+ ipa,
+ syllable_count,
+ syllable_sequence,
+ tone_sequence,
+ ipa_sequence,
+ tags,
+ notes,
+ );
}
addWordRhyme(wordId: number | bigint, ipa: string, lang: string, notes: Str) {
const query = this.db
@@ -212,12 +391,14 @@ class DatabaseHandler {
notes: Str,
) {
const query = this.db.query(
- `INSERT OR IGNORE INTO words(spelling, lang, frequency, notes) VALUES(?, ?, ?, ?)`,
- // `INSERT INTO words(spelling, lang) VALUES(?, ?)`,
+ `INSERT INTO words(spelling, lang, frequency, notes) VALUES(?, ?, ?, ?)
+ ON CONFLICT(spelling, lang) DO UPDATE SET
+ lang = excluded.lang
+ RETURNING rowid
+ `,
);
- const res = query.run(spelling, lang, frequency, notes);
- const wordId = res.lastInsertRowid;
- return wordId;
+ const res = query.get(spelling, lang, frequency, notes) as { id: number };
+ return res.id;
}
addSyllable(
wordId: number | bigint,
@@ -292,9 +473,15 @@ class DatabaseHandler {
.get(tone.letters, lang, tone.name, tone.numbers) as { id: number };
const query = this.db.query(
- `INSERT INTO syllables(lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+ `INSERT INTO syllables(
+ lang, ipa, long, text, onset, medial, nucleus, coda, rhyme, tone, notes)
+ VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ ON CONFLICT(text, ipa, lang) DO UPDATE SET
+ lang = excluded.lang
+ RETURNING rowid
+ `,
);
- const res = query.run(
+ const res = query.get(
lang,
ipa,
long,
@@ -306,8 +493,8 @@ class DatabaseHandler {
rhymeId.id,
toneId.id,
notes,
- );
- const sylId = res.lastInsertRowid;
+ ) as { id: number };
+ const sylId = res.id;
//
const res1 = this.db
.query(
diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql
index c6a04fa..5554a02 100644
--- a/src/lib/db/prosodyschema.sql
+++ b/src/lib/db/prosodyschema.sql
@@ -150,9 +150,103 @@ CREATE TABLE IF NOT EXISTS word_phonetics(
id INTEGER PRIMARY KEY AUTOINCREMENT,
word_id INTEGER NOT NULL,
ipa TEXT NOT NULL,
- syllables INTEGER NOT NULL,
+ syllable_count INTEGER NOT NULL,
+ syllable_sequence TEXT NOT NULL, -- "家,鄉"
+ tone_sequence TEXT NOT NULL, -- "rising,rising"
+ ipa_sequence TEXT NOT NULL, -- IPA representation
tag TEXT,
notes TEXT,
- CONSTRAINT ipa_unique UNIQUE (ipa, word_id)
+ FOREIGN KEY (word_id) REFERENCES words(id)
);
CREATE INDEX IF NOT EXISTS idx_words_ipa ON word_phonetics(ipa, word_id);
+
+-- -- Query 2: Even simpler with pattern table
+-- -- Pattern [{ change: "rising" }, { change: "falling" }] - any 2-syllable word with rising,falling tones
+-- SELECT
+-- w.spelling,
+-- w.frequency,
+-- wp.syllable_sequence,
+-- wp.tone_sequence
+-- FROM words w
+-- JOIN word_patterns wp ON w.id = wp.word_id
+-- WHERE wp.syllable_count = 2
+-- AND wp.tone_sequence = 'rising,falling'
+-- ORDER BY w.frequency DESC NULLS LAST;
+
+-- -- Query 3: Mixed pattern [{ keep: "家" }, { change: "falling" }, { keep: "人" }]
+-- SELECT DISTINCT
+-- w.spelling,
+-- w.frequency,
+-- wp.syllable_sequence,
+-- wp.tone_sequence
+-- FROM words w
+-- JOIN word_patterns wp ON w.id = wp.word_id
+-- WHERE wp.syllable_count = 3
+-- AND wp.syllable_sequence LIKE '家,%,人' -- Simple pattern matching
+-- AND EXISTS (
+-- SELECT 1 FROM word_syllable_positions wsp
+-- WHERE wsp.word_id = w.id
+-- AND wsp.position = 1
+-- AND wsp.tone_name = 'falling'
+-- )
+-- ORDER BY w.frequency DESC NULLS LAST;
+
+-- -- Query 4: Super fast rhyme finding
+-- -- Find all words that end with same syllable as "家鄉" (end with "鄉")
+-- SELECT
+-- w.spelling,
+-- w.frequency,
+-- wp.syllable_sequence
+-- FROM words w
+-- JOIN word_patterns wp ON w.id = wp.word_id
+-- WHERE wp.syllable_sequence LIKE '%,鄉' -- Ends with 鄉
+-- AND wp.syllable_count >= 2
+-- ORDER BY w.frequency DESC NULLS LAST;
+
+
+
+
+-- SELECT
+-- w.id as word_id,
+-- w.spelling,
+-- w.lang,
+-- w.frequency
+-- FROM words w
+-- JOIN word_phonetics wp ON wp.word_id= w.id
+-- WHERE wp.syllable_sequence LIKE '%,ใจ'
+-- AND wp.tone_sequence LIKE 'rising,%'
+-- AND wp.syllable_count = 2
+-- GROUP BY w.id, w.spelling, w.lang, w.frequency
+-- ORDER BY w.frequency DESC NULLS LAST;
+--
+-- Indexes for fast pattern matching
+CREATE INDEX IF NOT EXISTS idx_word_patterns_syllables ON word_phonetics(syllable_sequence);
+CREATE INDEX IF NOT EXISTS idx_word_patterns_tones ON word_phonetics(tone_sequence);
+CREATE INDEX IF NOT EXISTS idx_word_patterns_count ON word_phonetics(syllable_count);
+CREATE INDEX IF NOT EXISTS idx_word_patterns_mixed ON word_phonetics(syllable_count, syllable_sequence, tone_sequence);
+
+
+CREATE INDEX IF NOT EXISTS idx_syllables_words_word_idx ON syllables_words(word_id, idx);
+CREATE INDEX IF NOT EXISTS idx_syllables_words_idx_word ON syllables_words(idx, word_id);
+CREATE INDEX IF NOT EXISTS idx_syllables_words_syl ON syllables_words(syl_id);
+
+-- 2. Syllables table indexes for text and language lookups
+CREATE INDEX IF NOT EXISTS idx_syllables_text_lang ON syllables(text, lang);
+CREATE INDEX IF NOT EXISTS idx_syllables_lang_text ON syllables(lang, text);
+CREATE INDEX IF NOT EXISTS idx_syllables_tone ON syllables(tone);
+CREATE INDEX IF NOT EXISTS idx_syllables_text_tone ON syllables(text, tone);
+
+-- 3. Tones table indexes
+CREATE INDEX IF NOT EXISTS idx_tones_name_lang ON tones(name, lang);
+CREATE INDEX IF NOT EXISTS idx_tones_nums_lang ON tones(nums, lang);
+CREATE INDEX IF NOT EXISTS idx_tones_lang_name ON tones(lang, name);
+
+-- 4. Words table indexes
+CREATE INDEX IF NOT EXISTS idx_words_lang_freq ON words(lang, frequency DESC);
+CREATE INDEX IF NOT EXISTS idx_words_id_lang ON words(id, lang);
+
+-- 5. Composite indexes for common query patterns
+CREATE INDEX IF NOT EXISTS idx_syllables_compound ON syllables(lang, text, tone);
+CREATE INDEX IF NOT EXISTS idx_syllables_words_compound ON syllables_words(word_id, idx, syl_id);
+
+
diff --git a/src/lib/db/thaiseed.ts b/src/lib/db/thaiseed.ts
index 6c69d9c..32434da 100644
--- a/src/lib/db/thaiseed.ts
+++ b/src/lib/db/thaiseed.ts
@@ -11,7 +11,7 @@ import {
import pdb from "./prosodydb";
import { cleanIpa } from "../utils";
import { handleFile } from "./utils";
-import { Tone } from "../types/phonetics";
+import { Phoneme, Tone } from "../types/phonetics";
import { AsyncRes } from "../types";
async function readDump(lang: string) {
@@ -25,7 +25,7 @@ async function readDump(lang: string) {
// langrows = langrows.slice(10);
for (const langrow of langrows) {
count++;
- // console.log(count);
+ console.log(count);
// if (count <= 10000) continue;
// if (count > 100) break;
const j = JSON.parse(langrow.data);
@@ -68,65 +68,101 @@ async function readDump(lang: string) {
async function handleWord(word: string, j: any): AsyncRes<string> {
// TODO add categories but add a tag to see what classifying scheme we're using
//
- const sounds = j.sounds || [];
- const hasIpa = sounds.find((s: any) => "ipa" in s);
- if (!hasIpa) return { error: "meh no ipa" };
- const freq = await getThaiFreq(word);
- const wordId = pdb.addWord(word, "th", freq, null);
- if (wordId == 478 || word === "และ") {
- console.log("wtf man");
- console.dir(j, { depth: null });
- return { error: "i said wtf" };
- }
+ const frequency = await getThaiFreq(word);
const analyzed = await analyzeTHWord(word);
- for (let snd of sounds)
- if ("ipa" in snd) {
- const res = await handleIpa(wordId, j, snd, analyzed);
- if ("error" in res) return res;
- }
+ const phonetics = await Promise.all(getIpa(j, analyzed));
+
+ pdb.superAdd({ word, lang: "th", frequency, wordNotes: null, phonetics });
return { ok: "" };
}
-async function handleIpa(
- wordId: number | bigint,
- j: any,
- snd: any,
- analyzed: ThaiNLPRes,
-): AsyncRes<string> {
+function getIpa(j: any, analyzed: ThaiNLPRes) {
+ const sounds = j.sounds || [];
+ const hasIpa = sounds.find((s: any) => "ipa" in s);
+ if (!hasIpa) return [];
+ const ipaData: Promise<IPAData>[] = sounds.reduce(
+ async (acc: Promise<IPAData>[], snd: any) => {
+ if ("ipa" in snd) {
+ const data = getIpaData(snd, analyzed);
+ return [...acc, data];
+ } else return acc;
+ },
+ [],
+ );
+ return ipaData;
+}
+type IPAData = {
+ ipa: string;
+ syllable_count: number;
+ syllable_sequence: string;
+ tone_sequence: string;
+ ipa_sequence: string;
+ tags: string | null;
+ notes: string | null;
+ wordRhyme: string | null;
+ syllables: SylData[];
+};
+async function getIpaData(snd: any, analyzed: ThaiNLPRes): Promise<IPAData> {
const tags = JSON.stringify(snd.tags) || null;
// console.log("handleipa", analyzed.syllables.length);
// console.log(analyzed);
const wikiIpa = cleanIpa(snd.ipa);
const nlpIpa = cleanIpa(analyzed.ipa);
const ipa = wikiIpa || nlpIpa;
- if (j.word === "และ") {
- console.log("wtf!!");
- return { error: "wtf is this" };
- }
+ // if (j.word === "และ") {
+ // console.log("wtf!!");
+ // return { error: "wtf is this" };
+ // }
const wikiIpaSplit = wikiIpa.split(".");
const nlpIpaSplit = nlpIpa.split(".");
if (wikiIpaSplit.length !== nlpIpaSplit.length) {
- // console.log("ipa mismatch");
- // console.log(wikiIpa);
- // console.log(nlpIpa);
+ console.log("ipa mismatch");
+ console.log(wikiIpa);
+ console.log(nlpIpa);
}
if (analyzed.realSyls.length !== wikiIpaSplit.length) {
- // console.log("syllable analysis mismatch", j.word);
- // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
- // console.dir(j, { depth: null });
- return { error: "meh syllable analysis mismatch" };
+ console.log("syllable analysis mismatch", analyzed.word);
+ console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+ throw new Error("syllable mismatch");
}
const writtenSyls = analyzed.syllables;
- const pronouncedSyls = analyzed.realSyls;
+ const pronouncedSyls = analyzed.realSyls.map((s) =>
+ s.replace(/\u{E3A}/u, ""),
+ );
+
+ const tone_sequence = wikiIpaSplit
+ .map((s) => parseTone(s, analyzed.word))
+ .map((t) => t.name)
+ .join(",");
+ const syllable_sequence = pronouncedSyls.join(",");
+ const ipa_sequence = wikiIpaSplit.join(",");
+ const syllables = await Promise.all(
+ getSyllables(writtenSyls, pronouncedSyls, wikiIpaSplit),
+ );
+ return {
+ ipa,
+ syllable_count: pronouncedSyls.length,
+ syllable_sequence,
+ tone_sequence,
+ ipa_sequence,
+ tags,
+ notes: null,
+ wordRhyme: null,
+ syllables,
+ };
+}
+function getSyllables(
+ writtenSyls: string[],
+ pronouncedSyls: string[],
+ ipaSyls: string[],
+) {
let badSyls = false;
if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
-
- pdb.addPronunciation(wordId, ipa, pronouncedSyls.length, tags, null);
-
+ let syls: Promise<SylData>[] = [];
for (let i = 0; i < pronouncedSyls.length; i++) {
- const pronounced = pronouncedSyls[i]!.replace(/\u{E3A}/u, "");
+ const pronounced = pronouncedSyls[i]!;
const written = writtenSyls[i] || "";
const syllable = badSyls ? pronounced : written;
- const ipa = wikiIpaSplit[i]!;
+ const ipa = ipaSyls[i]!;
// TODO insert both??
const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
if (pronounced !== syllable) {
@@ -134,10 +170,10 @@ async function handleIpa(
console.log(pronounced);
console.log(written);
}
- const res = await handleSyllable(syllable, ipa, wordId, i, notes);
- if ("error" in res) return res;
+ const res = getSyllable(syllable, ipa, i, notes);
+ syls.push(res);
}
- return { ok: "" };
+ return syls;
}
const thaiTones: Record<string, string> = {
"˧": "mid",
@@ -153,8 +189,22 @@ const thaiToneNums: Record<string, number> = {
"˦˥": 45,
"˩˩˦": 214,
};
+const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|"));
+
function parseTone(ipa: string, spelling: string): Tone {
try {
+ const match = ipa.match(toneRegex)!;
+ const m = match[0]!;
+ const name = thaiTones[m]!;
+ const numbers = thaiToneNums[m]!;
+ return { letters: ipa, name, numbers };
+ } catch (e) {
+ console.error("meh wrong tones!!", { s: spelling, ipa });
+ throw new Error("");
+ }
+}
+function parseToneS(ipa: string, spelling: string): Tone {
+ try {
const name = thaiTones[ipa]!;
const numbers = thaiToneNums[ipa]!;
return { letters: ipa, name, numbers };
@@ -164,71 +214,44 @@ function parseTone(ipa: string, spelling: string): Tone {
}
}
-async function handleSyllable(
+type SylData = {
+ idx: number;
+ stressed: boolean | null;
+ spelling: string;
+ ipa: string;
+ long: boolean;
+ onset: Phoneme;
+ medial: Phoneme;
+ nucleus: Phoneme;
+ coda: Phoneme;
+ rhyme: Phoneme;
+ tone: Tone;
+ notes: string | null;
+};
+async function getSyllable(
spelling: string,
ipa: string,
- wordId: number | bigint,
idx: number,
notes: string | null,
-): AsyncRes<string> {
+): Promise<SylData> {
const sorsyl = await sorSyl(spelling, "th", ipa);
- const weird = [
- // "a̯n",
- // "a̯",
- // "a̯p",
- // "a̯w",
- // "a̯j",
- // "a̯ŋ",
- // "a̯k",
- // "a̯t",
- // "a̯m",
- // "a̯ʔ",
- // "ʔ",
- "s",
- "l",
- "f",
- "a̯s",
- "js",
- "t͡ɕʰ",
- "ks",
- "ns",
- "a̯l",
- "a̯f",
- "mk",
- ];
- // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
- // if (weirder) {
- // console.log("syllable", spelling);
- // // console.dir(sorsyl, { depth: null });
- // // console.dir(j, { depth: null });
- // }
if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
const syl = sorsyl.syls[0]!.ipa;
- const tone = parseTone(syl.tone, spelling);
- // TODO add actual ortographic data here not just ipa
- try {
- pdb.addSyllable(
- wordId,
- idx + 1,
- null,
- "th",
- syl.all,
- syl.long,
- spelling,
- { spelling: syl.onset, ipa: syl.onset },
- { spelling: syl.medial, ipa: syl.medial },
- { spelling: syl.nucleus, ipa: syl.nucleus },
- { spelling: syl.coda, ipa: syl.coda },
- { spelling: syl.rhyme, ipa: syl.rhyme },
- tone,
- notes,
- );
- return { ok: "" };
- } catch (e) {
- // console.log("well fuck", syl);
- // console.error(e);
- return { error: `meh ${e}` };
- }
+ const tone = parseToneS(syl.tone, spelling);
+ return {
+ idx: idx + 1,
+ stressed: null,
+ spelling,
+ ipa: syl.all,
+ long: syl.long,
+ onset: { spelling: syl.onset, ipa: syl.onset },
+ medial: { spelling: syl.medial, ipa: syl.medial },
+ nucleus: { spelling: syl.nucleus, ipa: syl.nucleus },
+ coda: { spelling: syl.coda, ipa: syl.coda },
+ rhyme: { spelling: syl.rhyme, ipa: syl.rhyme },
+ tone,
+ notes,
+ };
}
async function handleIdiom(idiom: string): AsyncRes<string> {
pdb.addIdiom(idiom, "th");
@@ -236,33 +259,5 @@ async function handleIdiom(idiom: string): AsyncRes<string> {
// console.log();
return { ok: "" };
}
-// ช้า ๆ
-// งก ๆ
-// หงก ๆ
-
-async function getFrequency() {
- const files = [
- "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
- "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
- "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
- "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
- "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
- "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
- ];
- const freqMap = new Map<number, string>();
- for (const file of files) {
- await handleFile(file, (line, idx) => {
- const [spelling, IPA, tone, length, frequency, ...rest] = line.split(",");
- freqMap.set(Number(frequency!), spelling!);
- });
- }
- const orderedMap = new Map<string, number>();
- const keys = Array.from(freqMap.keys()).sort();
- for (let i = 0; i < keys.length; i++) {
- const val = freqMap.get(keys[i]!)!;
- orderedMap.set(val, i + 1);
- }
- return orderedMap;
-}
readDump("th");
diff --git a/src/lib/db/thaiseedold.ts b/src/lib/db/thaiseedold.ts
new file mode 100644
index 0000000..b9522dd
--- /dev/null
+++ b/src/lib/db/thaiseedold.ts
@@ -0,0 +1,301 @@
+import Database from "bun:sqlite";
+import {
+ analyzeTHWord,
+ deconstructSyllable,
+ segmentateThai,
+ type SorSyl,
+ type ThaiNLPRes,
+ sorSyl,
+ getThaiFreq,
+} from "../calls/nlp";
+import pdb from "./prosodydb";
+import { cleanIpa } from "../utils";
+import { handleFile } from "./utils";
+import { Tone } from "../types/phonetics";
+import { AsyncRes } from "../types";
+
+async function readDump(lang: string) {
+ await pdb.init();
+ pdb.addLanguage("th", "thai");
+ let count = 0;
+ const langdb = new Database(
+ `/home/y/code/prosody/resources/wiktionary/${lang}.db`,
+ );
+ let langrows: any = langdb.query("SELECT data FROM langs");
+ // langrows = langrows.slice(10);
+ for (const langrow of langrows) {
+ count++;
+ console.log(count);
+ // if (count <= 10000) continue;
+ // if (count > 100) break;
+ const j = JSON.parse(langrow.data);
+ const word = j.word.trim();
+ if (!word) continue;
+
+ if (word.includes("ๆ")) {
+ const res = await handleWord(word, j);
+ if ("error" in res) {
+ if (res.error.includes("meh")) continue;
+ if (res.error.includes("wtf")) {
+ console.error(res.error);
+ console.error(j.sounds);
+ }
+ break;
+ }
+ } else {
+ const split = word.split(" ");
+ if (split.length > 1) {
+ const res = await handleIdiom(word);
+ if ("error" in res) {
+ console.error(res.error);
+ break;
+ }
+ } else {
+ const res = await handleWord(word, j);
+ if ("error" in res) {
+ if (res.error.includes("meh")) continue;
+ if (res.error.includes("wtf")) {
+ console.error(res.error);
+ console.error(j.sounds);
+ }
+ // break;
+ }
+ }
+ }
+ }
+}
+
+// if (wordId == 478 || word === "และ") {
+// // console.log("wtf man");
+// // console.dir(j, { depth: null });
+// // return { error: "i said wtf" };
+// }
+async function handleWord(word: string, j: any): AsyncRes<string> {
+ // TODO add categories but add a tag to see what classifying scheme we're using
+ //
+ const sounds = j.sounds || [];
+ const hasIpa = sounds.find((s: any) => "ipa" in s);
+ if (!hasIpa) return { error: "meh no ipa" };
+ const freq = await getThaiFreq(word);
+ const wordId = pdb.addWord(word, "th", freq, null);
+ const analyzed = await analyzeTHWord(word);
+ for (let snd of sounds)
+ if ("ipa" in snd) {
+ const res = await handleIpa(wordId, j, snd, analyzed);
+ if ("error" in res) return res;
+ }
+ return { ok: "" };
+}
+async function handleIpa(
+ wordId: number | bigint,
+ j: any,
+ snd: any,
+ analyzed: ThaiNLPRes,
+): AsyncRes<string> {
+ console.log();
+ const tags = JSON.stringify(snd.tags) || null;
+ // console.log("handleipa", analyzed.syllables.length);
+ // console.log(analyzed);
+ const wikiIpa = cleanIpa(snd.ipa);
+ const nlpIpa = cleanIpa(analyzed.ipa);
+ const ipa = wikiIpa || nlpIpa;
+ // if (j.word === "และ") {
+ // console.log("wtf!!");
+ // return { error: "wtf is this" };
+ // }
+ const wikiIpaSplit = wikiIpa.split(".");
+ const nlpIpaSplit = nlpIpa.split(".");
+ if (wikiIpaSplit.length !== nlpIpaSplit.length) {
+ // console.log("ipa mismatch");
+ // console.log(wikiIpa);
+ // console.log(nlpIpa);
+ }
+ if (analyzed.realSyls.length !== wikiIpaSplit.length) {
+ // console.log("syllable analysis mismatch", j.word);
+ // console.log({ syls: analyzed.syllables, ipa: wikiIpaSplit });
+ // console.dir(j, { depth: null });
+ return { error: "meh syllable analysis mismatch" };
+ }
+ const writtenSyls = analyzed.syllables;
+ const pronouncedSyls = analyzed.realSyls.map((s) =>
+ s.replace(/\u{E3A}/u, ""),
+ );
+ let badSyls = false;
+ if (writtenSyls.length !== pronouncedSyls.length) badSyls = true;
+
+ const tone_sequence = wikiIpaSplit
+ .map((s) => parseTone(s, j.word))
+ .map((t) => t.name)
+ .join(",");
+ const syl_sequence = pronouncedSyls.join(",");
+ const ipa_sequence = wikiIpaSplit.join(",");
+ pdb.addPronunciation(
+ wordId,
+ ipa,
+ pronouncedSyls.length,
+ syl_sequence,
+ tone_sequence,
+ ipa_sequence,
+ tags,
+ null,
+ );
+
+ for (let i = 0; i < pronouncedSyls.length; i++) {
+ const pronounced = pronouncedSyls[i]!;
+ const written = writtenSyls[i] || "";
+ const syllable = badSyls ? pronounced : written;
+ const ipa = wikiIpaSplit[i]!;
+ // TODO insert both??
+ const notes = pronounced === written ? null : `Pronounced ${pronounced}`;
+ if (pronounced !== syllable) {
+ console.log("diff");
+ console.log(pronounced);
+ console.log(written);
+ }
+ const res = await handleSyllable(syllable, ipa, wordId, i, notes);
+ if ("error" in res) return res;
+ }
+ return { ok: "" };
+}
+const thaiTones: Record<string, string> = {
+ "˧": "mid",
+ "˨˩": "low",
+ "˥˩": "falling",
+ "˦˥": "high",
+ "˩˩˦": "rising",
+};
+const thaiToneNums: Record<string, number> = {
+ "˧": 33,
+ "˨˩": 21,
+ "˥˩": 41,
+ "˦˥": 45,
+ "˩˩˦": 214,
+};
+const toneRegex = new RegExp(Object.keys(thaiToneNums).join("|"));
+
+function parseTone(ipa: string, spelling: string): Tone {
+ try {
+ const match = ipa.match(toneRegex)!;
+ const m = match[0]!;
+ const name = thaiTones[m]!;
+ const numbers = thaiToneNums[m]!;
+ return { letters: ipa, name, numbers };
+ } catch (e) {
+ console.error("meh wrong tones!!", { s: spelling, ipa });
+ throw new Error("");
+ }
+}
+function parseToneS(ipa: string, spelling: string): Tone {
+ try {
+ const name = thaiTones[ipa]!;
+ const numbers = thaiToneNums[ipa]!;
+ return { letters: ipa, name, numbers };
+ } catch (e) {
+ console.error("meh wrong tones!!", { s: spelling, ipa });
+ throw new Error("");
+ }
+}
+
+async function handleSyllable(
+ spelling: string,
+ ipa: string,
+ wordId: number | bigint,
+ idx: number,
+ notes: string | null,
+): AsyncRes<string> {
+ const sorsyl = await sorSyl(spelling, "th", ipa);
+ // console.log("ssyl", sorsyl.syls);
+ const weird = [
+ // "a̯n",
+ // "a̯",
+ // "a̯p",
+ // "a̯w",
+ // "a̯j",
+ // "a̯ŋ",
+ // "a̯k",
+ // "a̯t",
+ // "a̯m",
+ // "a̯ʔ",
+ // "ʔ",
+ "s",
+ "l",
+ "f",
+ "a̯s",
+ "js",
+ "t͡ɕʰ",
+ "ks",
+ "ns",
+ "a̯l",
+ "a̯f",
+ "mk",
+ ];
+ // const weirder = sorsyl.syls.find((s) => weird.includes(s.coda));
+ // if (weirder) {
+ // console.log("syllable", spelling);
+ // // console.dir(sorsyl, { depth: null });
+ // // console.dir(j, { depth: null });
+ // }
+ if (sorsyl.syls.length !== 1) throw new Error("wtf sorsyl!");
+ const syl = sorsyl.syls[0]!.ipa;
+ const tone = parseToneS(syl.tone, spelling);
+ // TODO add actual ortographic data here not just ipa
+ try {
+ pdb.addSyllable(
+ wordId,
+ idx + 1,
+ null,
+ "th",
+ syl.all,
+ syl.long,
+ spelling,
+ { spelling: syl.onset, ipa: syl.onset },
+ { spelling: syl.medial, ipa: syl.medial },
+ { spelling: syl.nucleus, ipa: syl.nucleus },
+ { spelling: syl.coda, ipa: syl.coda },
+ { spelling: syl.rhyme, ipa: syl.rhyme },
+ tone,
+ notes,
+ );
+ return { ok: "" };
+ } catch (e) {
+ // console.log("well fuck", syl);
+ // console.error(e);
+ return { error: `meh ${e}` };
+ }
+}
+async function handleIdiom(idiom: string): AsyncRes<string> {
+ pdb.addIdiom(idiom, "th");
+ // TODO later set idiom_words once all words are populated
+ // console.log();
+ return { ok: "" };
+}
+// ช้า ๆ
+// งก ๆ
+// หงก ๆ
+
+async function getFrequency() {
+ const files = [
+ "/home/y/code/prosody/resources/langdata/thai/data/1yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/2yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/3yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/4yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/5yin_freq.csv",
+ "/home/y/code/prosody/resources/langdata/thai/data/6yin_freq.csv",
+ ];
+ const freqMap = new Map<number, string>();
+ for (const file of files) {
+ await handleFile(file, (line, idx) => {
+ const [spelling, IPA, tone, length, frequency, ...rest] = line.split(",");
+ freqMap.set(Number(frequency!), spelling!);
+ });
+ }
+ const orderedMap = new Map<string, number>();
+ const keys = Array.from(freqMap.keys()).sort();
+ for (let i = 0; i < keys.length; i++) {
+ const val = freqMap.get(keys[i]!)!;
+ orderedMap.set(val, i + 1);
+ }
+ return orderedMap;
+}
+
+readDump("th");
diff --git a/src/lib/types/phonetics.ts b/src/lib/types/phonetics.ts
index 0009e78..f7289c7 100644
--- a/src/lib/types/phonetics.ts
+++ b/src/lib/types/phonetics.ts
@@ -20,3 +20,7 @@ export type Syllable = {
rhyme: Phoneme;
tone: Tone;
};
+
+export type ToneQuery = Array<string | null>;
+export type MutationType = { change: string } | { keep: string };
+export type MutationOrder = MutationType[];
diff --git a/src/lib/utils.ts b/src/lib/utils.ts
index 0674dea..0f0c084 100644
--- a/src/lib/utils.ts
+++ b/src/lib/utils.ts
@@ -63,3 +63,8 @@ export function cleanIpa(ipa: string): string {
const r2 = /[\[\]\/]/g;
return ipa.replace(r1, "").replace(r2, "");
}
+
+export function randomFromArray<T>(arr: T[]): T {
+ const idx = Math.floor(Math.random() * arr.length);
+ return arr[idx]!;
+}