summaryrefslogtreecommitdiff
path: root/src/lib
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-05-29 12:10:22 +0700
committerpolwex <polwex@sortug.com>2025-05-29 12:10:22 +0700
commita3f24ea79b14394b24c4b60a010651eb29eeb872 (patch)
treecb1c4937084116f66a59727ee752afd974714c8e /src/lib
parent7abf2227438362ad30820ee236405ec1b57a40b6 (diff)
glorious new db
Diffstat (limited to 'src/lib')
-rw-r--r--src/lib/calls/nlp.ts54
-rw-r--r--src/lib/db/codes.js203
-rw-r--r--src/lib/db/index.ts8
-rw-r--r--src/lib/db/prosodydb.ts238
-rw-r--r--src/lib/db/prosodyschema.sql178
-rw-r--r--src/lib/db/seed.ts212
-rw-r--r--src/lib/db/senseschema.sql54
-rw-r--r--src/lib/lang/utils.ts28
-rw-r--r--src/lib/types/cards.ts13
9 files changed, 986 insertions, 2 deletions
diff --git a/src/lib/calls/nlp.ts b/src/lib/calls/nlp.ts
new file mode 100644
index 0000000..28562d0
--- /dev/null
+++ b/src/lib/calls/nlp.ts
@@ -0,0 +1,54 @@
+import { SyllableRes } from "../types/cards";
+
+type AnalyzeRes = {
+ word: string;
+ syllables: string[];
+ ipa: string;
+ pos: string;
+};
+
+export async function thaiData(word: string): Promise<AnalyzeRes[]> {
+ const [head, tail] = await Promise.all([
+ analyzeTHWord(word),
+ segmentateThai(word),
+ ]);
+ return [head, ...tail];
+}
+
+export async function analyzeTHWord(word: string): Promise<AnalyzeRes> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word }),
+ };
+ const r1 = await fetch("http://localhost:8001" + "/analyze", opts);
+ // const r2 = await fetch(`http://192.168.1.110:8000/analyze`, opts);
+ const jj = await r1.json();
+ return jj;
+}
+export async function segmentateThai(sentence: string): Promise<AnalyzeRes[]> {
+ const opts = {
+ method: "POST",
+ headers: { "Content-type": "application/json" },
+ body: JSON.stringify({ word: sentence }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8001" + `/segmentate`, opts);
+ const jj = await r2.json();
+ return jj;
+}
+
+export async function deconstructSyllable(ipa: string): Promise<SyllableRes> {
+ const opts = {
+ method: "POST",
+ headers: {
+ "Content-type": "application/json",
+ "X-API-KEY": Bun.env.SORTUG_NLP_API_KEY!,
+ },
+ body: JSON.stringify({ string: ipa }),
+ };
+ // const r1 = await fetch(`http://localhost:8000/segmentate`, opts);
+ const r2 = await fetch("http://localhost:8102" + `/lingpy`, opts);
+ const jj = await r2.json();
+ return jj;
+}
diff --git a/src/lib/db/codes.js b/src/lib/db/codes.js
new file mode 100644
index 0000000..bef3e1b
--- /dev/null
+++ b/src/lib/db/codes.js
@@ -0,0 +1,203 @@
+
+
+const ALL_LANGUAGE_CODES = {
+ iso639_1: [],
+ iso639_2_T: [], // Terminology codes
+ iso639_2_B: [], // Bibliographic codes
+ iso639_3: [],
+ iso639_5: [],
+ bcp47_language_subtags: [], // Primary language subtags from IANA
+};
+
+const LOC_ISO639_2_URL = 'https://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt';
+// For ISO 639-3, SIL provides dated files. This is the structure of the comprehensive file.
+// The actual filename changes with each update (e.g., iso-639-3_20240123.tab).
+// You might need to go to https://iso639-3.sil.org/code_tables/download_tables and get the current link
+// for the "Complete Code Tables" zip, then extract the main .tab file.
+// For this script, I'll use a link to one specific (potentially older) version for demonstration.
+// A more robust solution would involve downloading and unzipping the latest.
+// This link points to the main table that includes mappings.
+const SIL_ISO639_3_URL = 'https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_20240701.tab'; // Example: replace with current
+const LOC_ISO639_5_URL = 'https://www.loc.gov/standards/iso639-5/iso639-5.tsv'; // TSV format
+const IANA_LANGUAGE_SUBTAG_REGISTRY_URL = 'https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry';
+
+async function fetchAndParseISO639_1_2() {
+ try {
+ console.log('Fetching ISO 639-1 & 639-2 codes from LOC...');
+ const response = await fetch(LOC_ISO639_2_URL);
+ if (!response.ok) throw new Error(`Failed to fetch ISO 639-1/2: ${response.statusText}`);
+ const text = await response.text();
+
+ const lines = text.trim().split('\n');
+ lines.forEach(line => {
+ // Format: alpha3-b|alpha3-t|alpha2|english_name|french_name
+ const parts = line.split('|');
+ if (parts.length >= 4) {
+ const alpha3_b = parts[0].trim();
+ const alpha3_t = parts[1].trim();
+ const alpha2 = parts[2].trim();
+ const englishName = parts[3].trim();
+
+ if (alpha3_b) { // Bibliographic code
+ ALL_LANGUAGE_CODES.iso639_2_B.push({ code: alpha3_b, name: englishName });
+ }
+ if (alpha3_t) { // Terminology code
+ ALL_LANGUAGE_CODES.iso639_2_T.push({ code: alpha3_t, name: englishName });
+ }
+ if (alpha2) { // Alpha-2 code
+ ALL_LANGUAGE_CODES.iso639_1.push({ code: alpha2, name: englishName });
+ }
+ }
+ });
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_1.length} ISO 639-1 codes.`);
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_B.length} ISO 639-2/B codes.`);
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_T.length} ISO 639-2/T codes.`);
+ } catch (error) {
+ console.error('Error fetching ISO 639-1/2 codes:', error.message);
+ }
+}
+
+async function fetchAndParseISO639_3() {
+ try {
+ console.log('Fetching ISO 639-3 codes from SIL...');
+ const response = await fetch(SIL_ISO639_3_URL);
+ if (!response.ok) throw new Error(`Failed to fetch ISO 639-3: ${response.statusText}`);
+ const text = await response.text();
+
+ const lines = text.trim().split('\n');
+ const header = lines.shift().split('\t'); // Remove header line
+ // Expected header fields (order matters):
+ // Id (3-letter code) | Part2B | Part2T | Part1 | Scope | Language_Type | Ref_Name | Comment
+ const idIndex = header.indexOf('Id');
+ const refNameIndex = header.indexOf('Ref_Name');
+ const part1Index = header.indexOf('Part1'); // For cross-referencing ISO 639-1
+
+ if (idIndex === -1 || refNameIndex === -1) {
+ throw new Error('ISO 639-3 header format mismatch. Expected "Id" and "Ref_Name" columns.');
+ }
+
+ lines.forEach(line => {
+ const parts = line.split('\t');
+ const code = parts[idIndex]?.trim();
+ const name = parts[refNameIndex]?.trim();
+ const part1Code = parts[part1Index]?.trim();
+
+ if (code && name) {
+ ALL_LANGUAGE_CODES.iso639_3.push({ code, name });
+
+ // Also, let's try to get more complete ISO 639-1 from this source
+ // as it might be more comprehensive than LOC's where 639-1 is only if 639-2 exists
+ if (part1Code && !ALL_LANGUAGE_CODES.iso639_1.find(c => c.code === part1Code)) {
+ ALL_LANGUAGE_CODES.iso639_1.push({ code: part1Code, name });
+ }
+ }
+ });
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_3.length} ISO 639-3 codes.`);
+ // Deduplicate and sort ISO 639-1 after potential additions
+ const uniqueIso639_1 = {};
+ ALL_LANGUAGE_CODES.iso639_1.forEach(item => uniqueIso639_1[item.code] = item);
+ ALL_LANGUAGE_CODES.iso639_1 = Object.values(uniqueIso639_1).sort((a, b) => a.code.localeCompare(b.code));
+ console.log(`Final unique ISO 639-1 count: ${ALL_LANGUAGE_CODES.iso639_1.length}.`);
+
+ } catch (error) {
+ console.error('Error fetching ISO 639-3 codes:', error.message);
+ console.warn('Make sure the SIL_ISO639_3_URL is current or points to a valid .tab file.');
+ }
+}
+
+async function fetchAndParseISO639_5() {
+ try {
+ console.log('Fetching ISO 639-5 codes from LOC...');
+ const response = await fetch(LOC_ISO639_5_URL);
+ if (!response.ok) throw new Error(`Failed to fetch ISO 639-5: ${response.statusText}`);
+ const text = await response.text();
+
+ const lines = text.trim().split('\n');
+ lines.shift(); // Remove header line: URI Code Label_en
+
+ lines.forEach(line => {
+ const parts = line.split('\t');
+ // URI | Code | Label_en | Label_fr ...
+ if (parts.length >= 3) {
+ const code = parts[1].trim();
+ const name = parts[2].trim();
+ if (code && name) {
+ ALL_LANGUAGE_CODES.iso639_5.push({ code, name });
+ }
+ }
+ });
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_5.length} ISO 639-5 codes (language families/groups).`);
+ } catch (error) {
+ console.error('Error fetching ISO 639-5 codes:', error.message);
+ }
+}
+
+async function fetchAndParseIANALanguageSubtags() {
+ try {
+ console.log('Fetching IANA Language Subtag Registry...');
+ const response = await fetch(IANA_LANGUAGE_SUBTAG_REGISTRY_URL);
+ if (!response.ok) throw new Error(`Failed to fetch IANA registry: ${response.statusText}`);
+ const text = await response.text();
+
+ const entries = text.split('%%'); // Entries are separated by %%
+ entries.forEach(entry => {
+ const lines = entry.trim().split('\n');
+ let type = '';
+ let subtag = '';
+ let description = '';
+
+ lines.forEach(line => {
+ if (line.startsWith('Type:')) {
+ type = line.substring(5).trim();
+ } else if (line.startsWith('Subtag:')) {
+ subtag = line.substring(7).trim();
+ } else if (line.startsWith('Description:')) {
+ // Description can span multiple lines, but for simplicity, we take the first
+ if (!description) description = line.substring(12).trim();
+ }
+ });
+
+ if (type === 'language' && subtag && description) {
+ ALL_LANGUAGE_CODES.bcp47_language_subtags.push({
+ code: subtag,
+ name: description
+ });
+ }
+ });
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.bcp47_language_subtags.length} primary language subtags from IANA.`);
+ } catch (error) {
+ console.error('Error fetching IANA Language Subtag Registry:', error.message);
+ }
+}
+
+
+async function main() {
+ console.log('Starting to fetch all language codes...\n');
+
+ await Promise.all([
+ fetchAndParseISO639_1_2(),
+ fetchAndParseISO639_3(), // Run this after 1_2 to potentially augment 639-1
+ fetchAndParseISO639_5(),
+ fetchAndParseIANALanguageSubtags()
+ ]);
+ await Bun.write("bcp.json", JSON.stringify(ALL_LANGUAGE_CODES.bcp47_language_subtags))
+
+ console.log('\n\n--- All Fetched Language Codes ---');
+
+ // Example: Print counts and first few of each
+ for (const [key, codes] of Object.entries(ALL_LANGUAGE_CODES)) {
+ console.log(`\n--- ${key} (Total: ${codes.length}) ---`);
+ codes.slice(0, 50).forEach(c => console.log(`${c.code}: ${c.name}`));
+ if (codes.length > 50) console.log('... and more ...');
+ }
+
+ // You can now use ALL_LANGUAGE_CODES object for your needs
+ // e.g., save to a JSON file
+ // import fs from 'fs';
+ // fs.writeFileSync('all_language_codes.json', JSON.stringify(ALL_LANGUAGE_CODES, null, 2));
+ // console.log('\n\nSaved all codes to all_language_codes.json');
+
+ console.log('\nFetching complete.');
+}
+
+main().catch(console.error);
diff --git a/src/lib/db/index.ts b/src/lib/db/index.ts
index fcfab57..6bd417c 100644
--- a/src/lib/db/index.ts
+++ b/src/lib/db/index.ts
@@ -92,10 +92,16 @@ class DatabaseHandler {
const query = this.db.query(
`
SELECT * FROM expressions e
- WHERE e.syllables = 1 AND e.lang = ?
+ WHERE e.type = 'syllable' AND e.lang = ?
ORDER BY frequency DESC
LIMIT ${PAGE_SIZE} ${page ? "OFFSET " + getDBOffset(page, PAGE_SIZE) : ""}
`,
+ // `
+ // SELECT * FROM expressions e
+ // WHERE e.syllables = 1 AND e.lang = ?
+ // ORDER BY frequency DESC
+ // LIMIT ${PAGE_SIZE} ${page ? "OFFSET " + getDBOffset(page, PAGE_SIZE) : ""}
+ // `,
);
const results = query.all(lang);
console.log({ lang, page }, "results");
diff --git a/src/lib/db/prosodydb.ts b/src/lib/db/prosodydb.ts
new file mode 100644
index 0000000..b3b973b
--- /dev/null
+++ b/src/lib/db/prosodydb.ts
@@ -0,0 +1,238 @@
+import Database from "bun:sqlite";
+type Str = string | null;
+type ItemType = "word" | "syllable" | "idiom";
+
+class DatabaseHandler {
+ db: Database;
+ constructor() {
+ const dbPath = "/home/y/code/bun/ssr/waku/bulkdata/prosodynew.db";
+ const db = new Database(dbPath, { create: true });
+ db.exec("PRAGMA journal_mode = WAL"); // Enable Write-Ahead Logging for better performance
+ db.exec("PRAGMA foreign_keys = ON");
+ this.db = db;
+ }
+ async init() {
+ const file = Bun.file("./prosodyschema.sql");
+ const sql = await file.text();
+ this.db.exec(sql);
+ }
+ // selects
+ fetchWords(words: string[]) {
+ const query = this.db.query(
+ `SELECT id FROM words where spelling IN (${words.map((w) => `'${w}'`).join(", ")})`,
+ );
+ return query.all() as Array<{ id: number }>;
+ }
+ // inserts
+
+ addLanguage(code: string, name: string) {
+ const query = this.db
+ .query(`INSERT OR IGNORE INTO languages(iso6392, english) VALUES(?, ?)`)
+ .run(code, name);
+ }
+ addPronunciation(
+ type: ItemType,
+ parentId: number | bigint,
+ ipa: string,
+ syllables: number,
+ tags: Str,
+ notes: Str,
+ ) {
+ try {
+ const query = this.db
+ .query(
+ `INSERT INTO pronunciation(type, parent_id,ipa, syllables, tag, notes) VALUES(?, ?, ?, ?, ?, ?)`,
+ )
+ .run(type, parentId, ipa, syllables, tags, notes);
+ } catch (e) {
+ // console.error(e);
+ }
+ }
+ addWordRhyme(wordId: number | bigint, ipa: string, lang: string, notes: Str) {
+ console.log("wordrhyme", notes);
+ try {
+ const query = this.db
+ .query(
+ `INSERT INTO word_rhymes(text, lang, notes) VALUES(?, ?, ?)
+ ON CONFLICT(text,lang) DO UPDATE SET
+ text = excluded.text
+ RETURNING rowid
+ `,
+ )
+ .get(ipa, lang, notes) as { id: number };
+ const query2 = this.db
+ .query(
+ `
+ INSERT INTO words_idioms(word_id, idiom_id) VALUES(?, ?)
+ `,
+ )
+ .run(wordId, query.id);
+ } catch (e) {
+ // console.error(e);
+ }
+ }
+ addIdiom(spelling: string, lang: string) {
+ const query = this.db.query(
+ `INSERT INTO idioms(spelling, lang) VALUES(?, ?)`,
+ );
+ const res = query.run(spelling, lang);
+ return res;
+ }
+ findIdiomWords(spelling: string, idId: number | bigint) {
+ const split = spelling.split(" ");
+ const words = this.fetchWords(split);
+ console.log({ words });
+ const tx = this.db.transaction(() => {
+ for (const w of words) {
+ this.db
+ .query(
+ `
+ INSERT INTO words_idioms(word_id, idiom_id) VALUES(?, ?)
+ `,
+ )
+ .run(w.id, idId);
+ }
+ });
+ tx();
+ }
+ findIdiomsWords() {
+ const rows: any = this.db.query(`SELECT id, spelling FROM idioms`);
+ for (const row of rows) {
+ this.findIdiomWords(row.spelling, row.id);
+ }
+ }
+ addWord(spelling: string, lang: string) {
+ const query = this.db.query(
+ // `INSERT OR IGNORE INTO words(spelling, lang) VALUES(?, ?)`,
+ `INSERT INTO words(spelling, lang) VALUES(?, ?)`,
+ );
+ const res = query.run(spelling, lang);
+ const wordId = res.lastInsertRowid;
+ return wordId;
+ }
+ addSyllable(
+ wordId: number | bigint,
+ text: string,
+ lang: string,
+ long: boolean,
+ onset: Str,
+ medial: Str,
+ nucleus: string,
+ coda: Str,
+ rhyme: string,
+ tone: Str,
+ notes: Str,
+ ) {
+ const tx = this.db.transaction(() => {
+ const query = this.db.query(
+ `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+ );
+ const res = query.run(
+ text,
+ lang,
+ long,
+ onset,
+ medial,
+ nucleus,
+ coda,
+ rhyme,
+ tone,
+ notes,
+ );
+ const sylId = res.lastInsertRowid;
+
+ const res1 = this.db
+ .query(`INSERT INTO syllables_words(syl_id, word_id) VALUES(?, ?)`)
+ .run(sylId, wordId);
+ //
+ return sylId;
+ });
+ const sylId = tx();
+ let res1: any;
+ if (onset) {
+ res1 = this.db
+ .query(
+ `INSERT INTO onsets(text, lang) VALUES(?, ?)
+ ON CONFLICT(text, lang) DO UPDATE SET
+ text = excluded.text
+ RETURNING rowid
+ `,
+ )
+ .get(onset, lang);
+ this.db
+ .query(`INSERT INTO onsets_syllables(syl_id, onset_id) VALUES(?, ?)`)
+ .run(sylId, res1.id);
+ }
+ if (medial) {
+ res1 = this.db
+ .query(
+ `INSERT INTO medials(text, lang) VALUES(?, ?)
+ ON CONFLICT(text, lang) DO UPDATE SET
+ text = excluded.text
+ RETURNING rowid
+ `,
+ )
+ .get(medial, lang);
+ this.db
+ .query(`INSERT INTO medials_syllables(syl_id, medial_id) VALUES(?, ?)`)
+ .run(sylId, res1.id);
+ }
+ res1 = this.db
+ .query(
+ `INSERT INTO nucleus(text, lang) VALUES(?, ?)
+ ON CONFLICT(text, lang) DO UPDATE SET
+ text = excluded.text
+ RETURNING rowid
+ `,
+ )
+ .get(nucleus, lang);
+ this.db
+ .query(`INSERT INTO nucleus_syllables(syl_id, nucleus_id) VALUES(?, ?)`)
+ .run(sylId, res1.id);
+ if (coda) {
+ res1 = this.db
+ .query(
+ `INSERT INTO codas(text, lang) VALUES(?, ?)
+ ON CONFLICT(text, lang) DO UPDATE SET
+ text = excluded.text
+ RETURNING rowid
+ `,
+ )
+ .get(coda, lang);
+ this.db
+ .query(`INSERT INTO codas_syllables(syl_id, coda_id) VALUES(?, ?)`)
+ .run(sylId, res1.id);
+ }
+ res1 = this.db
+ .query(
+ `INSERT INTO rhymes(text, lang) VALUES(?, ?)
+ ON CONFLICT(text, lang) DO UPDATE SET
+ text = excluded.text
+ RETURNING rowid
+ `,
+ )
+ .get(rhyme, lang);
+ this.db
+ .query(`INSERT INTO rhymes_syllables(syl_id, rhyme_id) VALUES(?, ?)`)
+ .run(sylId, res1.id);
+ if (tone) {
+ res1 = this.db
+ .query(
+ `INSERT INTO tones(text, lang) VALUES(?, ?)
+ ON CONFLICT(text, lang) DO UPDATE SET
+ text = excluded.text
+ RETURNING rowid
+ `,
+ )
+ .get(tone, lang);
+ this.db
+ .query(`INSERT INTO tones_syllables(syl_id, tone_id) VALUES(?, ?)`)
+ .run(sylId, res1.id);
+ }
+ }
+
+ // reads
+}
+const db = new DatabaseHandler();
+
+export default db;
diff --git a/src/lib/db/prosodyschema.sql b/src/lib/db/prosodyschema.sql
new file mode 100644
index 0000000..e70b005
--- /dev/null
+++ b/src/lib/db/prosodyschema.sql
@@ -0,0 +1,178 @@
+-- Enable foreign key support
+PRAGMA foreign_keys = ON;
+PRAGMA journal_mode = WAL;
+PRAGMA cache_size = -2000;
+PRAGMA mmap_size = 30000000000;
+
+
+-- proper prosody now
+--
+--
+--
+CREATE TABLE IF NOT EXISTS languages(
+ iso6392 TEXT PRIMARY KEY,
+ -- bcp47 TEXT PRIMARY KEY,
+ -- iso6393 TEXT NOT NULL,
+ english TEXT NOT NULL
+ -- native TEXT,
+ -- iso6391 TEXT,
+ -- iso6395 TEXT,
+ -- glottolog TEXT
+);
+
+CREATE TABLE IF NOT EXISTS idioms(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ spelling TEXT NOT NULL,
+ lang TEXT NOT NULL,
+ frequency INTEGER,
+ FOREIGN KEY (lang) REFERENCES languages(iso6392),
+ CONSTRAINT spell_unique UNIQUE (spelling, lang)
+);
+
+CREATE INDEX IF NOT EXISTS idx_idioms_spelling ON idioms(spelling);
+CREATE TABLE IF NOT EXISTS words(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ spelling TEXT NOT NULL,
+ lang TEXT NOT NULL,
+ frequency INTEGER,
+ FOREIGN KEY (lang) REFERENCES languages(iso6392),
+ CONSTRAINT spell_unique UNIQUE (spelling, lang)
+);
+
+CREATE INDEX IF NOT EXISTS idx_words_spelling ON words(spelling);
+
+CREATE TABLE IF NOT EXISTS word_rhymes(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ text TEXT NOT NULL,
+ lang TEXT NOT NULL,
+ notes TEXT,
+ CONSTRAINT wrhyme_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS words_rhymes(
+ word_id INTEGER NOT NULL,
+ wrhyme_id INTEGER NOT NULL,
+ FOREIGN KEY (word_id) REFERENCES words(id),
+ FOREIGN KEY (wrhyme_id) REFERENCES word_rhymes(id)
+);
+
+-- break up syllables
+CREATE TABLE IF NOT EXISTS syllables(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ text TEXT NOT NULL,
+ lang TEXT NOT NULL,
+ long INTEGER NOT NULL,
+ tone TEXT,
+ onset TEXT,
+ medial TEXT,
+ nucleus TEXT,
+ coda TEXT,
+ rhyme TEXT,
+ notes TEXT,
+ FOREIGN KEY (lang) REFERENCES languages(iso6392),
+ CONSTRAINT spell_unique UNIQUE (text, lang)
+);
+
+CREATE TABLE IF NOT EXISTS tones(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ text TEXT NOT NULL,
+ lang TEXT NOT NULL,
+ name TEXT,
+ num INTEGER,
+ CONSTRAINT tone_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS onsets(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ text TEXT NOT NULL,
+ lang TEXT NOT NULL,
+ CONSTRAINT onsets_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS medials(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ text TEXT NOT NULL,
+ lang TEXT NOT NULL,
+ CONSTRAINT medials_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS nucleus(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ text TEXT NOT NULL,
+ lang TEXT NOT NULL,
+ CONSTRAINT nucleus_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS codas(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ text TEXT NOT NULL,
+ lang TEXT NOT NULL,
+ CONSTRAINT coda_unique UNIQUE (text, lang)
+);
+CREATE TABLE IF NOT EXISTS rhymes(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ text TEXT NOT NULL,
+ lang TEXT NOT NULL,
+ CONSTRAINT rhyme_unique UNIQUE (text, lang)
+);
+
+-- join tables
+
+CREATE TABLE IF NOT EXISTS tones_syllables(
+ syl_id INTEGER NOT NULL,
+ tone_id INTEGER NOT NULL,
+ FOREIGN KEY (syl_id) REFERENCES syllables(id),
+ FOREIGN KEY (tone_id) REFERENCES tones(id)
+);
+CREATE TABLE IF NOT EXISTS onsets_syllables(
+ syl_id INTEGER NOT NULL,
+ onset_id INTEGER NOT NULL,
+ FOREIGN KEY (syl_id) REFERENCES syllables(id),
+ FOREIGN KEY (onset_id) REFERENCES onsets(id)
+);
+CREATE TABLE IF NOT EXISTS medials_syllables(
+ syl_id INTEGER NOT NULL,
+ medial_id INTEGER NOT NULL,
+ FOREIGN KEY (syl_id) REFERENCES syllables(id),
+ FOREIGN KEY (medial_id) REFERENCES medials(id)
+);
+CREATE TABLE IF NOT EXISTS nucleus_syllables(
+ syl_id INTEGER NOT NULL,
+ nucleus_id INTEGER NOT NULL,
+ FOREIGN KEY (syl_id) REFERENCES syllables(id),
+ FOREIGN KEY (nucleus_id) REFERENCES nucleus(id)
+);
+CREATE TABLE IF NOT EXISTS codas_syllables(
+ syl_id INTEGER NOT NULL,
+ coda_id INTEGER NOT NULL,
+ FOREIGN KEY (syl_id) REFERENCES syllables(id),
+ FOREIGN KEY (coda_id) REFERENCES codas(id)
+);
+CREATE TABLE IF NOT EXISTS rhymes_syllables(
+ syl_id INTEGER NOT NULL,
+ rhyme_id INTEGER NOT NULL,
+ FOREIGN KEY (syl_id) REFERENCES syllables(id),
+ FOREIGN KEY (rhyme_id) REFERENCES rhymes(id)
+);
+
+CREATE TABLE IF NOT EXISTS syllables_words(
+ syl_id INTEGER NOT NULL,
+ word_id INTEGER NOT NULL,
+ FOREIGN KEY (syl_id) REFERENCES syllables(id),
+ FOREIGN KEY (word_id) REFERENCES words(id)
+);
+CREATE TABLE IF NOT EXISTS words_idioms(
+ word_id INTEGER NOT NULL,
+ idiom_id INTEGER NOT NULL,
+ FOREIGN KEY (idiom_id) REFERENCES idioms(id),
+ FOREIGN KEY (word_id) REFERENCES words(id)
+);
+
+
+--
+CREATE TABLE IF NOT EXISTS pronunciation(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ type TEXT CHECK(type IN ('word', 'syllable', 'idiom')) NOT NULL,
+ parent_id INTEGER NOT NULL,
+ ipa TEXT NOT NULL,
+ syllables INTEGER NOT NULL,
+ tag TEXT,
+ notes TEXT,
+ CONSTRAINT ipa_unique UNIQUE (ipa, parent_id)
+);
+CREATE INDEX IF NOT EXISTS idx_words_ipa ON pronunciation(ipa, parent_id);
diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts
index c4094de..7f4352f 100644
--- a/src/lib/db/seed.ts
+++ b/src/lib/db/seed.ts
@@ -2,6 +2,8 @@ import { readWiktionaryDump } from "../services/wiki";
import { getStressedSyllable, getSyllableCount } from "../utils";
import useful from "@/lib/useful_thai.json";
import db from ".";
+import pdb from "./prosodydb";
+import * as Sorsyl from "sorsyl";
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
@@ -483,7 +485,16 @@ function addThaiSyllablesLesson() {
// }
// }
// }
-addThaiUseful();
+function fixSyllables() {
+ const res = db.db.query(`SELECT ipa, syllables FROM expressions;`).all();
+ for (let i = 0; i < 10; i++) {
+ // for (const row of res) {
+ const row = res[i];
+ console.log({ row });
+ }
+}
+// fixSyllables();
+// addThaiUseful();
// addThaiSyllablesLesson();
// adjustFrequency("th");
@@ -492,3 +503,202 @@ addThaiUseful();
// fillFromDump();
// thaiSyllables();
// thaiFreq();
+//
+//
+const SORSYL_PATH =
+ "/nix/store/lkyi9rrjbr619w3ivpkm89ccf93bvxx5-sorsyl-0.1.0/bin/sorsyl";
+async function redump() {
+ await pdb.init();
+ let count = 0;
+
+ // const soundTypes = new Set<string>();
+ // [
+ // "tags", "ipa", "audio", "ogg_url", "mp3_url", "enpr", "rhymes", "homophone", "note", "zh-pron", "other",
+ // "text", "hangeul", "topics", "form", "audio-ipa"
+ // ]
+ for await (const line of readWiktionaryDump()) {
+ try {
+ count++;
+ // if (count > 50) break;
+ const j = JSON.parse(line);
+ console.log(Object.keys(j), j.word);
+ // add language to db
+ pdb.addLanguage(j.lang_code, j.lang);
+ // handleEtim(j);
+ // handleDerived(j);
+ // handleSenses(j.pos, j.senses);
+ // //
+ const isWord = j.word.trim().split(" ").length === 1;
+ if (isWord) await handleWord(j);
+ else await handleIdiom(j);
+ } catch (e) {
+ console.log("error parsing", e);
+ // break;
+ }
+ }
+}
+
+type SorSyl = {
+ stressed: boolean;
+ long: boolean;
+ spelling: string;
+ ipa: string;
+ nucleus: string;
+ onset: string;
+ medial: string;
+ coda: string;
+ rhyme: string;
+ tone: string;
+};
+async function handleWord(j: any) {
+ const wordId = pdb.addWord(j.word, j.lang_code);
+ let ts = Date.now();
+
+ const hwikiRhyme = j.sounds.find((s) => "rhymes" in s);
+ const wikiRhyme = hwikiRhyme ? hwikiRhyme.rhymes : null;
+ for (let snd of j.sounds || []) {
+ if ("ipa" in snd) {
+ const tags = JSON.stringify(snd.tags) || null;
+ const ipa = snd.ipa;
+ try {
+ const hres = await fetch("http://localhost:8104/syls", {
+ method: "POST",
+ headers: { "content-type": "application/json" },
+ body: JSON.stringify({ string: j.word, lang: j.lang_code, ipa }),
+ });
+ const hjon = await hres.json();
+ console.log(Date.now() - ts, "elapsed in http");
+ ts = Date.now();
+ pdb.addPronunciation(
+ "word",
+ wordId,
+ hjon.clean_ipa,
+ hjon.syls.length,
+ tags,
+ null,
+ );
+ const wordRhyme = hjon.syls.reduce((acc: string, item: SorSyl) => {
+ if (!item.stressed && !acc) return acc;
+ if (item.stressed && !acc) return `${acc}${item.rhyme}`;
+ else return `${acc}${item.ipa}`;
+ }, "");
+ if (wordRhyme)
+ pdb.addWordRhyme(wordId, wordRhyme, j.lang_code, wikiRhyme);
+ else console.log("no rhyme?", hjon);
+ for (const syl of hjon.syls) {
+ // TODO ideally syllables would have spelling not IPA... harsh tho
+ pdb.addSyllable(
+ wordId,
+ syl.ipa,
+ j.lang_code,
+ syl.long,
+ syl.onset || null,
+ syl.medial || null,
+ syl.nucleus,
+ syl.coda || null,
+ syl.rhyme,
+ syl.tone || null,
+ null,
+ );
+ }
+ console.log(Date.now() - ts, "elapsed in db");
+ ts = Date.now();
+ } catch (e) {
+ console.error(e);
+ console.error(j);
+ // break;
+ }
+ }
+ }
+}
+async function handleIdiom(j: any) {
+ console.log(j.word, "idiom");
+ pdb.addIdiom(j.word, j.lang_code);
+ // TODO IPA of idioms...?
+}
+async function handleEtim(j: any) {
+ console.log(j.etymology_text, "etym");
+ console.log(j.etymology_templates, "etym");
+
+ // {
+ // name: "inh",
+ // args: {
+ // "1": "en",
+ // "2": "ang",
+ // "3": "frēo",
+ // "4": "",
+ // "5": "free",
+ // },
+ // expansion: "Old English frēo (“free”)",
+ // },
+
+ console.log(j.head_templates, "head");
+ // {
+ // name: "en-verb",
+ // args: {},
+ // expansion: "free (third-person singular simple present frees, present participle freeing, simple past and past participle freed)",
+ // }
+}
+async function handleDerived(j: any) {
+ const { forms, derived, related, antonyms, hyponyms, synonyms, descendants } =
+ j;
+ console.log("forms", forms);
+ // {form: string; tags: string[]}
+ console.log("derived", derived);
+ // {word: string}
+ console.log("related", related);
+ // {word: string, source?: string;}
+ console.log("ant", antonyms);
+ // {word: string, source?: string;}
+ console.log("hypo", hyponyms);
+ console.log("syno", synonyms);
+ // {word: string, source?: string;}
+ console.log("desc", descendants);
+}
+async function handleSenses(pos: string, senses: any[]) {
+ console.log("ex", senses[0].examples);
+ // {text: string; ref: string; type: "quote"}
+ console.log("info", senses[0].info_templates);
+ for (const s of senses) {
+ // s.glosses[]
+ // s.tags[]
+ }
+}
+
+redump();
+
+async function newtest() {
+ // const query = pdb.db.query(
+ // `INSERT INTO syllables(text, lang, long, onset, medial, nucleus, coda, rhyme, tone, notes) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
+ // );
+ // const res = query.run(
+ // "lol",
+ // "en",
+ // true,
+ // "l",
+ // "j",
+ // "o",
+ // "q",
+ // "joq",
+ // null,
+ // null,
+ // );
+ // const sylId = res.lastInsertRowid;
+ const res1 = pdb.db
+ .query(
+ `INSERT INTO onsets(text, lang) VALUES(?, ?)
+ ON CONFLICT(text, lang) DO UPDATE SET
+ text = excluded.text
+ RETURNING rowid
+ `,
+ )
+ .get("lll", "en");
+ console.log({ res1 });
+}
+// newtest();
+// TIL calling shell commands is terribly slow wtf
+// Bun.$.env({ FOO: ipa });
+// const res = await Bun.$`${SORSYL_PATH} $FOO`;
+// const syllables = JSON.parse(res.stdout.toString());
+// console.log(Date.now() - ts, "elapsed in py");
+// ts = Date.now();
diff --git a/src/lib/db/senseschema.sql b/src/lib/db/senseschema.sql
new file mode 100644
index 0000000..f07a208
--- /dev/null
+++ b/src/lib/db/senseschema.sql
@@ -0,0 +1,54 @@
+-- Enable foreign key support
+PRAGMA foreign_keys = ON;
+PRAGMA journal_mode = WAL;
+PRAGMA cache_size = -2000;
+PRAGMA mmap_size = 30000000000;
+
+-- a semantic entity
+CREATE TABLE IF NOT EXISTS senses(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ parent_id INTEGER NOT NULL,
+ spelling TEXT NOT NULL,
+ pos TEXT,
+ etymology TEXT,
+ senses JSONB,
+ forms JSONB,
+ related JSONB,
+ confidence INTEGER NOT NULL DEFAULT 0,
+ FOREIGN KEY (parent_id) REFERENCES expressions(id)
+);
+CREATE INDEX IF NOT EXISTS idx_words_pos ON senses(pos);
+CREATE INDEX IF NOT EXISTS idx_senses_parent ON senses(parent_id);
+
+CREATE TABLE IF NOT EXISTS subsenses(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ sid INTEGER NOT NULL
+ gloss TEXT NOT NULL,
+ examples JSONB,
+ FOREIGN KEY (sid) REFERENCES senses(id)
+);
+
+CREATE TABLE IF NOT EXISTS derivation(
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ sid INTEGER NOT NULL
+ type TEXT NOT NULL,
+ text TEXT NOT NULL,
+ tags JSONB,
+ FOREIGN KEY (sid) REFERENCES senses(id)
+);
+
+-- Categories table (for noun and verb categories)
+CREATE TABLE IF NOT EXISTS categories (
+ name TEXT PRIMARY KEY
+);
+
+-- Word Categories junction table
+CREATE TABLE IF NOT EXISTS word_categories (
+ word_id INTEGER NOT NULL,
+ category INTEGER NOT NULL,
+ PRIMARY KEY (word_id, category),
+ FOREIGN KEY (word_id) REFERENCES expressions(id),
+ FOREIGN KEY (category) REFERENCES categories(name)
+);
+CREATE INDEX IF NOT EXISTS idx_word_categories_category_id ON word_categories(category);
+
diff --git a/src/lib/lang/utils.ts b/src/lib/lang/utils.ts
new file mode 100644
index 0000000..b6c5bae
--- /dev/null
+++ b/src/lib/lang/utils.ts
@@ -0,0 +1,28 @@
+const TONE_LETTER_REGEX = new RegExp(
+ /[\u{02E5}-\u{02EE}\u{A700}-\u{A71F}\u{A789}-\u{A78A}]/u,
+);
+const TONE_NUMBERS_REGEX = new RegExp(/\w(\d{2,})\s/);
+
+export function isTonal(text: string): boolean {
+ return !!TONE_LETTER_REGEX.exec(text) || !!TONE_NUMBERS_REGEX.exec(text);
+}
+
+export function isLongVowel(text: string): boolean {
+ return text.includes("ː");
+}
+
+// In this order!
+export const thaiTones = {
+ "˧": "M",
+ "˨˩": "L",
+ "˦˥": "H",
+ "˥˩": "F",
+ "˩˩˦": "R",
+};
+
+export const mandarinTones = {
+ "˥˥": 1,
+ "˧˥": 2,
+ "˨˩˦": 3,
+ "˥˩": 4,
+};
diff --git a/src/lib/types/cards.ts b/src/lib/types/cards.ts
index cef02d2..1a62a44 100644
--- a/src/lib/types/cards.ts
+++ b/src/lib/types/cards.ts
@@ -210,3 +210,16 @@ export type ExpressionSearchParams = {
frequency?: { num: number; above: boolean };
type?: ExpressionType;
};
+
+export type SyllableRes = { input: string; result: SyllableToken[] };
+export type SyllableToken = [IPACharacter, SyllablePart];
+export type IPACharacter = string; // one char mostly
+export enum SyllablePart {
+ INITIAL = "#",
+ OTHER_ONSET = "C",
+ VOWEL = "V",
+ OTHER_VOWEL = "v",
+ FINAL_VOWEL = ">",
+ OTHER_OFFSET = "c",
+ CODA = "$",
+}