From e839a5f61f0faa21ca8b4bd5767f7575d5e576ee Mon Sep 17 00:00:00 2001 From: polwex Date: Wed, 21 May 2025 14:00:28 +0700 Subject: the card flip animation is legit --- src/lib/db/index.ts | 92 ++++++---- src/lib/db/schema.sql | 11 +- src/lib/db/seed.ts | 494 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 562 insertions(+), 35 deletions(-) create mode 100644 src/lib/db/seed.ts (limited to 'src/lib/db') diff --git a/src/lib/db/index.ts b/src/lib/db/index.ts index a710a1e..b43edc3 100644 --- a/src/lib/db/index.ts +++ b/src/lib/db/index.ts @@ -1,8 +1,9 @@ import Database from "bun:sqlite"; import { getDBOffset, wordFactorial } from "../utils"; -import type { AddSense, AddWord, State } from "../types"; +import type { AddSense, AddWord, Result, State } from "../types"; import { DEFAULT_SRS } from "../services/srs"; import { DBWord, WordData } from "@/zoom/logic/types"; +import { DeckResponse } from "../types/cards"; const PAGE_SIZE = 100; @@ -57,16 +58,35 @@ class DatabaseHandler { // read // fetchLanguage(lang: string, page?: number) { - const query = this.db.query( - ` - SELECT * FROM lessons - JOIN languages l ON l.code = lessons.lang - WHERE lessons.lang= ? - `, - ); + const query = this.db.query(` + SELECT + l.id, + l.name, + l.position, + l.description, + COUNT(cl.card_id) AS count + FROM + lessons l + INNER JOIN + lang_lessons ll ON l.id = ll.lesson_id + LEFT JOIN + cards_lessons cl ON l.id = cl.lesson_id + WHERE + ll.lang = ? + GROUP BY + l.id, l.name, l.position, l.description + ORDER BY + l.position, l.name; + `); const res = query.all(lang); console.log(res, "results"); - return res; + return res as Array<{ + id: number; + count: number; + name: string; + description: string; + position: number; + }>; } fetchSyllables(lang: string, page?: number) { const query = this.db.query( @@ -135,7 +155,7 @@ class DatabaseHandler { GROUP_CONCAT(c.name, ',') AS category, FROM expressions JOIN word_categories wc ON wc.word_id = words.id - JOIN categories c ON c.id = wc.category_id + JOIN categories c ON c.name = wc.category WHERE spelling = $spelling GROUP BY words.id `); @@ -151,7 +171,7 @@ class DatabaseHandler { GROUP_CONCAT(c.name, ',') AS category FROM expressions e JOIN word_categories wc ON wc.word_id = e.id - JOIN categories c ON c.id = wc.category_id + JOIN categories c ON c.name= wc.category ORDER BY e.frequency DESC LIMIT $count OFFSET $offset @@ -297,7 +317,7 @@ class DatabaseHandler { LEFT JOIN word_categories wc ON wc.word_id = e.id LEFT JOIN - categories cat ON cat.id = wc.category_id + categories cat ON cat.name = wc.category GROUP BY l.id, c.id, e.id ORDER BY @@ -309,7 +329,12 @@ class DatabaseHandler { } // SELECT l.id, l.text, cards.text, cards.note FROM cards_lessons cl LEFT JOIN lessons l ON l.id = cl.lesson_id LEFT JOIN cards ON cards.id = cl.card_id ORDER BY l.id ASC LIMIT 20 OFFSET 0; - fetchLesson(userId: number, lessonId: number, count?: number, page?: number) { + fetchLesson( + userId: number, + lessonId: number, + count?: number, + page?: number, + ): Result { const p = page ? page : 1; const size = count ? count : PAGE_SIZE; const offset = getDBOffset(p, size); @@ -318,7 +343,7 @@ class DatabaseHandler { console.log(tomorrow.getTime()); const queryString = ` SELECT - l.name, l.description, l.lang as llang, cards.text, cards.note, cards.id as cid, + l.name, l.description, ll.lang as llang, cards.text, cards.note, cards.id as cid, up.id as upid, up.repetition_count, up.ease_factor, @@ -342,6 +367,7 @@ class DatabaseHandler { WHERE cl_inner.lesson_id = l.id) AS total_card_count FROM cards_lessons cl JOIN lessons l ON l.id = cl.lesson_id + JOIN lang_lessons ll ON l.id = ll.lesson_id JOIN cards ON cards.id = cl.card_id JOIN cards_expressions ce ON cards.id = ce.card_id JOIN expressions e ON e.id = ce.expression_id @@ -367,7 +393,7 @@ class DatabaseHandler { const query = this.db.query(queryString); const res = query.all(userId, lessonId, tomorrow.getTime(), size, offset); console.log(res.length); - if (res.length === 0) return null; + if (res.length === 0) return { error: "Lesson not found" }; const row: any = res[0]; // console.log({ row }); const lesson = { @@ -377,8 +403,10 @@ class DatabaseHandler { language: row.llang, cardCount: row.total_card_count, }; + // TODO IPA, prosody, senses... should we unify the format on the wikisource standard? const cards = res.map((row: any) => { // TODO parse here...? + // console.log({ row }); const sense_array = JSON.parse(row.senses_array); const senses = sense_array.map((s: any) => { const senses = JSON.parse(s.senses); @@ -417,7 +445,7 @@ class DatabaseHandler { }; return card; }); - return { lesson, cards }; + return { ok: { lesson, cards } }; } fetchCard(cid: number, userid: number) { const query = this.db.query(` @@ -448,7 +476,7 @@ class DatabaseHandler { VALUES (${columns.map((c) => "?").join(",")}) `; const query = this.db.query(queryString).run(...Object.values(params)); - return query; + return query.lastInsertRowid; } addSense(params: AddSense) { const columns = Object.keys(params); @@ -521,25 +549,28 @@ class DatabaseHandler { }) { const { text, mnote, eid, lesson_id } = params; const note = mnote ? mnote : null; - const query = this.db.query(` + const tx = this.db.transaction(() => { + const query = this.db.query(` INSERT INTO cards(text, note) VALUES(?, ?) `); - const res = query.run(text, note); - const cid = res.lastInsertRowid; - const query2 = this.db.query(` + const res = query.run(text, note); + const cid = res.lastInsertRowid; + const query2 = this.db.query(` INSERT OR IGNORE INTO cards_expressions(card_id, expression_id) VALUES(?, ?) `); - query2.run(cid, eid); - if (lesson_id) { - const query = this.db.query(` + query2.run(cid, eid); + if (lesson_id) { + const query = this.db.query(` INSERT INTO cards_lessons(card_id, lesson_id) VALUES(?, ?) `); - query.run(cid, lesson_id); - } + query.run(cid, lesson_id); + } + }); + return tx(); } addCardO(lesson_id: number | bigint | null, text: string, mnote?: string) { // wtf is this fucntion when did I write this @@ -634,12 +665,9 @@ class DatabaseHandler { } addWCat(wordId: number | bigint, category: string) { const queryString = ` - INSERT - INTO word_categories(word_id, category_id) - VALUES($wordId, ( - SELECT id FROM categories - WHERE name = $category - )) + INSERT OR IGNORE + INTO word_categories(word_id, category) + VALUES($wordId, $category) `; const query = this.db.query(queryString); const res = query.run({ wordId, category }); diff --git a/src/lib/db/schema.sql b/src/lib/db/schema.sql index 1b678c5..8d1b288 100644 --- a/src/lib/db/schema.sql +++ b/src/lib/db/schema.sql @@ -114,9 +114,14 @@ CREATE TABLE IF NOT EXISTS lessons( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT NOT NULL, position INTEGER NOT NULL DEFAULT 0, - description TEXT, - lang TEXT, - FOREIGN KEY (lang) REFERENCES languages(code) + description TEXT +); +CREATE TABLE IF NOT EXISTS lang_lessons( + lesson_id INTEGER NOT NULL, + lang TEXT NOT NULL, + PRIMARY KEY (lang, lesson_id), + FOREIGN KEY (lang) REFERENCES languages(code), + FOREIGN KEY (lesson_id) REFERENCES lessons(id) ); CREATE TABLE IF NOT EXISTS cards( id INTEGER PRIMARY KEY AUTOINCREMENT, diff --git a/src/lib/db/seed.ts b/src/lib/db/seed.ts new file mode 100644 index 0000000..c4094de --- /dev/null +++ b/src/lib/db/seed.ts @@ -0,0 +1,494 @@ +import { readWiktionaryDump } from "../services/wiki"; +import { getStressedSyllable, getSyllableCount } from "../utils"; +import useful from "@/lib/useful_thai.json"; +import db from "."; + +const SYMBOL_REGEX = new RegExp(/[\W\d]/); + +async function handleFile( + filename: string, + func: (line: string, idx: number) => void, +) { + const file = Bun.file(filename); + const s = file.stream(); + const reader = s.getReader(); + const decoder = new TextDecoder(); + let leftover = ""; + let lineCount = 0; + while (true) { + const { value, done } = await reader.read(); + if (done) break; + const chunk = decoder.decode(value, { stream: true }); + const lines = (leftover + chunk).split("\n"); + + // Process each line except the last (which might be incomplete) + for (const line of lines.slice(0, -1)) { + lineCount++; + func(line, lineCount); + } + + // Save the last incomplete line to process in the next iteration + leftover = lines[lines.length - 1]; + } + + // Handle any remaining content after reading all chunks + if (leftover) func(leftover, lineCount + 1); +} + +function goodPos(pos: string): boolean { + const list = [ + "CC", + "DT", + "EX", + "IN", + "LS", + "MD", + "PDT", + "POS", + "PRP", + "PRP$", + "RP", + "TO", + "WDT", + "WP", + "WP$", + ]; + return list.includes(pos); +} +// function englishKaggle() { +// handleFile("../datasets/words_pos.csv", (line, idx) => { +// const [_, spelling, pos] = line.split(","); +// if (!goodPos(pos)) return; +// const rowid = addWord(db, spelling, "", "en-us", "word", null); +// const category = poss[pos] || "unknown;"; +// addCat(db, rowid, category); +// }); +// } +// async function englishIPA() { +// handleFile("ipa/en-us/ipadict.txt", (line, idx) => { +// const [spelling, ipa] = line.split(/\s+/); +// if (!spelling || !ipa) return; +// const hasSymbols = spelling.match(SYMBOL_REGEX); +// if (hasSymbols) return; +// const split = spelling.split(" "); +// const type = split.length > 1 ? "expression" : "word"; +// const subtype = null; +// addWord(db, spelling, ipa, "en-us", type, subtype); +// }); +// } + +async function englishFreq() { + handleFile( + "/home/y/code/prosody/hanchu/datasets/unigram_freq.csv", + (line, idx) => { + const [spelling, _frequency] = line.split(","); + db.addFrequency(spelling, idx); + }, + ); +} +async function thaiFreq() { + const files = [ + "/home/y/code/prosody/prosody/langdata/thai/data/1yin_freq.csv", + "/home/y/code/prosody/prosody/langdata/thai/data/2yin_freq.csv", + "/home/y/code/prosody/prosody/langdata/thai/data/3yin_freq.csv", + "/home/y/code/prosody/prosody/langdata/thai/data/4yin_freq.csv", + "/home/y/code/prosody/prosody/langdata/thai/data/5yin_freq.csv", + "/home/y/code/prosody/prosody/langdata/thai/data/6yin_freq.csv", + ]; + for (let f of files) { + handleFile(f, (line, idx) => { + const [spelling, IPA, tone, length, frequency, ...rest] = line.split(","); + db.addFrequency(spelling, Number(frequency)); + }); + } +} + +const thaiTones: Record = { + M: 1, + L: 2, + F: 3, + H: 4, + R: 5, +}; +const thaiTones2: Record = { + "˧": 1, + "˨˩": 2, + "˥˩": 3, + "˦˥": 4, + "˩˩˦": 5, +}; +async function thaiSyllables() { + handleFile( + "/home/y/code/prosody/prosody/langdata/thai/data/1yin_freq.csv", + (line, idx) => { + const [spelling, IPA, toneS, length, frequency, ...rest] = + line.split(","); + const isLong = length === "長"; + const tone = thaiTones[toneS]; + const prosody = JSON.stringify({ isLong, tone, lang: "th" }); + db.upsertWord({ + spelling, + lang: "th", + ipa: JSON.stringify([{ ipa: IPA, tags: ["sortug"] }]), + prosody, + syllables: 1, + type: "syllable", + frequency: Number(frequency), + confidence: 10, + }); + }, + ); + handleFile( + "/home/y/code/prosody/prosody/langdata/thai/data/1yinjie.csv", + (line, idx) => { + const [spelling, IPA] = line.split(","); + const isLong = IPA.includes("ː"); + let tone = 0; + const toneMarks = Object.keys(thaiTones2); + for (let tm of toneMarks) { + if (IPA.includes(tm)) tone = thaiTones2[tm]; + } + const prosody = JSON.stringify({ isLong, tone, lang: "th" }); + db.upsertWord({ + spelling, + lang: "th", + ipa: JSON.stringify([{ ipa: IPA, tags: ["sortug"] }]), + prosody, + syllables: 1, + type: "syllable", + confidence: 10, + }); + }, + ); +} + +// // Save the last incomplete line to process in the next iteration +// } +// TODO no conjunctions or adpositions in Wordnet!! +// function englishWordnet() { +// // LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB' +// // LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid +// const queryString = ` +// WITH ranked_ipa AS ( +// SELECT +// lp.wordid, +// pr.pronunciation, +// lp.variety, +// ROW_NUMBER() OVER ( +// PARTITION BY lp.wordid +// ORDER BY +// CASE +// WHEN lp.variety = 'US' THEN 1 +// WHEN lp.variety IS NULL THEN 2 +// WHEN lp.variety IS 'GB' THEN 3 +// ELSE 4 +// END +// ) AS rank +// FROM lexes_pronunciations lp +// JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid +// ) +// SELECT words.wordid, word, rp.pronunciation as ipa, domainname +// FROM words +// LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1 +// LEFT JOIN senses ON senses.wordid = words.wordid +// LEFT JOIN synsets ON synsets.synsetid = senses.synsetid +// LEFT JOIN domains ON domains.domainid = synsets.domainid +// GROUP BY words.wordid +// `; +// const query = wndb.query(queryString); +// const res: Array<{ +// word: string; +// ipa: string; +// domainname: string; +// }> = query.all() as any; +// console.log("res", res.length); +// for (const r of res) { +// console.log(r, "r"); +// // if (r.word === 'abrasive') throw new Error('stop right here'); +// const ok = filterWord(r.word); +// if (!ok) continue; +// const split = r.word.split(" "); +// const type = split.length > 1 ? "expression" : "word"; +// const subtype = null; +// const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype); +// const category = domains[r.domainname] || "unknown;"; +// addCat(db, wordid, category); +// } +// } +function filterWord(s: string) { + const hasSymbols = s.match(SYMBOL_REGEX); + if (hasSymbols) return false; + else return true; +} + +// function checkWordNet(word: string) { +// const query = wndb.query(`SELECT * FROM words WHERE word = $word`); +// const res = query.get({ $word: word }); +// return !!res; +// } + +// function englishCards() { +// const lesson_id = addLesson(db, "First Lesson, some easy stuff"); +// const texts = [ +// "I", +// "friend", +// "my friend", +// "you", +// "your friend", +// "my friends' friend", +// "you are my friend", +// "I am your friend", +// "your friend is my friend", +// "my friend is your friend", +// "he is my friend", +// "this is mine", +// "this is yours", +// "this is my friends'", +// "no", +// "you are not my friend", +// "this is not yours", +// "your friend is not my friend", +// "that is mine", +// "this is mine, that is yours", +// "he is not your friend", +// "no, I am not", +// "that is not me", +// "that is not mine, that is my friends'", +// ]; +// for (const text of texts) { +// addCard(db, lesson_id, text); +// } +// } +// englishWordnet(); +// englishFreq(); +// englishCards(); +// englishKaggle(); + +async function fillFromDump() { + await db.init(); + // const log = Bun.file("./stuff.log"); + // const logWriter = log.writer(); + let count = 0; + const fields = new Set(); + // let biggest = 0; + for await (const line of readWiktionaryDump()) { + try { + count++; + // if (count > 80) break; + // if (line.length > biggest) { + // biggest = line.length; + // Bun.write("./biggest.log", line, { createPath: true }); + // } + const j = JSON.parse(line); + db.addLanguage(j.lang_code, j.lang); + db.addCat(j.pos); + // for (let key of Object.keys(j)) { + // if (!fields.has(key)) { + // fields.add(key); + // logWriter.write(`${line}\n`); + // } + // } + if (j.lang_code === "en" || j.lang_code === "th") { + console.log("saving", j.word); + // console.log(j.sounds); + const related = { + derived: j.derived, + antonyms: j.antonyms, + synonyms: j.synonyms, + related: j.related, + }; + let rhyme = ""; + let ipaExample = ""; + let ipa: any[] = []; + for (let snd of j.sounds || []) { + if ("ipa" in snd) { + ipa.push(snd); + if (!ipaExample) ipaExample = snd.ipa; + } + if ("rhymes" in snd) rhyme = snd.rhymes; + } + const isWord = j.word.trim().split(" ").length === 1; + const type: any = isWord ? "word" : "expression"; + const syllables = ipaExample ? getSyllableCount(ipaExample) : 0; + console.log({ ipaExample, syllables }); + let prosody: any = {}; + if (ipaExample) { + const stressedSyllable = getStressedSyllable(ipaExample); + if ("ok" in stressedSyllable) + prosody.stressedSyllable = stressedSyllable.ok; + } + if (rhyme) prosody.rhyme = rhyme; + try { + const row = db.addWord({ + spelling: j.word, + lang: j.lang_code, + ipa: JSON.stringify(ipa), + prosody: JSON.stringify(prosody), + syllables, + type, + }); + let parent_id: number | bigint; + if (row.changes === 1) parent_id = row.lastInsertRowid; + else { + const data: any = db.fetchExpressionBySpelling(j.word, j.lang_code); + parent_id = data.id; + } + const senseRow = db.addSense({ + id: count - 1, + parent_id, + spelling: j.word, + etymology: j.etymology_text || "", + pos: j.pos, + ipa: JSON.stringify(ipa), + prosody: JSON.stringify(prosody), + senses: JSON.stringify(j.senses), + forms: JSON.stringify(j.forms || []), + related: JSON.stringify(related), + }); + } catch (e) { + console.log("error inserting", e); + } + } + // langset.add(j.lang_code); + // if (j.lang === "Translingual") continue; + // if (j.lang_code === "en") en++; + // if (j.lang_code === "th") thai++; + // if (j.lang_code === "zh") zh++; + + // if (j.word === "cat") { + // console.log(j.word); + // console.log(Object.keys(j)); + // console.log(j); + // console.log("senses", j.senses); + // console.log("forms", j.forms); + // // console.log("ett", j.etymology_templates); + // // console.log("derived", j.derived); + // // const meaning: Meaning = {etymology: j.etymology_text} + // // const wd = { lang: j.lang_code, spelling: j.word, ipa, {} }; + // break; + // } + } catch (e) { + console.log("error parsing", e); + } + } + console.log("fields", fields); +} + +function addDecks() { + // const lesson_id = db.addLesson({ + // name: "Thai Syllables", + // description: "All the syllables in the Thai language ordered by frequency", + // lang: "th", + // }); + const syllables: any[] = db.fetchExpressionRaw({ + confidence: "10", + syllables: "1", + lang: "th", + }); + for (let expression of syllables) { + db.addCard({ + lesson_id: 5, + eid: expression.id, + text: "Syllable", + mnote: "from Sortug Development", + }); + } +} +function adjustFrequency(lang: string) { + const frequencies: Set = new Set(); + const all: any[] = db.fetchExpressionRaw({ lang }); + for (let row of all) { + if (row.frequency) frequencies.add(row.frequency); + } + const freqArray = Array.from(frequencies).sort((a, b) => b - a); + console.log(freqArray); + for (let row of all) { + if (row.frequency) { + const f = freqArray.indexOf(row.frequency); + if (f === -1) throw new Error("wtf" + row.frequency); + db.updateWord(row.id, { frequency: f + 1 }); + } + } +} + +// -- INSERT INTO lessons(name, description) values('8000 Super Useful Expressions', 'David Martins Facebook list of coloquial Thai expressions'); +// -- INSERT INTO lang_lessons(lesson_id, lang) VALUES(1, 'th'); +// -- INSERT INTO lessons(name, description) values('Thai Syllables', 'All syllables in Thai phonology'); +// -- INSERT INTO lang_lessons(lesson_id, lang) VALUES(2, 'th'); +function addThaiUseful() { + let idx = 0; + for (const level in useful) { + db.addCat(level); + const exps = (useful as any)[level]; + console.log(level, exps.length); + for (const exp of exps) { + const split = exp.ipa.split("/").filter((s) => s.trim()); + const ipa = split.map((ip: any) => ({ ipa: ip, tags: [] })); + try { + idx++; + const tx = db.db.transaction(() => { + const wid = db.addWord({ + spelling: exp.spelling, + lang: "th", + type: "expression", + ipa: JSON.stringify(ipa), + }); + console.log({ wid }); + db.addWCat(wid, level); + if (exp.register) { + db.addCat(exp.register); + db.addWCat(wid, exp.register); + } + const glosses = [exp.english]; + if (exp.note) glosses.push(exp.note); + db.addSense({ + parent_id: wid, + spelling: exp.spelling, + senses: JSON.stringify([{ glosses }]), + }); + db.addCard({ + text: `Super Useful ${idx}`, + eid: wid as any, + lesson_id: 1, + }); + }); + tx(); + } catch (e) { + console.log({ exp }); + console.error(`${e}`); + // break; + } + } + } +} + +function addThaiSyllablesLesson() { + const res = db.db + .query( + "SELECT id FROM expressions e WHERE e.type = 'syllable' and e.lang = 'th'", + ) + .all() as any[]; + for (const row of res) { + db.addCard({ text: "Syllable", eid: row.id, lesson_id: 2 }); + } +} +// function fixIpa() { +// const res = db.db.query(`SELECT id, ipa FROM expressions`).all() as any[]; +// for (const row of res) { +// try { +// const jon = JSON.parse(row.ipa); +// } catch (_) { +// const clean: string = row.ipa.replace("...", "").trim(); +// db.db.query(`UPDATE expressions SET ipa = ? WHERE `).run(JSON.stringify(ipa)); +// } +// } +// } +addThaiUseful(); +// addThaiSyllablesLesson(); + +// adjustFrequency("th"); + +// addDecks(); +// fillFromDump(); +// thaiSyllables(); +// thaiFreq(); -- cgit v1.2.3