From d0bcc2a81c9a9a3ead837b6872af23a8c65cef04 Mon Sep 17 00:00:00 2001 From: polwex Date: Wed, 23 Oct 2024 23:54:41 +0700 Subject: [PATCH] m --- schema.sql | 6 +- server/db.ts | 103 +++++++++++++++++++++-------- server/seeding.ts | 165 ++++++++++++++++++++++++++++++---------------- server/server.ts | 33 ++++++---- server/utils.ts | 10 +-- 5 files changed, 218 insertions(+), 99 deletions(-) diff --git a/schema.sql b/schema.sql index 3a8d38d..112ffa0 100644 --- a/schema.sql +++ b/schema.sql @@ -6,11 +6,12 @@ PRAGMA mmap_size = 30000000000; -- Words table +-- TODO restore a separate words table? CREATE TABLE expressions( id INTEGER PRIMARY KEY AUTOINCREMENT, spelling TEXT NOT NULL, - ipa TEXT NOT NULL, language_id INTEGER NOT NULL, + ipa TEXT, frequency INTEGER, type TEXT NOT NULL, subtype TEXT, @@ -95,6 +96,9 @@ INSERT INTO categories (name, part_of_speech_id) VALUES ('nominative', 5), ('accusative', 5), ('genitive', 5), +('interrogative', 5), +-- not really a pronoun but whatever +('determiner', 5), -- adpositions ('preposition', 6), ('postposition', 6), diff --git a/server/db.ts b/server/db.ts index ce7645d..82c76a4 100644 --- a/server/db.ts +++ b/server/db.ts @@ -27,7 +27,7 @@ export function fetchFrequent(db: Database, count: number, page: number) { spelling, ipa, frequency, - GROUP_CONCAT(c.name, ',') AS category, + GROUP_CONCAT(c.name, ',') AS category FROM expressions e JOIN word_categories wc ON wc.word_id = e.id JOIN categories c ON c.id = wc.category_id @@ -37,37 +37,66 @@ export function fetchFrequent(db: Database, count: number, page: number) { `); return query.get({ count, offset }); } + +export function fetchExpressionsByCard(db: Database, cid: number) { + const queryString = ` + SELECT + e.spelling, e.id as eid, e.ipa + FROM cards_expressions ce + JOIN expressions e ON ce.expression_id = e.id + WHERE ce.card_id = $cid AND e.spelling IS NOT NULL + ORDER BY e.frequency DESC + `; + const query = db.query(queryString); + return query.all({ cid }); +} export function fetchLessons(db: Database, count: number, page: number) { const p = page < 1 ? 1 : page; const offset = (p - 1) * count; - const queryString = ` - SELECT - l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid, - e.spelling, e.ipa, e.frequency, e.id as eid, - GROUP_CONCAT(cg.name, ',') AS category - FROM expressions e - JOIN cards_expressions ce ON e.id = ce.expression_id - JOIN cards ON cards.id = cl.card_id - JOIN cards_lessons cl ON cl.card_id = cards.id - JOIN lessons l ON l.id = cl.lesson_id - JOIN expressions e ON e.id = ce.expression_id - JOIN word_categories wc ON wc.word_id = e.id - JOIN categories cg ON cg.id = wc.category_id - LIMIT $count - OFFSET $offset - `; // const queryString = ` // SELECT // l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid // FROM cards_lessons cl - // JOIN lessons l ON l.id = cl.lesson_id // JOIN cards ON cards.id = cl.card_id + // JOIN lessons l ON l.id = cl.lesson_id // LIMIT $count // OFFSET $offset // `; + const queryString = ` + SELECT + l.id AS lesson_id, + l.text AS lesson_text, + c.id AS card_id, + c.text AS card_text, + c.note AS card_note, + e.id AS expression_id, + e.spelling AS expression_spelling, + e.ipa AS expression_ipa, + e.type AS expression_type, + e.subtype AS expression_subtype, + GROUP_CONCAT(cat.name, ', ') AS categories + FROM + lessons l + JOIN + cards_lessons cl ON l.id = cl.lesson_id + JOIN + cards c ON c.id = cl.card_id + JOIN + cards_expressions ce ON c.id = ce.card_id + JOIN + expressions e ON e.id = ce.expression_id + LEFT JOIN + word_categories wc ON wc.word_id = e.id + LEFT JOIN + categories cat ON cat.id = wc.category_id + GROUP BY + l.id, c.id, e.id + ORDER BY + l.id ASC, c.id ASC, e.id ASC + LIMIT $count OFFSET $offset; + `; const query = db.query(queryString); - const res = query.all({ count, offset }); - return res; + return query.all({ count, offset }); } // SELECT l.id, l.text, cards.text, cards.note FROM cards_lessons cl LEFT JOIN lessons l ON l.id = cl.lesson_id LEFT JOIN cards ON cards.id = cl.card_id ORDER BY l.id ASC LIMIT 20 OFFSET 0; @@ -86,7 +115,6 @@ export function fetchLesson(db: Database, lesson: number) { JOIN categories cg ON cg.id = wc.category_id WHERE l.id = $lesson `; - console.log(queryString); const query = db.query(queryString); return query.all({ lesson }); } @@ -137,6 +165,7 @@ export function addCard( )) `); const wtr = db.transaction((pairs) => { + // console.log("adding to ce", { pairs, cid, text }); for (const pair of pairs) wquery.run(pair); }); const words = text @@ -145,7 +174,7 @@ export function addCard( .trim() .split(" "); const combinations = wordFactorial(words); - const richWords = combinations.map((spelling) => { + const richWords = Array.from(combinations).map((spelling) => { return { spelling, cid }; }); wtr(richWords); @@ -187,7 +216,11 @@ export function addWord( const res = query.run({ spelling, ipa, language, type, subtype }); return res.lastInsertRowid; } -export function addCat(db: Database, wordId: number | bigint, domain: string) { +export function addCat( + db: Database, + wordId: number | bigint, + category: string, +) { const queryString = ` INSERT INTO word_categories(word_id, category_id) @@ -196,13 +229,23 @@ export function addCat(db: Database, wordId: number | bigint, domain: string) { WHERE name = $category )) `; - const category = domains[domain] || "unknown"; const query = db.query(queryString); const res = query.run({ wordId, category }); return res.lastInsertRowid; } -const domains: Record = { +export const poss: Record = { + CC: "conjunction", + DT: "determiner", + IN: "preposition", + MD: "auxiliar", + PRP: "nominative", // TODO oi + PRP$: "gemitive", + WDT: "determiner", + WP: "interrogative", + WP$: "interrogative", +}; +export const domains: Record = { "adj.all": "adjective", "adj.pert": "adjective", "adj.ppl": "adjective", @@ -261,5 +304,13 @@ export function addFrequency( `; const query = db.query(queryString); const res = query.run({ spelling, frequency }); - console.log(res, "added frequency"); +} +export function addIPA(db: Database, spelling: string, ipa: string) { + const queryString = ` + UPDATE expressions + SET ipa= $ipa + WHERE expressions.spelling = $spelling + `; + const query = db.query(queryString); + const res = query.run({ spelling, ipa }); } diff --git a/server/seeding.ts b/server/seeding.ts index 38bb881..593c806 100644 --- a/server/seeding.ts +++ b/server/seeding.ts @@ -1,5 +1,13 @@ import { Database } from "bun:sqlite"; -import { addCard, addCat, addFrequency, addLesson, addWord } from "./db"; +import { + addCard, + addCat, + addFrequency, + addLesson, + addWord, + domains, + poss, +} from "./db"; // const db = new Database('../db/data.db'); const db = new Database("../db/data.db", { strict: true }); @@ -8,31 +16,11 @@ db.exec("PRAGMA journal_mode = WAL;"); const SYMBOL_REGEX = new RegExp(/[\W\d]/); -// async function englishIPA() { -// const file = Bun.file('ipa/en-us/ipadict.txt'); -// const s = file.stream(); -// const reader = s.getReader(); -// const decoder = new TextDecoder(); -// let leftover = ''; -// while (true) { -// const { value, done } = await reader.read(); -// if (done) break; -// const chunk = decoder.decode(value, { stream: true }); -// const lines = (leftover + chunk).split('\n'); - -// // Process each line except the last (which might be incomplete) -// for (const line of lines.slice(0, -1)) saveLine(line); - -// // Save the last incomplete line to process in the next iteration -// leftover = lines[lines.length - 1]; -// } - -// // Handle any remaining content after reading all chunks -// if (leftover) saveLine(leftover); -// } - -async function englishFreq() { - const file = Bun.file("../datasets/unigram_freq.csv"); +async function handleFile( + filename: string, + func: (line: string, idx: number) => void, +) { + const file = Bun.file(filename); const s = file.stream(); const reader = s.getReader(); const decoder = new TextDecoder(); @@ -47,8 +35,7 @@ async function englishFreq() { // Process each line except the last (which might be incomplete) for (const line of lines.slice(0, -1)) { lineCount++; - const [spelling, _frequency] = line.split(","); - addFrequency(db, spelling, lineCount); + func(line, lineCount); } // Save the last incomplete line to process in the next iteration @@ -56,48 +43,115 @@ async function englishFreq() { } // Handle any remaining content after reading all chunks - if (leftover) addFrequency(db, leftover, lineCount + 1); + if (leftover) func(leftover, lineCount + 1); +} + +function goodPos(pos: string): boolean { + const list = [ + "CC", + "DT", + "EX", + "IN", + "LS", + "MD", + "PDT", + "POS", + "PRP", + "PRP$", + "RP", + "TO", + "WDT", + "WP", + "WP$", + ]; + return list.includes(pos); +} +function englishKaggle() { + handleFile("../datasets/words_pos.csv", (line, idx) => { + const [_, spelling, pos] = line.split(","); + if (!goodPos(pos)) return; + const rowid = addWord(db, spelling, "", "en-us", "word", null); + const category = poss[pos] || "unknown;"; + addCat(db, rowid, category); + }); +} +async function englishIPA() { + handleFile("ipa/en-us/ipadict.txt", (line, idx) => { + const [spelling, ipa] = line.split(/\s+/); + if (!spelling || !ipa) return; + const hasSymbols = spelling.match(SYMBOL_REGEX); + if (hasSymbols) return; + const split = spelling.split(" "); + const type = split.length > 1 ? "expression" : "word"; + const subtype = null; + addWord(db, spelling, ipa, "en-us", type, subtype); + }); +} + +async function englishFreq() { + handleFile("../datasets/unigram_freq.csv", (line, idx) => { + const [spelling, _frequency] = line.split(","); + addFrequency(db, spelling, idx); + }); + + // Save the last incomplete line to process in the next iteration } // TODO no conjunctions or adpositions in Wordnet!! -function englishIPA() { +function englishWordnet() { + // LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB' + // LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid const queryString = ` - SELECT words.wordid, word, pronunciation, domainname FROM words - JOIN lexes_pronunciations lp ON lp.wordid = words.wordid - JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid - JOIN senses ON senses.wordid = words.wordid - JOIN synsets ON synsets.synsetid = senses.synsetid - JOIN domains ON domains.domainid = synsets.domainid + WITH ranked_ipa AS ( + SELECT + lp.wordid, + pr.pronunciation, + lp.variety, + ROW_NUMBER() OVER ( + PARTITION BY lp.wordid + ORDER BY + CASE + WHEN lp.variety = 'US' THEN 1 + WHEN lp.variety IS NULL THEN 2 + WHEN lp.variety IS 'GB' THEN 3 + ELSE 4 + END + ) AS rank + FROM lexes_pronunciations lp + JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid + ) + SELECT words.wordid, word, rp.pronunciation as ipa, domainname + FROM words + LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1 + LEFT JOIN senses ON senses.wordid = words.wordid + LEFT JOIN synsets ON synsets.synsetid = senses.synsetid + LEFT JOIN domains ON domains.domainid = synsets.domainid GROUP BY words.wordid `; const query = wndb.query(queryString); const res: Array<{ word: string; - pronunciation: string; + ipa: string; domainname: string; }> = query.all() as any; + console.log("res", res.length); for (const r of res) { - console.log("adding word", r); + console.log(r, "r"); // if (r.word === 'abrasive') throw new Error('stop right here'); + const ok = filterWord(r.word); + if (!ok) continue; const split = r.word.split(" "); const type = split.length > 1 ? "expression" : "word"; const subtype = null; - const wordid = addWord(db, r.word, r.pronunciation, "en-us", type, subtype); - addCat(db, wordid, r.domainname); + const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype); + const category = domains[r.domainname] || "unknown;"; + addCat(db, wordid, category); } } -// function saveLine(line: string) { -// const [spelling, ipa] = line.split(/\s+/); -// if (!spelling || !ipa) return; -// const hasSymbols = spelling.match(SYMBOL_REGEX); -// if (hasSymbols) return; -// const isWord = checkWordNet(spelling); -// console.log(spelling, isWord); -// if (!isWord) return; -// const split = spelling.split(' '); -// const type = split.length > 1 ? 'expression' : 'word'; -// const subtype = null; -// addWord(db, spelling, ipa, 'en-us', type, subtype); -// } +function filterWord(s: string) { + const hasSymbols = s.match(SYMBOL_REGEX); + if (hasSymbols) return false; + else return true; +} // function checkWordNet(word: string) { // const query = wndb.query(`SELECT * FROM words WHERE word = $word`); @@ -137,6 +191,7 @@ function englishCards() { addCard(db, lesson_id, text); } } -// englishIPA(); +// englishWordnet(); // englishFreq(); -englishCards(); +// englishCards(); +englishKaggle(); diff --git a/server/server.ts b/server/server.ts index 8582117..38c1e2d 100644 --- a/server/server.ts +++ b/server/server.ts @@ -2,6 +2,7 @@ import { Database } from "bun:sqlite"; import { addUser, fetchCard, + fetchExpressionsByCard, fetchLesson, fetchLessons, fetchResource, @@ -58,7 +59,12 @@ type LessonsType = Record< { id: number; text: string; - cards: Array<{ text: string; note: string | null; id: number }>; + cards: Array<{ + text: string; + note: string | null; + id: number; + words: Array<{ spelling: string; ipa: string; category: string }>; + }>; } >; type LessonsDBType = { @@ -85,17 +91,20 @@ function handleGetLessons(user: number, url: URL) { const page = params.get("page") || "0"; const data: LessonsDBType[] = fetchLessons(db, 20, Number(page)) as any; console.log(data, "fetchlessons"); - const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => { - let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] }; - const cards = [ - ...cur.cards, - { text: item.ctext, note: item.cnote, id: item.cid }, - ]; - const def = { ...cur, cards }; - return { ...acc, [item.id]: def }; - }, {} as LessonsType); - console.log(lessons, "lesons"); - return Response.json({ ok: lessons }); + console.log(data.length); + // const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => { + // let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] }; + // const words = fetchExpressionsByCard(db, item.cid) as any[]; + // console.log(words, item.cid); + // const cards = [ + // ...cur.cards, + // { text: item.ctext, note: item.cnote, id: item.cid, words }, + // ]; + // const def = { ...cur, cards }; + // return { ...acc, [item.id]: def }; + // }, {} as LessonsType); + // return Response.json({ ok: lessons }); + return Response.json({ ok: data }); } async function handlePost(req: Request, user: number, url: URL) { diff --git a/server/utils.ts b/server/utils.ts index 07a9872..e55a9a9 100644 --- a/server/utils.ts +++ b/server/utils.ts @@ -1,10 +1,10 @@ -export function wordFactorial(words: string[]): string[] { - const combinations: string[] = []; +export function wordFactorial(words: string[]): Set { + const combinations: Set = new Set([]); for (let i = 0; i < words.length; i++) { - let inner = ''; + let inner = ""; for (let ii = i; ii < words.length; ii++) { - inner += (ii > i ? ' ' : '') + words[ii]; - combinations.push(inner); + inner += (ii > i ? " " : "") + words[ii].toLowerCase(); + combinations.add(inner); } } return combinations;