This commit is contained in:
polwex 2024-10-23 23:54:41 +07:00
parent 9bbb3b3cfa
commit d0bcc2a81c
5 changed files with 218 additions and 99 deletions

View File

@ -6,11 +6,12 @@ PRAGMA mmap_size = 30000000000;
-- Words table -- Words table
-- TODO restore a separate words table?
CREATE TABLE expressions( CREATE TABLE expressions(
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
spelling TEXT NOT NULL, spelling TEXT NOT NULL,
ipa TEXT NOT NULL,
language_id INTEGER NOT NULL, language_id INTEGER NOT NULL,
ipa TEXT,
frequency INTEGER, frequency INTEGER,
type TEXT NOT NULL, type TEXT NOT NULL,
subtype TEXT, subtype TEXT,
@ -95,6 +96,9 @@ INSERT INTO categories (name, part_of_speech_id) VALUES
('nominative', 5), ('nominative', 5),
('accusative', 5), ('accusative', 5),
('genitive', 5), ('genitive', 5),
('interrogative', 5),
-- not really a pronoun but whatever
('determiner', 5),
-- adpositions -- adpositions
('preposition', 6), ('preposition', 6),
('postposition', 6), ('postposition', 6),

View File

@ -27,7 +27,7 @@ export function fetchFrequent(db: Database, count: number, page: number) {
spelling, spelling,
ipa, ipa,
frequency, frequency,
GROUP_CONCAT(c.name, ',') AS category, GROUP_CONCAT(c.name, ',') AS category
FROM expressions e FROM expressions e
JOIN word_categories wc ON wc.word_id = e.id JOIN word_categories wc ON wc.word_id = e.id
JOIN categories c ON c.id = wc.category_id JOIN categories c ON c.id = wc.category_id
@ -37,37 +37,66 @@ export function fetchFrequent(db: Database, count: number, page: number) {
`); `);
return query.get({ count, offset }); return query.get({ count, offset });
} }
export function fetchExpressionsByCard(db: Database, cid: number) {
const queryString = `
SELECT
e.spelling, e.id as eid, e.ipa
FROM cards_expressions ce
JOIN expressions e ON ce.expression_id = e.id
WHERE ce.card_id = $cid AND e.spelling IS NOT NULL
ORDER BY e.frequency DESC
`;
const query = db.query(queryString);
return query.all({ cid });
}
export function fetchLessons(db: Database, count: number, page: number) { export function fetchLessons(db: Database, count: number, page: number) {
const p = page < 1 ? 1 : page; const p = page < 1 ? 1 : page;
const offset = (p - 1) * count; const offset = (p - 1) * count;
const queryString = `
SELECT
l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid,
e.spelling, e.ipa, e.frequency, e.id as eid,
GROUP_CONCAT(cg.name, ',') AS category
FROM expressions e
JOIN cards_expressions ce ON e.id = ce.expression_id
JOIN cards ON cards.id = cl.card_id
JOIN cards_lessons cl ON cl.card_id = cards.id
JOIN lessons l ON l.id = cl.lesson_id
JOIN expressions e ON e.id = ce.expression_id
JOIN word_categories wc ON wc.word_id = e.id
JOIN categories cg ON cg.id = wc.category_id
LIMIT $count
OFFSET $offset
`;
// const queryString = ` // const queryString = `
// SELECT // SELECT
// l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid // l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid
// FROM cards_lessons cl // FROM cards_lessons cl
// JOIN lessons l ON l.id = cl.lesson_id
// JOIN cards ON cards.id = cl.card_id // JOIN cards ON cards.id = cl.card_id
// JOIN lessons l ON l.id = cl.lesson_id
// LIMIT $count // LIMIT $count
// OFFSET $offset // OFFSET $offset
// `; // `;
const queryString = `
SELECT
l.id AS lesson_id,
l.text AS lesson_text,
c.id AS card_id,
c.text AS card_text,
c.note AS card_note,
e.id AS expression_id,
e.spelling AS expression_spelling,
e.ipa AS expression_ipa,
e.type AS expression_type,
e.subtype AS expression_subtype,
GROUP_CONCAT(cat.name, ', ') AS categories
FROM
lessons l
JOIN
cards_lessons cl ON l.id = cl.lesson_id
JOIN
cards c ON c.id = cl.card_id
JOIN
cards_expressions ce ON c.id = ce.card_id
JOIN
expressions e ON e.id = ce.expression_id
LEFT JOIN
word_categories wc ON wc.word_id = e.id
LEFT JOIN
categories cat ON cat.id = wc.category_id
GROUP BY
l.id, c.id, e.id
ORDER BY
l.id ASC, c.id ASC, e.id ASC
LIMIT $count OFFSET $offset;
`;
const query = db.query(queryString); const query = db.query(queryString);
const res = query.all({ count, offset }); return query.all({ count, offset });
return res;
} }
// SELECT l.id, l.text, cards.text, cards.note FROM cards_lessons cl LEFT JOIN lessons l ON l.id = cl.lesson_id LEFT JOIN cards ON cards.id = cl.card_id ORDER BY l.id ASC LIMIT 20 OFFSET 0; // SELECT l.id, l.text, cards.text, cards.note FROM cards_lessons cl LEFT JOIN lessons l ON l.id = cl.lesson_id LEFT JOIN cards ON cards.id = cl.card_id ORDER BY l.id ASC LIMIT 20 OFFSET 0;
@ -86,7 +115,6 @@ export function fetchLesson(db: Database, lesson: number) {
JOIN categories cg ON cg.id = wc.category_id JOIN categories cg ON cg.id = wc.category_id
WHERE l.id = $lesson WHERE l.id = $lesson
`; `;
console.log(queryString);
const query = db.query(queryString); const query = db.query(queryString);
return query.all({ lesson }); return query.all({ lesson });
} }
@ -137,6 +165,7 @@ export function addCard(
)) ))
`); `);
const wtr = db.transaction((pairs) => { const wtr = db.transaction((pairs) => {
// console.log("adding to ce", { pairs, cid, text });
for (const pair of pairs) wquery.run(pair); for (const pair of pairs) wquery.run(pair);
}); });
const words = text const words = text
@ -145,7 +174,7 @@ export function addCard(
.trim() .trim()
.split(" "); .split(" ");
const combinations = wordFactorial(words); const combinations = wordFactorial(words);
const richWords = combinations.map((spelling) => { const richWords = Array.from(combinations).map((spelling) => {
return { spelling, cid }; return { spelling, cid };
}); });
wtr(richWords); wtr(richWords);
@ -187,7 +216,11 @@ export function addWord(
const res = query.run({ spelling, ipa, language, type, subtype }); const res = query.run({ spelling, ipa, language, type, subtype });
return res.lastInsertRowid; return res.lastInsertRowid;
} }
export function addCat(db: Database, wordId: number | bigint, domain: string) { export function addCat(
db: Database,
wordId: number | bigint,
category: string,
) {
const queryString = ` const queryString = `
INSERT INSERT
INTO word_categories(word_id, category_id) INTO word_categories(word_id, category_id)
@ -196,13 +229,23 @@ export function addCat(db: Database, wordId: number | bigint, domain: string) {
WHERE name = $category WHERE name = $category
)) ))
`; `;
const category = domains[domain] || "unknown";
const query = db.query(queryString); const query = db.query(queryString);
const res = query.run({ wordId, category }); const res = query.run({ wordId, category });
return res.lastInsertRowid; return res.lastInsertRowid;
} }
const domains: Record<string, string> = { export const poss: Record<string, string> = {
CC: "conjunction",
DT: "determiner",
IN: "preposition",
MD: "auxiliar",
PRP: "nominative", // TODO oi
PRP$: "gemitive",
WDT: "determiner",
WP: "interrogative",
WP$: "interrogative",
};
export const domains: Record<string, string> = {
"adj.all": "adjective", "adj.all": "adjective",
"adj.pert": "adjective", "adj.pert": "adjective",
"adj.ppl": "adjective", "adj.ppl": "adjective",
@ -261,5 +304,13 @@ export function addFrequency(
`; `;
const query = db.query(queryString); const query = db.query(queryString);
const res = query.run({ spelling, frequency }); const res = query.run({ spelling, frequency });
console.log(res, "added frequency"); }
export function addIPA(db: Database, spelling: string, ipa: string) {
const queryString = `
UPDATE expressions
SET ipa= $ipa
WHERE expressions.spelling = $spelling
`;
const query = db.query(queryString);
const res = query.run({ spelling, ipa });
} }

View File

@ -1,5 +1,13 @@
import { Database } from "bun:sqlite"; import { Database } from "bun:sqlite";
import { addCard, addCat, addFrequency, addLesson, addWord } from "./db"; import {
addCard,
addCat,
addFrequency,
addLesson,
addWord,
domains,
poss,
} from "./db";
// const db = new Database('../db/data.db'); // const db = new Database('../db/data.db');
const db = new Database("../db/data.db", { strict: true }); const db = new Database("../db/data.db", { strict: true });
@ -8,31 +16,11 @@ db.exec("PRAGMA journal_mode = WAL;");
const SYMBOL_REGEX = new RegExp(/[\W\d]/); const SYMBOL_REGEX = new RegExp(/[\W\d]/);
// async function englishIPA() { async function handleFile(
// const file = Bun.file('ipa/en-us/ipadict.txt'); filename: string,
// const s = file.stream(); func: (line: string, idx: number) => void,
// const reader = s.getReader(); ) {
// const decoder = new TextDecoder(); const file = Bun.file(filename);
// let leftover = '';
// while (true) {
// const { value, done } = await reader.read();
// if (done) break;
// const chunk = decoder.decode(value, { stream: true });
// const lines = (leftover + chunk).split('\n');
// // Process each line except the last (which might be incomplete)
// for (const line of lines.slice(0, -1)) saveLine(line);
// // Save the last incomplete line to process in the next iteration
// leftover = lines[lines.length - 1];
// }
// // Handle any remaining content after reading all chunks
// if (leftover) saveLine(leftover);
// }
async function englishFreq() {
const file = Bun.file("../datasets/unigram_freq.csv");
const s = file.stream(); const s = file.stream();
const reader = s.getReader(); const reader = s.getReader();
const decoder = new TextDecoder(); const decoder = new TextDecoder();
@ -47,8 +35,7 @@ async function englishFreq() {
// Process each line except the last (which might be incomplete) // Process each line except the last (which might be incomplete)
for (const line of lines.slice(0, -1)) { for (const line of lines.slice(0, -1)) {
lineCount++; lineCount++;
const [spelling, _frequency] = line.split(","); func(line, lineCount);
addFrequency(db, spelling, lineCount);
} }
// Save the last incomplete line to process in the next iteration // Save the last incomplete line to process in the next iteration
@ -56,48 +43,115 @@ async function englishFreq() {
} }
// Handle any remaining content after reading all chunks // Handle any remaining content after reading all chunks
if (leftover) addFrequency(db, leftover, lineCount + 1); if (leftover) func(leftover, lineCount + 1);
}
function goodPos(pos: string): boolean {
const list = [
"CC",
"DT",
"EX",
"IN",
"LS",
"MD",
"PDT",
"POS",
"PRP",
"PRP$",
"RP",
"TO",
"WDT",
"WP",
"WP$",
];
return list.includes(pos);
}
function englishKaggle() {
handleFile("../datasets/words_pos.csv", (line, idx) => {
const [_, spelling, pos] = line.split(",");
if (!goodPos(pos)) return;
const rowid = addWord(db, spelling, "", "en-us", "word", null);
const category = poss[pos] || "unknown;";
addCat(db, rowid, category);
});
}
async function englishIPA() {
handleFile("ipa/en-us/ipadict.txt", (line, idx) => {
const [spelling, ipa] = line.split(/\s+/);
if (!spelling || !ipa) return;
const hasSymbols = spelling.match(SYMBOL_REGEX);
if (hasSymbols) return;
const split = spelling.split(" ");
const type = split.length > 1 ? "expression" : "word";
const subtype = null;
addWord(db, spelling, ipa, "en-us", type, subtype);
});
}
async function englishFreq() {
handleFile("../datasets/unigram_freq.csv", (line, idx) => {
const [spelling, _frequency] = line.split(",");
addFrequency(db, spelling, idx);
});
// Save the last incomplete line to process in the next iteration
} }
// TODO no conjunctions or adpositions in Wordnet!! // TODO no conjunctions or adpositions in Wordnet!!
function englishIPA() { function englishWordnet() {
// LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB'
// LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid
const queryString = ` const queryString = `
SELECT words.wordid, word, pronunciation, domainname FROM words WITH ranked_ipa AS (
JOIN lexes_pronunciations lp ON lp.wordid = words.wordid SELECT
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid lp.wordid,
JOIN senses ON senses.wordid = words.wordid pr.pronunciation,
JOIN synsets ON synsets.synsetid = senses.synsetid lp.variety,
JOIN domains ON domains.domainid = synsets.domainid ROW_NUMBER() OVER (
PARTITION BY lp.wordid
ORDER BY
CASE
WHEN lp.variety = 'US' THEN 1
WHEN lp.variety IS NULL THEN 2
WHEN lp.variety IS 'GB' THEN 3
ELSE 4
END
) AS rank
FROM lexes_pronunciations lp
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
)
SELECT words.wordid, word, rp.pronunciation as ipa, domainname
FROM words
LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1
LEFT JOIN senses ON senses.wordid = words.wordid
LEFT JOIN synsets ON synsets.synsetid = senses.synsetid
LEFT JOIN domains ON domains.domainid = synsets.domainid
GROUP BY words.wordid GROUP BY words.wordid
`; `;
const query = wndb.query(queryString); const query = wndb.query(queryString);
const res: Array<{ const res: Array<{
word: string; word: string;
pronunciation: string; ipa: string;
domainname: string; domainname: string;
}> = query.all() as any; }> = query.all() as any;
console.log("res", res.length);
for (const r of res) { for (const r of res) {
console.log("adding word", r); console.log(r, "r");
// if (r.word === 'abrasive') throw new Error('stop right here'); // if (r.word === 'abrasive') throw new Error('stop right here');
const ok = filterWord(r.word);
if (!ok) continue;
const split = r.word.split(" "); const split = r.word.split(" ");
const type = split.length > 1 ? "expression" : "word"; const type = split.length > 1 ? "expression" : "word";
const subtype = null; const subtype = null;
const wordid = addWord(db, r.word, r.pronunciation, "en-us", type, subtype); const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype);
addCat(db, wordid, r.domainname); const category = domains[r.domainname] || "unknown;";
addCat(db, wordid, category);
} }
} }
// function saveLine(line: string) { function filterWord(s: string) {
// const [spelling, ipa] = line.split(/\s+/); const hasSymbols = s.match(SYMBOL_REGEX);
// if (!spelling || !ipa) return; if (hasSymbols) return false;
// const hasSymbols = spelling.match(SYMBOL_REGEX); else return true;
// if (hasSymbols) return; }
// const isWord = checkWordNet(spelling);
// console.log(spelling, isWord);
// if (!isWord) return;
// const split = spelling.split(' ');
// const type = split.length > 1 ? 'expression' : 'word';
// const subtype = null;
// addWord(db, spelling, ipa, 'en-us', type, subtype);
// }
// function checkWordNet(word: string) { // function checkWordNet(word: string) {
// const query = wndb.query(`SELECT * FROM words WHERE word = $word`); // const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
@ -137,6 +191,7 @@ function englishCards() {
addCard(db, lesson_id, text); addCard(db, lesson_id, text);
} }
} }
// englishIPA(); // englishWordnet();
// englishFreq(); // englishFreq();
englishCards(); // englishCards();
englishKaggle();

View File

@ -2,6 +2,7 @@ import { Database } from "bun:sqlite";
import { import {
addUser, addUser,
fetchCard, fetchCard,
fetchExpressionsByCard,
fetchLesson, fetchLesson,
fetchLessons, fetchLessons,
fetchResource, fetchResource,
@ -58,7 +59,12 @@ type LessonsType = Record<
{ {
id: number; id: number;
text: string; text: string;
cards: Array<{ text: string; note: string | null; id: number }>; cards: Array<{
text: string;
note: string | null;
id: number;
words: Array<{ spelling: string; ipa: string; category: string }>;
}>;
} }
>; >;
type LessonsDBType = { type LessonsDBType = {
@ -85,17 +91,20 @@ function handleGetLessons(user: number, url: URL) {
const page = params.get("page") || "0"; const page = params.get("page") || "0";
const data: LessonsDBType[] = fetchLessons(db, 20, Number(page)) as any; const data: LessonsDBType[] = fetchLessons(db, 20, Number(page)) as any;
console.log(data, "fetchlessons"); console.log(data, "fetchlessons");
const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => { console.log(data.length);
let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] }; // const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => {
const cards = [ // let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] };
...cur.cards, // const words = fetchExpressionsByCard(db, item.cid) as any[];
{ text: item.ctext, note: item.cnote, id: item.cid }, // console.log(words, item.cid);
]; // const cards = [
const def = { ...cur, cards }; // ...cur.cards,
return { ...acc, [item.id]: def }; // { text: item.ctext, note: item.cnote, id: item.cid, words },
}, {} as LessonsType); // ];
console.log(lessons, "lesons"); // const def = { ...cur, cards };
return Response.json({ ok: lessons }); // return { ...acc, [item.id]: def };
// }, {} as LessonsType);
// return Response.json({ ok: lessons });
return Response.json({ ok: data });
} }
async function handlePost(req: Request, user: number, url: URL) { async function handlePost(req: Request, user: number, url: URL) {

View File

@ -1,10 +1,10 @@
export function wordFactorial(words: string[]): string[] { export function wordFactorial(words: string[]): Set<string> {
const combinations: string[] = []; const combinations: Set<string> = new Set([]);
for (let i = 0; i < words.length; i++) { for (let i = 0; i < words.length; i++) {
let inner = ''; let inner = "";
for (let ii = i; ii < words.length; ii++) { for (let ii = i; ii < words.length; ii++) {
inner += (ii > i ? ' ' : '') + words[ii]; inner += (ii > i ? " " : "") + words[ii].toLowerCase();
combinations.push(inner); combinations.add(inner);
} }
} }
return combinations; return combinations;