This commit is contained in:
polwex 2024-10-23 23:54:41 +07:00
parent 9bbb3b3cfa
commit d0bcc2a81c
5 changed files with 218 additions and 99 deletions

View File

@ -6,11 +6,12 @@ PRAGMA mmap_size = 30000000000;
-- Words table
-- TODO restore a separate words table?
CREATE TABLE expressions(
id INTEGER PRIMARY KEY AUTOINCREMENT,
spelling TEXT NOT NULL,
ipa TEXT NOT NULL,
language_id INTEGER NOT NULL,
ipa TEXT,
frequency INTEGER,
type TEXT NOT NULL,
subtype TEXT,
@ -95,6 +96,9 @@ INSERT INTO categories (name, part_of_speech_id) VALUES
('nominative', 5),
('accusative', 5),
('genitive', 5),
('interrogative', 5),
-- not really a pronoun but whatever
('determiner', 5),
-- adpositions
('preposition', 6),
('postposition', 6),

View File

@ -27,7 +27,7 @@ export function fetchFrequent(db: Database, count: number, page: number) {
spelling,
ipa,
frequency,
GROUP_CONCAT(c.name, ',') AS category,
GROUP_CONCAT(c.name, ',') AS category
FROM expressions e
JOIN word_categories wc ON wc.word_id = e.id
JOIN categories c ON c.id = wc.category_id
@ -37,37 +37,66 @@ export function fetchFrequent(db: Database, count: number, page: number) {
`);
return query.get({ count, offset });
}
export function fetchExpressionsByCard(db: Database, cid: number) {
const queryString = `
SELECT
e.spelling, e.id as eid, e.ipa
FROM cards_expressions ce
JOIN expressions e ON ce.expression_id = e.id
WHERE ce.card_id = $cid AND e.spelling IS NOT NULL
ORDER BY e.frequency DESC
`;
const query = db.query(queryString);
return query.all({ cid });
}
export function fetchLessons(db: Database, count: number, page: number) {
const p = page < 1 ? 1 : page;
const offset = (p - 1) * count;
const queryString = `
SELECT
l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid,
e.spelling, e.ipa, e.frequency, e.id as eid,
GROUP_CONCAT(cg.name, ',') AS category
FROM expressions e
JOIN cards_expressions ce ON e.id = ce.expression_id
JOIN cards ON cards.id = cl.card_id
JOIN cards_lessons cl ON cl.card_id = cards.id
JOIN lessons l ON l.id = cl.lesson_id
JOIN expressions e ON e.id = ce.expression_id
JOIN word_categories wc ON wc.word_id = e.id
JOIN categories cg ON cg.id = wc.category_id
LIMIT $count
OFFSET $offset
`;
// const queryString = `
// SELECT
// l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid
// FROM cards_lessons cl
// JOIN lessons l ON l.id = cl.lesson_id
// JOIN cards ON cards.id = cl.card_id
// JOIN lessons l ON l.id = cl.lesson_id
// LIMIT $count
// OFFSET $offset
// `;
const queryString = `
SELECT
l.id AS lesson_id,
l.text AS lesson_text,
c.id AS card_id,
c.text AS card_text,
c.note AS card_note,
e.id AS expression_id,
e.spelling AS expression_spelling,
e.ipa AS expression_ipa,
e.type AS expression_type,
e.subtype AS expression_subtype,
GROUP_CONCAT(cat.name, ', ') AS categories
FROM
lessons l
JOIN
cards_lessons cl ON l.id = cl.lesson_id
JOIN
cards c ON c.id = cl.card_id
JOIN
cards_expressions ce ON c.id = ce.card_id
JOIN
expressions e ON e.id = ce.expression_id
LEFT JOIN
word_categories wc ON wc.word_id = e.id
LEFT JOIN
categories cat ON cat.id = wc.category_id
GROUP BY
l.id, c.id, e.id
ORDER BY
l.id ASC, c.id ASC, e.id ASC
LIMIT $count OFFSET $offset;
`;
const query = db.query(queryString);
const res = query.all({ count, offset });
return res;
return query.all({ count, offset });
}
// SELECT l.id, l.text, cards.text, cards.note FROM cards_lessons cl LEFT JOIN lessons l ON l.id = cl.lesson_id LEFT JOIN cards ON cards.id = cl.card_id ORDER BY l.id ASC LIMIT 20 OFFSET 0;
@ -86,7 +115,6 @@ export function fetchLesson(db: Database, lesson: number) {
JOIN categories cg ON cg.id = wc.category_id
WHERE l.id = $lesson
`;
console.log(queryString);
const query = db.query(queryString);
return query.all({ lesson });
}
@ -137,6 +165,7 @@ export function addCard(
))
`);
const wtr = db.transaction((pairs) => {
// console.log("adding to ce", { pairs, cid, text });
for (const pair of pairs) wquery.run(pair);
});
const words = text
@ -145,7 +174,7 @@ export function addCard(
.trim()
.split(" ");
const combinations = wordFactorial(words);
const richWords = combinations.map((spelling) => {
const richWords = Array.from(combinations).map((spelling) => {
return { spelling, cid };
});
wtr(richWords);
@ -187,7 +216,11 @@ export function addWord(
const res = query.run({ spelling, ipa, language, type, subtype });
return res.lastInsertRowid;
}
export function addCat(db: Database, wordId: number | bigint, domain: string) {
export function addCat(
db: Database,
wordId: number | bigint,
category: string,
) {
const queryString = `
INSERT
INTO word_categories(word_id, category_id)
@ -196,13 +229,23 @@ export function addCat(db: Database, wordId: number | bigint, domain: string) {
WHERE name = $category
))
`;
const category = domains[domain] || "unknown";
const query = db.query(queryString);
const res = query.run({ wordId, category });
return res.lastInsertRowid;
}
const domains: Record<string, string> = {
export const poss: Record<string, string> = {
CC: "conjunction",
DT: "determiner",
IN: "preposition",
MD: "auxiliar",
PRP: "nominative", // TODO oi
PRP$: "gemitive",
WDT: "determiner",
WP: "interrogative",
WP$: "interrogative",
};
export const domains: Record<string, string> = {
"adj.all": "adjective",
"adj.pert": "adjective",
"adj.ppl": "adjective",
@ -261,5 +304,13 @@ export function addFrequency(
`;
const query = db.query(queryString);
const res = query.run({ spelling, frequency });
console.log(res, "added frequency");
}
export function addIPA(db: Database, spelling: string, ipa: string) {
const queryString = `
UPDATE expressions
SET ipa= $ipa
WHERE expressions.spelling = $spelling
`;
const query = db.query(queryString);
const res = query.run({ spelling, ipa });
}

View File

@ -1,5 +1,13 @@
import { Database } from "bun:sqlite";
import { addCard, addCat, addFrequency, addLesson, addWord } from "./db";
import {
addCard,
addCat,
addFrequency,
addLesson,
addWord,
domains,
poss,
} from "./db";
// const db = new Database('../db/data.db');
const db = new Database("../db/data.db", { strict: true });
@ -8,31 +16,11 @@ db.exec("PRAGMA journal_mode = WAL;");
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
// async function englishIPA() {
// const file = Bun.file('ipa/en-us/ipadict.txt');
// const s = file.stream();
// const reader = s.getReader();
// const decoder = new TextDecoder();
// let leftover = '';
// while (true) {
// const { value, done } = await reader.read();
// if (done) break;
// const chunk = decoder.decode(value, { stream: true });
// const lines = (leftover + chunk).split('\n');
// // Process each line except the last (which might be incomplete)
// for (const line of lines.slice(0, -1)) saveLine(line);
// // Save the last incomplete line to process in the next iteration
// leftover = lines[lines.length - 1];
// }
// // Handle any remaining content after reading all chunks
// if (leftover) saveLine(leftover);
// }
async function englishFreq() {
const file = Bun.file("../datasets/unigram_freq.csv");
async function handleFile(
filename: string,
func: (line: string, idx: number) => void,
) {
const file = Bun.file(filename);
const s = file.stream();
const reader = s.getReader();
const decoder = new TextDecoder();
@ -47,8 +35,7 @@ async function englishFreq() {
// Process each line except the last (which might be incomplete)
for (const line of lines.slice(0, -1)) {
lineCount++;
const [spelling, _frequency] = line.split(",");
addFrequency(db, spelling, lineCount);
func(line, lineCount);
}
// Save the last incomplete line to process in the next iteration
@ -56,48 +43,115 @@ async function englishFreq() {
}
// Handle any remaining content after reading all chunks
if (leftover) addFrequency(db, leftover, lineCount + 1);
if (leftover) func(leftover, lineCount + 1);
}
function goodPos(pos: string): boolean {
const list = [
"CC",
"DT",
"EX",
"IN",
"LS",
"MD",
"PDT",
"POS",
"PRP",
"PRP$",
"RP",
"TO",
"WDT",
"WP",
"WP$",
];
return list.includes(pos);
}
function englishKaggle() {
handleFile("../datasets/words_pos.csv", (line, idx) => {
const [_, spelling, pos] = line.split(",");
if (!goodPos(pos)) return;
const rowid = addWord(db, spelling, "", "en-us", "word", null);
const category = poss[pos] || "unknown;";
addCat(db, rowid, category);
});
}
async function englishIPA() {
handleFile("ipa/en-us/ipadict.txt", (line, idx) => {
const [spelling, ipa] = line.split(/\s+/);
if (!spelling || !ipa) return;
const hasSymbols = spelling.match(SYMBOL_REGEX);
if (hasSymbols) return;
const split = spelling.split(" ");
const type = split.length > 1 ? "expression" : "word";
const subtype = null;
addWord(db, spelling, ipa, "en-us", type, subtype);
});
}
async function englishFreq() {
handleFile("../datasets/unigram_freq.csv", (line, idx) => {
const [spelling, _frequency] = line.split(",");
addFrequency(db, spelling, idx);
});
// Save the last incomplete line to process in the next iteration
}
// TODO no conjunctions or adpositions in Wordnet!!
function englishIPA() {
function englishWordnet() {
// LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB'
// LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid
const queryString = `
SELECT words.wordid, word, pronunciation, domainname FROM words
JOIN lexes_pronunciations lp ON lp.wordid = words.wordid
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
JOIN senses ON senses.wordid = words.wordid
JOIN synsets ON synsets.synsetid = senses.synsetid
JOIN domains ON domains.domainid = synsets.domainid
WITH ranked_ipa AS (
SELECT
lp.wordid,
pr.pronunciation,
lp.variety,
ROW_NUMBER() OVER (
PARTITION BY lp.wordid
ORDER BY
CASE
WHEN lp.variety = 'US' THEN 1
WHEN lp.variety IS NULL THEN 2
WHEN lp.variety IS 'GB' THEN 3
ELSE 4
END
) AS rank
FROM lexes_pronunciations lp
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
)
SELECT words.wordid, word, rp.pronunciation as ipa, domainname
FROM words
LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1
LEFT JOIN senses ON senses.wordid = words.wordid
LEFT JOIN synsets ON synsets.synsetid = senses.synsetid
LEFT JOIN domains ON domains.domainid = synsets.domainid
GROUP BY words.wordid
`;
const query = wndb.query(queryString);
const res: Array<{
word: string;
pronunciation: string;
ipa: string;
domainname: string;
}> = query.all() as any;
console.log("res", res.length);
for (const r of res) {
console.log("adding word", r);
console.log(r, "r");
// if (r.word === 'abrasive') throw new Error('stop right here');
const ok = filterWord(r.word);
if (!ok) continue;
const split = r.word.split(" ");
const type = split.length > 1 ? "expression" : "word";
const subtype = null;
const wordid = addWord(db, r.word, r.pronunciation, "en-us", type, subtype);
addCat(db, wordid, r.domainname);
const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype);
const category = domains[r.domainname] || "unknown;";
addCat(db, wordid, category);
}
}
// function saveLine(line: string) {
// const [spelling, ipa] = line.split(/\s+/);
// if (!spelling || !ipa) return;
// const hasSymbols = spelling.match(SYMBOL_REGEX);
// if (hasSymbols) return;
// const isWord = checkWordNet(spelling);
// console.log(spelling, isWord);
// if (!isWord) return;
// const split = spelling.split(' ');
// const type = split.length > 1 ? 'expression' : 'word';
// const subtype = null;
// addWord(db, spelling, ipa, 'en-us', type, subtype);
// }
function filterWord(s: string) {
const hasSymbols = s.match(SYMBOL_REGEX);
if (hasSymbols) return false;
else return true;
}
// function checkWordNet(word: string) {
// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
@ -137,6 +191,7 @@ function englishCards() {
addCard(db, lesson_id, text);
}
}
// englishIPA();
// englishWordnet();
// englishFreq();
englishCards();
// englishCards();
englishKaggle();

View File

@ -2,6 +2,7 @@ import { Database } from "bun:sqlite";
import {
addUser,
fetchCard,
fetchExpressionsByCard,
fetchLesson,
fetchLessons,
fetchResource,
@ -58,7 +59,12 @@ type LessonsType = Record<
{
id: number;
text: string;
cards: Array<{ text: string; note: string | null; id: number }>;
cards: Array<{
text: string;
note: string | null;
id: number;
words: Array<{ spelling: string; ipa: string; category: string }>;
}>;
}
>;
type LessonsDBType = {
@ -85,17 +91,20 @@ function handleGetLessons(user: number, url: URL) {
const page = params.get("page") || "0";
const data: LessonsDBType[] = fetchLessons(db, 20, Number(page)) as any;
console.log(data, "fetchlessons");
const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => {
let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] };
const cards = [
...cur.cards,
{ text: item.ctext, note: item.cnote, id: item.cid },
];
const def = { ...cur, cards };
return { ...acc, [item.id]: def };
}, {} as LessonsType);
console.log(lessons, "lesons");
return Response.json({ ok: lessons });
console.log(data.length);
// const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => {
// let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] };
// const words = fetchExpressionsByCard(db, item.cid) as any[];
// console.log(words, item.cid);
// const cards = [
// ...cur.cards,
// { text: item.ctext, note: item.cnote, id: item.cid, words },
// ];
// const def = { ...cur, cards };
// return { ...acc, [item.id]: def };
// }, {} as LessonsType);
// return Response.json({ ok: lessons });
return Response.json({ ok: data });
}
async function handlePost(req: Request, user: number, url: URL) {

View File

@ -1,10 +1,10 @@
export function wordFactorial(words: string[]): string[] {
const combinations: string[] = [];
export function wordFactorial(words: string[]): Set<string> {
const combinations: Set<string> = new Set([]);
for (let i = 0; i < words.length; i++) {
let inner = '';
let inner = "";
for (let ii = i; ii < words.length; ii++) {
inner += (ii > i ? ' ' : '') + words[ii];
combinations.push(inner);
inner += (ii > i ? " " : "") + words[ii].toLowerCase();
combinations.add(inner);
}
}
return combinations;