m
This commit is contained in:
parent
9bbb3b3cfa
commit
d0bcc2a81c
@ -6,11 +6,12 @@ PRAGMA mmap_size = 30000000000;
|
||||
|
||||
|
||||
-- Words table
|
||||
-- TODO restore a separate words table?
|
||||
CREATE TABLE expressions(
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
spelling TEXT NOT NULL,
|
||||
ipa TEXT NOT NULL,
|
||||
language_id INTEGER NOT NULL,
|
||||
ipa TEXT,
|
||||
frequency INTEGER,
|
||||
type TEXT NOT NULL,
|
||||
subtype TEXT,
|
||||
@ -95,6 +96,9 @@ INSERT INTO categories (name, part_of_speech_id) VALUES
|
||||
('nominative', 5),
|
||||
('accusative', 5),
|
||||
('genitive', 5),
|
||||
('interrogative', 5),
|
||||
-- not really a pronoun but whatever
|
||||
('determiner', 5),
|
||||
-- adpositions
|
||||
('preposition', 6),
|
||||
('postposition', 6),
|
||||
|
103
server/db.ts
103
server/db.ts
@ -27,7 +27,7 @@ export function fetchFrequent(db: Database, count: number, page: number) {
|
||||
spelling,
|
||||
ipa,
|
||||
frequency,
|
||||
GROUP_CONCAT(c.name, ',') AS category,
|
||||
GROUP_CONCAT(c.name, ',') AS category
|
||||
FROM expressions e
|
||||
JOIN word_categories wc ON wc.word_id = e.id
|
||||
JOIN categories c ON c.id = wc.category_id
|
||||
@ -37,37 +37,66 @@ export function fetchFrequent(db: Database, count: number, page: number) {
|
||||
`);
|
||||
return query.get({ count, offset });
|
||||
}
|
||||
|
||||
export function fetchExpressionsByCard(db: Database, cid: number) {
|
||||
const queryString = `
|
||||
SELECT
|
||||
e.spelling, e.id as eid, e.ipa
|
||||
FROM cards_expressions ce
|
||||
JOIN expressions e ON ce.expression_id = e.id
|
||||
WHERE ce.card_id = $cid AND e.spelling IS NOT NULL
|
||||
ORDER BY e.frequency DESC
|
||||
`;
|
||||
const query = db.query(queryString);
|
||||
return query.all({ cid });
|
||||
}
|
||||
export function fetchLessons(db: Database, count: number, page: number) {
|
||||
const p = page < 1 ? 1 : page;
|
||||
const offset = (p - 1) * count;
|
||||
const queryString = `
|
||||
SELECT
|
||||
l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid,
|
||||
e.spelling, e.ipa, e.frequency, e.id as eid,
|
||||
GROUP_CONCAT(cg.name, ',') AS category
|
||||
FROM expressions e
|
||||
JOIN cards_expressions ce ON e.id = ce.expression_id
|
||||
JOIN cards ON cards.id = cl.card_id
|
||||
JOIN cards_lessons cl ON cl.card_id = cards.id
|
||||
JOIN lessons l ON l.id = cl.lesson_id
|
||||
JOIN expressions e ON e.id = ce.expression_id
|
||||
JOIN word_categories wc ON wc.word_id = e.id
|
||||
JOIN categories cg ON cg.id = wc.category_id
|
||||
LIMIT $count
|
||||
OFFSET $offset
|
||||
`;
|
||||
// const queryString = `
|
||||
// SELECT
|
||||
// l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid
|
||||
// FROM cards_lessons cl
|
||||
// JOIN lessons l ON l.id = cl.lesson_id
|
||||
// JOIN cards ON cards.id = cl.card_id
|
||||
// JOIN lessons l ON l.id = cl.lesson_id
|
||||
// LIMIT $count
|
||||
// OFFSET $offset
|
||||
// `;
|
||||
const queryString = `
|
||||
SELECT
|
||||
l.id AS lesson_id,
|
||||
l.text AS lesson_text,
|
||||
c.id AS card_id,
|
||||
c.text AS card_text,
|
||||
c.note AS card_note,
|
||||
e.id AS expression_id,
|
||||
e.spelling AS expression_spelling,
|
||||
e.ipa AS expression_ipa,
|
||||
e.type AS expression_type,
|
||||
e.subtype AS expression_subtype,
|
||||
GROUP_CONCAT(cat.name, ', ') AS categories
|
||||
FROM
|
||||
lessons l
|
||||
JOIN
|
||||
cards_lessons cl ON l.id = cl.lesson_id
|
||||
JOIN
|
||||
cards c ON c.id = cl.card_id
|
||||
JOIN
|
||||
cards_expressions ce ON c.id = ce.card_id
|
||||
JOIN
|
||||
expressions e ON e.id = ce.expression_id
|
||||
LEFT JOIN
|
||||
word_categories wc ON wc.word_id = e.id
|
||||
LEFT JOIN
|
||||
categories cat ON cat.id = wc.category_id
|
||||
GROUP BY
|
||||
l.id, c.id, e.id
|
||||
ORDER BY
|
||||
l.id ASC, c.id ASC, e.id ASC
|
||||
LIMIT $count OFFSET $offset;
|
||||
`;
|
||||
const query = db.query(queryString);
|
||||
const res = query.all({ count, offset });
|
||||
return res;
|
||||
return query.all({ count, offset });
|
||||
}
|
||||
|
||||
// SELECT l.id, l.text, cards.text, cards.note FROM cards_lessons cl LEFT JOIN lessons l ON l.id = cl.lesson_id LEFT JOIN cards ON cards.id = cl.card_id ORDER BY l.id ASC LIMIT 20 OFFSET 0;
|
||||
@ -86,7 +115,6 @@ export function fetchLesson(db: Database, lesson: number) {
|
||||
JOIN categories cg ON cg.id = wc.category_id
|
||||
WHERE l.id = $lesson
|
||||
`;
|
||||
console.log(queryString);
|
||||
const query = db.query(queryString);
|
||||
return query.all({ lesson });
|
||||
}
|
||||
@ -137,6 +165,7 @@ export function addCard(
|
||||
))
|
||||
`);
|
||||
const wtr = db.transaction((pairs) => {
|
||||
// console.log("adding to ce", { pairs, cid, text });
|
||||
for (const pair of pairs) wquery.run(pair);
|
||||
});
|
||||
const words = text
|
||||
@ -145,7 +174,7 @@ export function addCard(
|
||||
.trim()
|
||||
.split(" ");
|
||||
const combinations = wordFactorial(words);
|
||||
const richWords = combinations.map((spelling) => {
|
||||
const richWords = Array.from(combinations).map((spelling) => {
|
||||
return { spelling, cid };
|
||||
});
|
||||
wtr(richWords);
|
||||
@ -187,7 +216,11 @@ export function addWord(
|
||||
const res = query.run({ spelling, ipa, language, type, subtype });
|
||||
return res.lastInsertRowid;
|
||||
}
|
||||
export function addCat(db: Database, wordId: number | bigint, domain: string) {
|
||||
export function addCat(
|
||||
db: Database,
|
||||
wordId: number | bigint,
|
||||
category: string,
|
||||
) {
|
||||
const queryString = `
|
||||
INSERT
|
||||
INTO word_categories(word_id, category_id)
|
||||
@ -196,13 +229,23 @@ export function addCat(db: Database, wordId: number | bigint, domain: string) {
|
||||
WHERE name = $category
|
||||
))
|
||||
`;
|
||||
const category = domains[domain] || "unknown";
|
||||
const query = db.query(queryString);
|
||||
const res = query.run({ wordId, category });
|
||||
return res.lastInsertRowid;
|
||||
}
|
||||
|
||||
const domains: Record<string, string> = {
|
||||
export const poss: Record<string, string> = {
|
||||
CC: "conjunction",
|
||||
DT: "determiner",
|
||||
IN: "preposition",
|
||||
MD: "auxiliar",
|
||||
PRP: "nominative", // TODO oi
|
||||
PRP$: "gemitive",
|
||||
WDT: "determiner",
|
||||
WP: "interrogative",
|
||||
WP$: "interrogative",
|
||||
};
|
||||
export const domains: Record<string, string> = {
|
||||
"adj.all": "adjective",
|
||||
"adj.pert": "adjective",
|
||||
"adj.ppl": "adjective",
|
||||
@ -261,5 +304,13 @@ export function addFrequency(
|
||||
`;
|
||||
const query = db.query(queryString);
|
||||
const res = query.run({ spelling, frequency });
|
||||
console.log(res, "added frequency");
|
||||
}
|
||||
export function addIPA(db: Database, spelling: string, ipa: string) {
|
||||
const queryString = `
|
||||
UPDATE expressions
|
||||
SET ipa= $ipa
|
||||
WHERE expressions.spelling = $spelling
|
||||
`;
|
||||
const query = db.query(queryString);
|
||||
const res = query.run({ spelling, ipa });
|
||||
}
|
||||
|
@ -1,5 +1,13 @@
|
||||
import { Database } from "bun:sqlite";
|
||||
import { addCard, addCat, addFrequency, addLesson, addWord } from "./db";
|
||||
import {
|
||||
addCard,
|
||||
addCat,
|
||||
addFrequency,
|
||||
addLesson,
|
||||
addWord,
|
||||
domains,
|
||||
poss,
|
||||
} from "./db";
|
||||
|
||||
// const db = new Database('../db/data.db');
|
||||
const db = new Database("../db/data.db", { strict: true });
|
||||
@ -8,31 +16,11 @@ db.exec("PRAGMA journal_mode = WAL;");
|
||||
|
||||
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
|
||||
|
||||
// async function englishIPA() {
|
||||
// const file = Bun.file('ipa/en-us/ipadict.txt');
|
||||
// const s = file.stream();
|
||||
// const reader = s.getReader();
|
||||
// const decoder = new TextDecoder();
|
||||
// let leftover = '';
|
||||
// while (true) {
|
||||
// const { value, done } = await reader.read();
|
||||
// if (done) break;
|
||||
// const chunk = decoder.decode(value, { stream: true });
|
||||
// const lines = (leftover + chunk).split('\n');
|
||||
|
||||
// // Process each line except the last (which might be incomplete)
|
||||
// for (const line of lines.slice(0, -1)) saveLine(line);
|
||||
|
||||
// // Save the last incomplete line to process in the next iteration
|
||||
// leftover = lines[lines.length - 1];
|
||||
// }
|
||||
|
||||
// // Handle any remaining content after reading all chunks
|
||||
// if (leftover) saveLine(leftover);
|
||||
// }
|
||||
|
||||
async function englishFreq() {
|
||||
const file = Bun.file("../datasets/unigram_freq.csv");
|
||||
async function handleFile(
|
||||
filename: string,
|
||||
func: (line: string, idx: number) => void,
|
||||
) {
|
||||
const file = Bun.file(filename);
|
||||
const s = file.stream();
|
||||
const reader = s.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
@ -47,8 +35,7 @@ async function englishFreq() {
|
||||
// Process each line except the last (which might be incomplete)
|
||||
for (const line of lines.slice(0, -1)) {
|
||||
lineCount++;
|
||||
const [spelling, _frequency] = line.split(",");
|
||||
addFrequency(db, spelling, lineCount);
|
||||
func(line, lineCount);
|
||||
}
|
||||
|
||||
// Save the last incomplete line to process in the next iteration
|
||||
@ -56,48 +43,115 @@ async function englishFreq() {
|
||||
}
|
||||
|
||||
// Handle any remaining content after reading all chunks
|
||||
if (leftover) addFrequency(db, leftover, lineCount + 1);
|
||||
if (leftover) func(leftover, lineCount + 1);
|
||||
}
|
||||
|
||||
function goodPos(pos: string): boolean {
|
||||
const list = [
|
||||
"CC",
|
||||
"DT",
|
||||
"EX",
|
||||
"IN",
|
||||
"LS",
|
||||
"MD",
|
||||
"PDT",
|
||||
"POS",
|
||||
"PRP",
|
||||
"PRP$",
|
||||
"RP",
|
||||
"TO",
|
||||
"WDT",
|
||||
"WP",
|
||||
"WP$",
|
||||
];
|
||||
return list.includes(pos);
|
||||
}
|
||||
function englishKaggle() {
|
||||
handleFile("../datasets/words_pos.csv", (line, idx) => {
|
||||
const [_, spelling, pos] = line.split(",");
|
||||
if (!goodPos(pos)) return;
|
||||
const rowid = addWord(db, spelling, "", "en-us", "word", null);
|
||||
const category = poss[pos] || "unknown;";
|
||||
addCat(db, rowid, category);
|
||||
});
|
||||
}
|
||||
async function englishIPA() {
|
||||
handleFile("ipa/en-us/ipadict.txt", (line, idx) => {
|
||||
const [spelling, ipa] = line.split(/\s+/);
|
||||
if (!spelling || !ipa) return;
|
||||
const hasSymbols = spelling.match(SYMBOL_REGEX);
|
||||
if (hasSymbols) return;
|
||||
const split = spelling.split(" ");
|
||||
const type = split.length > 1 ? "expression" : "word";
|
||||
const subtype = null;
|
||||
addWord(db, spelling, ipa, "en-us", type, subtype);
|
||||
});
|
||||
}
|
||||
|
||||
async function englishFreq() {
|
||||
handleFile("../datasets/unigram_freq.csv", (line, idx) => {
|
||||
const [spelling, _frequency] = line.split(",");
|
||||
addFrequency(db, spelling, idx);
|
||||
});
|
||||
|
||||
// Save the last incomplete line to process in the next iteration
|
||||
}
|
||||
// TODO no conjunctions or adpositions in Wordnet!!
|
||||
function englishIPA() {
|
||||
function englishWordnet() {
|
||||
// LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB'
|
||||
// LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid
|
||||
const queryString = `
|
||||
SELECT words.wordid, word, pronunciation, domainname FROM words
|
||||
JOIN lexes_pronunciations lp ON lp.wordid = words.wordid
|
||||
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
|
||||
JOIN senses ON senses.wordid = words.wordid
|
||||
JOIN synsets ON synsets.synsetid = senses.synsetid
|
||||
JOIN domains ON domains.domainid = synsets.domainid
|
||||
WITH ranked_ipa AS (
|
||||
SELECT
|
||||
lp.wordid,
|
||||
pr.pronunciation,
|
||||
lp.variety,
|
||||
ROW_NUMBER() OVER (
|
||||
PARTITION BY lp.wordid
|
||||
ORDER BY
|
||||
CASE
|
||||
WHEN lp.variety = 'US' THEN 1
|
||||
WHEN lp.variety IS NULL THEN 2
|
||||
WHEN lp.variety IS 'GB' THEN 3
|
||||
ELSE 4
|
||||
END
|
||||
) AS rank
|
||||
FROM lexes_pronunciations lp
|
||||
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
|
||||
)
|
||||
SELECT words.wordid, word, rp.pronunciation as ipa, domainname
|
||||
FROM words
|
||||
LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1
|
||||
LEFT JOIN senses ON senses.wordid = words.wordid
|
||||
LEFT JOIN synsets ON synsets.synsetid = senses.synsetid
|
||||
LEFT JOIN domains ON domains.domainid = synsets.domainid
|
||||
GROUP BY words.wordid
|
||||
`;
|
||||
const query = wndb.query(queryString);
|
||||
const res: Array<{
|
||||
word: string;
|
||||
pronunciation: string;
|
||||
ipa: string;
|
||||
domainname: string;
|
||||
}> = query.all() as any;
|
||||
console.log("res", res.length);
|
||||
for (const r of res) {
|
||||
console.log("adding word", r);
|
||||
console.log(r, "r");
|
||||
// if (r.word === 'abrasive') throw new Error('stop right here');
|
||||
const ok = filterWord(r.word);
|
||||
if (!ok) continue;
|
||||
const split = r.word.split(" ");
|
||||
const type = split.length > 1 ? "expression" : "word";
|
||||
const subtype = null;
|
||||
const wordid = addWord(db, r.word, r.pronunciation, "en-us", type, subtype);
|
||||
addCat(db, wordid, r.domainname);
|
||||
const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype);
|
||||
const category = domains[r.domainname] || "unknown;";
|
||||
addCat(db, wordid, category);
|
||||
}
|
||||
}
|
||||
// function saveLine(line: string) {
|
||||
// const [spelling, ipa] = line.split(/\s+/);
|
||||
// if (!spelling || !ipa) return;
|
||||
// const hasSymbols = spelling.match(SYMBOL_REGEX);
|
||||
// if (hasSymbols) return;
|
||||
// const isWord = checkWordNet(spelling);
|
||||
// console.log(spelling, isWord);
|
||||
// if (!isWord) return;
|
||||
// const split = spelling.split(' ');
|
||||
// const type = split.length > 1 ? 'expression' : 'word';
|
||||
// const subtype = null;
|
||||
// addWord(db, spelling, ipa, 'en-us', type, subtype);
|
||||
// }
|
||||
function filterWord(s: string) {
|
||||
const hasSymbols = s.match(SYMBOL_REGEX);
|
||||
if (hasSymbols) return false;
|
||||
else return true;
|
||||
}
|
||||
|
||||
// function checkWordNet(word: string) {
|
||||
// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
|
||||
@ -137,6 +191,7 @@ function englishCards() {
|
||||
addCard(db, lesson_id, text);
|
||||
}
|
||||
}
|
||||
// englishIPA();
|
||||
// englishWordnet();
|
||||
// englishFreq();
|
||||
englishCards();
|
||||
// englishCards();
|
||||
englishKaggle();
|
||||
|
@ -2,6 +2,7 @@ import { Database } from "bun:sqlite";
|
||||
import {
|
||||
addUser,
|
||||
fetchCard,
|
||||
fetchExpressionsByCard,
|
||||
fetchLesson,
|
||||
fetchLessons,
|
||||
fetchResource,
|
||||
@ -58,7 +59,12 @@ type LessonsType = Record<
|
||||
{
|
||||
id: number;
|
||||
text: string;
|
||||
cards: Array<{ text: string; note: string | null; id: number }>;
|
||||
cards: Array<{
|
||||
text: string;
|
||||
note: string | null;
|
||||
id: number;
|
||||
words: Array<{ spelling: string; ipa: string; category: string }>;
|
||||
}>;
|
||||
}
|
||||
>;
|
||||
type LessonsDBType = {
|
||||
@ -85,17 +91,20 @@ function handleGetLessons(user: number, url: URL) {
|
||||
const page = params.get("page") || "0";
|
||||
const data: LessonsDBType[] = fetchLessons(db, 20, Number(page)) as any;
|
||||
console.log(data, "fetchlessons");
|
||||
const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => {
|
||||
let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] };
|
||||
const cards = [
|
||||
...cur.cards,
|
||||
{ text: item.ctext, note: item.cnote, id: item.cid },
|
||||
];
|
||||
const def = { ...cur, cards };
|
||||
return { ...acc, [item.id]: def };
|
||||
}, {} as LessonsType);
|
||||
console.log(lessons, "lesons");
|
||||
return Response.json({ ok: lessons });
|
||||
console.log(data.length);
|
||||
// const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => {
|
||||
// let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] };
|
||||
// const words = fetchExpressionsByCard(db, item.cid) as any[];
|
||||
// console.log(words, item.cid);
|
||||
// const cards = [
|
||||
// ...cur.cards,
|
||||
// { text: item.ctext, note: item.cnote, id: item.cid, words },
|
||||
// ];
|
||||
// const def = { ...cur, cards };
|
||||
// return { ...acc, [item.id]: def };
|
||||
// }, {} as LessonsType);
|
||||
// return Response.json({ ok: lessons });
|
||||
return Response.json({ ok: data });
|
||||
}
|
||||
|
||||
async function handlePost(req: Request, user: number, url: URL) {
|
||||
|
@ -1,10 +1,10 @@
|
||||
export function wordFactorial(words: string[]): string[] {
|
||||
const combinations: string[] = [];
|
||||
export function wordFactorial(words: string[]): Set<string> {
|
||||
const combinations: Set<string> = new Set([]);
|
||||
for (let i = 0; i < words.length; i++) {
|
||||
let inner = '';
|
||||
let inner = "";
|
||||
for (let ii = i; ii < words.length; ii++) {
|
||||
inner += (ii > i ? ' ' : '') + words[ii];
|
||||
combinations.push(inner);
|
||||
inner += (ii > i ? " " : "") + words[ii].toLowerCase();
|
||||
combinations.add(inner);
|
||||
}
|
||||
}
|
||||
return combinations;
|
||||
|
Loading…
Reference in New Issue
Block a user