hanchu/server/seeding.ts

198 lines
5.4 KiB
TypeScript
Raw Normal View History

2024-10-22 08:45:52 +00:00
import { Database } from "bun:sqlite";
2024-10-23 16:54:41 +00:00
import {
addCard,
addCat,
addFrequency,
addLesson,
addWord,
domains,
poss,
} from "./db";
2024-10-22 04:35:21 +00:00
// const db = new Database('../db/data.db');
2024-10-22 08:45:52 +00:00
const db = new Database("../db/data.db", { strict: true });
const wndb = new Database("../datasets/en-wordnet/data.sqlite");
db.exec("PRAGMA journal_mode = WAL;");
2024-10-22 04:35:21 +00:00
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
2024-10-23 16:54:41 +00:00
async function handleFile(
filename: string,
func: (line: string, idx: number) => void,
) {
const file = Bun.file(filename);
2024-10-22 04:35:21 +00:00
const s = file.stream();
const reader = s.getReader();
const decoder = new TextDecoder();
2024-10-22 08:45:52 +00:00
let leftover = "";
2024-10-22 04:35:21 +00:00
let lineCount = 0;
while (true) {
const { value, done } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
2024-10-22 08:45:52 +00:00
const lines = (leftover + chunk).split("\n");
2024-10-22 04:35:21 +00:00
// Process each line except the last (which might be incomplete)
for (const line of lines.slice(0, -1)) {
lineCount++;
2024-10-23 16:54:41 +00:00
func(line, lineCount);
2024-10-22 04:35:21 +00:00
}
// Save the last incomplete line to process in the next iteration
leftover = lines[lines.length - 1];
}
// Handle any remaining content after reading all chunks
2024-10-23 16:54:41 +00:00
if (leftover) func(leftover, lineCount + 1);
}
function goodPos(pos: string): boolean {
const list = [
"CC",
"DT",
"EX",
"IN",
"LS",
"MD",
"PDT",
"POS",
"PRP",
"PRP$",
"RP",
"TO",
"WDT",
"WP",
"WP$",
];
return list.includes(pos);
}
function englishKaggle() {
handleFile("../datasets/words_pos.csv", (line, idx) => {
const [_, spelling, pos] = line.split(",");
if (!goodPos(pos)) return;
const rowid = addWord(db, spelling, "", "en-us", "word", null);
const category = poss[pos] || "unknown;";
addCat(db, rowid, category);
});
}
async function englishIPA() {
handleFile("ipa/en-us/ipadict.txt", (line, idx) => {
const [spelling, ipa] = line.split(/\s+/);
if (!spelling || !ipa) return;
const hasSymbols = spelling.match(SYMBOL_REGEX);
if (hasSymbols) return;
const split = spelling.split(" ");
const type = split.length > 1 ? "expression" : "word";
const subtype = null;
addWord(db, spelling, ipa, "en-us", type, subtype);
});
}
async function englishFreq() {
handleFile("../datasets/unigram_freq.csv", (line, idx) => {
const [spelling, _frequency] = line.split(",");
addFrequency(db, spelling, idx);
});
// Save the last incomplete line to process in the next iteration
2024-10-22 04:35:21 +00:00
}
// TODO no conjunctions or adpositions in Wordnet!!
2024-10-23 16:54:41 +00:00
function englishWordnet() {
// LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB'
// LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid
2024-10-22 04:35:21 +00:00
const queryString = `
2024-10-23 16:54:41 +00:00
WITH ranked_ipa AS (
SELECT
lp.wordid,
pr.pronunciation,
lp.variety,
ROW_NUMBER() OVER (
PARTITION BY lp.wordid
ORDER BY
CASE
WHEN lp.variety = 'US' THEN 1
WHEN lp.variety IS NULL THEN 2
WHEN lp.variety IS 'GB' THEN 3
ELSE 4
END
) AS rank
FROM lexes_pronunciations lp
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
)
SELECT words.wordid, word, rp.pronunciation as ipa, domainname
FROM words
LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1
LEFT JOIN senses ON senses.wordid = words.wordid
LEFT JOIN synsets ON synsets.synsetid = senses.synsetid
LEFT JOIN domains ON domains.domainid = synsets.domainid
2024-10-22 04:35:21 +00:00
GROUP BY words.wordid
`;
const query = wndb.query(queryString);
const res: Array<{
word: string;
2024-10-23 16:54:41 +00:00
ipa: string;
2024-10-22 04:35:21 +00:00
domainname: string;
}> = query.all() as any;
2024-10-23 16:54:41 +00:00
console.log("res", res.length);
2024-10-22 04:35:21 +00:00
for (const r of res) {
2024-10-23 16:54:41 +00:00
console.log(r, "r");
2024-10-22 04:35:21 +00:00
// if (r.word === 'abrasive') throw new Error('stop right here');
2024-10-23 16:54:41 +00:00
const ok = filterWord(r.word);
if (!ok) continue;
2024-10-22 08:45:52 +00:00
const split = r.word.split(" ");
const type = split.length > 1 ? "expression" : "word";
2024-10-22 04:35:21 +00:00
const subtype = null;
2024-10-23 16:54:41 +00:00
const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype);
const category = domains[r.domainname] || "unknown;";
addCat(db, wordid, category);
2024-10-22 04:35:21 +00:00
}
}
2024-10-23 16:54:41 +00:00
function filterWord(s: string) {
const hasSymbols = s.match(SYMBOL_REGEX);
if (hasSymbols) return false;
else return true;
}
2024-10-22 04:35:21 +00:00
// function checkWordNet(word: string) {
// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
// const res = query.get({ $word: word });
// return !!res;
// }
function englishCards() {
2024-10-22 08:45:52 +00:00
const lesson_id = addLesson(db, "First Lesson, some easy stuff");
2024-10-22 04:35:21 +00:00
const texts = [
2024-10-22 08:45:52 +00:00
"I",
"friend",
"my friend",
"you",
"your friend",
2024-10-22 04:35:21 +00:00
"my friends' friend",
2024-10-22 08:45:52 +00:00
"you are my friend",
"I am your friend",
"your friend is my friend",
"my friend is your friend",
"he is my friend",
"this is mine",
"this is yours",
2024-10-22 04:35:21 +00:00
"this is my friends'",
2024-10-22 08:45:52 +00:00
"no",
"you are not my friend",
"this is not yours",
"your friend is not my friend",
"that is mine",
"this is mine, that is yours",
"he is not your friend",
"no, I am not",
"that is not me",
2024-10-22 04:35:21 +00:00
"that is not mine, that is my friends'",
];
for (const text of texts) {
addCard(db, lesson_id, text);
}
}
2024-10-23 16:54:41 +00:00
// englishWordnet();
2024-10-22 08:45:52 +00:00
// englishFreq();
2024-10-23 16:54:41 +00:00
// englishCards();
englishKaggle();