hanchu/server/seeding.ts
2024-10-23 23:54:41 +07:00

198 lines
5.4 KiB
TypeScript

import { Database } from "bun:sqlite";
import {
addCard,
addCat,
addFrequency,
addLesson,
addWord,
domains,
poss,
} from "./db";
// const db = new Database('../db/data.db');
const db = new Database("../db/data.db", { strict: true });
const wndb = new Database("../datasets/en-wordnet/data.sqlite");
db.exec("PRAGMA journal_mode = WAL;");
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
async function handleFile(
filename: string,
func: (line: string, idx: number) => void,
) {
const file = Bun.file(filename);
const s = file.stream();
const reader = s.getReader();
const decoder = new TextDecoder();
let leftover = "";
let lineCount = 0;
while (true) {
const { value, done } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
const lines = (leftover + chunk).split("\n");
// Process each line except the last (which might be incomplete)
for (const line of lines.slice(0, -1)) {
lineCount++;
func(line, lineCount);
}
// Save the last incomplete line to process in the next iteration
leftover = lines[lines.length - 1];
}
// Handle any remaining content after reading all chunks
if (leftover) func(leftover, lineCount + 1);
}
function goodPos(pos: string): boolean {
const list = [
"CC",
"DT",
"EX",
"IN",
"LS",
"MD",
"PDT",
"POS",
"PRP",
"PRP$",
"RP",
"TO",
"WDT",
"WP",
"WP$",
];
return list.includes(pos);
}
function englishKaggle() {
handleFile("../datasets/words_pos.csv", (line, idx) => {
const [_, spelling, pos] = line.split(",");
if (!goodPos(pos)) return;
const rowid = addWord(db, spelling, "", "en-us", "word", null);
const category = poss[pos] || "unknown;";
addCat(db, rowid, category);
});
}
async function englishIPA() {
handleFile("ipa/en-us/ipadict.txt", (line, idx) => {
const [spelling, ipa] = line.split(/\s+/);
if (!spelling || !ipa) return;
const hasSymbols = spelling.match(SYMBOL_REGEX);
if (hasSymbols) return;
const split = spelling.split(" ");
const type = split.length > 1 ? "expression" : "word";
const subtype = null;
addWord(db, spelling, ipa, "en-us", type, subtype);
});
}
async function englishFreq() {
handleFile("../datasets/unigram_freq.csv", (line, idx) => {
const [spelling, _frequency] = line.split(",");
addFrequency(db, spelling, idx);
});
// Save the last incomplete line to process in the next iteration
}
// TODO no conjunctions or adpositions in Wordnet!!
function englishWordnet() {
// LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB'
// LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid
const queryString = `
WITH ranked_ipa AS (
SELECT
lp.wordid,
pr.pronunciation,
lp.variety,
ROW_NUMBER() OVER (
PARTITION BY lp.wordid
ORDER BY
CASE
WHEN lp.variety = 'US' THEN 1
WHEN lp.variety IS NULL THEN 2
WHEN lp.variety IS 'GB' THEN 3
ELSE 4
END
) AS rank
FROM lexes_pronunciations lp
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
)
SELECT words.wordid, word, rp.pronunciation as ipa, domainname
FROM words
LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1
LEFT JOIN senses ON senses.wordid = words.wordid
LEFT JOIN synsets ON synsets.synsetid = senses.synsetid
LEFT JOIN domains ON domains.domainid = synsets.domainid
GROUP BY words.wordid
`;
const query = wndb.query(queryString);
const res: Array<{
word: string;
ipa: string;
domainname: string;
}> = query.all() as any;
console.log("res", res.length);
for (const r of res) {
console.log(r, "r");
// if (r.word === 'abrasive') throw new Error('stop right here');
const ok = filterWord(r.word);
if (!ok) continue;
const split = r.word.split(" ");
const type = split.length > 1 ? "expression" : "word";
const subtype = null;
const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype);
const category = domains[r.domainname] || "unknown;";
addCat(db, wordid, category);
}
}
function filterWord(s: string) {
const hasSymbols = s.match(SYMBOL_REGEX);
if (hasSymbols) return false;
else return true;
}
// function checkWordNet(word: string) {
// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
// const res = query.get({ $word: word });
// return !!res;
// }
function englishCards() {
const lesson_id = addLesson(db, "First Lesson, some easy stuff");
const texts = [
"I",
"friend",
"my friend",
"you",
"your friend",
"my friends' friend",
"you are my friend",
"I am your friend",
"your friend is my friend",
"my friend is your friend",
"he is my friend",
"this is mine",
"this is yours",
"this is my friends'",
"no",
"you are not my friend",
"this is not yours",
"your friend is not my friend",
"that is mine",
"this is mine, that is yours",
"he is not your friend",
"no, I am not",
"that is not me",
"that is not mine, that is my friends'",
];
for (const text of texts) {
addCard(db, lesson_id, text);
}
}
// englishWordnet();
// englishFreq();
// englishCards();
englishKaggle();