hanchu/server/seeding.ts

143 lines
4.5 KiB
TypeScript
Raw Normal View History

2024-10-22 08:45:52 +00:00
import { Database } from "bun:sqlite";
import { addCard, addCat, addFrequency, addLesson, addWord } from "./db";
2024-10-22 04:35:21 +00:00
// const db = new Database('../db/data.db');
2024-10-22 08:45:52 +00:00
const db = new Database("../db/data.db", { strict: true });
const wndb = new Database("../datasets/en-wordnet/data.sqlite");
db.exec("PRAGMA journal_mode = WAL;");
2024-10-22 04:35:21 +00:00
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
// async function englishIPA() {
// const file = Bun.file('ipa/en-us/ipadict.txt');
// const s = file.stream();
// const reader = s.getReader();
// const decoder = new TextDecoder();
// let leftover = '';
// while (true) {
// const { value, done } = await reader.read();
// if (done) break;
// const chunk = decoder.decode(value, { stream: true });
// const lines = (leftover + chunk).split('\n');
// // Process each line except the last (which might be incomplete)
// for (const line of lines.slice(0, -1)) saveLine(line);
// // Save the last incomplete line to process in the next iteration
// leftover = lines[lines.length - 1];
// }
// // Handle any remaining content after reading all chunks
// if (leftover) saveLine(leftover);
// }
async function englishFreq() {
2024-10-22 08:45:52 +00:00
const file = Bun.file("../datasets/unigram_freq.csv");
2024-10-22 04:35:21 +00:00
const s = file.stream();
const reader = s.getReader();
const decoder = new TextDecoder();
2024-10-22 08:45:52 +00:00
let leftover = "";
2024-10-22 04:35:21 +00:00
let lineCount = 0;
while (true) {
const { value, done } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
2024-10-22 08:45:52 +00:00
const lines = (leftover + chunk).split("\n");
2024-10-22 04:35:21 +00:00
// Process each line except the last (which might be incomplete)
for (const line of lines.slice(0, -1)) {
lineCount++;
2024-10-22 08:45:52 +00:00
const [spelling, _frequency] = line.split(",");
2024-10-22 04:35:21 +00:00
addFrequency(db, spelling, lineCount);
}
// Save the last incomplete line to process in the next iteration
leftover = lines[lines.length - 1];
}
// Handle any remaining content after reading all chunks
if (leftover) addFrequency(db, leftover, lineCount + 1);
}
// TODO no conjunctions or adpositions in Wordnet!!
function englishIPA() {
const queryString = `
SELECT words.wordid, word, pronunciation, domainname FROM words
JOIN lexes_pronunciations lp ON lp.wordid = words.wordid
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
JOIN senses ON senses.wordid = words.wordid
JOIN synsets ON synsets.synsetid = senses.synsetid
JOIN domains ON domains.domainid = synsets.domainid
GROUP BY words.wordid
`;
const query = wndb.query(queryString);
const res: Array<{
word: string;
pronunciation: string;
domainname: string;
}> = query.all() as any;
for (const r of res) {
2024-10-22 08:45:52 +00:00
console.log("adding word", r);
2024-10-22 04:35:21 +00:00
// if (r.word === 'abrasive') throw new Error('stop right here');
2024-10-22 08:45:52 +00:00
const split = r.word.split(" ");
const type = split.length > 1 ? "expression" : "word";
2024-10-22 04:35:21 +00:00
const subtype = null;
2024-10-22 08:45:52 +00:00
const wordid = addWord(db, r.word, r.pronunciation, "en-us", type, subtype);
2024-10-22 04:35:21 +00:00
addCat(db, wordid, r.domainname);
}
}
// function saveLine(line: string) {
// const [spelling, ipa] = line.split(/\s+/);
// if (!spelling || !ipa) return;
// const hasSymbols = spelling.match(SYMBOL_REGEX);
// if (hasSymbols) return;
// const isWord = checkWordNet(spelling);
// console.log(spelling, isWord);
// if (!isWord) return;
// const split = spelling.split(' ');
// const type = split.length > 1 ? 'expression' : 'word';
// const subtype = null;
// addWord(db, spelling, ipa, 'en-us', type, subtype);
// }
// function checkWordNet(word: string) {
// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
// const res = query.get({ $word: word });
// return !!res;
// }
function englishCards() {
2024-10-22 08:45:52 +00:00
const lesson_id = addLesson(db, "First Lesson, some easy stuff");
2024-10-22 04:35:21 +00:00
const texts = [
2024-10-22 08:45:52 +00:00
"I",
"friend",
"my friend",
"you",
"your friend",
2024-10-22 04:35:21 +00:00
"my friends' friend",
2024-10-22 08:45:52 +00:00
"you are my friend",
"I am your friend",
"your friend is my friend",
"my friend is your friend",
"he is my friend",
"this is mine",
"this is yours",
2024-10-22 04:35:21 +00:00
"this is my friends'",
2024-10-22 08:45:52 +00:00
"no",
"you are not my friend",
"this is not yours",
"your friend is not my friend",
"that is mine",
"this is mine, that is yours",
"he is not your friend",
"no, I am not",
"that is not me",
2024-10-22 04:35:21 +00:00
"that is not mine, that is my friends'",
];
for (const text of texts) {
addCard(db, lesson_id, text);
}
}
2024-10-22 08:45:52 +00:00
// englishIPA();
// englishFreq();
2024-10-22 04:35:21 +00:00
englishCards();