2024-10-22 08:45:52 +00:00
|
|
|
import { Database } from "bun:sqlite";
|
|
|
|
import { addCard, addCat, addFrequency, addLesson, addWord } from "./db";
|
2024-10-22 04:35:21 +00:00
|
|
|
|
|
|
|
// const db = new Database('../db/data.db');
|
2024-10-22 08:45:52 +00:00
|
|
|
const db = new Database("../db/data.db", { strict: true });
|
|
|
|
const wndb = new Database("../datasets/en-wordnet/data.sqlite");
|
|
|
|
db.exec("PRAGMA journal_mode = WAL;");
|
2024-10-22 04:35:21 +00:00
|
|
|
|
|
|
|
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
|
|
|
|
|
|
|
|
// async function englishIPA() {
|
|
|
|
// const file = Bun.file('ipa/en-us/ipadict.txt');
|
|
|
|
// const s = file.stream();
|
|
|
|
// const reader = s.getReader();
|
|
|
|
// const decoder = new TextDecoder();
|
|
|
|
// let leftover = '';
|
|
|
|
// while (true) {
|
|
|
|
// const { value, done } = await reader.read();
|
|
|
|
// if (done) break;
|
|
|
|
// const chunk = decoder.decode(value, { stream: true });
|
|
|
|
// const lines = (leftover + chunk).split('\n');
|
|
|
|
|
|
|
|
// // Process each line except the last (which might be incomplete)
|
|
|
|
// for (const line of lines.slice(0, -1)) saveLine(line);
|
|
|
|
|
|
|
|
// // Save the last incomplete line to process in the next iteration
|
|
|
|
// leftover = lines[lines.length - 1];
|
|
|
|
// }
|
|
|
|
|
|
|
|
// // Handle any remaining content after reading all chunks
|
|
|
|
// if (leftover) saveLine(leftover);
|
|
|
|
// }
|
|
|
|
|
|
|
|
async function englishFreq() {
|
2024-10-22 08:45:52 +00:00
|
|
|
const file = Bun.file("../datasets/unigram_freq.csv");
|
2024-10-22 04:35:21 +00:00
|
|
|
const s = file.stream();
|
|
|
|
const reader = s.getReader();
|
|
|
|
const decoder = new TextDecoder();
|
2024-10-22 08:45:52 +00:00
|
|
|
let leftover = "";
|
2024-10-22 04:35:21 +00:00
|
|
|
let lineCount = 0;
|
|
|
|
while (true) {
|
|
|
|
const { value, done } = await reader.read();
|
|
|
|
if (done) break;
|
|
|
|
const chunk = decoder.decode(value, { stream: true });
|
2024-10-22 08:45:52 +00:00
|
|
|
const lines = (leftover + chunk).split("\n");
|
2024-10-22 04:35:21 +00:00
|
|
|
|
|
|
|
// Process each line except the last (which might be incomplete)
|
|
|
|
for (const line of lines.slice(0, -1)) {
|
|
|
|
lineCount++;
|
2024-10-22 08:45:52 +00:00
|
|
|
const [spelling, _frequency] = line.split(",");
|
2024-10-22 04:35:21 +00:00
|
|
|
addFrequency(db, spelling, lineCount);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Save the last incomplete line to process in the next iteration
|
|
|
|
leftover = lines[lines.length - 1];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle any remaining content after reading all chunks
|
|
|
|
if (leftover) addFrequency(db, leftover, lineCount + 1);
|
|
|
|
}
|
|
|
|
// TODO no conjunctions or adpositions in Wordnet!!
|
|
|
|
function englishIPA() {
|
|
|
|
const queryString = `
|
|
|
|
SELECT words.wordid, word, pronunciation, domainname FROM words
|
|
|
|
JOIN lexes_pronunciations lp ON lp.wordid = words.wordid
|
|
|
|
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
|
|
|
|
JOIN senses ON senses.wordid = words.wordid
|
|
|
|
JOIN synsets ON synsets.synsetid = senses.synsetid
|
|
|
|
JOIN domains ON domains.domainid = synsets.domainid
|
|
|
|
GROUP BY words.wordid
|
|
|
|
`;
|
|
|
|
const query = wndb.query(queryString);
|
|
|
|
const res: Array<{
|
|
|
|
word: string;
|
|
|
|
pronunciation: string;
|
|
|
|
domainname: string;
|
|
|
|
}> = query.all() as any;
|
|
|
|
for (const r of res) {
|
2024-10-22 08:45:52 +00:00
|
|
|
console.log("adding word", r);
|
2024-10-22 04:35:21 +00:00
|
|
|
// if (r.word === 'abrasive') throw new Error('stop right here');
|
2024-10-22 08:45:52 +00:00
|
|
|
const split = r.word.split(" ");
|
|
|
|
const type = split.length > 1 ? "expression" : "word";
|
2024-10-22 04:35:21 +00:00
|
|
|
const subtype = null;
|
2024-10-22 08:45:52 +00:00
|
|
|
const wordid = addWord(db, r.word, r.pronunciation, "en-us", type, subtype);
|
2024-10-22 04:35:21 +00:00
|
|
|
addCat(db, wordid, r.domainname);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// function saveLine(line: string) {
|
|
|
|
// const [spelling, ipa] = line.split(/\s+/);
|
|
|
|
// if (!spelling || !ipa) return;
|
|
|
|
// const hasSymbols = spelling.match(SYMBOL_REGEX);
|
|
|
|
// if (hasSymbols) return;
|
|
|
|
// const isWord = checkWordNet(spelling);
|
|
|
|
// console.log(spelling, isWord);
|
|
|
|
// if (!isWord) return;
|
|
|
|
// const split = spelling.split(' ');
|
|
|
|
// const type = split.length > 1 ? 'expression' : 'word';
|
|
|
|
// const subtype = null;
|
|
|
|
// addWord(db, spelling, ipa, 'en-us', type, subtype);
|
|
|
|
// }
|
|
|
|
|
|
|
|
// function checkWordNet(word: string) {
|
|
|
|
// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
|
|
|
|
// const res = query.get({ $word: word });
|
|
|
|
// return !!res;
|
|
|
|
// }
|
|
|
|
|
|
|
|
function englishCards() {
|
2024-10-22 08:45:52 +00:00
|
|
|
const lesson_id = addLesson(db, "First Lesson, some easy stuff");
|
2024-10-22 04:35:21 +00:00
|
|
|
const texts = [
|
2024-10-22 08:45:52 +00:00
|
|
|
"I",
|
|
|
|
"friend",
|
|
|
|
"my friend",
|
|
|
|
"you",
|
|
|
|
"your friend",
|
2024-10-22 04:35:21 +00:00
|
|
|
"my friends' friend",
|
2024-10-22 08:45:52 +00:00
|
|
|
"you are my friend",
|
|
|
|
"I am your friend",
|
|
|
|
"your friend is my friend",
|
|
|
|
"my friend is your friend",
|
|
|
|
"he is my friend",
|
|
|
|
"this is mine",
|
|
|
|
"this is yours",
|
2024-10-22 04:35:21 +00:00
|
|
|
"this is my friends'",
|
2024-10-22 08:45:52 +00:00
|
|
|
"no",
|
|
|
|
"you are not my friend",
|
|
|
|
"this is not yours",
|
|
|
|
"your friend is not my friend",
|
|
|
|
"that is mine",
|
|
|
|
"this is mine, that is yours",
|
|
|
|
"he is not your friend",
|
|
|
|
"no, I am not",
|
|
|
|
"that is not me",
|
2024-10-22 04:35:21 +00:00
|
|
|
"that is not mine, that is my friends'",
|
|
|
|
];
|
|
|
|
for (const text of texts) {
|
|
|
|
addCard(db, lesson_id, text);
|
|
|
|
}
|
|
|
|
}
|
2024-10-22 08:45:52 +00:00
|
|
|
// englishIPA();
|
|
|
|
// englishFreq();
|
2024-10-22 04:35:21 +00:00
|
|
|
englishCards();
|