2024-10-22 08:45:52 +00:00
|
|
|
import { Database } from "bun:sqlite";
|
2024-10-23 16:54:41 +00:00
|
|
|
import {
|
|
|
|
addCard,
|
|
|
|
addCat,
|
|
|
|
addFrequency,
|
|
|
|
addLesson,
|
|
|
|
addWord,
|
|
|
|
domains,
|
|
|
|
poss,
|
|
|
|
} from "./db";
|
2024-10-22 04:35:21 +00:00
|
|
|
|
|
|
|
// const db = new Database('../db/data.db');
|
2024-10-22 08:45:52 +00:00
|
|
|
const db = new Database("../db/data.db", { strict: true });
|
|
|
|
const wndb = new Database("../datasets/en-wordnet/data.sqlite");
|
|
|
|
db.exec("PRAGMA journal_mode = WAL;");
|
2024-10-22 04:35:21 +00:00
|
|
|
|
|
|
|
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
|
|
|
|
|
2024-10-23 16:54:41 +00:00
|
|
|
async function handleFile(
|
|
|
|
filename: string,
|
|
|
|
func: (line: string, idx: number) => void,
|
|
|
|
) {
|
|
|
|
const file = Bun.file(filename);
|
2024-10-22 04:35:21 +00:00
|
|
|
const s = file.stream();
|
|
|
|
const reader = s.getReader();
|
|
|
|
const decoder = new TextDecoder();
|
2024-10-22 08:45:52 +00:00
|
|
|
let leftover = "";
|
2024-10-22 04:35:21 +00:00
|
|
|
let lineCount = 0;
|
|
|
|
while (true) {
|
|
|
|
const { value, done } = await reader.read();
|
|
|
|
if (done) break;
|
|
|
|
const chunk = decoder.decode(value, { stream: true });
|
2024-10-22 08:45:52 +00:00
|
|
|
const lines = (leftover + chunk).split("\n");
|
2024-10-22 04:35:21 +00:00
|
|
|
|
|
|
|
// Process each line except the last (which might be incomplete)
|
|
|
|
for (const line of lines.slice(0, -1)) {
|
|
|
|
lineCount++;
|
2024-10-23 16:54:41 +00:00
|
|
|
func(line, lineCount);
|
2024-10-22 04:35:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Save the last incomplete line to process in the next iteration
|
|
|
|
leftover = lines[lines.length - 1];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle any remaining content after reading all chunks
|
2024-10-23 16:54:41 +00:00
|
|
|
if (leftover) func(leftover, lineCount + 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
function goodPos(pos: string): boolean {
|
|
|
|
const list = [
|
|
|
|
"CC",
|
|
|
|
"DT",
|
|
|
|
"EX",
|
|
|
|
"IN",
|
|
|
|
"LS",
|
|
|
|
"MD",
|
|
|
|
"PDT",
|
|
|
|
"POS",
|
|
|
|
"PRP",
|
|
|
|
"PRP$",
|
|
|
|
"RP",
|
|
|
|
"TO",
|
|
|
|
"WDT",
|
|
|
|
"WP",
|
|
|
|
"WP$",
|
|
|
|
];
|
|
|
|
return list.includes(pos);
|
|
|
|
}
|
|
|
|
function englishKaggle() {
|
|
|
|
handleFile("../datasets/words_pos.csv", (line, idx) => {
|
|
|
|
const [_, spelling, pos] = line.split(",");
|
|
|
|
if (!goodPos(pos)) return;
|
|
|
|
const rowid = addWord(db, spelling, "", "en-us", "word", null);
|
|
|
|
const category = poss[pos] || "unknown;";
|
|
|
|
addCat(db, rowid, category);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
async function englishIPA() {
|
|
|
|
handleFile("ipa/en-us/ipadict.txt", (line, idx) => {
|
|
|
|
const [spelling, ipa] = line.split(/\s+/);
|
|
|
|
if (!spelling || !ipa) return;
|
|
|
|
const hasSymbols = spelling.match(SYMBOL_REGEX);
|
|
|
|
if (hasSymbols) return;
|
|
|
|
const split = spelling.split(" ");
|
|
|
|
const type = split.length > 1 ? "expression" : "word";
|
|
|
|
const subtype = null;
|
|
|
|
addWord(db, spelling, ipa, "en-us", type, subtype);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
async function englishFreq() {
|
|
|
|
handleFile("../datasets/unigram_freq.csv", (line, idx) => {
|
|
|
|
const [spelling, _frequency] = line.split(",");
|
|
|
|
addFrequency(db, spelling, idx);
|
|
|
|
});
|
|
|
|
|
|
|
|
// Save the last incomplete line to process in the next iteration
|
2024-10-22 04:35:21 +00:00
|
|
|
}
|
|
|
|
// TODO no conjunctions or adpositions in Wordnet!!
|
2024-10-23 16:54:41 +00:00
|
|
|
function englishWordnet() {
|
|
|
|
// LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB'
|
|
|
|
// LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid
|
2024-10-22 04:35:21 +00:00
|
|
|
const queryString = `
|
2024-10-23 16:54:41 +00:00
|
|
|
WITH ranked_ipa AS (
|
|
|
|
SELECT
|
|
|
|
lp.wordid,
|
|
|
|
pr.pronunciation,
|
|
|
|
lp.variety,
|
|
|
|
ROW_NUMBER() OVER (
|
|
|
|
PARTITION BY lp.wordid
|
|
|
|
ORDER BY
|
|
|
|
CASE
|
|
|
|
WHEN lp.variety = 'US' THEN 1
|
|
|
|
WHEN lp.variety IS NULL THEN 2
|
|
|
|
WHEN lp.variety IS 'GB' THEN 3
|
|
|
|
ELSE 4
|
|
|
|
END
|
|
|
|
) AS rank
|
|
|
|
FROM lexes_pronunciations lp
|
|
|
|
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
|
|
|
|
)
|
|
|
|
SELECT words.wordid, word, rp.pronunciation as ipa, domainname
|
|
|
|
FROM words
|
|
|
|
LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1
|
|
|
|
LEFT JOIN senses ON senses.wordid = words.wordid
|
|
|
|
LEFT JOIN synsets ON synsets.synsetid = senses.synsetid
|
|
|
|
LEFT JOIN domains ON domains.domainid = synsets.domainid
|
2024-10-22 04:35:21 +00:00
|
|
|
GROUP BY words.wordid
|
|
|
|
`;
|
|
|
|
const query = wndb.query(queryString);
|
|
|
|
const res: Array<{
|
|
|
|
word: string;
|
2024-10-23 16:54:41 +00:00
|
|
|
ipa: string;
|
2024-10-22 04:35:21 +00:00
|
|
|
domainname: string;
|
|
|
|
}> = query.all() as any;
|
2024-10-23 16:54:41 +00:00
|
|
|
console.log("res", res.length);
|
2024-10-22 04:35:21 +00:00
|
|
|
for (const r of res) {
|
2024-10-23 16:54:41 +00:00
|
|
|
console.log(r, "r");
|
2024-10-22 04:35:21 +00:00
|
|
|
// if (r.word === 'abrasive') throw new Error('stop right here');
|
2024-10-23 16:54:41 +00:00
|
|
|
const ok = filterWord(r.word);
|
|
|
|
if (!ok) continue;
|
2024-10-22 08:45:52 +00:00
|
|
|
const split = r.word.split(" ");
|
|
|
|
const type = split.length > 1 ? "expression" : "word";
|
2024-10-22 04:35:21 +00:00
|
|
|
const subtype = null;
|
2024-10-23 16:54:41 +00:00
|
|
|
const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype);
|
|
|
|
const category = domains[r.domainname] || "unknown;";
|
|
|
|
addCat(db, wordid, category);
|
2024-10-22 04:35:21 +00:00
|
|
|
}
|
|
|
|
}
|
2024-10-23 16:54:41 +00:00
|
|
|
function filterWord(s: string) {
|
|
|
|
const hasSymbols = s.match(SYMBOL_REGEX);
|
|
|
|
if (hasSymbols) return false;
|
|
|
|
else return true;
|
|
|
|
}
|
2024-10-22 04:35:21 +00:00
|
|
|
|
|
|
|
// function checkWordNet(word: string) {
|
|
|
|
// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
|
|
|
|
// const res = query.get({ $word: word });
|
|
|
|
// return !!res;
|
|
|
|
// }
|
|
|
|
|
|
|
|
function englishCards() {
|
2024-10-22 08:45:52 +00:00
|
|
|
const lesson_id = addLesson(db, "First Lesson, some easy stuff");
|
2024-10-22 04:35:21 +00:00
|
|
|
const texts = [
|
2024-10-22 08:45:52 +00:00
|
|
|
"I",
|
|
|
|
"friend",
|
|
|
|
"my friend",
|
|
|
|
"you",
|
|
|
|
"your friend",
|
2024-10-22 04:35:21 +00:00
|
|
|
"my friends' friend",
|
2024-10-22 08:45:52 +00:00
|
|
|
"you are my friend",
|
|
|
|
"I am your friend",
|
|
|
|
"your friend is my friend",
|
|
|
|
"my friend is your friend",
|
|
|
|
"he is my friend",
|
|
|
|
"this is mine",
|
|
|
|
"this is yours",
|
2024-10-22 04:35:21 +00:00
|
|
|
"this is my friends'",
|
2024-10-22 08:45:52 +00:00
|
|
|
"no",
|
|
|
|
"you are not my friend",
|
|
|
|
"this is not yours",
|
|
|
|
"your friend is not my friend",
|
|
|
|
"that is mine",
|
|
|
|
"this is mine, that is yours",
|
|
|
|
"he is not your friend",
|
|
|
|
"no, I am not",
|
|
|
|
"that is not me",
|
2024-10-22 04:35:21 +00:00
|
|
|
"that is not mine, that is my friends'",
|
|
|
|
];
|
|
|
|
for (const text of texts) {
|
|
|
|
addCard(db, lesson_id, text);
|
|
|
|
}
|
|
|
|
}
|
2024-10-23 16:54:41 +00:00
|
|
|
// englishWordnet();
|
2024-10-22 08:45:52 +00:00
|
|
|
// englishFreq();
|
2024-10-23 16:54:41 +00:00
|
|
|
// englishCards();
|
|
|
|
englishKaggle();
|