144 lines
4.5 KiB
TypeScript
144 lines
4.5 KiB
TypeScript
|
import { Database } from 'bun:sqlite';
|
||
|
import { addCard, addCat, addFrequency, addLesson, addWord } from './db';
|
||
|
import Wordnet from 'en-wordnet';
|
||
|
|
||
|
// const db = new Database('../db/data.db');
|
||
|
const db = new Database('../db/data.db', { strict: true });
|
||
|
const wndb = new Database('../datasets/en-wordnet/data.sqlite');
|
||
|
db.exec('PRAGMA journal_mode = WAL;');
|
||
|
|
||
|
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
|
||
|
|
||
|
// async function englishIPA() {
|
||
|
// const file = Bun.file('ipa/en-us/ipadict.txt');
|
||
|
// const s = file.stream();
|
||
|
// const reader = s.getReader();
|
||
|
// const decoder = new TextDecoder();
|
||
|
// let leftover = '';
|
||
|
// while (true) {
|
||
|
// const { value, done } = await reader.read();
|
||
|
// if (done) break;
|
||
|
// const chunk = decoder.decode(value, { stream: true });
|
||
|
// const lines = (leftover + chunk).split('\n');
|
||
|
|
||
|
// // Process each line except the last (which might be incomplete)
|
||
|
// for (const line of lines.slice(0, -1)) saveLine(line);
|
||
|
|
||
|
// // Save the last incomplete line to process in the next iteration
|
||
|
// leftover = lines[lines.length - 1];
|
||
|
// }
|
||
|
|
||
|
// // Handle any remaining content after reading all chunks
|
||
|
// if (leftover) saveLine(leftover);
|
||
|
// }
|
||
|
|
||
|
async function englishFreq() {
|
||
|
const file = Bun.file('../datasets/unigram_freq.csv');
|
||
|
const s = file.stream();
|
||
|
const reader = s.getReader();
|
||
|
const decoder = new TextDecoder();
|
||
|
let leftover = '';
|
||
|
let lineCount = 0;
|
||
|
while (true) {
|
||
|
const { value, done } = await reader.read();
|
||
|
if (done) break;
|
||
|
const chunk = decoder.decode(value, { stream: true });
|
||
|
const lines = (leftover + chunk).split('\n');
|
||
|
|
||
|
// Process each line except the last (which might be incomplete)
|
||
|
for (const line of lines.slice(0, -1)) {
|
||
|
lineCount++;
|
||
|
const [spelling, _frequency] = line.split(',');
|
||
|
addFrequency(db, spelling, lineCount);
|
||
|
}
|
||
|
|
||
|
// Save the last incomplete line to process in the next iteration
|
||
|
leftover = lines[lines.length - 1];
|
||
|
}
|
||
|
|
||
|
// Handle any remaining content after reading all chunks
|
||
|
if (leftover) addFrequency(db, leftover, lineCount + 1);
|
||
|
}
|
||
|
// TODO no conjunctions or adpositions in Wordnet!!
|
||
|
function englishIPA() {
|
||
|
const queryString = `
|
||
|
SELECT words.wordid, word, pronunciation, domainname FROM words
|
||
|
JOIN lexes_pronunciations lp ON lp.wordid = words.wordid
|
||
|
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
|
||
|
JOIN senses ON senses.wordid = words.wordid
|
||
|
JOIN synsets ON synsets.synsetid = senses.synsetid
|
||
|
JOIN domains ON domains.domainid = synsets.domainid
|
||
|
GROUP BY words.wordid
|
||
|
`;
|
||
|
const query = wndb.query(queryString);
|
||
|
const res: Array<{
|
||
|
word: string;
|
||
|
pronunciation: string;
|
||
|
domainname: string;
|
||
|
}> = query.all() as any;
|
||
|
for (const r of res) {
|
||
|
console.log('adding word', r);
|
||
|
// if (r.word === 'abrasive') throw new Error('stop right here');
|
||
|
const split = r.word.split(' ');
|
||
|
const type = split.length > 1 ? 'expression' : 'word';
|
||
|
const subtype = null;
|
||
|
const wordid = addWord(db, r.word, r.pronunciation, 'en-us', type, subtype);
|
||
|
addCat(db, wordid, r.domainname);
|
||
|
}
|
||
|
}
|
||
|
// function saveLine(line: string) {
|
||
|
// const [spelling, ipa] = line.split(/\s+/);
|
||
|
// if (!spelling || !ipa) return;
|
||
|
// const hasSymbols = spelling.match(SYMBOL_REGEX);
|
||
|
// if (hasSymbols) return;
|
||
|
// const isWord = checkWordNet(spelling);
|
||
|
// console.log(spelling, isWord);
|
||
|
// if (!isWord) return;
|
||
|
// const split = spelling.split(' ');
|
||
|
// const type = split.length > 1 ? 'expression' : 'word';
|
||
|
// const subtype = null;
|
||
|
// addWord(db, spelling, ipa, 'en-us', type, subtype);
|
||
|
// }
|
||
|
|
||
|
// function checkWordNet(word: string) {
|
||
|
// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
|
||
|
// const res = query.get({ $word: word });
|
||
|
// return !!res;
|
||
|
// }
|
||
|
|
||
|
function englishCards() {
|
||
|
const lesson_id = addLesson(db, 'First Lesson, some easy stuff');
|
||
|
const texts = [
|
||
|
'I',
|
||
|
'friend',
|
||
|
'my friend',
|
||
|
'you',
|
||
|
'your friend',
|
||
|
"my friends' friend",
|
||
|
'you are my friend',
|
||
|
'I am your friend',
|
||
|
'your friend is my friend',
|
||
|
'my friend is your friend',
|
||
|
'he is my friend',
|
||
|
'this is mine',
|
||
|
'this is yours',
|
||
|
"this is my friends'",
|
||
|
'no',
|
||
|
'you are not my friend',
|
||
|
'this is not yours',
|
||
|
'your friend is not my friend',
|
||
|
'that is mine',
|
||
|
'this is mine, that is yours',
|
||
|
'he is not your friend',
|
||
|
'no, I am not',
|
||
|
'that is not me',
|
||
|
"that is not mine, that is my friends'",
|
||
|
];
|
||
|
for (const text of texts) {
|
||
|
addCard(db, lesson_id, text);
|
||
|
}
|
||
|
}
|
||
|
englishIPA();
|
||
|
englishFreq();
|
||
|
englishCards();
|