import { Database } from 'bun:sqlite'; import { addCard, addCat, addFrequency, addLesson, addWord } from './db'; import Wordnet from 'en-wordnet'; // const db = new Database('../db/data.db'); const db = new Database('../db/data.db', { strict: true }); const wndb = new Database('../datasets/en-wordnet/data.sqlite'); db.exec('PRAGMA journal_mode = WAL;'); const SYMBOL_REGEX = new RegExp(/[\W\d]/); // async function englishIPA() { // const file = Bun.file('ipa/en-us/ipadict.txt'); // const s = file.stream(); // const reader = s.getReader(); // const decoder = new TextDecoder(); // let leftover = ''; // while (true) { // const { value, done } = await reader.read(); // if (done) break; // const chunk = decoder.decode(value, { stream: true }); // const lines = (leftover + chunk).split('\n'); // // Process each line except the last (which might be incomplete) // for (const line of lines.slice(0, -1)) saveLine(line); // // Save the last incomplete line to process in the next iteration // leftover = lines[lines.length - 1]; // } // // Handle any remaining content after reading all chunks // if (leftover) saveLine(leftover); // } async function englishFreq() { const file = Bun.file('../datasets/unigram_freq.csv'); const s = file.stream(); const reader = s.getReader(); const decoder = new TextDecoder(); let leftover = ''; let lineCount = 0; while (true) { const { value, done } = await reader.read(); if (done) break; const chunk = decoder.decode(value, { stream: true }); const lines = (leftover + chunk).split('\n'); // Process each line except the last (which might be incomplete) for (const line of lines.slice(0, -1)) { lineCount++; const [spelling, _frequency] = line.split(','); addFrequency(db, spelling, lineCount); } // Save the last incomplete line to process in the next iteration leftover = lines[lines.length - 1]; } // Handle any remaining content after reading all chunks if (leftover) addFrequency(db, leftover, lineCount + 1); } // TODO no conjunctions or adpositions in Wordnet!! function englishIPA() { const queryString = ` SELECT words.wordid, word, pronunciation, domainname FROM words JOIN lexes_pronunciations lp ON lp.wordid = words.wordid JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid JOIN senses ON senses.wordid = words.wordid JOIN synsets ON synsets.synsetid = senses.synsetid JOIN domains ON domains.domainid = synsets.domainid GROUP BY words.wordid `; const query = wndb.query(queryString); const res: Array<{ word: string; pronunciation: string; domainname: string; }> = query.all() as any; for (const r of res) { console.log('adding word', r); // if (r.word === 'abrasive') throw new Error('stop right here'); const split = r.word.split(' '); const type = split.length > 1 ? 'expression' : 'word'; const subtype = null; const wordid = addWord(db, r.word, r.pronunciation, 'en-us', type, subtype); addCat(db, wordid, r.domainname); } } // function saveLine(line: string) { // const [spelling, ipa] = line.split(/\s+/); // if (!spelling || !ipa) return; // const hasSymbols = spelling.match(SYMBOL_REGEX); // if (hasSymbols) return; // const isWord = checkWordNet(spelling); // console.log(spelling, isWord); // if (!isWord) return; // const split = spelling.split(' '); // const type = split.length > 1 ? 'expression' : 'word'; // const subtype = null; // addWord(db, spelling, ipa, 'en-us', type, subtype); // } // function checkWordNet(word: string) { // const query = wndb.query(`SELECT * FROM words WHERE word = $word`); // const res = query.get({ $word: word }); // return !!res; // } function englishCards() { const lesson_id = addLesson(db, 'First Lesson, some easy stuff'); const texts = [ 'I', 'friend', 'my friend', 'you', 'your friend', "my friends' friend", 'you are my friend', 'I am your friend', 'your friend is my friend', 'my friend is your friend', 'he is my friend', 'this is mine', 'this is yours', "this is my friends'", 'no', 'you are not my friend', 'this is not yours', 'your friend is not my friend', 'that is mine', 'this is mine, that is yours', 'he is not your friend', 'no, I am not', 'that is not me', "that is not mine, that is my friends'", ]; for (const text of texts) { addCard(db, lesson_id, text); } } englishIPA(); englishFreq(); englishCards();