hanchu/server/seeding.ts

144 lines
4.5 KiB
TypeScript
Raw Normal View History

2024-10-22 04:35:21 +00:00
import { Database } from 'bun:sqlite';
import { addCard, addCat, addFrequency, addLesson, addWord } from './db';
import Wordnet from 'en-wordnet';
// const db = new Database('../db/data.db');
const db = new Database('../db/data.db', { strict: true });
const wndb = new Database('../datasets/en-wordnet/data.sqlite');
db.exec('PRAGMA journal_mode = WAL;');
const SYMBOL_REGEX = new RegExp(/[\W\d]/);
// async function englishIPA() {
// const file = Bun.file('ipa/en-us/ipadict.txt');
// const s = file.stream();
// const reader = s.getReader();
// const decoder = new TextDecoder();
// let leftover = '';
// while (true) {
// const { value, done } = await reader.read();
// if (done) break;
// const chunk = decoder.decode(value, { stream: true });
// const lines = (leftover + chunk).split('\n');
// // Process each line except the last (which might be incomplete)
// for (const line of lines.slice(0, -1)) saveLine(line);
// // Save the last incomplete line to process in the next iteration
// leftover = lines[lines.length - 1];
// }
// // Handle any remaining content after reading all chunks
// if (leftover) saveLine(leftover);
// }
async function englishFreq() {
const file = Bun.file('../datasets/unigram_freq.csv');
const s = file.stream();
const reader = s.getReader();
const decoder = new TextDecoder();
let leftover = '';
let lineCount = 0;
while (true) {
const { value, done } = await reader.read();
if (done) break;
const chunk = decoder.decode(value, { stream: true });
const lines = (leftover + chunk).split('\n');
// Process each line except the last (which might be incomplete)
for (const line of lines.slice(0, -1)) {
lineCount++;
const [spelling, _frequency] = line.split(',');
addFrequency(db, spelling, lineCount);
}
// Save the last incomplete line to process in the next iteration
leftover = lines[lines.length - 1];
}
// Handle any remaining content after reading all chunks
if (leftover) addFrequency(db, leftover, lineCount + 1);
}
// TODO no conjunctions or adpositions in Wordnet!!
function englishIPA() {
const queryString = `
SELECT words.wordid, word, pronunciation, domainname FROM words
JOIN lexes_pronunciations lp ON lp.wordid = words.wordid
JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
JOIN senses ON senses.wordid = words.wordid
JOIN synsets ON synsets.synsetid = senses.synsetid
JOIN domains ON domains.domainid = synsets.domainid
GROUP BY words.wordid
`;
const query = wndb.query(queryString);
const res: Array<{
word: string;
pronunciation: string;
domainname: string;
}> = query.all() as any;
for (const r of res) {
console.log('adding word', r);
// if (r.word === 'abrasive') throw new Error('stop right here');
const split = r.word.split(' ');
const type = split.length > 1 ? 'expression' : 'word';
const subtype = null;
const wordid = addWord(db, r.word, r.pronunciation, 'en-us', type, subtype);
addCat(db, wordid, r.domainname);
}
}
// function saveLine(line: string) {
// const [spelling, ipa] = line.split(/\s+/);
// if (!spelling || !ipa) return;
// const hasSymbols = spelling.match(SYMBOL_REGEX);
// if (hasSymbols) return;
// const isWord = checkWordNet(spelling);
// console.log(spelling, isWord);
// if (!isWord) return;
// const split = spelling.split(' ');
// const type = split.length > 1 ? 'expression' : 'word';
// const subtype = null;
// addWord(db, spelling, ipa, 'en-us', type, subtype);
// }
// function checkWordNet(word: string) {
// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
// const res = query.get({ $word: word });
// return !!res;
// }
function englishCards() {
const lesson_id = addLesson(db, 'First Lesson, some easy stuff');
const texts = [
'I',
'friend',
'my friend',
'you',
'your friend',
"my friends' friend",
'you are my friend',
'I am your friend',
'your friend is my friend',
'my friend is your friend',
'he is my friend',
'this is mine',
'this is yours',
"this is my friends'",
'no',
'you are not my friend',
'this is not yours',
'your friend is not my friend',
'that is mine',
'this is mine, that is yours',
'he is not your friend',
'no, I am not',
'that is not me',
"that is not mine, that is my friends'",
];
for (const text of texts) {
addCard(db, lesson_id, text);
}
}
englishIPA();
englishFreq();
englishCards();