hanchu/server/seeding.ts

import { Database } from "bun:sqlite";
import { addCard, addCat, addFrequency, addLesson, addWord } from "./db";

// const db = new Database('../db/data.db');
const db = new Database("../db/data.db", { strict: true });
const wndb = new Database("../datasets/en-wordnet/data.sqlite");
db.exec("PRAGMA journal_mode = WAL;");

const SYMBOL_REGEX = new RegExp(/[\W\d]/);

// async function englishIPA() {
//   const file = Bun.file('ipa/en-us/ipadict.txt');
//   const s = file.stream();
//   const reader = s.getReader();
//   const decoder = new TextDecoder();
//   let leftover = '';
//   while (true) {
//     const { value, done } = await reader.read();
//     if (done) break;
//     const chunk = decoder.decode(value, { stream: true });
//     const lines = (leftover + chunk).split('\n');

//     // Process each line except the last (which might be incomplete)
//     for (const line of lines.slice(0, -1)) saveLine(line);

//     // Save the last incomplete line to process in the next iteration
//     leftover = lines[lines.length - 1];
//   }

//   // Handle any remaining content after reading all chunks
//   if (leftover) saveLine(leftover);
// }

async function englishFreq() {
  const file = Bun.file("../datasets/unigram_freq.csv");
  const s = file.stream();
  const reader = s.getReader();
  const decoder = new TextDecoder();
  let leftover = "";
  let lineCount = 0;
  while (true) {
    const { value, done } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value, { stream: true });
    const lines = (leftover + chunk).split("\n");

    // Process each line except the last (which might be incomplete)
    for (const line of lines.slice(0, -1)) {
      lineCount++;
      const [spelling, _frequency] = line.split(",");
      addFrequency(db, spelling, lineCount);
    }

    // Save the last incomplete line to process in the next iteration
    leftover = lines[lines.length - 1];
  }

  // Handle any remaining content after reading all chunks
  if (leftover) addFrequency(db, leftover, lineCount + 1);
}
// TODO no conjunctions or adpositions in Wordnet!!
function englishIPA() {
  const queryString = `
    SELECT words.wordid, word, pronunciation, domainname FROM words
    JOIN lexes_pronunciations lp ON lp.wordid = words.wordid 
    JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid 
    JOIN senses ON senses.wordid = words.wordid
    JOIN synsets ON synsets.synsetid = senses.synsetid
    JOIN domains ON domains.domainid = synsets.domainid
    GROUP BY words.wordid
  `;
  const query = wndb.query(queryString);
  const res: Array<{
    word: string;
    pronunciation: string;
    domainname: string;
  }> = query.all() as any;
  for (const r of res) {
    console.log("adding word", r);
    // if (r.word === 'abrasive') throw new Error('stop right here');
    const split = r.word.split(" ");
    const type = split.length > 1 ? "expression" : "word";
    const subtype = null;
    const wordid = addWord(db, r.word, r.pronunciation, "en-us", type, subtype);
    addCat(db, wordid, r.domainname);
  }
}
// function saveLine(line: string) {
//   const [spelling, ipa] = line.split(/\s+/);
//   if (!spelling || !ipa) return;
//   const hasSymbols = spelling.match(SYMBOL_REGEX);
//   if (hasSymbols) return;
//   const isWord = checkWordNet(spelling);
//   console.log(spelling, isWord);
//   if (!isWord) return;
//   const split = spelling.split(' ');
//   const type = split.length > 1 ? 'expression' : 'word';
//   const subtype = null;
//   addWord(db, spelling, ipa, 'en-us', type, subtype);
// }

// function checkWordNet(word: string) {
//   const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
//   const res = query.get({ $word: word });
//   return !!res;
// }

function englishCards() {
  const lesson_id = addLesson(db, "First Lesson, some easy stuff");
  const texts = [
    "I",
    "friend",
    "my friend",
    "you",
    "your friend",
    "my friends' friend",
    "you are my friend",
    "I am your friend",
    "your friend is my friend",
    "my friend is your friend",
    "he is my friend",
    "this is mine",
    "this is yours",
    "this is my friends'",
    "no",
    "you are not my friend",
    "this is not yours",
    "your friend is not my friend",
    "that is mine",
    "this is mine, that is yours",
    "he is not your friend",
    "no, I am not",
    "that is not me",
    "that is not mine, that is my friends'",
  ];
  for (const text of texts) {
    addCard(db, lesson_id, text);
  }
}
// englishIPA();
// englishFreq();
englishCards();
m 2024-10-22 08:45:52 +00:00			`import { Database } from "bun:sqlite";`
			`import { addCard, addCat, addFrequency, addLesson, addWord } from "./db";`
m 2024-10-22 04:35:21 +00:00
			`// const db = new Database('../db/data.db');`
m 2024-10-22 08:45:52 +00:00			`const db = new Database("../db/data.db", { strict: true });`
			`const wndb = new Database("../datasets/en-wordnet/data.sqlite");`
			`db.exec("PRAGMA journal_mode = WAL;");`
m 2024-10-22 04:35:21 +00:00
			`const SYMBOL_REGEX = new RegExp(/[\W\d]/);`

			`// async function englishIPA() {`
			`// const file = Bun.file('ipa/en-us/ipadict.txt');`
			`// const s = file.stream();`
			`// const reader = s.getReader();`
			`// const decoder = new TextDecoder();`
			`// let leftover = '';`
			`// while (true) {`
			`// const { value, done } = await reader.read();`
			`// if (done) break;`
			`// const chunk = decoder.decode(value, { stream: true });`
			`// const lines = (leftover + chunk).split('\n');`

			`// // Process each line except the last (which might be incomplete)`
			`// for (const line of lines.slice(0, -1)) saveLine(line);`

			`// // Save the last incomplete line to process in the next iteration`
			`// leftover = lines[lines.length - 1];`
			`// }`

			`// // Handle any remaining content after reading all chunks`
			`// if (leftover) saveLine(leftover);`
			`// }`

			`async function englishFreq() {`
m 2024-10-22 08:45:52 +00:00			`const file = Bun.file("../datasets/unigram_freq.csv");`
m 2024-10-22 04:35:21 +00:00			`const s = file.stream();`
			`const reader = s.getReader();`
			`const decoder = new TextDecoder();`
m 2024-10-22 08:45:52 +00:00			`let leftover = "";`
m 2024-10-22 04:35:21 +00:00			`let lineCount = 0;`
			`while (true) {`
			`const { value, done } = await reader.read();`
			`if (done) break;`
			`const chunk = decoder.decode(value, { stream: true });`
m 2024-10-22 08:45:52 +00:00			`const lines = (leftover + chunk).split("\n");`
m 2024-10-22 04:35:21 +00:00
			`// Process each line except the last (which might be incomplete)`
			`for (const line of lines.slice(0, -1)) {`
			`lineCount++;`
m 2024-10-22 08:45:52 +00:00			`const [spelling, _frequency] = line.split(",");`
m 2024-10-22 04:35:21 +00:00			`addFrequency(db, spelling, lineCount);`
			`}`

			`// Save the last incomplete line to process in the next iteration`
			`leftover = lines[lines.length - 1];`
			`}`

			`// Handle any remaining content after reading all chunks`
			`if (leftover) addFrequency(db, leftover, lineCount + 1);`
			`}`
			`// TODO no conjunctions or adpositions in Wordnet!!`
			`function englishIPA() {`
			const queryString = `
			`SELECT words.wordid, word, pronunciation, domainname FROM words`
			`JOIN lexes_pronunciations lp ON lp.wordid = words.wordid`
			`JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid`
			`JOIN senses ON senses.wordid = words.wordid`
			`JOIN synsets ON synsets.synsetid = senses.synsetid`
			`JOIN domains ON domains.domainid = synsets.domainid`
			`GROUP BY words.wordid`
			`;
			`const query = wndb.query(queryString);`
			`const res: Array<{`
			`word: string;`
			`pronunciation: string;`
			`domainname: string;`
			`}> = query.all() as any;`
			`for (const r of res) {`
m 2024-10-22 08:45:52 +00:00			`console.log("adding word", r);`
m 2024-10-22 04:35:21 +00:00			`// if (r.word === 'abrasive') throw new Error('stop right here');`
m 2024-10-22 08:45:52 +00:00			`const split = r.word.split(" ");`
			`const type = split.length > 1 ? "expression" : "word";`
m 2024-10-22 04:35:21 +00:00			`const subtype = null;`
m 2024-10-22 08:45:52 +00:00			`const wordid = addWord(db, r.word, r.pronunciation, "en-us", type, subtype);`
m 2024-10-22 04:35:21 +00:00			`addCat(db, wordid, r.domainname);`
			`}`
			`}`
			`// function saveLine(line: string) {`
			`// const [spelling, ipa] = line.split(/\s+/);`
			`// if (!spelling \|\| !ipa) return;`
			`// const hasSymbols = spelling.match(SYMBOL_REGEX);`
			`// if (hasSymbols) return;`
			`// const isWord = checkWordNet(spelling);`
			`// console.log(spelling, isWord);`
			`// if (!isWord) return;`
			`// const split = spelling.split(' ');`
			`// const type = split.length > 1 ? 'expression' : 'word';`
			`// const subtype = null;`
			`// addWord(db, spelling, ipa, 'en-us', type, subtype);`
			`// }`

			`// function checkWordNet(word: string) {`
			// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
			`// const res = query.get({ $word: word });`
			`// return !!res;`
			`// }`

			`function englishCards() {`
m 2024-10-22 08:45:52 +00:00			`const lesson_id = addLesson(db, "First Lesson, some easy stuff");`
m 2024-10-22 04:35:21 +00:00			`const texts = [`
m 2024-10-22 08:45:52 +00:00			`"I",`
			`"friend",`
			`"my friend",`
			`"you",`
			`"your friend",`
m 2024-10-22 04:35:21 +00:00			`"my friends' friend",`
m 2024-10-22 08:45:52 +00:00			`"you are my friend",`
			`"I am your friend",`
			`"your friend is my friend",`
			`"my friend is your friend",`
			`"he is my friend",`
			`"this is mine",`
			`"this is yours",`
m 2024-10-22 04:35:21 +00:00			`"this is my friends'",`
m 2024-10-22 08:45:52 +00:00			`"no",`
			`"you are not my friend",`
			`"this is not yours",`
			`"your friend is not my friend",`
			`"that is mine",`
			`"this is mine, that is yours",`
			`"he is not your friend",`
			`"no, I am not",`
			`"that is not me",`
m 2024-10-22 04:35:21 +00:00			`"that is not mine, that is my friends'",`
			`];`
			`for (const text of texts) {`
			`addCard(db, lesson_id, text);`
			`}`
			`}`
m 2024-10-22 08:45:52 +00:00			`// englishIPA();`
			`// englishFreq();`
m 2024-10-22 04:35:21 +00:00			`englishCards();`