hanchu/server/seeding.ts

import { Database } from "bun:sqlite";
import {
  addCard,
  addCat,
  addFrequency,
  addLesson,
  addWord,
  domains,
  poss,
} from "./db";

// const db = new Database('../db/data.db');
const db = new Database("../db/data.db", { strict: true });
const wndb = new Database("../datasets/en-wordnet/data.sqlite");
db.exec("PRAGMA journal_mode = WAL;");

const SYMBOL_REGEX = new RegExp(/[\W\d]/);

async function handleFile(
  filename: string,
  func: (line: string, idx: number) => void,
) {
  const file = Bun.file(filename);
  const s = file.stream();
  const reader = s.getReader();
  const decoder = new TextDecoder();
  let leftover = "";
  let lineCount = 0;
  while (true) {
    const { value, done } = await reader.read();
    if (done) break;
    const chunk = decoder.decode(value, { stream: true });
    const lines = (leftover + chunk).split("\n");

    // Process each line except the last (which might be incomplete)
    for (const line of lines.slice(0, -1)) {
      lineCount++;
      func(line, lineCount);
    }

    // Save the last incomplete line to process in the next iteration
    leftover = lines[lines.length - 1];
  }

  // Handle any remaining content after reading all chunks
  if (leftover) func(leftover, lineCount + 1);
}

function goodPos(pos: string): boolean {
  const list = [
    "CC",
    "DT",
    "EX",
    "IN",
    "LS",
    "MD",
    "PDT",
    "POS",
    "PRP",
    "PRP$",
    "RP",
    "TO",
    "WDT",
    "WP",
    "WP$",
  ];
  return list.includes(pos);
}
function englishKaggle() {
  handleFile("../datasets/words_pos.csv", (line, idx) => {
    const [_, spelling, pos] = line.split(",");
    if (!goodPos(pos)) return;
    const rowid = addWord(db, spelling, "", "en-us", "word", null);
    const category = poss[pos] || "unknown;";
    addCat(db, rowid, category);
  });
}
async function englishIPA() {
  handleFile("ipa/en-us/ipadict.txt", (line, idx) => {
    const [spelling, ipa] = line.split(/\s+/);
    if (!spelling || !ipa) return;
    const hasSymbols = spelling.match(SYMBOL_REGEX);
    if (hasSymbols) return;
    const split = spelling.split(" ");
    const type = split.length > 1 ? "expression" : "word";
    const subtype = null;
    addWord(db, spelling, ipa, "en-us", type, subtype);
  });
}

async function englishFreq() {
  handleFile("../datasets/unigram_freq.csv", (line, idx) => {
    const [spelling, _frequency] = line.split(",");
    addFrequency(db, spelling, idx);
  });

  // Save the last incomplete line to process in the next iteration
}
// TODO no conjunctions or adpositions in Wordnet!!
function englishWordnet() {
  // LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB'
  // LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid
  const queryString = `
    WITH ranked_ipa  AS (
      SELECT
        lp.wordid,
        pr.pronunciation,
        lp.variety,
        ROW_NUMBER() OVER (
          PARTITION BY lp.wordid
          ORDER BY
              CASE
                  WHEN lp.variety = 'US' THEN 1
                  WHEN lp.variety IS NULL THEN 2
                  WHEN lp.variety IS 'GB' THEN 3
                  ELSE 4
              END
        ) AS rank
        FROM lexes_pronunciations lp
        JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
    )
    SELECT words.wordid, word, rp.pronunciation as ipa, domainname 
    FROM words
    LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1
    LEFT JOIN senses ON senses.wordid = words.wordid
    LEFT JOIN synsets ON synsets.synsetid = senses.synsetid
    LEFT JOIN domains ON domains.domainid = synsets.domainid
    GROUP BY words.wordid
  `;
  const query = wndb.query(queryString);
  const res: Array<{
    word: string;
    ipa: string;
    domainname: string;
  }> = query.all() as any;
  console.log("res", res.length);
  for (const r of res) {
    console.log(r, "r");
    // if (r.word === 'abrasive') throw new Error('stop right here');
    const ok = filterWord(r.word);
    if (!ok) continue;
    const split = r.word.split(" ");
    const type = split.length > 1 ? "expression" : "word";
    const subtype = null;
    const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype);
    const category = domains[r.domainname] || "unknown;";
    addCat(db, wordid, category);
  }
}
function filterWord(s: string) {
  const hasSymbols = s.match(SYMBOL_REGEX);
  if (hasSymbols) return false;
  else return true;
}

// function checkWordNet(word: string) {
//   const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
//   const res = query.get({ $word: word });
//   return !!res;
// }

function englishCards() {
  const lesson_id = addLesson(db, "First Lesson, some easy stuff");
  const texts = [
    "I",
    "friend",
    "my friend",
    "you",
    "your friend",
    "my friends' friend",
    "you are my friend",
    "I am your friend",
    "your friend is my friend",
    "my friend is your friend",
    "he is my friend",
    "this is mine",
    "this is yours",
    "this is my friends'",
    "no",
    "you are not my friend",
    "this is not yours",
    "your friend is not my friend",
    "that is mine",
    "this is mine, that is yours",
    "he is not your friend",
    "no, I am not",
    "that is not me",
    "that is not mine, that is my friends'",
  ];
  for (const text of texts) {
    addCard(db, lesson_id, text);
  }
}
// englishWordnet();
// englishFreq();
// englishCards();
englishKaggle();
m 2024-10-22 08:45:52 +00:00			`import { Database } from "bun:sqlite";`
m 2024-10-23 16:54:41 +00:00			`import {`
			`addCard,`
			`addCat,`
			`addFrequency,`
			`addLesson,`
			`addWord,`
			`domains,`
			`poss,`
			`} from "./db";`
m 2024-10-22 04:35:21 +00:00
			`// const db = new Database('../db/data.db');`
m 2024-10-22 08:45:52 +00:00			`const db = new Database("../db/data.db", { strict: true });`
			`const wndb = new Database("../datasets/en-wordnet/data.sqlite");`
			`db.exec("PRAGMA journal_mode = WAL;");`
m 2024-10-22 04:35:21 +00:00
			`const SYMBOL_REGEX = new RegExp(/[\W\d]/);`

m 2024-10-23 16:54:41 +00:00			`async function handleFile(`
			`filename: string,`
			`func: (line: string, idx: number) => void,`
			`) {`
			`const file = Bun.file(filename);`
m 2024-10-22 04:35:21 +00:00			`const s = file.stream();`
			`const reader = s.getReader();`
			`const decoder = new TextDecoder();`
m 2024-10-22 08:45:52 +00:00			`let leftover = "";`
m 2024-10-22 04:35:21 +00:00			`let lineCount = 0;`
			`while (true) {`
			`const { value, done } = await reader.read();`
			`if (done) break;`
			`const chunk = decoder.decode(value, { stream: true });`
m 2024-10-22 08:45:52 +00:00			`const lines = (leftover + chunk).split("\n");`
m 2024-10-22 04:35:21 +00:00
			`// Process each line except the last (which might be incomplete)`
			`for (const line of lines.slice(0, -1)) {`
			`lineCount++;`
m 2024-10-23 16:54:41 +00:00			`func(line, lineCount);`
m 2024-10-22 04:35:21 +00:00			`}`

			`// Save the last incomplete line to process in the next iteration`
			`leftover = lines[lines.length - 1];`
			`}`

			`// Handle any remaining content after reading all chunks`
m 2024-10-23 16:54:41 +00:00			`if (leftover) func(leftover, lineCount + 1);`
			`}`

			`function goodPos(pos: string): boolean {`
			`const list = [`
			`"CC",`
			`"DT",`
			`"EX",`
			`"IN",`
			`"LS",`
			`"MD",`
			`"PDT",`
			`"POS",`
			`"PRP",`
			`"PRP$",`
			`"RP",`
			`"TO",`
			`"WDT",`
			`"WP",`
			`"WP$",`
			`];`
			`return list.includes(pos);`
			`}`
			`function englishKaggle() {`
			`handleFile("../datasets/words_pos.csv", (line, idx) => {`
			`const [_, spelling, pos] = line.split(",");`
			`if (!goodPos(pos)) return;`
			`const rowid = addWord(db, spelling, "", "en-us", "word", null);`
			`const category = poss[pos] \|\| "unknown;";`
			`addCat(db, rowid, category);`
			`});`
			`}`
			`async function englishIPA() {`
			`handleFile("ipa/en-us/ipadict.txt", (line, idx) => {`
			`const [spelling, ipa] = line.split(/\s+/);`
			`if (!spelling \|\| !ipa) return;`
			`const hasSymbols = spelling.match(SYMBOL_REGEX);`
			`if (hasSymbols) return;`
			`const split = spelling.split(" ");`
			`const type = split.length > 1 ? "expression" : "word";`
			`const subtype = null;`
			`addWord(db, spelling, ipa, "en-us", type, subtype);`
			`});`
			`}`

			`async function englishFreq() {`
			`handleFile("../datasets/unigram_freq.csv", (line, idx) => {`
			`const [spelling, _frequency] = line.split(",");`
			`addFrequency(db, spelling, idx);`
			`});`

			`// Save the last incomplete line to process in the next iteration`
m 2024-10-22 04:35:21 +00:00			`}`
			`// TODO no conjunctions or adpositions in Wordnet!!`
m 2024-10-23 16:54:41 +00:00			`function englishWordnet() {`
			`// LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB'`
			`// LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid`
m 2024-10-22 04:35:21 +00:00			const queryString = `
m 2024-10-23 16:54:41 +00:00			`WITH ranked_ipa AS (`
			`SELECT`
			`lp.wordid,`
			`pr.pronunciation,`
			`lp.variety,`
			`ROW_NUMBER() OVER (`
			`PARTITION BY lp.wordid`
			`ORDER BY`
			`CASE`
			`WHEN lp.variety = 'US' THEN 1`
			`WHEN lp.variety IS NULL THEN 2`
			`WHEN lp.variety IS 'GB' THEN 3`
			`ELSE 4`
			`END`
			`) AS rank`
			`FROM lexes_pronunciations lp`
			`JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid`
			`)`
			`SELECT words.wordid, word, rp.pronunciation as ipa, domainname`
			`FROM words`
			`LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1`
			`LEFT JOIN senses ON senses.wordid = words.wordid`
			`LEFT JOIN synsets ON synsets.synsetid = senses.synsetid`
			`LEFT JOIN domains ON domains.domainid = synsets.domainid`
m 2024-10-22 04:35:21 +00:00			`GROUP BY words.wordid`
			`;
			`const query = wndb.query(queryString);`
			`const res: Array<{`
			`word: string;`
m 2024-10-23 16:54:41 +00:00			`ipa: string;`
m 2024-10-22 04:35:21 +00:00			`domainname: string;`
			`}> = query.all() as any;`
m 2024-10-23 16:54:41 +00:00			`console.log("res", res.length);`
m 2024-10-22 04:35:21 +00:00			`for (const r of res) {`
m 2024-10-23 16:54:41 +00:00			`console.log(r, "r");`
m 2024-10-22 04:35:21 +00:00			`// if (r.word === 'abrasive') throw new Error('stop right here');`
m 2024-10-23 16:54:41 +00:00			`const ok = filterWord(r.word);`
			`if (!ok) continue;`
m 2024-10-22 08:45:52 +00:00			`const split = r.word.split(" ");`
			`const type = split.length > 1 ? "expression" : "word";`
m 2024-10-22 04:35:21 +00:00			`const subtype = null;`
m 2024-10-23 16:54:41 +00:00			`const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype);`
			`const category = domains[r.domainname] \|\| "unknown;";`
			`addCat(db, wordid, category);`
m 2024-10-22 04:35:21 +00:00			`}`
			`}`
m 2024-10-23 16:54:41 +00:00			`function filterWord(s: string) {`
			`const hasSymbols = s.match(SYMBOL_REGEX);`
			`if (hasSymbols) return false;`
			`else return true;`
			`}`
m 2024-10-22 04:35:21 +00:00
			`// function checkWordNet(word: string) {`
			// const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
			`// const res = query.get({ $word: word });`
			`// return !!res;`
			`// }`

			`function englishCards() {`
m 2024-10-22 08:45:52 +00:00			`const lesson_id = addLesson(db, "First Lesson, some easy stuff");`
m 2024-10-22 04:35:21 +00:00			`const texts = [`
m 2024-10-22 08:45:52 +00:00			`"I",`
			`"friend",`
			`"my friend",`
			`"you",`
			`"your friend",`
m 2024-10-22 04:35:21 +00:00			`"my friends' friend",`
m 2024-10-22 08:45:52 +00:00			`"you are my friend",`
			`"I am your friend",`
			`"your friend is my friend",`
			`"my friend is your friend",`
			`"he is my friend",`
			`"this is mine",`
			`"this is yours",`
m 2024-10-22 04:35:21 +00:00			`"this is my friends'",`
m 2024-10-22 08:45:52 +00:00			`"no",`
			`"you are not my friend",`
			`"this is not yours",`
			`"your friend is not my friend",`
			`"that is mine",`
			`"this is mine, that is yours",`
			`"he is not your friend",`
			`"no, I am not",`
			`"that is not me",`
m 2024-10-22 04:35:21 +00:00			`"that is not mine, that is my friends'",`
			`];`
			`for (const text of texts) {`
			`addCard(db, lesson_id, text);`
			`}`
			`}`
m 2024-10-23 16:54:41 +00:00			`// englishWordnet();`
m 2024-10-22 08:45:52 +00:00			`// englishFreq();`
m 2024-10-23 16:54:41 +00:00			`// englishCards();`
			`englishKaggle();`