m

2024-10-23 23:54:41 +07:00 · 2024-10-23 23:54:41 +07:00 · d0bcc2a81c
commit d0bcc2a81c
parent 9bbb3b3cfa
5 changed files with 218 additions and 99 deletions
--- a/schema.sql
+++ b/schema.sql
@ -6,11 +6,12 @@ PRAGMA mmap_size = 30000000000;
 -- Words table
 -- TODO restore a separate words table?
 CREATE TABLE expressions(
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    spelling TEXT NOT NULL,
    ipa      TEXT NOT NULL,
    language_id INTEGER NOT NULL,
    ipa      TEXT,
    frequency INTEGER,
    type TEXT NOT NULL,
    subtype TEXT,
@ -95,6 +96,9 @@ INSERT INTO categories (name, part_of_speech_id) VALUES
 ('nominative', 5),
 ('accusative', 5),
 ('genitive', 5),
 ('interrogative', 5),
 -- not really a pronoun but whatever
 ('determiner', 5),
 -- adpositions
 ('preposition', 6),
 ('postposition', 6),
--- a/server/db.ts
+++ b/server/db.ts
@ -27,7 +27,7 @@ export function fetchFrequent(db: Database, count: number, page: number) {
      spelling,
      ipa,
      frequency,
-      GROUP_CONCAT(c.name, ',') AS category,
+      GROUP_CONCAT(c.name, ',') AS category
    FROM expressions e
    JOIN word_categories wc ON wc.word_id = e.id
    JOIN categories c ON c.id = wc.category_id
@ -37,37 +37,66 @@ export function fetchFrequent(db: Database, count: number, page: number) {
    `);
  return query.get({ count, offset });
 }
 export function fetchExpressionsByCard(db: Database, cid: number) {
  const queryString = `
    SELECT 
      e.spelling, e.id as eid, e.ipa
    FROM cards_expressions ce
    JOIN expressions e ON ce.expression_id = e.id
    WHERE ce.card_id = $cid AND e.spelling IS NOT NULL
    ORDER BY e.frequency DESC
    `;
  const query = db.query(queryString);
  return query.all({ cid });
 }
 export function fetchLessons(db: Database, count: number, page: number) {
  const p = page < 1 ? 1 : page;
  const offset = (p - 1) * count;
  const queryString = `
    SELECT 
      l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid,
      e.spelling, e.ipa, e.frequency, e.id as eid,
      GROUP_CONCAT(cg.name, ',') AS category
    FROM expressions e 
    JOIN cards_expressions ce ON e.id = ce.expression_id
    JOIN cards ON cards.id = cl.card_id
    JOIN cards_lessons cl ON cl.card_id = cards.id
    JOIN lessons l ON l.id = cl.lesson_id
    JOIN expressions e ON e.id = ce.expression_id
    JOIN word_categories wc ON wc.word_id = e.id
    JOIN categories cg ON cg.id = wc.category_id
    LIMIT $count
    OFFSET $offset
    `;
  // const queryString = `
  //   SELECT
  //     l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid
  //   FROM cards_lessons cl
  //   JOIN lessons l ON l.id = cl.lesson_id
  //   JOIN cards ON cards.id = cl.card_id
  //   JOIN lessons l ON l.id = cl.lesson_id
  //   LIMIT $count
  //   OFFSET $offset
  //   `;
  const queryString = `
    SELECT 
        l.id AS lesson_id, 
        l.text AS lesson_text, 
        c.id AS card_id, 
        c.text AS card_text, 
        c.note AS card_note, 
        e.id AS expression_id, 
        e.spelling AS expression_spelling, 
        e.ipa AS expression_ipa, 
        e.type AS expression_type, 
        e.subtype AS expression_subtype,
        GROUP_CONCAT(cat.name, ', ') AS categories
    FROM 
        lessons l
    JOIN 
        cards_lessons cl ON l.id = cl.lesson_id
    JOIN 
        cards c ON c.id = cl.card_id
    JOIN 
        cards_expressions ce ON c.id = ce.card_id
    JOIN 
        expressions e ON e.id = ce.expression_id
    LEFT JOIN 
        word_categories wc ON wc.word_id = e.id
    LEFT JOIN 
        categories cat ON cat.id = wc.category_id
    GROUP BY 
        l.id, c.id, e.id
    ORDER BY 
        l.id ASC, c.id ASC, e.id ASC
    LIMIT $count OFFSET $offset;
    `;
  const query = db.query(queryString);
-  const res = query.all({ count, offset });
+  return query.all({ count, offset });
  return res;
 }
 // SELECT l.id, l.text, cards.text, cards.note FROM cards_lessons cl LEFT JOIN lessons l ON l.id = cl.lesson_id LEFT JOIN cards ON cards.id = cl.card_id ORDER BY l.id ASC LIMIT 20 OFFSET 0;
@ -86,7 +115,6 @@ export function fetchLesson(db: Database, lesson: number) {
    JOIN categories cg ON cg.id = wc.category_id
    WHERE l.id = $lesson 
    `;
  console.log(queryString);
  const query = db.query(queryString);
  return query.all({ lesson });
 }
@ -137,6 +165,7 @@ export function addCard(
    ))
    `);
  const wtr = db.transaction((pairs) => {
    // console.log("adding to ce", { pairs, cid, text });
    for (const pair of pairs) wquery.run(pair);
  });
  const words = text
@ -145,7 +174,7 @@ export function addCard(
    .trim()
    .split(" ");
  const combinations = wordFactorial(words);
-  const richWords = combinations.map((spelling) => {
+  const richWords = Array.from(combinations).map((spelling) => {
    return { spelling, cid };
  });
  wtr(richWords);
@ -187,7 +216,11 @@ export function addWord(
  const res = query.run({ spelling, ipa, language, type, subtype });
  return res.lastInsertRowid;
 }
-export function addCat(db: Database, wordId: number | bigint, domain: string) {
+export function addCat(
  db: Database,
  wordId: number | bigint,
  category: string,
 ) {
  const queryString = `
  INSERT
  INTO word_categories(word_id, category_id)
@ -196,13 +229,23 @@ export function addCat(db: Database, wordId: number | bigint, domain: string) {
      WHERE name = $category
  ))
  `;
  const category = domains[domain] || "unknown";
  const query = db.query(queryString);
  const res = query.run({ wordId, category });
  return res.lastInsertRowid;
 }
-const domains: Record<string, string> = {
+export const poss: Record<string, string> = {
  CC: "conjunction",
  DT: "determiner",
  IN: "preposition",
  MD: "auxiliar",
  PRP: "nominative", // TODO oi
  PRP$: "gemitive",
  WDT: "determiner",
  WP: "interrogative",
  WP$: "interrogative",
 };
 export const domains: Record<string, string> = {
  "adj.all": "adjective",
  "adj.pert": "adjective",
  "adj.ppl": "adjective",
@ -261,5 +304,13 @@ export function addFrequency(
  `;
  const query = db.query(queryString);
  const res = query.run({ spelling, frequency });
-  console.log(res, "added frequency");
+}
 export function addIPA(db: Database, spelling: string, ipa: string) {
  const queryString = `
  UPDATE expressions
  SET ipa= $ipa
  WHERE expressions.spelling = $spelling
  `;
  const query = db.query(queryString);
  const res = query.run({ spelling, ipa });
 }
--- a/server/seeding.ts
+++ b/server/seeding.ts
@ -1,5 +1,13 @@
 import { Database } from "bun:sqlite";
-import { addCard, addCat, addFrequency, addLesson, addWord } from "./db";
+import {
  addCard,
  addCat,
  addFrequency,
  addLesson,
  addWord,
  domains,
  poss,
 } from "./db";
 // const db = new Database('../db/data.db');
 const db = new Database("../db/data.db", { strict: true });
@ -8,31 +16,11 @@ db.exec("PRAGMA journal_mode = WAL;");
 const SYMBOL_REGEX = new RegExp(/[\W\d]/);
-// async function englishIPA() {
+async function handleFile(
-//   const file = Bun.file('ipa/en-us/ipadict.txt');
+  filename: string,
-//   const s = file.stream();
+  func: (line: string, idx: number) => void,
-//   const reader = s.getReader();
+) {
-//   const decoder = new TextDecoder();
+  const file = Bun.file(filename);
 //   let leftover = '';
 //   while (true) {
 //     const { value, done } = await reader.read();
 //     if (done) break;
 //     const chunk = decoder.decode(value, { stream: true });
 //     const lines = (leftover + chunk).split('\n');
 //     // Process each line except the last (which might be incomplete)
 //     for (const line of lines.slice(0, -1)) saveLine(line);
 //     // Save the last incomplete line to process in the next iteration
 //     leftover = lines[lines.length - 1];
 //   }
 //   // Handle any remaining content after reading all chunks
 //   if (leftover) saveLine(leftover);
 // }
 async function englishFreq() {
  const file = Bun.file("../datasets/unigram_freq.csv");
  const s = file.stream();
  const reader = s.getReader();
  const decoder = new TextDecoder();
@ -47,8 +35,7 @@ async function englishFreq() {
    // Process each line except the last (which might be incomplete)
    for (const line of lines.slice(0, -1)) {
      lineCount++;
-      const [spelling, _frequency] = line.split(",");
+      func(line, lineCount);
      addFrequency(db, spelling, lineCount);
    }
    // Save the last incomplete line to process in the next iteration
@ -56,48 +43,115 @@ async function englishFreq() {
  }
  // Handle any remaining content after reading all chunks
-  if (leftover) addFrequency(db, leftover, lineCount + 1);
+  if (leftover) func(leftover, lineCount + 1);
 }
 function goodPos(pos: string): boolean {
  const list = [
    "CC",
    "DT",
    "EX",
    "IN",
    "LS",
    "MD",
    "PDT",
    "POS",
    "PRP",
    "PRP$",
    "RP",
    "TO",
    "WDT",
    "WP",
    "WP$",
  ];
  return list.includes(pos);
 }
 function englishKaggle() {
  handleFile("../datasets/words_pos.csv", (line, idx) => {
    const [_, spelling, pos] = line.split(",");
    if (!goodPos(pos)) return;
    const rowid = addWord(db, spelling, "", "en-us", "word", null);
    const category = poss[pos] || "unknown;";
    addCat(db, rowid, category);
  });
 }
 async function englishIPA() {
  handleFile("ipa/en-us/ipadict.txt", (line, idx) => {
    const [spelling, ipa] = line.split(/\s+/);
    if (!spelling || !ipa) return;
    const hasSymbols = spelling.match(SYMBOL_REGEX);
    if (hasSymbols) return;
    const split = spelling.split(" ");
    const type = split.length > 1 ? "expression" : "word";
    const subtype = null;
    addWord(db, spelling, ipa, "en-us", type, subtype);
  });
 }
 async function englishFreq() {
  handleFile("../datasets/unigram_freq.csv", (line, idx) => {
    const [spelling, _frequency] = line.split(",");
    addFrequency(db, spelling, idx);
  });
  // Save the last incomplete line to process in the next iteration
 }
 // TODO no conjunctions or adpositions in Wordnet!!
-function englishIPA() {
+function englishWordnet() {
  // LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB'
  // LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid
  const queryString = `
-    SELECT words.wordid, word, pronunciation, domainname FROM words
+    WITH ranked_ipa  AS (
-    JOIN lexes_pronunciations lp ON lp.wordid = words.wordid 
+      SELECT
-    JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid 
+        lp.wordid,
-    JOIN senses ON senses.wordid = words.wordid
+        pr.pronunciation,
-    JOIN synsets ON synsets.synsetid = senses.synsetid
+        lp.variety,
-    JOIN domains ON domains.domainid = synsets.domainid
+        ROW_NUMBER() OVER (
          PARTITION BY lp.wordid
          ORDER BY
              CASE
                  WHEN lp.variety = 'US' THEN 1
                  WHEN lp.variety IS NULL THEN 2
                  WHEN lp.variety IS 'GB' THEN 3
                  ELSE 4
              END
        ) AS rank
        FROM lexes_pronunciations lp
        JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
    )
    SELECT words.wordid, word, rp.pronunciation as ipa, domainname 
    FROM words
    LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1
    LEFT JOIN senses ON senses.wordid = words.wordid
    LEFT JOIN synsets ON synsets.synsetid = senses.synsetid
    LEFT JOIN domains ON domains.domainid = synsets.domainid
    GROUP BY words.wordid
  `;
  const query = wndb.query(queryString);
  const res: Array<{
    word: string;
-    pronunciation: string;
+    ipa: string;
    domainname: string;
  }> = query.all() as any;
  console.log("res", res.length);
  for (const r of res) {
-    console.log("adding word", r);
+    console.log(r, "r");
    // if (r.word === 'abrasive') throw new Error('stop right here');
    const ok = filterWord(r.word);
    if (!ok) continue;
    const split = r.word.split(" ");
    const type = split.length > 1 ? "expression" : "word";
    const subtype = null;
-    const wordid = addWord(db, r.word, r.pronunciation, "en-us", type, subtype);
+    const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype);
-    addCat(db, wordid, r.domainname);
+    const category = domains[r.domainname] || "unknown;";
    addCat(db, wordid, category);
  }
 }
-// function saveLine(line: string) {
+function filterWord(s: string) {
-//   const [spelling, ipa] = line.split(/\s+/);
+  const hasSymbols = s.match(SYMBOL_REGEX);
-//   if (!spelling || !ipa) return;
+  if (hasSymbols) return false;
-//   const hasSymbols = spelling.match(SYMBOL_REGEX);
+  else return true;
-//   if (hasSymbols) return;
+}
 //   const isWord = checkWordNet(spelling);
 //   console.log(spelling, isWord);
 //   if (!isWord) return;
 //   const split = spelling.split(' ');
 //   const type = split.length > 1 ? 'expression' : 'word';
 //   const subtype = null;
 //   addWord(db, spelling, ipa, 'en-us', type, subtype);
 // }
 // function checkWordNet(word: string) {
 //   const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
@ -137,6 +191,7 @@ function englishCards() {
    addCard(db, lesson_id, text);
  }
 }
-// englishIPA();
+// englishWordnet();
 // englishFreq();
-englishCards();
+// englishCards();
 englishKaggle();
--- a/server/server.ts
+++ b/server/server.ts
@ -2,6 +2,7 @@ import { Database } from "bun:sqlite";
 import {
  addUser,
  fetchCard,
  fetchExpressionsByCard,
  fetchLesson,
  fetchLessons,
  fetchResource,
@ -58,7 +59,12 @@ type LessonsType = Record<
  {
    id: number;
    text: string;
-    cards: Array<{ text: string; note: string | null; id: number }>;
+    cards: Array<{
      text: string;
      note: string | null;
      id: number;
      words: Array<{ spelling: string; ipa: string; category: string }>;
    }>;
  }
 >;
 type LessonsDBType = {
@ -85,17 +91,20 @@ function handleGetLessons(user: number, url: URL) {
  const page = params.get("page") || "0";
  const data: LessonsDBType[] = fetchLessons(db, 20, Number(page)) as any;
  console.log(data, "fetchlessons");
-  const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => {
+  console.log(data.length);
-    let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] };
+  // const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => {
-    const cards = [
+  //   let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] };
-      ...cur.cards,
+  //   const words = fetchExpressionsByCard(db, item.cid) as any[];
-      { text: item.ctext, note: item.cnote, id: item.cid },
+  //   console.log(words, item.cid);
-    ];
+  //   const cards = [
-    const def = { ...cur, cards };
+  //     ...cur.cards,
-    return { ...acc, [item.id]: def };
+  //     { text: item.ctext, note: item.cnote, id: item.cid, words },
-  }, {} as LessonsType);
+  //   ];
-  console.log(lessons, "lesons");
+  //   const def = { ...cur, cards };
-  return Response.json({ ok: lessons });
+  //   return { ...acc, [item.id]: def };
  // }, {} as LessonsType);
  // return Response.json({ ok: lessons });
  return Response.json({ ok: data });
 }
 async function handlePost(req: Request, user: number, url: URL) {
--- a/server/utils.ts
+++ b/server/utils.ts
@ -1,10 +1,10 @@
-export function wordFactorial(words: string[]): string[] {
+export function wordFactorial(words: string[]): Set<string> {
-  const combinations: string[] = [];
+  const combinations: Set<string> = new Set([]);
  for (let i = 0; i < words.length; i++) {
-    let inner = '';
+    let inner = "";
    for (let ii = i; ii < words.length; ii++) {
-      inner += (ii > i ? ' ' : '') + words[ii];
+      inner += (ii > i ? " " : "") + words[ii].toLowerCase();
-      combinations.push(inner);
+      combinations.add(inner);
    }
  }
  return combinations;