m

2024-10-23 23:54:41 +07:00 · 2024-10-23 23:54:41 +07:00 · d0bcc2a81c
commit d0bcc2a81c
parent 9bbb3b3cfa
5 changed files with 218 additions and 99 deletions
--- a/schema.sql
+++ b/schema.sql
@ -6,11 +6,12 @@ PRAGMA mmap_size = 30000000000;


 -- Words table
+-- TODO restore a separate words table?
 CREATE TABLE expressions(
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    spelling TEXT NOT NULL,
-    ipa      TEXT NOT NULL,
    language_id INTEGER NOT NULL,
+    ipa      TEXT,
    frequency INTEGER,
    type TEXT NOT NULL,
    subtype TEXT,
@ -95,6 +96,9 @@ INSERT INTO categories (name, part_of_speech_id) VALUES
 ('nominative', 5),
 ('accusative', 5),
 ('genitive', 5),
+('interrogative', 5),
+-- not really a pronoun but whatever
+('determiner', 5),
 -- adpositions
 ('preposition', 6),
 ('postposition', 6),
--- a/server/db.ts
+++ b/server/db.ts
@ -27,7 +27,7 @@ export function fetchFrequent(db: Database, count: number, page: number) {
      spelling,
      ipa,
      frequency,
-      GROUP_CONCAT(c.name, ',') AS category,
+      GROUP_CONCAT(c.name, ',') AS category
    FROM expressions e
    JOIN word_categories wc ON wc.word_id = e.id
    JOIN categories c ON c.id = wc.category_id
@ -37,37 +37,66 @@ export function fetchFrequent(db: Database, count: number, page: number) {
    `);
  return query.get({ count, offset });
 }
+
+export function fetchExpressionsByCard(db: Database, cid: number) {
+  const queryString = `
+    SELECT 
+      e.spelling, e.id as eid, e.ipa
+    FROM cards_expressions ce
+    JOIN expressions e ON ce.expression_id = e.id
+    WHERE ce.card_id = $cid AND e.spelling IS NOT NULL
+    ORDER BY e.frequency DESC
+    `;
+  const query = db.query(queryString);
+  return query.all({ cid });
+}
 export function fetchLessons(db: Database, count: number, page: number) {
  const p = page < 1 ? 1 : page;
  const offset = (p - 1) * count;
-  const queryString = `
-    SELECT 
-      l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid,
-      e.spelling, e.ipa, e.frequency, e.id as eid,
-      GROUP_CONCAT(cg.name, ',') AS category
-    FROM expressions e 
-    JOIN cards_expressions ce ON e.id = ce.expression_id
-    JOIN cards ON cards.id = cl.card_id
-    JOIN cards_lessons cl ON cl.card_id = cards.id
-    JOIN lessons l ON l.id = cl.lesson_id
-    JOIN expressions e ON e.id = ce.expression_id
-    JOIN word_categories wc ON wc.word_id = e.id
-    JOIN categories cg ON cg.id = wc.category_id
-    LIMIT $count
-    OFFSET $offset
-    `;
  // const queryString = `
  //   SELECT
  //     l.id, l.text as ltext, cards.text as ctext, cards.note as cnote, cards.id as cid
  //   FROM cards_lessons cl
-  //   JOIN lessons l ON l.id = cl.lesson_id
  //   JOIN cards ON cards.id = cl.card_id
+  //   JOIN lessons l ON l.id = cl.lesson_id
  //   LIMIT $count
  //   OFFSET $offset
  //   `;
+  const queryString = `
+    SELECT 
+        l.id AS lesson_id, 
+        l.text AS lesson_text, 
+        c.id AS card_id, 
+        c.text AS card_text, 
+        c.note AS card_note, 
+        e.id AS expression_id, 
+        e.spelling AS expression_spelling, 
+        e.ipa AS expression_ipa, 
+        e.type AS expression_type, 
+        e.subtype AS expression_subtype,
+        GROUP_CONCAT(cat.name, ', ') AS categories
+    FROM 
+        lessons l
+    JOIN 
+        cards_lessons cl ON l.id = cl.lesson_id
+    JOIN 
+        cards c ON c.id = cl.card_id
+    JOIN 
+        cards_expressions ce ON c.id = ce.card_id
+    JOIN 
+        expressions e ON e.id = ce.expression_id
+    LEFT JOIN 
+        word_categories wc ON wc.word_id = e.id
+    LEFT JOIN 
+        categories cat ON cat.id = wc.category_id
+    GROUP BY 
+        l.id, c.id, e.id
+    ORDER BY 
+        l.id ASC, c.id ASC, e.id ASC
+    LIMIT $count OFFSET $offset;
+    `;
  const query = db.query(queryString);
-  const res = query.all({ count, offset });
-  return res;
+  return query.all({ count, offset });
 }

 // SELECT l.id, l.text, cards.text, cards.note FROM cards_lessons cl LEFT JOIN lessons l ON l.id = cl.lesson_id LEFT JOIN cards ON cards.id = cl.card_id ORDER BY l.id ASC LIMIT 20 OFFSET 0;
@ -86,7 +115,6 @@ export function fetchLesson(db: Database, lesson: number) {
    JOIN categories cg ON cg.id = wc.category_id
    WHERE l.id = $lesson 
    `;
-  console.log(queryString);
  const query = db.query(queryString);
  return query.all({ lesson });
 }
@ -137,6 +165,7 @@ export function addCard(
    ))
    `);
  const wtr = db.transaction((pairs) => {
+    // console.log("adding to ce", { pairs, cid, text });
    for (const pair of pairs) wquery.run(pair);
  });
  const words = text
@ -145,7 +174,7 @@ export function addCard(
    .trim()
    .split(" ");
  const combinations = wordFactorial(words);
-  const richWords = combinations.map((spelling) => {
+  const richWords = Array.from(combinations).map((spelling) => {
    return { spelling, cid };
  });
  wtr(richWords);
@ -187,7 +216,11 @@ export function addWord(
  const res = query.run({ spelling, ipa, language, type, subtype });
  return res.lastInsertRowid;
 }
-export function addCat(db: Database, wordId: number | bigint, domain: string) {
+export function addCat(
+  db: Database,
+  wordId: number | bigint,
+  category: string,
+) {
  const queryString = `
  INSERT
  INTO word_categories(word_id, category_id)
@ -196,13 +229,23 @@ export function addCat(db: Database, wordId: number | bigint, domain: string) {
      WHERE name = $category
  ))
  `;
-  const category = domains[domain] || "unknown";
  const query = db.query(queryString);
  const res = query.run({ wordId, category });
  return res.lastInsertRowid;
 }

-const domains: Record<string, string> = {
+export const poss: Record<string, string> = {
+  CC: "conjunction",
+  DT: "determiner",
+  IN: "preposition",
+  MD: "auxiliar",
+  PRP: "nominative", // TODO oi
+  PRP$: "gemitive",
+  WDT: "determiner",
+  WP: "interrogative",
+  WP$: "interrogative",
+};
+export const domains: Record<string, string> = {
  "adj.all": "adjective",
  "adj.pert": "adjective",
  "adj.ppl": "adjective",
@ -261,5 +304,13 @@ export function addFrequency(
  `;
  const query = db.query(queryString);
  const res = query.run({ spelling, frequency });
-  console.log(res, "added frequency");
+}
+export function addIPA(db: Database, spelling: string, ipa: string) {
+  const queryString = `
+  UPDATE expressions
+  SET ipa= $ipa
+  WHERE expressions.spelling = $spelling
+  `;
+  const query = db.query(queryString);
+  const res = query.run({ spelling, ipa });
 }
--- a/server/seeding.ts
+++ b/server/seeding.ts
@ -1,5 +1,13 @@
 import { Database } from "bun:sqlite";
-import { addCard, addCat, addFrequency, addLesson, addWord } from "./db";
+import {
+  addCard,
+  addCat,
+  addFrequency,
+  addLesson,
+  addWord,
+  domains,
+  poss,
+} from "./db";

 // const db = new Database('../db/data.db');
 const db = new Database("../db/data.db", { strict: true });
@ -8,31 +16,11 @@ db.exec("PRAGMA journal_mode = WAL;");

 const SYMBOL_REGEX = new RegExp(/[\W\d]/);

-// async function englishIPA() {
-//   const file = Bun.file('ipa/en-us/ipadict.txt');
-//   const s = file.stream();
-//   const reader = s.getReader();
-//   const decoder = new TextDecoder();
-//   let leftover = '';
-//   while (true) {
-//     const { value, done } = await reader.read();
-//     if (done) break;
-//     const chunk = decoder.decode(value, { stream: true });
-//     const lines = (leftover + chunk).split('\n');
-
-//     // Process each line except the last (which might be incomplete)
-//     for (const line of lines.slice(0, -1)) saveLine(line);
-
-//     // Save the last incomplete line to process in the next iteration
-//     leftover = lines[lines.length - 1];
-//   }
-
-//   // Handle any remaining content after reading all chunks
-//   if (leftover) saveLine(leftover);
-// }
-
-async function englishFreq() {
-  const file = Bun.file("../datasets/unigram_freq.csv");
+async function handleFile(
+  filename: string,
+  func: (line: string, idx: number) => void,
+) {
+  const file = Bun.file(filename);
  const s = file.stream();
  const reader = s.getReader();
  const decoder = new TextDecoder();
@ -47,8 +35,7 @@ async function englishFreq() {
    // Process each line except the last (which might be incomplete)
    for (const line of lines.slice(0, -1)) {
      lineCount++;
-      const [spelling, _frequency] = line.split(",");
-      addFrequency(db, spelling, lineCount);
+      func(line, lineCount);
    }

    // Save the last incomplete line to process in the next iteration
@ -56,48 +43,115 @@ async function englishFreq() {
  }

  // Handle any remaining content after reading all chunks
-  if (leftover) addFrequency(db, leftover, lineCount + 1);
+  if (leftover) func(leftover, lineCount + 1);
+}
+
+function goodPos(pos: string): boolean {
+  const list = [
+    "CC",
+    "DT",
+    "EX",
+    "IN",
+    "LS",
+    "MD",
+    "PDT",
+    "POS",
+    "PRP",
+    "PRP$",
+    "RP",
+    "TO",
+    "WDT",
+    "WP",
+    "WP$",
+  ];
+  return list.includes(pos);
+}
+function englishKaggle() {
+  handleFile("../datasets/words_pos.csv", (line, idx) => {
+    const [_, spelling, pos] = line.split(",");
+    if (!goodPos(pos)) return;
+    const rowid = addWord(db, spelling, "", "en-us", "word", null);
+    const category = poss[pos] || "unknown;";
+    addCat(db, rowid, category);
+  });
+}
+async function englishIPA() {
+  handleFile("ipa/en-us/ipadict.txt", (line, idx) => {
+    const [spelling, ipa] = line.split(/\s+/);
+    if (!spelling || !ipa) return;
+    const hasSymbols = spelling.match(SYMBOL_REGEX);
+    if (hasSymbols) return;
+    const split = spelling.split(" ");
+    const type = split.length > 1 ? "expression" : "word";
+    const subtype = null;
+    addWord(db, spelling, ipa, "en-us", type, subtype);
+  });
+}
+
+async function englishFreq() {
+  handleFile("../datasets/unigram_freq.csv", (line, idx) => {
+    const [spelling, _frequency] = line.split(",");
+    addFrequency(db, spelling, idx);
+  });
+
+  // Save the last incomplete line to process in the next iteration
 }
 // TODO no conjunctions or adpositions in Wordnet!!
-function englishIPA() {
+function englishWordnet() {
+  // LEFT JOIN lexes_pronunciations ukpr ON ukpr.wordid = words.wordid AND uspr.variety = 'GB'
+  // LEFT JOIN pronunciations ukp ON ukp.pronunciationid = ukpr.pronunciationid
  const queryString = `
-    SELECT words.wordid, word, pronunciation, domainname FROM words
-    JOIN lexes_pronunciations lp ON lp.wordid = words.wordid 
-    JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid 
-    JOIN senses ON senses.wordid = words.wordid
-    JOIN synsets ON synsets.synsetid = senses.synsetid
-    JOIN domains ON domains.domainid = synsets.domainid
+    WITH ranked_ipa  AS (
+      SELECT
+        lp.wordid,
+        pr.pronunciation,
+        lp.variety,
+        ROW_NUMBER() OVER (
+          PARTITION BY lp.wordid
+          ORDER BY
+              CASE
+                  WHEN lp.variety = 'US' THEN 1
+                  WHEN lp.variety IS NULL THEN 2
+                  WHEN lp.variety IS 'GB' THEN 3
+                  ELSE 4
+              END
+        ) AS rank
+        FROM lexes_pronunciations lp
+        JOIN pronunciations pr ON pr.pronunciationid = lp.pronunciationid
+    )
+    SELECT words.wordid, word, rp.pronunciation as ipa, domainname 
+    FROM words
+    LEFT JOIN ranked_ipa rp ON rp.wordid = words.wordid AND rp.rank = 1
+    LEFT JOIN senses ON senses.wordid = words.wordid
+    LEFT JOIN synsets ON synsets.synsetid = senses.synsetid
+    LEFT JOIN domains ON domains.domainid = synsets.domainid
    GROUP BY words.wordid
  `;
  const query = wndb.query(queryString);
  const res: Array<{
    word: string;
-    pronunciation: string;
+    ipa: string;
    domainname: string;
  }> = query.all() as any;
+  console.log("res", res.length);
  for (const r of res) {
-    console.log("adding word", r);
+    console.log(r, "r");
    // if (r.word === 'abrasive') throw new Error('stop right here');
+    const ok = filterWord(r.word);
+    if (!ok) continue;
    const split = r.word.split(" ");
    const type = split.length > 1 ? "expression" : "word";
    const subtype = null;
-    const wordid = addWord(db, r.word, r.pronunciation, "en-us", type, subtype);
-    addCat(db, wordid, r.domainname);
+    const wordid = addWord(db, r.word, r.ipa, "en-us", type, subtype);
+    const category = domains[r.domainname] || "unknown;";
+    addCat(db, wordid, category);
  }
 }
-// function saveLine(line: string) {
-//   const [spelling, ipa] = line.split(/\s+/);
-//   if (!spelling || !ipa) return;
-//   const hasSymbols = spelling.match(SYMBOL_REGEX);
-//   if (hasSymbols) return;
-//   const isWord = checkWordNet(spelling);
-//   console.log(spelling, isWord);
-//   if (!isWord) return;
-//   const split = spelling.split(' ');
-//   const type = split.length > 1 ? 'expression' : 'word';
-//   const subtype = null;
-//   addWord(db, spelling, ipa, 'en-us', type, subtype);
-// }
+function filterWord(s: string) {
+  const hasSymbols = s.match(SYMBOL_REGEX);
+  if (hasSymbols) return false;
+  else return true;
+}

 // function checkWordNet(word: string) {
 //   const query = wndb.query(`SELECT * FROM words WHERE word = $word`);
@ -137,6 +191,7 @@ function englishCards() {
    addCard(db, lesson_id, text);
  }
 }
-// englishIPA();
+// englishWordnet();
 // englishFreq();
-englishCards();
+// englishCards();
+englishKaggle();
--- a/server/server.ts
+++ b/server/server.ts
@ -2,6 +2,7 @@ import { Database } from "bun:sqlite";
 import {
  addUser,
  fetchCard,
+  fetchExpressionsByCard,
  fetchLesson,
  fetchLessons,
  fetchResource,
@ -58,7 +59,12 @@ type LessonsType = Record<
  {
    id: number;
    text: string;
-    cards: Array<{ text: string; note: string | null; id: number }>;
+    cards: Array<{
+      text: string;
+      note: string | null;
+      id: number;
+      words: Array<{ spelling: string; ipa: string; category: string }>;
+    }>;
  }
 >;
 type LessonsDBType = {
@ -85,17 +91,20 @@ function handleGetLessons(user: number, url: URL) {
  const page = params.get("page") || "0";
  const data: LessonsDBType[] = fetchLessons(db, 20, Number(page)) as any;
  console.log(data, "fetchlessons");
-  const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => {
-    let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] };
-    const cards = [
-      ...cur.cards,
-      { text: item.ctext, note: item.cnote, id: item.cid },
-    ];
-    const def = { ...cur, cards };
-    return { ...acc, [item.id]: def };
-  }, {} as LessonsType);
-  console.log(lessons, "lesons");
-  return Response.json({ ok: lessons });
+  console.log(data.length);
+  // const lessons = data.reduce((acc: LessonsType, item: LessonsDBType) => {
+  //   let cur = acc[item.id] || { id: item.id, text: item.ltext, cards: [] };
+  //   const words = fetchExpressionsByCard(db, item.cid) as any[];
+  //   console.log(words, item.cid);
+  //   const cards = [
+  //     ...cur.cards,
+  //     { text: item.ctext, note: item.cnote, id: item.cid, words },
+  //   ];
+  //   const def = { ...cur, cards };
+  //   return { ...acc, [item.id]: def };
+  // }, {} as LessonsType);
+  // return Response.json({ ok: lessons });
+  return Response.json({ ok: data });
 }

 async function handlePost(req: Request, user: number, url: URL) {
--- a/server/utils.ts
+++ b/server/utils.ts
@ -1,10 +1,10 @@
-export function wordFactorial(words: string[]): string[] {
-  const combinations: string[] = [];
+export function wordFactorial(words: string[]): Set<string> {
+  const combinations: Set<string> = new Set([]);
  for (let i = 0; i < words.length; i++) {
-    let inner = '';
+    let inner = "";
    for (let ii = i; ii < words.length; ii++) {
-      inner += (ii > i ? ' ' : '') + words[ii];
-      combinations.push(inner);
+      inner += (ii > i ? " " : "") + words[ii].toLowerCase();
+      combinations.add(inner);
    }
  }
  return combinations;