diff options
| author | polwex <polwex@sortug.com> | 2025-11-23 01:12:53 +0700 |
|---|---|---|
| committer | polwex <polwex@sortug.com> | 2025-11-23 01:12:53 +0700 |
| commit | cb1b56f5a0eddbf77446f415f2beda57c8305f85 (patch) | |
| tree | d333ca5c143063af8ee1b2f9e2d1d25f8ef2007c /packages/prosody-ui/src/logic | |
wut
Diffstat (limited to 'packages/prosody-ui/src/logic')
| -rw-r--r-- | packages/prosody-ui/src/logic/iso6393to1.ts | 186 | ||||
| -rw-r--r-- | packages/prosody-ui/src/logic/stanza.ts | 86 | ||||
| -rw-r--r-- | packages/prosody-ui/src/logic/types.ts | 48 | ||||
| -rw-r--r-- | packages/prosody-ui/src/logic/utils.ts | 66 | ||||
| -rw-r--r-- | packages/prosody-ui/src/logic/wiki.ts | 138 |
5 files changed, 524 insertions, 0 deletions
diff --git a/packages/prosody-ui/src/logic/iso6393to1.ts b/packages/prosody-ui/src/logic/iso6393to1.ts new file mode 100644 index 0000000..4c4deed --- /dev/null +++ b/packages/prosody-ui/src/logic/iso6393to1.ts @@ -0,0 +1,186 @@ +export const iso6393To1: Record<string, string> = { + aar: "aa", + abk: "ab", + afr: "af", + aka: "ak", + amh: "am", + ara: "ar", + arg: "an", + asm: "as", + ava: "av", + ave: "ae", + aym: "ay", + aze: "az", + bak: "ba", + bam: "bm", + bel: "be", + ben: "bn", + bis: "bi", + bod: "bo", + bos: "bs", + bre: "br", + bul: "bg", + cat: "ca", + ces: "cs", + cha: "ch", + che: "ce", + chu: "cu", + chv: "cv", + cor: "kw", + cos: "co", + cre: "cr", + cym: "cy", + dan: "da", + deu: "de", + div: "dv", + dzo: "dz", + ell: "el", + eng: "en", + epo: "eo", + est: "et", + eus: "eu", + ewe: "ee", + fao: "fo", + fas: "fa", + fij: "fj", + fin: "fi", + fra: "fr", + fry: "fy", + ful: "ff", + gla: "gd", + gle: "ga", + glg: "gl", + glv: "gv", + grn: "gn", + guj: "gu", + hat: "ht", + hau: "ha", + hbs: "sh", + heb: "he", + her: "hz", + hin: "hi", + hmo: "ho", + hrv: "hr", + hun: "hu", + hye: "hy", + ibo: "ig", + ido: "io", + iii: "ii", + iku: "iu", + ile: "ie", + ina: "ia", + ind: "id", + ipk: "ik", + isl: "is", + ita: "it", + jav: "jv", + jpn: "ja", + kal: "kl", + kan: "kn", + kas: "ks", + kat: "ka", + kau: "kr", + kaz: "kk", + khm: "km", + kik: "ki", + kin: "rw", + kir: "ky", + kom: "kv", + kon: "kg", + kor: "ko", + kua: "kj", + kur: "ku", + lao: "lo", + lat: "la", + lav: "lv", + lim: "li", + lin: "ln", + lit: "lt", + ltz: "lb", + lub: "lu", + lug: "lg", + mah: "mh", + mal: "ml", + mar: "mr", + mkd: "mk", + mlg: "mg", + mlt: "mt", + mon: "mn", + mri: "mi", + msa: "ms", + mya: "my", + nau: "na", + nav: "nv", + nbl: "nr", + nde: "nd", + ndo: "ng", + nep: "ne", + nld: "nl", + nno: "nn", + nob: "nb", + nor: "no", + nya: "ny", + oci: "oc", + oji: "oj", + ori: "or", + orm: "om", + oss: "os", + pan: "pa", + pli: "pi", + pol: "pl", + por: "pt", + pus: "ps", + que: "qu", + roh: "rm", + ron: "ro", + run: "rn", + rus: "ru", + sag: "sg", + san: "sa", + sin: "si", + slk: "sk", + slv: "sl", + sme: "se", + smo: "sm", + sna: "sn", + snd: "sd", + som: "so", + sot: "st", + spa: "es", + sqi: "sq", + srd: "sc", + srp: "sr", + ssw: "ss", + sun: "su", + swa: "sw", + swe: "sv", + tah: "ty", + tam: "ta", + tat: "tt", + tel: "te", + tgk: "tg", + tgl: "tl", + tha: "th", + tir: "ti", + ton: "to", + tsn: "tn", + tso: "ts", + tuk: "tk", + tur: "tr", + twi: "tw", + uig: "ug", + ukr: "uk", + urd: "ur", + uzb: "uz", + ven: "ve", + vie: "vi", + vol: "vo", + wln: "wa", + wol: "wo", + xho: "xh", + yid: "yi", + yor: "yo", + zha: "za", + zho: "zh", + zul: "zu", +}; diff --git a/packages/prosody-ui/src/logic/stanza.ts b/packages/prosody-ui/src/logic/stanza.ts new file mode 100644 index 0000000..9e59450 --- /dev/null +++ b/packages/prosody-ui/src/logic/stanza.ts @@ -0,0 +1,86 @@ +import type { AsyncRes, Result } from "sortug"; + +const ENDPOINT = "http://localhost:8102"; +export async function segmenter(text: string, lang: string) { + try { + const body = JSON.stringify({ lang, string: text }); + const opts = { + headers: { "Content-type": "application/json" }, + method: "POST", + body, + }; + const res = await fetch(ENDPOINT + "/segment", opts); + console.log("stanza", res); + const j = await res.json(); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} +export async function idLang(text: string) { + try { + const body = JSON.stringify({ string: text }); + const opts = { + headers: { "Content-type": "application/json" }, + method: "POST", + body, + }; + const res = await fetch(ENDPOINT + "/detect-lang", opts); + const j = await res.json(); + return { ok: j }; + } catch (e) { + return { error: `${e}` }; + } +} + +export type Sentence = { + text: string; + sentiment: number; + constituency: string; + dependencies: Dependency[]; + entities: Entity[]; + tokens: Token[]; + words: Word[]; +}; +export type Dependency = Array<[Word, string, Word]>; +export type Word = { + id: number; + text: string; + lemma: string; + upos: string; + xpos: string; + feats: string; + head: number; + deprel: string; + start_char: number; + end_char: number; +}; +export type Token = { + id: [number, number]; + text: string; + misc: string; + words: Word[]; + start_char: number; + end_char: number; + ner: string; +}; +export type Entity = { + text: string; + misc: string; + start_char: number; + end_char: number; + type: string; +}; +// "amod", +// { +// "id": 1, +// "text": "Stony", +// "lemma": "Stony", +// "upos": "ADJ", +// "xpos": "NNP", +// "feats": "Degree=Pos", +// "head": 3, +// "deprel": "amod", +// "start_char": 0, +// "end_char": 5 +// } diff --git a/packages/prosody-ui/src/logic/types.ts b/packages/prosody-ui/src/logic/types.ts new file mode 100644 index 0000000..ac308cf --- /dev/null +++ b/packages/prosody-ui/src/logic/types.ts @@ -0,0 +1,48 @@ +export type Cookie = { + domain: string; + path: string; + hostOnly: boolean; + httpOnly: boolean; + secure: boolean; + session: boolean; + sameSite: SameSite; + storeId: null; + name: string; + value: string; +}; +export type CookiesMap = Record<string, CookieMap>; +export type CookieMap = Record<string, Cookie>; +export type KeyMap = Record<string, string>; +type SameSite = null | "no_restriction"; // TODO + +export type APIRes = { API: { app: string; api_key: string } }; +export type CookieRes = { Cookie: { app: string; cookie: CookieMap } }; +export type CookiesRes = { cookies: CookiesMap; apiKeys: KeyMap }; + +// words +export type Meaning = { + pos: string; // part of speech; + meaning: string[]; + etymology: string; + references?: any; +}; + +export type Prompts = { + translate: string; +}; +export type AnalyzeRes = { + word: string; + syllables: string[]; + ipa: string; + pos: POS; +}; +type PosTuple = [string, POS]; +type POS = string; + +export type WordData = { + spelling: string; + lang: string; + ipa: string; + meanings: Meaning[]; + references?: any; +}; diff --git a/packages/prosody-ui/src/logic/utils.ts b/packages/prosody-ui/src/logic/utils.ts new file mode 100644 index 0000000..737a6ec --- /dev/null +++ b/packages/prosody-ui/src/logic/utils.ts @@ -0,0 +1,66 @@ +import type { Result } from "sortug"; + +export function detectScript(text: string): Result<string> { + const scripts = { + Latin: /[\u0000-\u007F\u00A0-\u00FF\u0100-\u017F\u0180-\u024F]/g, + Cyrillic: /[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]/g, + Greek: /[\u0370-\u03FF\u1F00-\u1FFF]/g, + Hebrew: /[\u0590-\u05FF]/g, + Arabic: /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/g, + Devanagari: /[\u0900-\u097F]/g, // Hindi, Sanskrit, etc. + Bengali: /[\u0980-\u09FF]/g, + Thai: /[\u0E00-\u0E7F]/g, + Chinese: + /[\u4E00-\u9FFF\u3400-\u4DBF\u20000-\u2A6DF\u2A700-\u2B73F\u2B740-\u2B81F]/g, + Japanese: /[\u3040-\u309F\u30A0-\u30FF\uFF00-\uFFEF\u4E00-\u9FAF]/g, // Includes Hiragana, Katakana + Korean: /[\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F]/g, // Includes Hangul + Armenian: /[\u0530-\u058F]/g, + Georgian: /[\u10A0-\u10FF]/g, + Khmer: /[\u1780-\u17FF]/g, // Cambodian + Myanmar: /[\u1000-\u109F]/g, // Burmese + Tamil: /[\u0B80-\u0BFF]/g, + Telugu: /[\u0C00-\u0C7F]/g, + Amharic: /[\u1200-\u137F]/g, // Ethiopian + }; + const counts: Record<string, number> = {}; + + for (const [scriptName, regex] of Object.entries(scripts)) { + // Create an array of matches and count its length + const matches = text.match(regex) || []; + counts[scriptName] = matches.length; + } + + let maxCount = 0; + let dominantScript = "Unknown"; + + for (const [scriptName, count] of Object.entries(counts)) { + if (count > maxCount) { + maxCount = count; + dominantScript = scriptName; + } + } + if (dominantScript === "Unknown") return { error: "Not detected" }; + else return { ok: dominantScript }; +} + +export function langFromScript(script: string): Result<string> { + if (script === "Thai") return { ok: "th" }; + if (script === "Japanese") return { ok: "ja" }; + if (script === "Chinese") return { ok: "zh" }; + if (script === "Korean") return { ok: "ko" }; + else return { error: "too generic" }; +} +export function scriptFromLang(lang: string, text: string): string { + if (lang == "th") return "Thai"; + if (lang == "tha") return "Thai"; + if (lang == "en") return "Engl"; + if (lang == "es") return "Span"; + if (lang == "cn") return "Hant"; + if (lang == "zh") return "Hant"; + if (lang == "ja") return "Japn"; + else { + const res = detectScript(text); + if ("ok" in res) return res.ok; + else return ""; + } +} diff --git a/packages/prosody-ui/src/logic/wiki.ts b/packages/prosody-ui/src/logic/wiki.ts new file mode 100644 index 0000000..1325c0f --- /dev/null +++ b/packages/prosody-ui/src/logic/wiki.ts @@ -0,0 +1,138 @@ +import type { AsyncRes, Result } from "sortug"; +import type { Meaning } from "./types"; + +export function buildWiktionaryURL(word: string) { + const params = new URLSearchParams(); + params.append("action", "parse"); + params.append("page", word); + params.append("format", "json"); + params.append("prop", "templates|text"); + params.append("formatversion", "2"); + + const p = params.toString(); + const url = `https://en.wiktionary.org/w/api.php?${p}`; + return url; +} + +// export async function fetchWordInWiki(url: string) { +// const opts = { method: "GET", body: null, headers: {} }; +// try { +// const res = await proxyCall(url, opts); +// console.log(res.headers.get("content-type")); +// const j = await res.json(); +// return { ok: j }; +// } catch (e) { +// return { error: `${e}` }; +// } +// } + +export type WikiRes = { + url: string; + meanings: Meaning[]; + ipa: string[]; +}; +const poses = [ + "noun", + "verb", + "adjective", + "adverb", + "conjunction", + "determiner", + "preposition", + "definitions", +]; + +export function parseWiktionary(html: string, url: string): Result<WikiRes> { + try { + const dp = new DOMParser(); + const doc = dp.parseFromString(html, "text/html"); + const ipas = doc.querySelectorAll(".IPA"); + const headings = doc.querySelectorAll(".mw-heading"); + const ms: Meaning[] = []; + const doneIdx: number[] = []; + let currentRound: Meaning = { pos: "", meaning: [], etymology: "" }; + for (let [idx, h] of Array.from(headings).entries()) { + const headingType: string = (h.firstChild as any).innerText; + if (!headingType) continue; + const ht = headingType.toLowerCase(); + if (ht.includes("etymology")) currentRound.etymology = fillEtym(h); + else if (poses.includes(ht)) { + currentRound.pos = ht; + currentRound = fillMeaning(h, currentRound); + } + if (currentRound.pos) { + ms.push({ ...currentRound }); + currentRound = { pos: "", meaning: [], etymology: "" }; + } + if (ht === "references") break; // make sure it's one single language lol + } + const ipaStrings = Array.from(ipas).map((el: any) => el.innerText); + return { ok: { meanings: ms, ipa: ipaStrings, url } }; + } catch (e) { + return { error: `${e}` }; + } +} + +function fillMeaning(el: Element, m: Meaning) { + const sibling = el.nextElementSibling; + if (!sibling) return m; + if (sibling?.tagName.toLowerCase() === "ol") { + for (let li of Array.from(sibling.children)) { + if (li.tagName.toLowerCase() !== "li") continue; + if (li.className.includes("empty-elt")) continue; + m.meaning.push(li.innerHTML); + } + } + if (m.meaning.length === 0) return fillMeaning(sibling, m); + else return m; +} + +function fillEtym(el: Element, acc: string = ""): string { + const sibling = el.nextElementSibling; + if (!sibling) return acc; + if (sibling?.tagName.toLowerCase() === "p") acc += `\n${sibling.innerHTML}`; + if (!acc) return fillEtym(sibling, acc); + else return acc; +} + +export function parseWiktionaryo(html: string, url: string): Result<WikiRes> { + try { + const dp = new DOMParser(); + const doc = dp.parseFromString(html, "text/html"); + const ipas = doc.querySelectorAll(".IPA"); + const ols = doc.querySelectorAll("ol"); + const ms = Array.from(ols).map((el) => { + let pos = ""; + let etymology = ""; + let meaning: string[] = []; + let posr = findPos(el); + if ("ok" in posr) pos = posr.ok; + for (let li of Array.from(el.children)) { + if (li.tagName !== "LI") continue; + meaning.push((li as any).innerText); + } + return { pos, meaning, etymology }; + }); + console.log(ipas, "ipa strings"); + console.log(ols, "lists in wiki"); + const ipaStrings = Array.from(ipas).map((el: any) => el.innerText); + return { ok: { meanings: ms, ipa: ipaStrings, url } }; + } catch (e) { + return { error: `${e}` }; + } +} + +function findPos(el: Element): Result<string> { + let pichai = el.previousElementSibling; + console.log(pichai, "previous"); + if (!pichai) return { error: "no pichai" }; + if (pichai.classList.contains("mw-heading")) { + const h4 = pichai.querySelector("h4"); + const h3 = pichai.querySelector("h3"); + if (!h4 && !h3) return findPos(pichai); + else { + const id = (h4?.innerText || h3?.innerText)!; + return { ok: id }; + } + } else return findPos(pichai); +} |
