1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
import type { Result } from "@sortug/lib";
export function detectScript(text: string): Result<string> {
const scripts = {
Latin: /[\u0000-\u007F\u00A0-\u00FF\u0100-\u017F\u0180-\u024F]/g,
Cyrillic: /[\u0400-\u04FF\u0500-\u052F\u2DE0-\u2DFF\uA640-\uA69F]/g,
Greek: /[\u0370-\u03FF\u1F00-\u1FFF]/g,
Hebrew: /[\u0590-\u05FF]/g,
Arabic: /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]/g,
Devanagari: /[\u0900-\u097F]/g, // Hindi, Sanskrit, etc.
Bengali: /[\u0980-\u09FF]/g,
Thai: /[\u0E00-\u0E7F]/g,
Chinese:
/[\u4E00-\u9FFF\u3400-\u4DBF\u20000-\u2A6DF\u2A700-\u2B73F\u2B740-\u2B81F]/g,
Japanese: /[\u3040-\u309F\u30A0-\u30FF\uFF00-\uFFEF\u4E00-\u9FAF]/g, // Includes Hiragana, Katakana
Korean: /[\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F]/g, // Includes Hangul
Armenian: /[\u0530-\u058F]/g,
Georgian: /[\u10A0-\u10FF]/g,
Khmer: /[\u1780-\u17FF]/g, // Cambodian
Myanmar: /[\u1000-\u109F]/g, // Burmese
Tamil: /[\u0B80-\u0BFF]/g,
Telugu: /[\u0C00-\u0C7F]/g,
Amharic: /[\u1200-\u137F]/g, // Ethiopian
};
const counts: Record<string, number> = {};
for (const [scriptName, regex] of Object.entries(scripts)) {
// Create an array of matches and count its length
const matches = text.match(regex) || [];
counts[scriptName] = matches.length;
}
let maxCount = 0;
let dominantScript = "Unknown";
for (const [scriptName, count] of Object.entries(counts)) {
if (count > maxCount) {
maxCount = count;
dominantScript = scriptName;
}
}
if (dominantScript === "Unknown") return { error: "Not detected" };
else return { ok: dominantScript };
}
export function langFromScript(script: string): Result<string> {
if (script === "Thai") return { ok: "th" };
if (script === "Japanese") return { ok: "ja" };
if (script === "Chinese") return { ok: "zh" };
if (script === "Korean") return { ok: "ko" };
else return { error: "too generic" };
}
export function scriptFromLang(lang: string, text: string): string {
if (lang == "th") return "Thai";
if (lang == "tha") return "Thai";
if (lang == "en") return "Engl";
if (lang == "es") return "Span";
if (lang == "cn") return "Hant";
if (lang == "zh") return "Hant";
if (lang == "ja") return "Japn";
else {
const res = detectScript(text);
if ("ok" in res) return res.ok;
else return "";
}
}
|