summaryrefslogtreecommitdiff
path: root/packages/langlib/src/unicode/index.ts
diff options
context:
space:
mode:
Diffstat (limited to 'packages/langlib/src/unicode/index.ts')
-rw-r--r--packages/langlib/src/unicode/index.ts1419
1 files changed, 1419 insertions, 0 deletions
diff --git a/packages/langlib/src/unicode/index.ts b/packages/langlib/src/unicode/index.ts
new file mode 100644
index 0000000..71736d4
--- /dev/null
+++ b/packages/langlib/src/unicode/index.ts
@@ -0,0 +1,1419 @@
+// Author: Amir Hossein Kargaran (TypeScript port)
+// Date: March, 2025
+
+import type { ISO_15924_CODE } from "../iso";
+
+// Description: This code detects/separates the script(s) (writing system(s)) of the given text.
+// TypeScript port of the original Python implementation.
+
+// Types
+interface ScriptDetails {
+ details: Map<ISO_15924_CODE, number> | null;
+ tie: boolean | null;
+ interval: number | null;
+}
+type ScriptRange = [number, number];
+type ScriptRanges = Partial<Record<ISO_15924_CODE, ScriptRange[]>>;
+
+type ScoredScript = [ISO_15924_CODE | null, number, ScriptDetails];
+
+// The SCRIPT_RANGES object contains Unicode ranges for different scripts
+const SCRIPT_RANGES: ScriptRanges = {
+ Latn: [
+ [65, 90],
+ [97, 122],
+ [170, 170],
+ [186, 186],
+ [192, 214],
+ [216, 246],
+ [248, 696],
+ [736, 740],
+ [7424, 7461],
+ [7468, 7516],
+ [7522, 7525],
+ [7531, 7543],
+ [7545, 7614],
+ [7680, 7935],
+ [8305, 8305],
+ [8319, 8319],
+ [8336, 8348],
+ [8490, 8491],
+ [8498, 8498],
+ [8526, 8526],
+ [8544, 8584],
+ [11360, 11391],
+ [42786, 42887],
+ [42891, 42954],
+ [42960, 42961],
+ [42963, 42963],
+ [42965, 42969],
+ [42994, 43007],
+ [43824, 43866],
+ [43868, 43876],
+ [43878, 43881],
+ [64256, 64262],
+ [65313, 65338],
+ [65345, 65370],
+ [67456, 67461],
+ [67463, 67504],
+ [67506, 67514],
+ [122624, 122654],
+ [122661, 122666],
+ ], // Latin
+ Bopo: [
+ [746, 747],
+ [12549, 12591],
+ [12704, 12735],
+ ], // Bopomofo
+ Zinh: [
+ [768, 879],
+ [1157, 1158],
+ [2385, 2388],
+ [6832, 6862],
+ [7376, 7378],
+ [7380, 7392],
+ [7394, 7400],
+ [7405, 7405],
+ [7412, 7412],
+ [7416, 7417],
+ [7616, 7679],
+ [8204, 8205],
+ [8400, 8432],
+ [12330, 12333],
+ [12441, 12442],
+ [65024, 65039],
+ [65056, 65069],
+ [66045, 66045],
+ [66272, 66272],
+ [70459, 70459],
+ [118528, 118573],
+ [118576, 118598],
+ [119143, 119145],
+ [119163, 119170],
+ [119173, 119179],
+ [119210, 119213],
+ [917760, 917999],
+ ], // Inherited
+ Grek: [
+ [880, 883],
+ [885, 887],
+ [890, 893],
+ [895, 895],
+ [900, 900],
+ [902, 902],
+ [904, 906],
+ [908, 908],
+ [910, 929],
+ [931, 993],
+ [1008, 1023],
+ [7462, 7466],
+ [7517, 7521],
+ [7526, 7530],
+ [7615, 7615],
+ [7936, 7957],
+ [7960, 7965],
+ [7968, 8005],
+ [8008, 8013],
+ [8016, 8023],
+ [8025, 8025],
+ [8027, 8027],
+ [8029, 8029],
+ [8031, 8061],
+ [8064, 8116],
+ [8118, 8132],
+ [8134, 8147],
+ [8150, 8155],
+ [8157, 8175],
+ [8178, 8180],
+ [8182, 8190],
+ [8486, 8486],
+ [43877, 43877],
+ [65856, 65934],
+ [65952, 65952],
+ [119296, 119365],
+ ], // Greek
+ Copt: [
+ [994, 1007],
+ [11392, 11507],
+ [11513, 11519],
+ ], // Coptic
+ Cyrl: [
+ [1024, 1156],
+ [1159, 1327],
+ [7296, 7304],
+ [7467, 7467],
+ [7544, 7544],
+ [11744, 11775],
+ [42560, 42655],
+ [65070, 65071],
+ [122928, 122989],
+ [123023, 123023],
+ ], // Cyrillic
+ Armn: [
+ [1329, 1366],
+ [1369, 1418],
+ [1421, 1423],
+ [64275, 64279],
+ ], // Armenian
+ Hebr: [
+ [1425, 1479],
+ [1488, 1514],
+ [1519, 1524],
+ [64285, 64310],
+ [64312, 64316],
+ [64318, 64318],
+ [64320, 64321],
+ [64323, 64324],
+ [64326, 64335],
+ ], // Hebrew
+ Arab: [
+ [1536, 1540],
+ [1542, 1547],
+ [1549, 1562],
+ [1564, 1566],
+ [1568, 1599],
+ [1601, 1648],
+ [1649, 1756],
+ [1758, 1791],
+ [1872, 1919],
+ [2160, 2190],
+ [2192, 2193],
+ [2200, 2273],
+ [2275, 2303],
+ [64336, 64450],
+ [64467, 64829],
+ [64832, 64911],
+ [64914, 64967],
+ [64975, 64975],
+ [65008, 65023],
+ [65136, 65140],
+ [65142, 65276],
+ [69216, 69246],
+ [69373, 69375],
+ [126464, 126467],
+ [126469, 126495],
+ [126497, 126498],
+ [126500, 126500],
+ [126503, 126503],
+ [126505, 126514],
+ [126516, 126519],
+ [126521, 126521],
+ [126523, 126523],
+ [126530, 126530],
+ [126535, 126535],
+ [126537, 126537],
+ [126539, 126539],
+ [126541, 126543],
+ [126545, 126546],
+ [126548, 126548],
+ [126551, 126551],
+ [126553, 126553],
+ [126555, 126555],
+ [126557, 126557],
+ [126559, 126559],
+ [126561, 126562],
+ [126564, 126564],
+ [126567, 126570],
+ [126572, 126578],
+ [126580, 126583],
+ [126585, 126588],
+ [126590, 126590],
+ [126592, 126601],
+ [126603, 126619],
+ [126625, 126627],
+ [126629, 126633],
+ [126635, 126651],
+ [126704, 126705],
+ ], // Arabic
+ Syrc: [
+ [1792, 1805],
+ [1807, 1866],
+ [1869, 1871],
+ [2144, 2154],
+ ], // Syriac
+ Thaa: [[1920, 1969]], // Thaana
+ Nkoo: [
+ [1984, 2042],
+ [2045, 2047],
+ ], // Nko
+ Samr: [
+ [2048, 2093],
+ [2096, 2110],
+ ], // Samaritan
+ Mand: [
+ [2112, 2139],
+ [2142, 2142],
+ ], // Mandaic
+ Deva: [
+ [2304, 2384],
+ [2389, 2403],
+ [2406, 2431],
+ [43232, 43263],
+ [72448, 72457],
+ ], // Devanagari
+ Beng: [
+ [2432, 2435],
+ [2437, 2444],
+ [2447, 2448],
+ [2451, 2472],
+ [2474, 2480],
+ [2482, 2482],
+ [2486, 2489],
+ [2492, 2500],
+ [2503, 2504],
+ [2507, 2510],
+ [2519, 2519],
+ [2524, 2525],
+ [2527, 2531],
+ [2534, 2558],
+ ], // Bengali
+ Guru: [
+ [2561, 2563],
+ [2565, 2570],
+ [2575, 2576],
+ [2579, 2600],
+ [2602, 2608],
+ [2610, 2611],
+ [2613, 2614],
+ [2616, 2617],
+ [2620, 2620],
+ [2622, 2626],
+ [2631, 2632],
+ [2635, 2637],
+ [2641, 2641],
+ [2649, 2652],
+ [2654, 2654],
+ [2662, 2678],
+ ], // Gurmukhi
+ Gujr: [
+ [2689, 2691],
+ [2693, 2701],
+ [2703, 2705],
+ [2707, 2728],
+ [2730, 2736],
+ [2738, 2739],
+ [2741, 2745],
+ [2748, 2757],
+ [2759, 2761],
+ [2763, 2765],
+ [2768, 2768],
+ [2784, 2787],
+ [2790, 2801],
+ [2809, 2815],
+ ], // Gujarati
+ Orya: [
+ [2817, 2819],
+ [2821, 2828],
+ [2831, 2832],
+ [2835, 2856],
+ [2858, 2864],
+ [2866, 2867],
+ [2869, 2873],
+ [2876, 2884],
+ [2887, 2888],
+ [2891, 2893],
+ [2901, 2903],
+ [2908, 2909],
+ [2911, 2915],
+ [2918, 2935],
+ ], // Oriya
+ Taml: [
+ [2946, 2947],
+ [2949, 2954],
+ [2958, 2960],
+ [2962, 2965],
+ [2969, 2970],
+ [2972, 2972],
+ [2974, 2975],
+ [2979, 2980],
+ [2984, 2986],
+ [2990, 3001],
+ [3006, 3010],
+ [3014, 3016],
+ [3018, 3021],
+ [3024, 3024],
+ [3031, 3031],
+ [3046, 3066],
+ [73664, 73713],
+ [73727, 73727],
+ ], // Tamil
+ Telu: [
+ [3072, 3084],
+ [3086, 3088],
+ [3090, 3112],
+ [3114, 3129],
+ [3132, 3140],
+ [3142, 3144],
+ [3146, 3149],
+ [3157, 3158],
+ [3160, 3162],
+ [3165, 3165],
+ [3168, 3171],
+ [3174, 3183],
+ [3191, 3199],
+ ], // Telugu
+ Knda: [
+ [3200, 3212],
+ [3214, 3216],
+ [3218, 3240],
+ [3242, 3251],
+ [3253, 3257],
+ [3260, 3268],
+ [3270, 3272],
+ [3274, 3277],
+ [3285, 3286],
+ [3293, 3294],
+ [3296, 3299],
+ [3302, 3311],
+ [3313, 3315],
+ ], // Kannada
+ Mlym: [
+ [3328, 3340],
+ [3342, 3344],
+ [3346, 3396],
+ [3398, 3400],
+ [3402, 3407],
+ [3412, 3427],
+ [3430, 3455],
+ ], // Malayalam
+ Sinh: [
+ [3457, 3459],
+ [3461, 3478],
+ [3482, 3505],
+ [3507, 3515],
+ [3517, 3517],
+ [3520, 3526],
+ [3530, 3530],
+ [3535, 3540],
+ [3542, 3542],
+ [3544, 3551],
+ [3558, 3567],
+ [3570, 3572],
+ [70113, 70132],
+ ], // Sinhala
+ Thai: [
+ [3585, 3642],
+ [3648, 3675],
+ ], // Thai
+ Laoo: [
+ [3713, 3714],
+ [3716, 3716],
+ [3718, 3722],
+ [3724, 3747],
+ [3749, 3749],
+ [3751, 3773],
+ [3776, 3780],
+ [3782, 3782],
+ [3784, 3790],
+ [3792, 3801],
+ [3804, 3807],
+ ], // Lao
+ Tibt: [
+ [3840, 3911],
+ [3913, 3948],
+ [3953, 3991],
+ [3993, 4028],
+ [4030, 4044],
+ [4046, 4052],
+ [4057, 4058],
+ ], // Tibetan
+ Mymr: [
+ [4096, 4255],
+ [43488, 43518],
+ [43616, 43647],
+ ], // Myanmar
+ Geor: [
+ [4256, 4293],
+ [4295, 4295],
+ [4301, 4301],
+ [4304, 4346],
+ [4348, 4351],
+ [7312, 7354],
+ [7357, 7359],
+ [11520, 11557],
+ [11559, 11559],
+ [11565, 11565],
+ ], // Georgian
+ Hang: [
+ [4352, 4607],
+ [12334, 12335],
+ [12593, 12686],
+ [12800, 12830],
+ [12896, 12926],
+ [43360, 43388],
+ [44032, 55203],
+ [55216, 55238],
+ [55243, 55291],
+ [65440, 65470],
+ [65474, 65479],
+ [65482, 65487],
+ [65490, 65495],
+ [65498, 65500],
+ ], // Hangul
+ Ethi: [
+ [4608, 4680],
+ [4682, 4685],
+ [4688, 4694],
+ [4696, 4696],
+ [4698, 4701],
+ [4704, 4744],
+ [4746, 4749],
+ [4752, 4784],
+ [4786, 4789],
+ [4792, 4798],
+ [4800, 4800],
+ [4802, 4805],
+ [4808, 4822],
+ [4824, 4880],
+ [4882, 4885],
+ [4888, 4954],
+ [4957, 4988],
+ [4992, 5017],
+ [11648, 11670],
+ [11680, 11686],
+ [11688, 11694],
+ [11696, 11702],
+ [11704, 11710],
+ [11712, 11718],
+ [11720, 11726],
+ [11728, 11734],
+ [11736, 11742],
+ [43777, 43782],
+ [43785, 43790],
+ [43793, 43798],
+ [43808, 43814],
+ [43816, 43822],
+ [124896, 124902],
+ [124904, 124907],
+ [124909, 124910],
+ [124912, 124926],
+ ], // Ethiopic
+ Cher: [
+ [5024, 5109],
+ [5112, 5117],
+ [43888, 43967],
+ ], // Cherokee
+ Cans: [
+ [5120, 5759],
+ [6320, 6389],
+ [72368, 72383],
+ ], // Canadian_Aboriginal
+ Ogam: [[5760, 5788]], // Ogham
+ Runr: [
+ [5792, 5866],
+ [5870, 5880],
+ ], // Runic
+ Tglg: [
+ [5888, 5909],
+ [5919, 5919],
+ ], // Tagalog
+ Hano: [[5920, 5940]], // Hanunoo
+ Buhd: [[5952, 5971]], // Buhid
+ Tagb: [
+ [5984, 5996],
+ [5998, 6000],
+ [6002, 6003],
+ ], // Tagbanwa
+ Khmr: [
+ [6016, 6109],
+ [6112, 6121],
+ [6128, 6137],
+ [6624, 6655],
+ ], // Khmer
+ Mong: [
+ [6144, 6145],
+ [6148, 6148],
+ [6150, 6169],
+ [6176, 6264],
+ [6272, 6314],
+ [71264, 71276],
+ ], // Mongolian
+ Limb: [
+ [6400, 6430],
+ [6432, 6443],
+ [6448, 6459],
+ [6464, 6464],
+ [6468, 6479],
+ ], // Limbu
+ Tale: [
+ [6480, 6509],
+ [6512, 6516],
+ ], // Tai_Le
+ Talu: [
+ [6528, 6571],
+ [6576, 6601],
+ [6608, 6618],
+ [6622, 6623],
+ ], // New_Tai_Lue
+ Bugi: [
+ [6656, 6683],
+ [6686, 6687],
+ ], // Buginese
+ Lana: [
+ [6688, 6750],
+ [6752, 6780],
+ [6783, 6793],
+ [6800, 6809],
+ [6816, 6829],
+ ], // Tai_Tham
+ Bali: [
+ [6912, 6988],
+ [6992, 7038],
+ ], // Balinese
+ Sund: [
+ [7040, 7103],
+ [7360, 7367],
+ ], // Sundanese
+ Batk: [
+ [7104, 7155],
+ [7164, 7167],
+ ], // Batak
+ Lepc: [
+ [7168, 7223],
+ [7227, 7241],
+ [7245, 7247],
+ ], // Lepcha
+ Olck: [[7248, 7295]], // Ol_Chiki
+ Brai: [[10240, 10495]], // Braille
+ Glag: [
+ [11264, 11359],
+ [122880, 122886],
+ [122888, 122904],
+ [122907, 122913],
+ [122915, 122916],
+ [122918, 122922],
+ ], // Glagolitic
+ Tfng: [
+ [11568, 11623],
+ [11631, 11632],
+ [11647, 11647],
+ ], // Tifinagh
+ Hani: [
+ [11904, 11929],
+ [11931, 12019],
+ [12032, 12245],
+ [12293, 12293],
+ [12295, 12295],
+ [12321, 12329],
+ [12344, 12347],
+ [13312, 19903],
+ [19968, 40959],
+ [63744, 64109],
+ [64112, 64217],
+ [94178, 94179],
+ [94192, 94193],
+ [131072, 173791],
+ [173824, 177977],
+ [177984, 178205],
+ [178208, 183969],
+ [183984, 191456],
+ [194560, 195101],
+ [196608, 201546],
+ [201552, 205743],
+ ], // Han
+ Hira: [
+ [12353, 12438],
+ [12445, 12447],
+ [110593, 110879],
+ [110898, 110898],
+ [110928, 110930],
+ [127488, 127488],
+ ], // Hiragana
+ Kana: [
+ [12449, 12538],
+ [12541, 12543],
+ [12784, 12799],
+ [13008, 13054],
+ [13056, 13143],
+ [65382, 65391],
+ [65393, 65437],
+ [110576, 110579],
+ [110581, 110587],
+ [110589, 110590],
+ [110592, 110592],
+ [110880, 110882],
+ [110933, 110933],
+ [110948, 110951],
+ ], // Katakana
+ Yiii: [
+ [40960, 42124],
+ [42128, 42182],
+ ], // Yi
+ Lisu: [
+ [42192, 42239],
+ [73648, 73648],
+ ], // Lisu
+ Vaii: [[42240, 42539]], // Vai
+ Bamu: [
+ [42656, 42743],
+ [92160, 92728],
+ ], // Bamum
+ Sylo: [[43008, 43052]], // Syloti_Nagri
+ Phag: [[43072, 43127]], // Phags_Pa
+ Saur: [
+ [43136, 43205],
+ [43214, 43225],
+ ], // Saurashtra
+ Kali: [
+ [43264, 43309],
+ [43311, 43311],
+ ], // Kayah_Li
+ Rjng: [
+ [43312, 43347],
+ [43359, 43359],
+ ], // Rejang
+ Java: [
+ [43392, 43469],
+ [43472, 43481],
+ [43486, 43487],
+ ], // Javanese
+ Cham: [
+ [43520, 43574],
+ [43584, 43597],
+ [43600, 43609],
+ [43612, 43615],
+ ], // Cham
+ Tavt: [
+ [43648, 43714],
+ [43739, 43743],
+ ], // Tai_Viet
+ Mtei: [
+ [43744, 43766],
+ [43968, 44013],
+ [44016, 44025],
+ ], // Meetei_Mayek
+ Linb: [
+ [65536, 65547],
+ [65549, 65574],
+ [65576, 65594],
+ [65596, 65597],
+ [65599, 65613],
+ [65616, 65629],
+ [65664, 65786],
+ ], // Linear_B
+ Lyci: [[66176, 66204]], // Lycian
+ Cari: [[66208, 66256]], // Carian
+ Ital: [
+ [66304, 66339],
+ [66349, 66351],
+ ], // Old_Italic
+ Goth: [[66352, 66378]], // Gothic
+ Perm: [[66384, 66426]], // Old_Permic
+ Ugar: [
+ [66432, 66461],
+ [66463, 66463],
+ ], // Ugaritic
+ Xpeo: [
+ [66464, 66499],
+ [66504, 66517],
+ ], // Old_Persian
+ Dsrt: [[66560, 66639]], // Deseret
+ Shaw: [[66640, 66687]], // Shavian
+ Osma: [
+ [66688, 66717],
+ [66720, 66729],
+ ], // Osmanya
+ Osge: [
+ [66736, 66771],
+ [66776, 66811],
+ ], // Osage
+ Elba: [[66816, 66855]], // Elbasan
+ Aghb: [
+ [66864, 66915],
+ [66927, 66927],
+ ], // Caucasian_Albanian
+ Vith: [
+ [66928, 66938],
+ [66940, 66954],
+ [66956, 66962],
+ [66964, 66965],
+ [66967, 66977],
+ [66979, 66993],
+ [66995, 67001],
+ [67003, 67004],
+ ], // Vithkuqi
+ Lina: [
+ [67072, 67382],
+ [67392, 67413],
+ [67424, 67431],
+ ], // Linear_A
+ Cprt: [
+ [67584, 67589],
+ [67592, 67592],
+ [67594, 67637],
+ [67639, 67640],
+ [67644, 67644],
+ [67647, 67647],
+ ], // Cypriot
+ Armi: [
+ [67648, 67669],
+ [67671, 67679],
+ ], // Imperial_Aramaic
+ Palm: [[67680, 67711]], // Palmyrene
+ Nbat: [
+ [67712, 67742],
+ [67751, 67759],
+ ], // Nabataean
+ Hatr: [
+ [67808, 67826],
+ [67828, 67829],
+ [67835, 67839],
+ ], // Hatran
+ Phnx: [
+ [67840, 67867],
+ [67871, 67871],
+ ], // Phoenician
+ Lydi: [
+ [67872, 67897],
+ [67903, 67903],
+ ], // Lydian
+ Mero: [[67968, 67999]], // Meroitic_Hieroglyphs
+ Merc: [
+ [68000, 68023],
+ [68028, 68047],
+ [68050, 68095],
+ ], // Meroitic_Cursive
+ Khar: [
+ [68096, 68099],
+ [68101, 68102],
+ [68108, 68115],
+ [68117, 68119],
+ [68121, 68149],
+ [68152, 68154],
+ [68159, 68168],
+ [68176, 68184],
+ ], // Kharoshthi
+ Sarb: [[68192, 68223]], // Old_South_Arabian
+ Narb: [[68224, 68255]], // Old_North_Arabian
+ Mani: [
+ [68288, 68326],
+ [68331, 68342],
+ ], // Manichaean
+ Avst: [
+ [68352, 68405],
+ [68409, 68415],
+ ], // Avestan
+ Prti: [
+ [68416, 68437],
+ [68440, 68447],
+ ], // Inscriptional_Parthian
+ Phli: [
+ [68448, 68466],
+ [68472, 68479],
+ ], // Inscriptional_Pahlavi
+ Phlp: [
+ [68480, 68497],
+ [68505, 68508],
+ [68521, 68527],
+ ], // Psalter_Pahlavi
+ Orkh: [[68608, 68680]], // Old_Turkic
+ Hung: [
+ [68736, 68786],
+ [68800, 68850],
+ [68858, 68863],
+ ], // Old_Hungarian
+ Rohg: [
+ [68864, 68903],
+ [68912, 68921],
+ ], // Hanifi_Rohingya
+ Yezi: [
+ [69248, 69289],
+ [69291, 69293],
+ [69296, 69297],
+ ], // Yezidi
+ Sogo: [[69376, 69415]], // Old_Sogdian
+ Sogd: [[69424, 69465]], // Sogdian
+ Ougr: [[69488, 69513]], // Old_Uyghur
+ Chrs: [[69552, 69579]], // Chorasmian
+ Elym: [[69600, 69622]], // Elymaic
+ Brah: [
+ [69632, 69709],
+ [69714, 69749],
+ [69759, 69759],
+ ], // Brahmi
+ Kthi: [
+ [69760, 69826],
+ [69837, 69837],
+ ], // Kaithi
+ Sora: [
+ [69840, 69864],
+ [69872, 69881],
+ ], // Sora_Sompeng
+ Cakm: [
+ [69888, 69940],
+ [69942, 69959],
+ ], // Chakma
+ Mahj: [[69968, 70006]], // Mahajani
+ Shrd: [[70016, 70111]], // Sharada
+ Khoj: [
+ [70144, 70161],
+ [70163, 70209],
+ ], // Khojki
+ Mult: [
+ [70272, 70278],
+ [70280, 70280],
+ [70282, 70285],
+ [70287, 70301],
+ [70303, 70313],
+ ], // Multani
+ Sind: [
+ [70320, 70378],
+ [70384, 70393],
+ ], // Khudawadi
+ Gran: [
+ [70400, 70403],
+ [70405, 70412],
+ [70415, 70416],
+ [70419, 70440],
+ [70442, 70448],
+ [70450, 70451],
+ [70453, 70457],
+ [70460, 70468],
+ [70471, 70472],
+ [70475, 70477],
+ [70480, 70480],
+ [70487, 70487],
+ [70493, 70499],
+ [70502, 70508],
+ [70512, 70516],
+ ], // Grantha
+ Newa: [
+ [70656, 70747],
+ [70749, 70753],
+ ], // Newa
+ Tirh: [
+ [70784, 70855],
+ [70864, 70873],
+ ], // Tirhuta
+ Sidd: [
+ [71040, 71093],
+ [71096, 71133],
+ ], // Siddham
+ Modi: [
+ [71168, 71236],
+ [71248, 71257],
+ ], // Modi
+ Takr: [
+ [71296, 71353],
+ [71360, 71369],
+ ], // Takri
+ Ahom: [
+ [71424, 71450],
+ [71453, 71467],
+ [71472, 71494],
+ ], // Ahom
+ Dogr: [[71680, 71739]], // Dogra
+ Wara: [
+ [71840, 71922],
+ [71935, 71935],
+ ], // Warang_Citi
+ Diak: [
+ [71936, 71942],
+ [71945, 71945],
+ [71948, 71955],
+ [71957, 71958],
+ [71960, 71989],
+ [71991, 71992],
+ [71995, 72006],
+ [72016, 72025],
+ ], // Dives_Akuru
+ Nand: [
+ [72096, 72103],
+ [72106, 72151],
+ [72154, 72164],
+ ], // Nandinagari
+ Zanb: [[72192, 72263]], // Zanabazar_Square
+ Soyo: [[72272, 72354]], // Soyombo
+ Pauc: [[72384, 72440]], // Pau_Cin_Hau
+ Bhks: [
+ [72704, 72712],
+ [72714, 72758],
+ [72760, 72773],
+ [72784, 72812],
+ ], // Bhaiksuki
+ Marc: [
+ [72816, 72847],
+ [72850, 72871],
+ [72873, 72886],
+ ], // Marchen
+ Gonm: [
+ [72960, 72966],
+ [72968, 72969],
+ [72971, 73014],
+ [73018, 73018],
+ [73020, 73021],
+ [73023, 73031],
+ [73040, 73049],
+ ], // Masaram_Gondi
+ Gong: [
+ [73056, 73061],
+ [73063, 73064],
+ [73066, 73102],
+ [73104, 73105],
+ [73107, 73112],
+ [73120, 73129],
+ ], // Gunjala_Gondi
+ Maka: [[73440, 73464]], // Makasar
+ Kawi: [
+ [73472, 73488],
+ [73490, 73530],
+ [73534, 73561],
+ ], // Kawi
+ Xsux: [
+ [73728, 74649],
+ [74752, 74862],
+ [74864, 74868],
+ [74880, 75075],
+ ], // Cuneiform
+ Cpmn: [[77712, 77810]], // Cypro_Minoan
+ Egyp: [[77824, 78933]], // Egyptian_Hieroglyphs
+ Hluw: [[82944, 83526]], // Anatolian_Hieroglyphs
+ Mroo: [
+ [92736, 92766],
+ [92768, 92777],
+ [92782, 92783],
+ ], // Mro
+ Tnsa: [
+ [92784, 92862],
+ [92864, 92873],
+ ], // Tangsa
+ Bass: [
+ [92880, 92909],
+ [92912, 92917],
+ ], // Bassa_Vah
+ Hmng: [
+ [92928, 92997],
+ [93008, 93017],
+ [93019, 93025],
+ [93027, 93047],
+ [93053, 93071],
+ ], // Pahawh_Hmong
+ Medf: [[93760, 93850]], // Medefaidrin
+ Plrd: [
+ [93952, 94026],
+ [94031, 94087],
+ [94095, 94111],
+ ], // Miao
+ Tang: [
+ [94176, 94176],
+ [94208, 100343],
+ [100352, 101119],
+ [101632, 101640],
+ ], // Tangut
+ Nshu: [
+ [94177, 94177],
+ [110960, 111355],
+ ], // Nushu
+ Kits: [
+ [94180, 94180],
+ [101120, 101589],
+ ], // Khitan_Small_Script
+ Dupl: [
+ [113664, 113770],
+ [113776, 113788],
+ [113792, 113800],
+ [113808, 113817],
+ [113820, 113823],
+ ], // Duployan
+ Sgnw: [
+ [120832, 121483],
+ [121499, 121503],
+ [121505, 121519],
+ ], // SignWriting
+ Hmnp: [
+ [123136, 123180],
+ [123184, 123197],
+ [123200, 123209],
+ [123214, 123215],
+ ], // Nyiakeng_Puachue_Hmong
+ Toto: [[123536, 123566]], // Toto
+ Wcho: [
+ [123584, 123641],
+ [123647, 123647],
+ ], // Wancho
+ Nagm: [[124112, 124153]], // Nag_Mundari
+ Mend: [
+ [124928, 125124],
+ [125127, 125142],
+ ], // Mende_Kikakui
+ Adlm: [
+ [125184, 125259],
+ [125264, 125273],
+ [125278, 125279],
+ ], // Adlam
+ Zyyy: [
+ [0, 64],
+ [91, 96],
+ [123, 169],
+ [171, 185],
+ [187, 191],
+ [215, 215],
+ [247, 247],
+ [697, 735],
+ [741, 745],
+ [748, 767],
+ [884, 884],
+ [894, 894],
+ [901, 901],
+ [903, 903],
+ [1541, 1541],
+ [1548, 1548],
+ [1563, 1563],
+ [1567, 1567],
+ [1600, 1600],
+ [1757, 1757],
+ [2274, 2274],
+ [2404, 2405],
+ [3647, 3647],
+ [4053, 4056],
+ [4347, 4347],
+ [5867, 5869],
+ [5941, 5942],
+ [6146, 6147],
+ [6149, 6149],
+ [7379, 7379],
+ [7393, 7393],
+ [7401, 7404],
+ [7406, 7411],
+ [7413, 7415],
+ [7418, 7418],
+ [8192, 8203],
+ [8206, 8292],
+ [8294, 8304],
+ [8308, 8318],
+ [8320, 8334],
+ [8352, 8384],
+ [8448, 8485],
+ [8487, 8489],
+ [8492, 8497],
+ [8499, 8525],
+ [8527, 8543],
+ [8585, 8587],
+ [8592, 9254],
+ [9280, 9290],
+ [9312, 10239],
+ [10496, 11123],
+ [11126, 11157],
+ [11159, 11263],
+ [11776, 11869],
+ [12272, 12283],
+ [12288, 12292],
+ [12294, 12294],
+ [12296, 12320],
+ [12336, 12343],
+ [12348, 12351],
+ [12443, 12444],
+ [12448, 12448],
+ [12539, 12540],
+ [12688, 12703],
+ [12736, 12771],
+ [12832, 12895],
+ [12927, 13007],
+ [13055, 13055],
+ [13144, 13311],
+ [19904, 19967],
+ [42752, 42785],
+ [42888, 42890],
+ [43056, 43065],
+ [43310, 43310],
+ [43471, 43471],
+ [43867, 43867],
+ [43882, 43883],
+ [64830, 64831],
+ [65040, 65049],
+ [65072, 65106],
+ [65108, 65126],
+ [65128, 65131],
+ [65279, 65279],
+ [65281, 65312],
+ [65339, 65344],
+ [65371, 65381],
+ [65392, 65392],
+ [65438, 65439],
+ [65504, 65510],
+ [65512, 65518],
+ [65529, 65532],
+ [65792, 65794],
+ [65799, 65843],
+ [65847, 65855],
+ [65936, 65948],
+ [66000, 66044],
+ [66273, 66299],
+ [113824, 113827],
+ [118608, 118723],
+ [118784, 119029],
+ [119040, 119078],
+ [119081, 119142],
+ [119146, 119162],
+ [119171, 119172],
+ [119180, 119209],
+ [119214, 119274],
+ [119488, 119507],
+ [119520, 119539],
+ [119552, 119638],
+ [119648, 119672],
+ [119808, 119892],
+ [119894, 119964],
+ [119966, 119967],
+ [119970, 119970],
+ [119973, 119974],
+ [119977, 119980],
+ [119982, 119993],
+ [119995, 119995],
+ [119997, 120003],
+ [120005, 120069],
+ [120071, 120074],
+ [120077, 120084],
+ [120086, 120092],
+ [120094, 120121],
+ [120123, 120126],
+ [120128, 120132],
+ [120134, 120134],
+ [120138, 120144],
+ [120146, 120485],
+ [120488, 120779],
+ [120782, 120831],
+ [126065, 126132],
+ [126209, 126269],
+ [126976, 127019],
+ [127024, 127123],
+ [127136, 127150],
+ [127153, 127167],
+ [127169, 127183],
+ [127185, 127221],
+ [127232, 127405],
+ [127462, 127487],
+ [127489, 127490],
+ [127504, 127547],
+ [127552, 127560],
+ [127568, 127569],
+ [127584, 127589],
+ [127744, 128727],
+ [128732, 128748],
+ [128752, 128764],
+ [128768, 128886],
+ [128891, 128985],
+ [128992, 129003],
+ [129008, 129008],
+ [129024, 129035],
+ [129040, 129095],
+ [129104, 129113],
+ [129120, 129159],
+ [129168, 129197],
+ [129200, 129201],
+ [129280, 129619],
+ [129632, 129645],
+ [129648, 129660],
+ [129664, 129672],
+ [129680, 129725],
+ [129727, 129733],
+ [129742, 129755],
+ [129760, 129768],
+ [129776, 129784],
+ [129792, 129938],
+ [129940, 129994],
+ [130032, 130041],
+ [917505, 917505],
+ [917536, 917631],
+ ], // Common
+ Zzzz: [[65533, 65533]],
+};
+
+export function getScriptPredictor(
+ replaceWhitespace: boolean = true,
+ replacePunctuation: boolean = true,
+ replaceDigits: boolean = true,
+): (sent: string) => ScoredScript {
+ // Create a map of code points to script names
+ const histMap: Map<number, Set<ISO_15924_CODE>> = new Map();
+
+ for (const [key, ranges] of Object.entries(SCRIPT_RANGES)) {
+ const k = key as ISO_15924_CODE;
+ for (const [start, end] of ranges) {
+ for (let ordinal = start; ordinal <= end; ordinal++) {
+ if (!histMap.has(ordinal)) {
+ histMap.set(ordinal, new Set());
+ }
+ histMap.get(ordinal)!.add(k);
+ }
+ }
+ }
+
+ // Helper function to check if a character is whitespace
+ const isWhitespace = (char: string): boolean => /\s/.test(char);
+
+ // Helper function to check if a character is punctuation
+ const isPunctuation = (char: string): boolean =>
+ /[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]/.test(char);
+
+ // Helper function to check if a character is a digit
+ const isDigit = (char: string): boolean => /\d/.test(char);
+
+ return (sent: string): ScoredScript => {
+ // Filter out characters based on settings
+ const filteredText = sent
+ .split("")
+ .filter((char) => {
+ if (replaceWhitespace && isWhitespace(char)) return false;
+ if (replacePunctuation && isPunctuation(char)) return false;
+ if (replaceDigits && isDigit(char)) return false;
+ return true;
+ })
+ .join("");
+
+ if (filteredText.length === 0) {
+ return [null, 0, { details: null, tie: null, interval: null }];
+ }
+
+ // Count characters by script
+ const scriptCount: Map<ISO_15924_CODE, number> = new Map();
+
+ for (const char of filteredText) {
+ const ordinal = char.codePointAt(0)!;
+ const scripts = histMap.get(ordinal) || new Set(["Zzzz"]);
+
+ for (const script of scripts) {
+ scriptCount.set(script, (scriptCount.get(script) || 0) + 1);
+ }
+ }
+
+ // Convert to sorted object for details
+ const scoreMap: Map<ISO_15924_CODE, number> = new Map();
+ for (const [script, count] of scriptCount.entries()) {
+ scoreMap.set(script, count / filteredText.length);
+ }
+
+ // Find the script with maximum score
+ let maxScore = 0;
+ let maxScript: ISO_15924_CODE | null = null;
+
+ for (const [script, count] of scriptCount.entries()) {
+ const score = count / filteredText.length;
+ if (score > maxScore) {
+ maxScore = score;
+ maxScript = script;
+ }
+ }
+
+ // Sort scores for details
+ const sortedEntries = Array.from(scoreMap.entries()).sort(
+ (a, b) => b[1] - a[1],
+ );
+ // Calculate interval and check for ties
+ if (sortedEntries.length > 1) {
+ const secondScore = sortedEntries[1]![1];
+ const interval = maxScore - secondScore;
+ return [
+ maxScript,
+ maxScore,
+ {
+ details: scoreMap,
+ tie: interval === 0,
+ interval: interval,
+ },
+ ];
+ }
+
+ return [
+ maxScript,
+ maxScore,
+ {
+ details: scoreMap,
+ tie: false,
+ interval: 1,
+ },
+ ];
+ };
+}
+
+export function separateScript(sent: string): Record<string, string> {
+ const result: Record<string, string[]> = {};
+
+ for (const char of sent) {
+ const codePoint = char.codePointAt(0)!;
+
+ for (const [script, ranges] of Object.entries(SCRIPT_RANGES)) {
+ for (const [start, end] of ranges) {
+ if ((start <= codePoint && codePoint <= end) || char === " ") {
+ if (!result[script]) {
+ result[script] = [];
+ }
+ result[script].push(char);
+ break;
+ }
+ }
+ }
+ }
+
+ // Filter out empty values and spaces, convert arrays to strings
+ const filtered: Record<string, string> = {};
+ for (const [key, value] of Object.entries(result)) {
+ const joined = value.join("");
+ if (joined && joined.trim()) {
+ filtered[key] = joined;
+ }
+ }
+
+ return filtered;
+}
+
+// Test functions
+export function testPredictScript(): void {
+ const predictor = getScriptPredictor();
+
+ const tests: [string, [string | null, number]][] = [
+ ["this is a latin script.", ["Latn", 1.0]],
+ ["isso é escrita latina 1234", ["Latn", 1.0]],
+ ["এটি বাংলা লিপি", ["Beng", 1.0]],
+ ["นี่คืออักษรไทย", ["Thai", 1.0]],
+ [
+ "자미로콰이 Jamiroquai는 영국의 애시드 재즈 밴드이다 자미로콰이는 년대 초반 런던에서 활발하게 일어난 애시드 재즈",
+ ["Hang", 0.8148148148148148],
+ ],
+ ["이어지는기사 에서그점 에관해알려줄것 입니다", ["Hang", 1.0]],
+ ["12345", [null, 0]],
+ [" ", [null, 0]],
+ ["", [null, 0]],
+ ];
+
+ for (const [input, expected] of tests) {
+ const result = predictor(input);
+ console.assert(
+ result[0] === expected[0] && Math.abs(result[1] - expected[1]) < 0.0001,
+ `Test failed for "${input}"\nExpected: ${expected}\nGot: [${result[0]}, ${result[1]}]`,
+ );
+ }
+}
+
+export function testSeparateScript(): void {
+ const sent = "Hello Salut سلام 你好 こんにちは שלום مرحبا";
+ const detected = separateScript(sent);
+
+ const groundTruth: Record<string, string> = {
+ Latn: "Hello Salut ",
+ Hebr: " שלום ",
+ Arab: " سلام مرحبا",
+ Hani: " 你好 ",
+ Hira: " こんにちは ",
+ };
+
+ for (const key of Object.keys(groundTruth)) {
+ console.assert(
+ key in detected,
+ `Error: '${key}' script not found in detected scripts.`,
+ );
+
+ const detectedTokens = detected[key]!.split(" ")
+ .map((x) => x.trim())
+ .filter((x) => x.length > 0);
+ const groundTruthTokens = groundTruth[key]!.split(" ")
+ .map((x) => x.trim())
+ .filter((x) => x.length > 0);
+
+ console.assert(
+ JSON.stringify(detectedTokens.sort()) ===
+ JSON.stringify(groundTruthTokens.sort()),
+ `Error: Tokens for key '${key}' do not match.`,
+ );
+ }
+}