diff options
Diffstat (limited to 'packages/lang/src/unicode/index.ts')
| -rw-r--r-- | packages/lang/src/unicode/index.ts | 1424 |
1 files changed, 1424 insertions, 0 deletions
diff --git a/packages/lang/src/unicode/index.ts b/packages/lang/src/unicode/index.ts new file mode 100644 index 0000000..bf8821d --- /dev/null +++ b/packages/lang/src/unicode/index.ts @@ -0,0 +1,1424 @@ +// Author: Amir Hossein Kargaran (TypeScript port) +// Date: March, 2025 + +import { ISO_15924_CODE } from "../iso"; + +// Description: This code detects/separates the script(s) (writing system(s)) of the given text. +// TypeScript port of the original Python implementation. + +// Types +type ScriptRange = [number, number]; +type ScriptRanges = Record<string, ScriptRange[]>; + +type ScoredScript = [ISO_15924_CODE | null, number, ScriptDetails]; + +interface ScriptDetails { + details: Record<ISO_15924_CODE, number> | null; + tie: boolean | null; + interval: number | null; +} + +// The SCRIPT_RANGES object contains Unicode ranges for different scripts +export const SCRIPT_RANGES: ScriptRanges = { + Latn: [ + [65, 90], + [97, 122], + [170, 170], + [186, 186], + [192, 214], + [216, 246], + [248, 696], + [736, 740], + [7424, 7461], + [7468, 7516], + [7522, 7525], + [7531, 7543], + [7545, 7614], + [7680, 7935], + [8305, 8305], + [8319, 8319], + [8336, 8348], + [8490, 8491], + [8498, 8498], + [8526, 8526], + [8544, 8584], + [11360, 11391], + [42786, 42887], + [42891, 42954], + [42960, 42961], + [42963, 42963], + [42965, 42969], + [42994, 43007], + [43824, 43866], + [43868, 43876], + [43878, 43881], + [64256, 64262], + [65313, 65338], + [65345, 65370], + [67456, 67461], + [67463, 67504], + [67506, 67514], + [122624, 122654], + [122661, 122666], + ], // Latin + Bopo: [ + [746, 747], + [12549, 12591], + [12704, 12735], + ], // Bopomofo + Zinh: [ + [768, 879], + [1157, 1158], + [2385, 2388], + [6832, 6862], + [7376, 7378], + [7380, 7392], + [7394, 7400], + [7405, 7405], + [7412, 7412], + [7416, 7417], + [7616, 7679], + [8204, 8205], + [8400, 8432], + [12330, 12333], + [12441, 12442], + [65024, 65039], + [65056, 65069], + [66045, 66045], + [66272, 66272], + [70459, 70459], + [118528, 118573], + [118576, 118598], + [119143, 119145], + [119163, 119170], + [119173, 119179], + [119210, 119213], + [917760, 917999], + ], // Inherited + Grek: [ + [880, 883], + [885, 887], + [890, 893], + [895, 895], + [900, 900], + [902, 902], + [904, 906], + [908, 908], + [910, 929], + [931, 993], + [1008, 1023], + [7462, 7466], + [7517, 7521], + [7526, 7530], + [7615, 7615], + [7936, 7957], + [7960, 7965], + [7968, 8005], + [8008, 8013], + [8016, 8023], + [8025, 8025], + [8027, 8027], + [8029, 8029], + [8031, 8061], + [8064, 8116], + [8118, 8132], + [8134, 8147], + [8150, 8155], + [8157, 8175], + [8178, 8180], + [8182, 8190], + [8486, 8486], + [43877, 43877], + [65856, 65934], + [65952, 65952], + [119296, 119365], + ], // Greek + Copt: [ + [994, 1007], + [11392, 11507], + [11513, 11519], + ], // Coptic + Cyrl: [ + [1024, 1156], + [1159, 1327], + [7296, 7304], + [7467, 7467], + [7544, 7544], + [11744, 11775], + [42560, 42655], + [65070, 65071], + [122928, 122989], + [123023, 123023], + ], // Cyrillic + Armn: [ + [1329, 1366], + [1369, 1418], + [1421, 1423], + [64275, 64279], + ], // Armenian + Hebr: [ + [1425, 1479], + [1488, 1514], + [1519, 1524], + [64285, 64310], + [64312, 64316], + [64318, 64318], + [64320, 64321], + [64323, 64324], + [64326, 64335], + ], // Hebrew + Arab: [ + [1536, 1540], + [1542, 1547], + [1549, 1562], + [1564, 1566], + [1568, 1599], + [1601, 1648], + [1649, 1756], + [1758, 1791], + [1872, 1919], + [2160, 2190], + [2192, 2193], + [2200, 2273], + [2275, 2303], + [64336, 64450], + [64467, 64829], + [64832, 64911], + [64914, 64967], + [64975, 64975], + [65008, 65023], + [65136, 65140], + [65142, 65276], + [69216, 69246], + [69373, 69375], + [126464, 126467], + [126469, 126495], + [126497, 126498], + [126500, 126500], + [126503, 126503], + [126505, 126514], + [126516, 126519], + [126521, 126521], + [126523, 126523], + [126530, 126530], + [126535, 126535], + [126537, 126537], + [126539, 126539], + [126541, 126543], + [126545, 126546], + [126548, 126548], + [126551, 126551], + [126553, 126553], + [126555, 126555], + [126557, 126557], + [126559, 126559], + [126561, 126562], + [126564, 126564], + [126567, 126570], + [126572, 126578], + [126580, 126583], + [126585, 126588], + [126590, 126590], + [126592, 126601], + [126603, 126619], + [126625, 126627], + [126629, 126633], + [126635, 126651], + [126704, 126705], + ], // Arabic + Syrc: [ + [1792, 1805], + [1807, 1866], + [1869, 1871], + [2144, 2154], + ], // Syriac + Thaa: [[1920, 1969]], // Thaana + Nkoo: [ + [1984, 2042], + [2045, 2047], + ], // Nko + Samr: [ + [2048, 2093], + [2096, 2110], + ], // Samaritan + Mand: [ + [2112, 2139], + [2142, 2142], + ], // Mandaic + Deva: [ + [2304, 2384], + [2389, 2403], + [2406, 2431], + [43232, 43263], + [72448, 72457], + ], // Devanagari + Beng: [ + [2432, 2435], + [2437, 2444], + [2447, 2448], + [2451, 2472], + [2474, 2480], + [2482, 2482], + [2486, 2489], + [2492, 2500], + [2503, 2504], + [2507, 2510], + [2519, 2519], + [2524, 2525], + [2527, 2531], + [2534, 2558], + ], // Bengali + Guru: [ + [2561, 2563], + [2565, 2570], + [2575, 2576], + [2579, 2600], + [2602, 2608], + [2610, 2611], + [2613, 2614], + [2616, 2617], + [2620, 2620], + [2622, 2626], + [2631, 2632], + [2635, 2637], + [2641, 2641], + [2649, 2652], + [2654, 2654], + [2662, 2678], + ], // Gurmukhi + Gujr: [ + [2689, 2691], + [2693, 2701], + [2703, 2705], + [2707, 2728], + [2730, 2736], + [2738, 2739], + [2741, 2745], + [2748, 2757], + [2759, 2761], + [2763, 2765], + [2768, 2768], + [2784, 2787], + [2790, 2801], + [2809, 2815], + ], // Gujarati + Orya: [ + [2817, 2819], + [2821, 2828], + [2831, 2832], + [2835, 2856], + [2858, 2864], + [2866, 2867], + [2869, 2873], + [2876, 2884], + [2887, 2888], + [2891, 2893], + [2901, 2903], + [2908, 2909], + [2911, 2915], + [2918, 2935], + ], // Oriya + Taml: [ + [2946, 2947], + [2949, 2954], + [2958, 2960], + [2962, 2965], + [2969, 2970], + [2972, 2972], + [2974, 2975], + [2979, 2980], + [2984, 2986], + [2990, 3001], + [3006, 3010], + [3014, 3016], + [3018, 3021], + [3024, 3024], + [3031, 3031], + [3046, 3066], + [73664, 73713], + [73727, 73727], + ], // Tamil + Telu: [ + [3072, 3084], + [3086, 3088], + [3090, 3112], + [3114, 3129], + [3132, 3140], + [3142, 3144], + [3146, 3149], + [3157, 3158], + [3160, 3162], + [3165, 3165], + [3168, 3171], + [3174, 3183], + [3191, 3199], + ], // Telugu + Knda: [ + [3200, 3212], + [3214, 3216], + [3218, 3240], + [3242, 3251], + [3253, 3257], + [3260, 3268], + [3270, 3272], + [3274, 3277], + [3285, 3286], + [3293, 3294], + [3296, 3299], + [3302, 3311], + [3313, 3315], + ], // Kannada + Mlym: [ + [3328, 3340], + [3342, 3344], + [3346, 3396], + [3398, 3400], + [3402, 3407], + [3412, 3427], + [3430, 3455], + ], // Malayalam + Sinh: [ + [3457, 3459], + [3461, 3478], + [3482, 3505], + [3507, 3515], + [3517, 3517], + [3520, 3526], + [3530, 3530], + [3535, 3540], + [3542, 3542], + [3544, 3551], + [3558, 3567], + [3570, 3572], + [70113, 70132], + ], // Sinhala + Thai: [ + [3585, 3642], + [3648, 3675], + ], // Thai + Laoo: [ + [3713, 3714], + [3716, 3716], + [3718, 3722], + [3724, 3747], + [3749, 3749], + [3751, 3773], + [3776, 3780], + [3782, 3782], + [3784, 3790], + [3792, 3801], + [3804, 3807], + ], // Lao + Tibt: [ + [3840, 3911], + [3913, 3948], + [3953, 3991], + [3993, 4028], + [4030, 4044], + [4046, 4052], + [4057, 4058], + ], // Tibetan + Mymr: [ + [4096, 4255], + [43488, 43518], + [43616, 43647], + ], // Myanmar + Geor: [ + [4256, 4293], + [4295, 4295], + [4301, 4301], + [4304, 4346], + [4348, 4351], + [7312, 7354], + [7357, 7359], + [11520, 11557], + [11559, 11559], + [11565, 11565], + ], // Georgian + Hang: [ + [4352, 4607], + [12334, 12335], + [12593, 12686], + [12800, 12830], + [12896, 12926], + [43360, 43388], + [44032, 55203], + [55216, 55238], + [55243, 55291], + [65440, 65470], + [65474, 65479], + [65482, 65487], + [65490, 65495], + [65498, 65500], + ], // Hangul + Ethi: [ + [4608, 4680], + [4682, 4685], + [4688, 4694], + [4696, 4696], + [4698, 4701], + [4704, 4744], + [4746, 4749], + [4752, 4784], + [4786, 4789], + [4792, 4798], + [4800, 4800], + [4802, 4805], + [4808, 4822], + [4824, 4880], + [4882, 4885], + [4888, 4954], + [4957, 4988], + [4992, 5017], + [11648, 11670], + [11680, 11686], + [11688, 11694], + [11696, 11702], + [11704, 11710], + [11712, 11718], + [11720, 11726], + [11728, 11734], + [11736, 11742], + [43777, 43782], + [43785, 43790], + [43793, 43798], + [43808, 43814], + [43816, 43822], + [124896, 124902], + [124904, 124907], + [124909, 124910], + [124912, 124926], + ], // Ethiopic + Cher: [ + [5024, 5109], + [5112, 5117], + [43888, 43967], + ], // Cherokee + Cans: [ + [5120, 5759], + [6320, 6389], + [72368, 72383], + ], // Canadian_Aboriginal + Ogam: [[5760, 5788]], // Ogham + Runr: [ + [5792, 5866], + [5870, 5880], + ], // Runic + Tglg: [ + [5888, 5909], + [5919, 5919], + ], // Tagalog + Hano: [[5920, 5940]], // Hanunoo + Buhd: [[5952, 5971]], // Buhid + Tagb: [ + [5984, 5996], + [5998, 6000], + [6002, 6003], + ], // Tagbanwa + Khmr: [ + [6016, 6109], + [6112, 6121], + [6128, 6137], + [6624, 6655], + ], // Khmer + Mong: [ + [6144, 6145], + [6148, 6148], + [6150, 6169], + [6176, 6264], + [6272, 6314], + [71264, 71276], + ], // Mongolian + Limb: [ + [6400, 6430], + [6432, 6443], + [6448, 6459], + [6464, 6464], + [6468, 6479], + ], // Limbu + Tale: [ + [6480, 6509], + [6512, 6516], + ], // Tai_Le + Talu: [ + [6528, 6571], + [6576, 6601], + [6608, 6618], + [6622, 6623], + ], // New_Tai_Lue + Bugi: [ + [6656, 6683], + [6686, 6687], + ], // Buginese + Lana: [ + [6688, 6750], + [6752, 6780], + [6783, 6793], + [6800, 6809], + [6816, 6829], + ], // Tai_Tham + Bali: [ + [6912, 6988], + [6992, 7038], + ], // Balinese + Sund: [ + [7040, 7103], + [7360, 7367], + ], // Sundanese + Batk: [ + [7104, 7155], + [7164, 7167], + ], // Batak + Lepc: [ + [7168, 7223], + [7227, 7241], + [7245, 7247], + ], // Lepcha + Olck: [[7248, 7295]], // Ol_Chiki + Brai: [[10240, 10495]], // Braille + Glag: [ + [11264, 11359], + [122880, 122886], + [122888, 122904], + [122907, 122913], + [122915, 122916], + [122918, 122922], + ], // Glagolitic + Tfng: [ + [11568, 11623], + [11631, 11632], + [11647, 11647], + ], // Tifinagh + Hani: [ + [11904, 11929], + [11931, 12019], + [12032, 12245], + [12293, 12293], + [12295, 12295], + [12321, 12329], + [12344, 12347], + [13312, 19903], + [19968, 40959], + [63744, 64109], + [64112, 64217], + [94178, 94179], + [94192, 94193], + [131072, 173791], + [173824, 177977], + [177984, 178205], + [178208, 183969], + [183984, 191456], + [194560, 195101], + [196608, 201546], + [201552, 205743], + ], // Han + Hira: [ + [12353, 12438], + [12445, 12447], + [110593, 110879], + [110898, 110898], + [110928, 110930], + [127488, 127488], + ], // Hiragana + Kana: [ + [12449, 12538], + [12541, 12543], + [12784, 12799], + [13008, 13054], + [13056, 13143], + [65382, 65391], + [65393, 65437], + [110576, 110579], + [110581, 110587], + [110589, 110590], + [110592, 110592], + [110880, 110882], + [110933, 110933], + [110948, 110951], + ], // Katakana + Yiii: [ + [40960, 42124], + [42128, 42182], + ], // Yi + Lisu: [ + [42192, 42239], + [73648, 73648], + ], // Lisu + Vaii: [[42240, 42539]], // Vai + Bamu: [ + [42656, 42743], + [92160, 92728], + ], // Bamum + Sylo: [[43008, 43052]], // Syloti_Nagri + Phag: [[43072, 43127]], // Phags_Pa + Saur: [ + [43136, 43205], + [43214, 43225], + ], // Saurashtra + Kali: [ + [43264, 43309], + [43311, 43311], + ], // Kayah_Li + Rjng: [ + [43312, 43347], + [43359, 43359], + ], // Rejang + Java: [ + [43392, 43469], + [43472, 43481], + [43486, 43487], + ], // Javanese + Cham: [ + [43520, 43574], + [43584, 43597], + [43600, 43609], + [43612, 43615], + ], // Cham + Tavt: [ + [43648, 43714], + [43739, 43743], + ], // Tai_Viet + Mtei: [ + [43744, 43766], + [43968, 44013], + [44016, 44025], + ], // Meetei_Mayek + Linb: [ + [65536, 65547], + [65549, 65574], + [65576, 65594], + [65596, 65597], + [65599, 65613], + [65616, 65629], + [65664, 65786], + ], // Linear_B + Lyci: [[66176, 66204]], // Lycian + Cari: [[66208, 66256]], // Carian + Ital: [ + [66304, 66339], + [66349, 66351], + ], // Old_Italic + Goth: [[66352, 66378]], // Gothic + Perm: [[66384, 66426]], // Old_Permic + Ugar: [ + [66432, 66461], + [66463, 66463], + ], // Ugaritic + Xpeo: [ + [66464, 66499], + [66504, 66517], + ], // Old_Persian + Dsrt: [[66560, 66639]], // Deseret + Shaw: [[66640, 66687]], // Shavian + Osma: [ + [66688, 66717], + [66720, 66729], + ], // Osmanya + Osge: [ + [66736, 66771], + [66776, 66811], + ], // Osage + Elba: [[66816, 66855]], // Elbasan + Aghb: [ + [66864, 66915], + [66927, 66927], + ], // Caucasian_Albanian + Vith: [ + [66928, 66938], + [66940, 66954], + [66956, 66962], + [66964, 66965], + [66967, 66977], + [66979, 66993], + [66995, 67001], + [67003, 67004], + ], // Vithkuqi + Lina: [ + [67072, 67382], + [67392, 67413], + [67424, 67431], + ], // Linear_A + Cprt: [ + [67584, 67589], + [67592, 67592], + [67594, 67637], + [67639, 67640], + [67644, 67644], + [67647, 67647], + ], // Cypriot + Armi: [ + [67648, 67669], + [67671, 67679], + ], // Imperial_Aramaic + Palm: [[67680, 67711]], // Palmyrene + Nbat: [ + [67712, 67742], + [67751, 67759], + ], // Nabataean + Hatr: [ + [67808, 67826], + [67828, 67829], + [67835, 67839], + ], // Hatran + Phnx: [ + [67840, 67867], + [67871, 67871], + ], // Phoenician + Lydi: [ + [67872, 67897], + [67903, 67903], + ], // Lydian + Mero: [[67968, 67999]], // Meroitic_Hieroglyphs + Merc: [ + [68000, 68023], + [68028, 68047], + [68050, 68095], + ], // Meroitic_Cursive + Khar: [ + [68096, 68099], + [68101, 68102], + [68108, 68115], + [68117, 68119], + [68121, 68149], + [68152, 68154], + [68159, 68168], + [68176, 68184], + ], // Kharoshthi + Sarb: [[68192, 68223]], // Old_South_Arabian + Narb: [[68224, 68255]], // Old_North_Arabian + Mani: [ + [68288, 68326], + [68331, 68342], + ], // Manichaean + Avst: [ + [68352, 68405], + [68409, 68415], + ], // Avestan + Prti: [ + [68416, 68437], + [68440, 68447], + ], // Inscriptional_Parthian + Phli: [ + [68448, 68466], + [68472, 68479], + ], // Inscriptional_Pahlavi + Phlp: [ + [68480, 68497], + [68505, 68508], + [68521, 68527], + ], // Psalter_Pahlavi + Orkh: [[68608, 68680]], // Old_Turkic + Hung: [ + [68736, 68786], + [68800, 68850], + [68858, 68863], + ], // Old_Hungarian + Rohg: [ + [68864, 68903], + [68912, 68921], + ], // Hanifi_Rohingya + Yezi: [ + [69248, 69289], + [69291, 69293], + [69296, 69297], + ], // Yezidi + Sogo: [[69376, 69415]], // Old_Sogdian + Sogd: [[69424, 69465]], // Sogdian + Ougr: [[69488, 69513]], // Old_Uyghur + Chrs: [[69552, 69579]], // Chorasmian + Elym: [[69600, 69622]], // Elymaic + Brah: [ + [69632, 69709], + [69714, 69749], + [69759, 69759], + ], // Brahmi + Kthi: [ + [69760, 69826], + [69837, 69837], + ], // Kaithi + Sora: [ + [69840, 69864], + [69872, 69881], + ], // Sora_Sompeng + Cakm: [ + [69888, 69940], + [69942, 69959], + ], // Chakma + Mahj: [[69968, 70006]], // Mahajani + Shrd: [[70016, 70111]], // Sharada + Khoj: [ + [70144, 70161], + [70163, 70209], + ], // Khojki + Mult: [ + [70272, 70278], + [70280, 70280], + [70282, 70285], + [70287, 70301], + [70303, 70313], + ], // Multani + Sind: [ + [70320, 70378], + [70384, 70393], + ], // Khudawadi + Gran: [ + [70400, 70403], + [70405, 70412], + [70415, 70416], + [70419, 70440], + [70442, 70448], + [70450, 70451], + [70453, 70457], + [70460, 70468], + [70471, 70472], + [70475, 70477], + [70480, 70480], + [70487, 70487], + [70493, 70499], + [70502, 70508], + [70512, 70516], + ], // Grantha + Newa: [ + [70656, 70747], + [70749, 70753], + ], // Newa + Tirh: [ + [70784, 70855], + [70864, 70873], + ], // Tirhuta + Sidd: [ + [71040, 71093], + [71096, 71133], + ], // Siddham + Modi: [ + [71168, 71236], + [71248, 71257], + ], // Modi + Takr: [ + [71296, 71353], + [71360, 71369], + ], // Takri + Ahom: [ + [71424, 71450], + [71453, 71467], + [71472, 71494], + ], // Ahom + Dogr: [[71680, 71739]], // Dogra + Wara: [ + [71840, 71922], + [71935, 71935], + ], // Warang_Citi + Diak: [ + [71936, 71942], + [71945, 71945], + [71948, 71955], + [71957, 71958], + [71960, 71989], + [71991, 71992], + [71995, 72006], + [72016, 72025], + ], // Dives_Akuru + Nand: [ + [72096, 72103], + [72106, 72151], + [72154, 72164], + ], // Nandinagari + Zanb: [[72192, 72263]], // Zanabazar_Square + Soyo: [[72272, 72354]], // Soyombo + Pauc: [[72384, 72440]], // Pau_Cin_Hau + Bhks: [ + [72704, 72712], + [72714, 72758], + [72760, 72773], + [72784, 72812], + ], // Bhaiksuki + Marc: [ + [72816, 72847], + [72850, 72871], + [72873, 72886], + ], // Marchen + Gonm: [ + [72960, 72966], + [72968, 72969], + [72971, 73014], + [73018, 73018], + [73020, 73021], + [73023, 73031], + [73040, 73049], + ], // Masaram_Gondi + Gong: [ + [73056, 73061], + [73063, 73064], + [73066, 73102], + [73104, 73105], + [73107, 73112], + [73120, 73129], + ], // Gunjala_Gondi + Maka: [[73440, 73464]], // Makasar + Kawi: [ + [73472, 73488], + [73490, 73530], + [73534, 73561], + ], // Kawi + Xsux: [ + [73728, 74649], + [74752, 74862], + [74864, 74868], + [74880, 75075], + ], // Cuneiform + Cpmn: [[77712, 77810]], // Cypro_Minoan + Egyp: [[77824, 78933]], // Egyptian_Hieroglyphs + Hluw: [[82944, 83526]], // Anatolian_Hieroglyphs + Mroo: [ + [92736, 92766], + [92768, 92777], + [92782, 92783], + ], // Mro + Tnsa: [ + [92784, 92862], + [92864, 92873], + ], // Tangsa + Bass: [ + [92880, 92909], + [92912, 92917], + ], // Bassa_Vah + Hmng: [ + [92928, 92997], + [93008, 93017], + [93019, 93025], + [93027, 93047], + [93053, 93071], + ], // Pahawh_Hmong + Medf: [[93760, 93850]], // Medefaidrin + Plrd: [ + [93952, 94026], + [94031, 94087], + [94095, 94111], + ], // Miao + Tang: [ + [94176, 94176], + [94208, 100343], + [100352, 101119], + [101632, 101640], + ], // Tangut + Nshu: [ + [94177, 94177], + [110960, 111355], + ], // Nushu + Kits: [ + [94180, 94180], + [101120, 101589], + ], // Khitan_Small_Script + Dupl: [ + [113664, 113770], + [113776, 113788], + [113792, 113800], + [113808, 113817], + [113820, 113823], + ], // Duployan + Sgnw: [ + [120832, 121483], + [121499, 121503], + [121505, 121519], + ], // SignWriting + Hmnp: [ + [123136, 123180], + [123184, 123197], + [123200, 123209], + [123214, 123215], + ], // Nyiakeng_Puachue_Hmong + Toto: [[123536, 123566]], // Toto + Wcho: [ + [123584, 123641], + [123647, 123647], + ], // Wancho + Nagm: [[124112, 124153]], // Nag_Mundari + Mend: [ + [124928, 125124], + [125127, 125142], + ], // Mende_Kikakui + Adlm: [ + [125184, 125259], + [125264, 125273], + [125278, 125279], + ], // Adlam + Zyyy: [ + [0, 64], + [91, 96], + [123, 169], + [171, 185], + [187, 191], + [215, 215], + [247, 247], + [697, 735], + [741, 745], + [748, 767], + [884, 884], + [894, 894], + [901, 901], + [903, 903], + [1541, 1541], + [1548, 1548], + [1563, 1563], + [1567, 1567], + [1600, 1600], + [1757, 1757], + [2274, 2274], + [2404, 2405], + [3647, 3647], + [4053, 4056], + [4347, 4347], + [5867, 5869], + [5941, 5942], + [6146, 6147], + [6149, 6149], + [7379, 7379], + [7393, 7393], + [7401, 7404], + [7406, 7411], + [7413, 7415], + [7418, 7418], + [8192, 8203], + [8206, 8292], + [8294, 8304], + [8308, 8318], + [8320, 8334], + [8352, 8384], + [8448, 8485], + [8487, 8489], + [8492, 8497], + [8499, 8525], + [8527, 8543], + [8585, 8587], + [8592, 9254], + [9280, 9290], + [9312, 10239], + [10496, 11123], + [11126, 11157], + [11159, 11263], + [11776, 11869], + [12272, 12283], + [12288, 12292], + [12294, 12294], + [12296, 12320], + [12336, 12343], + [12348, 12351], + [12443, 12444], + [12448, 12448], + [12539, 12540], + [12688, 12703], + [12736, 12771], + [12832, 12895], + [12927, 13007], + [13055, 13055], + [13144, 13311], + [19904, 19967], + [42752, 42785], + [42888, 42890], + [43056, 43065], + [43310, 43310], + [43471, 43471], + [43867, 43867], + [43882, 43883], + [64830, 64831], + [65040, 65049], + [65072, 65106], + [65108, 65126], + [65128, 65131], + [65279, 65279], + [65281, 65312], + [65339, 65344], + [65371, 65381], + [65392, 65392], + [65438, 65439], + [65504, 65510], + [65512, 65518], + [65529, 65532], + [65792, 65794], + [65799, 65843], + [65847, 65855], + [65936, 65948], + [66000, 66044], + [66273, 66299], + [113824, 113827], + [118608, 118723], + [118784, 119029], + [119040, 119078], + [119081, 119142], + [119146, 119162], + [119171, 119172], + [119180, 119209], + [119214, 119274], + [119488, 119507], + [119520, 119539], + [119552, 119638], + [119648, 119672], + [119808, 119892], + [119894, 119964], + [119966, 119967], + [119970, 119970], + [119973, 119974], + [119977, 119980], + [119982, 119993], + [119995, 119995], + [119997, 120003], + [120005, 120069], + [120071, 120074], + [120077, 120084], + [120086, 120092], + [120094, 120121], + [120123, 120126], + [120128, 120132], + [120134, 120134], + [120138, 120144], + [120146, 120485], + [120488, 120779], + [120782, 120831], + [126065, 126132], + [126209, 126269], + [126976, 127019], + [127024, 127123], + [127136, 127150], + [127153, 127167], + [127169, 127183], + [127185, 127221], + [127232, 127405], + [127462, 127487], + [127489, 127490], + [127504, 127547], + [127552, 127560], + [127568, 127569], + [127584, 127589], + [127744, 128727], + [128732, 128748], + [128752, 128764], + [128768, 128886], + [128891, 128985], + [128992, 129003], + [129008, 129008], + [129024, 129035], + [129040, 129095], + [129104, 129113], + [129120, 129159], + [129168, 129197], + [129200, 129201], + [129280, 129619], + [129632, 129645], + [129648, 129660], + [129664, 129672], + [129680, 129725], + [129727, 129733], + [129742, 129755], + [129760, 129768], + [129776, 129784], + [129792, 129938], + [129940, 129994], + [130032, 130041], + [917505, 917505], + [917536, 917631], + ], // Common + Zzzz: [[65533, 65533]], +}; + +export function getScriptPredictor( + replaceWhitespace: boolean = true, + replacePunctuation: boolean = true, + replaceDigits: boolean = true, +): (sent: string) => ScoredScript { + // Create a map of code points to script names + const histMap: Map<number, Set<string>> = new Map(); + + for (const [key, ranges] of Object.entries(SCRIPT_RANGES)) { + for (const [start, end] of ranges) { + for (let ordinal = start; ordinal <= end; ordinal++) { + if (!histMap.has(ordinal)) { + histMap.set(ordinal, new Set()); + } + histMap.get(ordinal)!.add(key); + } + } + } + + // Helper function to check if a character is whitespace + const isWhitespace = (char: string): boolean => /\s/.test(char); + + // Helper function to check if a character is punctuation + const isPunctuation = (char: string): boolean => + /[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]/.test(char); + + // Helper function to check if a character is a digit + const isDigit = (char: string): boolean => /\d/.test(char); + + return (sent: string): ScoredScript => { + // Filter out characters based on settings + const filteredText = sent + .split("") + .filter((char) => { + if (replaceWhitespace && isWhitespace(char)) return false; + if (replacePunctuation && isPunctuation(char)) return false; + if (replaceDigits && isDigit(char)) return false; + return true; + }) + .join(""); + + if (filteredText.length === 0) { + return [null, 0, { details: null, tie: null, interval: null }]; + } + + // Count characters by script + const scriptCount: Map<string, number> = new Map(); + + for (const char of filteredText) { + const ordinal = char.codePointAt(0)!; + const scripts = histMap.get(ordinal) || new Set(["Zzzz"]); + + for (const script of scripts) { + scriptCount.set(script, (scriptCount.get(script) || 0) + 1); + } + } + + // Convert to sorted object for details + const sortedScores: Record<string, number> = {}; + for (const [script, count] of scriptCount.entries()) { + sortedScores[script] = count / filteredText.length; + } + + // Find the script with maximum score + let maxScore = 0; + let maxScript: string | null = null; + + for (const [script, count] of scriptCount.entries()) { + const score = count / filteredText.length; + if (score > maxScore) { + maxScore = score; + maxScript = script; + } + } + + // Sort scores for details + const sortedEntries = Object.entries(sortedScores).sort( + (a, b) => b[1] - a[1], + ); + const sortedDetails: Record<string, number> = + Object.fromEntries(sortedEntries); + + // Calculate interval and check for ties + if (sortedEntries.length > 1) { + const secondScore = sortedEntries[1][1]; + const interval = maxScore - secondScore; + return [ + maxScript, + maxScore, + { + details: sortedDetails, + tie: interval === 0, + interval: interval, + }, + ]; + } + + return [ + maxScript, + maxScore, + { + details: sortedDetails, + tie: false, + interval: 1, + }, + ]; + }; +} + +export function separateScript(sent: string): Record<string, string> { + const result: Record<string, string[]> = {}; + + for (const char of sent) { + const codePoint = char.codePointAt(0)!; + + for (const [script, ranges] of Object.entries(SCRIPT_RANGES)) { + for (const [start, end] of ranges) { + if ((start <= codePoint && codePoint <= end) || char === " ") { + if (!result[script]) { + result[script] = []; + } + result[script].push(char); + break; + } + } + } + } + + // Filter out empty values and spaces, convert arrays to strings + const filtered: Record<string, string> = {}; + for (const [key, value] of Object.entries(result)) { + const joined = value.join(""); + if (joined && joined.trim()) { + filtered[key] = joined; + } + } + + return filtered; +} + +// Test functions +export function testPredictScript(): void { + const predictor = getScriptPredictor(); + + const tests: [string, [string | null, number]][] = [ + ["this is a latin script.", ["Latn", 1.0]], + ["isso é escrita latina 1234", ["Latn", 1.0]], + ["এটি বাংলা লিপি", ["Beng", 1.0]], + ["นี่คืออักษรไทย", ["Thai", 1.0]], + [ + "자미로콰이 Jamiroquai는 영국의 애시드 재즈 밴드이다 자미로콰이는 년대 초반 런던에서 활발하게 일어난 애시드 재즈", + ["Hang", 0.8148148148148148], + ], + ["이어지는기사 에서그점 에관해알려줄것 입니다", ["Hang", 1.0]], + ["12345", [null, 0]], + [" ", [null, 0]], + ["", [null, 0]], + ]; + + for (const [input, expected] of tests) { + const result = predictor(input); + console.assert( + result[0] === expected[0] && Math.abs(result[1] - expected[1]) < 0.0001, + `Test failed for "${input}"\nExpected: ${expected}\nGot: [${result[0]}, ${result[1]}]`, + ); + } +} + +export function testSeparateScript(): void { + const sent = "Hello Salut سلام 你好 こんにちは שלום مرحبا"; + const detected = separateScript(sent); + + const groundTruth: Record<string, string> = { + Latn: "Hello Salut ", + Hebr: " שלום ", + Arab: " سلام مرحبا", + Hani: " 你好 ", + Hira: " こんにちは ", + }; + + for (const key of Object.keys(groundTruth)) { + console.assert( + key in detected, + `Error: '${key}' script not found in detected scripts.`, + ); + + const detectedTokens = detected[key] + .split(" ") + .map((x) => x.trim()) + .filter((x) => x.length > 0); + const groundTruthTokens = groundTruth[key] + .split(" ") + .map((x) => x.trim()) + .filter((x) => x.length > 0); + + console.assert( + JSON.stringify(detectedTokens.sort()) === + JSON.stringify(groundTruthTokens.sort()), + `Error: Tokens for key '${key}' do not match.`, + ); + } +} |
