// Author: Amir Hossein Kargaran (TypeScript port) // Date: March, 2025 import { ISO_15924_CODE } from "../iso"; // Description: This code detects/separates the script(s) (writing system(s)) of the given text. // TypeScript port of the original Python implementation. // Types type ScriptRange = [number, number]; type ScriptRanges = Record; type ScoredScript = [ISO_15924_CODE | null, number, ScriptDetails]; interface ScriptDetails { details: Record | null; tie: boolean | null; interval: number | null; } // The SCRIPT_RANGES object contains Unicode ranges for different scripts export const SCRIPT_RANGES: ScriptRanges = { Latn: [ [65, 90], [97, 122], [170, 170], [186, 186], [192, 214], [216, 246], [248, 696], [736, 740], [7424, 7461], [7468, 7516], [7522, 7525], [7531, 7543], [7545, 7614], [7680, 7935], [8305, 8305], [8319, 8319], [8336, 8348], [8490, 8491], [8498, 8498], [8526, 8526], [8544, 8584], [11360, 11391], [42786, 42887], [42891, 42954], [42960, 42961], [42963, 42963], [42965, 42969], [42994, 43007], [43824, 43866], [43868, 43876], [43878, 43881], [64256, 64262], [65313, 65338], [65345, 65370], [67456, 67461], [67463, 67504], [67506, 67514], [122624, 122654], [122661, 122666], ], // Latin Bopo: [ [746, 747], [12549, 12591], [12704, 12735], ], // Bopomofo Zinh: [ [768, 879], [1157, 1158], [2385, 2388], [6832, 6862], [7376, 7378], [7380, 7392], [7394, 7400], [7405, 7405], [7412, 7412], [7416, 7417], [7616, 7679], [8204, 8205], [8400, 8432], [12330, 12333], [12441, 12442], [65024, 65039], [65056, 65069], [66045, 66045], [66272, 66272], [70459, 70459], [118528, 118573], [118576, 118598], [119143, 119145], [119163, 119170], [119173, 119179], [119210, 119213], [917760, 917999], ], // Inherited Grek: [ [880, 883], [885, 887], [890, 893], [895, 895], [900, 900], [902, 902], [904, 906], [908, 908], [910, 929], [931, 993], [1008, 1023], [7462, 7466], [7517, 7521], [7526, 7530], [7615, 7615], [7936, 7957], [7960, 7965], [7968, 8005], [8008, 8013], [8016, 8023], [8025, 8025], [8027, 8027], [8029, 8029], [8031, 8061], [8064, 8116], [8118, 8132], [8134, 8147], [8150, 8155], [8157, 8175], [8178, 8180], [8182, 8190], [8486, 8486], [43877, 43877], [65856, 65934], [65952, 65952], [119296, 119365], ], // Greek Copt: [ [994, 1007], [11392, 11507], [11513, 11519], ], // Coptic Cyrl: [ [1024, 1156], [1159, 1327], [7296, 7304], [7467, 7467], [7544, 7544], [11744, 11775], [42560, 42655], [65070, 65071], [122928, 122989], [123023, 123023], ], // Cyrillic Armn: [ [1329, 1366], [1369, 1418], [1421, 1423], [64275, 64279], ], // Armenian Hebr: [ [1425, 1479], [1488, 1514], [1519, 1524], [64285, 64310], [64312, 64316], [64318, 64318], [64320, 64321], [64323, 64324], [64326, 64335], ], // Hebrew Arab: [ [1536, 1540], [1542, 1547], [1549, 1562], [1564, 1566], [1568, 1599], [1601, 1648], [1649, 1756], [1758, 1791], [1872, 1919], [2160, 2190], [2192, 2193], [2200, 2273], [2275, 2303], [64336, 64450], [64467, 64829], [64832, 64911], [64914, 64967], [64975, 64975], [65008, 65023], [65136, 65140], [65142, 65276], [69216, 69246], [69373, 69375], [126464, 126467], [126469, 126495], [126497, 126498], [126500, 126500], [126503, 126503], [126505, 126514], [126516, 126519], [126521, 126521], [126523, 126523], [126530, 126530], [126535, 126535], [126537, 126537], [126539, 126539], [126541, 126543], [126545, 126546], [126548, 126548], [126551, 126551], [126553, 126553], [126555, 126555], [126557, 126557], [126559, 126559], [126561, 126562], [126564, 126564], [126567, 126570], [126572, 126578], [126580, 126583], [126585, 126588], [126590, 126590], [126592, 126601], [126603, 126619], [126625, 126627], [126629, 126633], [126635, 126651], [126704, 126705], ], // Arabic Syrc: [ [1792, 1805], [1807, 1866], [1869, 1871], [2144, 2154], ], // Syriac Thaa: [[1920, 1969]], // Thaana Nkoo: [ [1984, 2042], [2045, 2047], ], // Nko Samr: [ [2048, 2093], [2096, 2110], ], // Samaritan Mand: [ [2112, 2139], [2142, 2142], ], // Mandaic Deva: [ [2304, 2384], [2389, 2403], [2406, 2431], [43232, 43263], [72448, 72457], ], // Devanagari Beng: [ [2432, 2435], [2437, 2444], [2447, 2448], [2451, 2472], [2474, 2480], [2482, 2482], [2486, 2489], [2492, 2500], [2503, 2504], [2507, 2510], [2519, 2519], [2524, 2525], [2527, 2531], [2534, 2558], ], // Bengali Guru: [ [2561, 2563], [2565, 2570], [2575, 2576], [2579, 2600], [2602, 2608], [2610, 2611], [2613, 2614], [2616, 2617], [2620, 2620], [2622, 2626], [2631, 2632], [2635, 2637], [2641, 2641], [2649, 2652], [2654, 2654], [2662, 2678], ], // Gurmukhi Gujr: [ [2689, 2691], [2693, 2701], [2703, 2705], [2707, 2728], [2730, 2736], [2738, 2739], [2741, 2745], [2748, 2757], [2759, 2761], [2763, 2765], [2768, 2768], [2784, 2787], [2790, 2801], [2809, 2815], ], // Gujarati Orya: [ [2817, 2819], [2821, 2828], [2831, 2832], [2835, 2856], [2858, 2864], [2866, 2867], [2869, 2873], [2876, 2884], [2887, 2888], [2891, 2893], [2901, 2903], [2908, 2909], [2911, 2915], [2918, 2935], ], // Oriya Taml: [ [2946, 2947], [2949, 2954], [2958, 2960], [2962, 2965], [2969, 2970], [2972, 2972], [2974, 2975], [2979, 2980], [2984, 2986], [2990, 3001], [3006, 3010], [3014, 3016], [3018, 3021], [3024, 3024], [3031, 3031], [3046, 3066], [73664, 73713], [73727, 73727], ], // Tamil Telu: [ [3072, 3084], [3086, 3088], [3090, 3112], [3114, 3129], [3132, 3140], [3142, 3144], [3146, 3149], [3157, 3158], [3160, 3162], [3165, 3165], [3168, 3171], [3174, 3183], [3191, 3199], ], // Telugu Knda: [ [3200, 3212], [3214, 3216], [3218, 3240], [3242, 3251], [3253, 3257], [3260, 3268], [3270, 3272], [3274, 3277], [3285, 3286], [3293, 3294], [3296, 3299], [3302, 3311], [3313, 3315], ], // Kannada Mlym: [ [3328, 3340], [3342, 3344], [3346, 3396], [3398, 3400], [3402, 3407], [3412, 3427], [3430, 3455], ], // Malayalam Sinh: [ [3457, 3459], [3461, 3478], [3482, 3505], [3507, 3515], [3517, 3517], [3520, 3526], [3530, 3530], [3535, 3540], [3542, 3542], [3544, 3551], [3558, 3567], [3570, 3572], [70113, 70132], ], // Sinhala Thai: [ [3585, 3642], [3648, 3675], ], // Thai Laoo: [ [3713, 3714], [3716, 3716], [3718, 3722], [3724, 3747], [3749, 3749], [3751, 3773], [3776, 3780], [3782, 3782], [3784, 3790], [3792, 3801], [3804, 3807], ], // Lao Tibt: [ [3840, 3911], [3913, 3948], [3953, 3991], [3993, 4028], [4030, 4044], [4046, 4052], [4057, 4058], ], // Tibetan Mymr: [ [4096, 4255], [43488, 43518], [43616, 43647], ], // Myanmar Geor: [ [4256, 4293], [4295, 4295], [4301, 4301], [4304, 4346], [4348, 4351], [7312, 7354], [7357, 7359], [11520, 11557], [11559, 11559], [11565, 11565], ], // Georgian Hang: [ [4352, 4607], [12334, 12335], [12593, 12686], [12800, 12830], [12896, 12926], [43360, 43388], [44032, 55203], [55216, 55238], [55243, 55291], [65440, 65470], [65474, 65479], [65482, 65487], [65490, 65495], [65498, 65500], ], // Hangul Ethi: [ [4608, 4680], [4682, 4685], [4688, 4694], [4696, 4696], [4698, 4701], [4704, 4744], [4746, 4749], [4752, 4784], [4786, 4789], [4792, 4798], [4800, 4800], [4802, 4805], [4808, 4822], [4824, 4880], [4882, 4885], [4888, 4954], [4957, 4988], [4992, 5017], [11648, 11670], [11680, 11686], [11688, 11694], [11696, 11702], [11704, 11710], [11712, 11718], [11720, 11726], [11728, 11734], [11736, 11742], [43777, 43782], [43785, 43790], [43793, 43798], [43808, 43814], [43816, 43822], [124896, 124902], [124904, 124907], [124909, 124910], [124912, 124926], ], // Ethiopic Cher: [ [5024, 5109], [5112, 5117], [43888, 43967], ], // Cherokee Cans: [ [5120, 5759], [6320, 6389], [72368, 72383], ], // Canadian_Aboriginal Ogam: [[5760, 5788]], // Ogham Runr: [ [5792, 5866], [5870, 5880], ], // Runic Tglg: [ [5888, 5909], [5919, 5919], ], // Tagalog Hano: [[5920, 5940]], // Hanunoo Buhd: [[5952, 5971]], // Buhid Tagb: [ [5984, 5996], [5998, 6000], [6002, 6003], ], // Tagbanwa Khmr: [ [6016, 6109], [6112, 6121], [6128, 6137], [6624, 6655], ], // Khmer Mong: [ [6144, 6145], [6148, 6148], [6150, 6169], [6176, 6264], [6272, 6314], [71264, 71276], ], // Mongolian Limb: [ [6400, 6430], [6432, 6443], [6448, 6459], [6464, 6464], [6468, 6479], ], // Limbu Tale: [ [6480, 6509], [6512, 6516], ], // Tai_Le Talu: [ [6528, 6571], [6576, 6601], [6608, 6618], [6622, 6623], ], // New_Tai_Lue Bugi: [ [6656, 6683], [6686, 6687], ], // Buginese Lana: [ [6688, 6750], [6752, 6780], [6783, 6793], [6800, 6809], [6816, 6829], ], // Tai_Tham Bali: [ [6912, 6988], [6992, 7038], ], // Balinese Sund: [ [7040, 7103], [7360, 7367], ], // Sundanese Batk: [ [7104, 7155], [7164, 7167], ], // Batak Lepc: [ [7168, 7223], [7227, 7241], [7245, 7247], ], // Lepcha Olck: [[7248, 7295]], // Ol_Chiki Brai: [[10240, 10495]], // Braille Glag: [ [11264, 11359], [122880, 122886], [122888, 122904], [122907, 122913], [122915, 122916], [122918, 122922], ], // Glagolitic Tfng: [ [11568, 11623], [11631, 11632], [11647, 11647], ], // Tifinagh Hani: [ [11904, 11929], [11931, 12019], [12032, 12245], [12293, 12293], [12295, 12295], [12321, 12329], [12344, 12347], [13312, 19903], [19968, 40959], [63744, 64109], [64112, 64217], [94178, 94179], [94192, 94193], [131072, 173791], [173824, 177977], [177984, 178205], [178208, 183969], [183984, 191456], [194560, 195101], [196608, 201546], [201552, 205743], ], // Han Hira: [ [12353, 12438], [12445, 12447], [110593, 110879], [110898, 110898], [110928, 110930], [127488, 127488], ], // Hiragana Kana: [ [12449, 12538], [12541, 12543], [12784, 12799], [13008, 13054], [13056, 13143], [65382, 65391], [65393, 65437], [110576, 110579], [110581, 110587], [110589, 110590], [110592, 110592], [110880, 110882], [110933, 110933], [110948, 110951], ], // Katakana Yiii: [ [40960, 42124], [42128, 42182], ], // Yi Lisu: [ [42192, 42239], [73648, 73648], ], // Lisu Vaii: [[42240, 42539]], // Vai Bamu: [ [42656, 42743], [92160, 92728], ], // Bamum Sylo: [[43008, 43052]], // Syloti_Nagri Phag: [[43072, 43127]], // Phags_Pa Saur: [ [43136, 43205], [43214, 43225], ], // Saurashtra Kali: [ [43264, 43309], [43311, 43311], ], // Kayah_Li Rjng: [ [43312, 43347], [43359, 43359], ], // Rejang Java: [ [43392, 43469], [43472, 43481], [43486, 43487], ], // Javanese Cham: [ [43520, 43574], [43584, 43597], [43600, 43609], [43612, 43615], ], // Cham Tavt: [ [43648, 43714], [43739, 43743], ], // Tai_Viet Mtei: [ [43744, 43766], [43968, 44013], [44016, 44025], ], // Meetei_Mayek Linb: [ [65536, 65547], [65549, 65574], [65576, 65594], [65596, 65597], [65599, 65613], [65616, 65629], [65664, 65786], ], // Linear_B Lyci: [[66176, 66204]], // Lycian Cari: [[66208, 66256]], // Carian Ital: [ [66304, 66339], [66349, 66351], ], // Old_Italic Goth: [[66352, 66378]], // Gothic Perm: [[66384, 66426]], // Old_Permic Ugar: [ [66432, 66461], [66463, 66463], ], // Ugaritic Xpeo: [ [66464, 66499], [66504, 66517], ], // Old_Persian Dsrt: [[66560, 66639]], // Deseret Shaw: [[66640, 66687]], // Shavian Osma: [ [66688, 66717], [66720, 66729], ], // Osmanya Osge: [ [66736, 66771], [66776, 66811], ], // Osage Elba: [[66816, 66855]], // Elbasan Aghb: [ [66864, 66915], [66927, 66927], ], // Caucasian_Albanian Vith: [ [66928, 66938], [66940, 66954], [66956, 66962], [66964, 66965], [66967, 66977], [66979, 66993], [66995, 67001], [67003, 67004], ], // Vithkuqi Lina: [ [67072, 67382], [67392, 67413], [67424, 67431], ], // Linear_A Cprt: [ [67584, 67589], [67592, 67592], [67594, 67637], [67639, 67640], [67644, 67644], [67647, 67647], ], // Cypriot Armi: [ [67648, 67669], [67671, 67679], ], // Imperial_Aramaic Palm: [[67680, 67711]], // Palmyrene Nbat: [ [67712, 67742], [67751, 67759], ], // Nabataean Hatr: [ [67808, 67826], [67828, 67829], [67835, 67839], ], // Hatran Phnx: [ [67840, 67867], [67871, 67871], ], // Phoenician Lydi: [ [67872, 67897], [67903, 67903], ], // Lydian Mero: [[67968, 67999]], // Meroitic_Hieroglyphs Merc: [ [68000, 68023], [68028, 68047], [68050, 68095], ], // Meroitic_Cursive Khar: [ [68096, 68099], [68101, 68102], [68108, 68115], [68117, 68119], [68121, 68149], [68152, 68154], [68159, 68168], [68176, 68184], ], // Kharoshthi Sarb: [[68192, 68223]], // Old_South_Arabian Narb: [[68224, 68255]], // Old_North_Arabian Mani: [ [68288, 68326], [68331, 68342], ], // Manichaean Avst: [ [68352, 68405], [68409, 68415], ], // Avestan Prti: [ [68416, 68437], [68440, 68447], ], // Inscriptional_Parthian Phli: [ [68448, 68466], [68472, 68479], ], // Inscriptional_Pahlavi Phlp: [ [68480, 68497], [68505, 68508], [68521, 68527], ], // Psalter_Pahlavi Orkh: [[68608, 68680]], // Old_Turkic Hung: [ [68736, 68786], [68800, 68850], [68858, 68863], ], // Old_Hungarian Rohg: [ [68864, 68903], [68912, 68921], ], // Hanifi_Rohingya Yezi: [ [69248, 69289], [69291, 69293], [69296, 69297], ], // Yezidi Sogo: [[69376, 69415]], // Old_Sogdian Sogd: [[69424, 69465]], // Sogdian Ougr: [[69488, 69513]], // Old_Uyghur Chrs: [[69552, 69579]], // Chorasmian Elym: [[69600, 69622]], // Elymaic Brah: [ [69632, 69709], [69714, 69749], [69759, 69759], ], // Brahmi Kthi: [ [69760, 69826], [69837, 69837], ], // Kaithi Sora: [ [69840, 69864], [69872, 69881], ], // Sora_Sompeng Cakm: [ [69888, 69940], [69942, 69959], ], // Chakma Mahj: [[69968, 70006]], // Mahajani Shrd: [[70016, 70111]], // Sharada Khoj: [ [70144, 70161], [70163, 70209], ], // Khojki Mult: [ [70272, 70278], [70280, 70280], [70282, 70285], [70287, 70301], [70303, 70313], ], // Multani Sind: [ [70320, 70378], [70384, 70393], ], // Khudawadi Gran: [ [70400, 70403], [70405, 70412], [70415, 70416], [70419, 70440], [70442, 70448], [70450, 70451], [70453, 70457], [70460, 70468], [70471, 70472], [70475, 70477], [70480, 70480], [70487, 70487], [70493, 70499], [70502, 70508], [70512, 70516], ], // Grantha Newa: [ [70656, 70747], [70749, 70753], ], // Newa Tirh: [ [70784, 70855], [70864, 70873], ], // Tirhuta Sidd: [ [71040, 71093], [71096, 71133], ], // Siddham Modi: [ [71168, 71236], [71248, 71257], ], // Modi Takr: [ [71296, 71353], [71360, 71369], ], // Takri Ahom: [ [71424, 71450], [71453, 71467], [71472, 71494], ], // Ahom Dogr: [[71680, 71739]], // Dogra Wara: [ [71840, 71922], [71935, 71935], ], // Warang_Citi Diak: [ [71936, 71942], [71945, 71945], [71948, 71955], [71957, 71958], [71960, 71989], [71991, 71992], [71995, 72006], [72016, 72025], ], // Dives_Akuru Nand: [ [72096, 72103], [72106, 72151], [72154, 72164], ], // Nandinagari Zanb: [[72192, 72263]], // Zanabazar_Square Soyo: [[72272, 72354]], // Soyombo Pauc: [[72384, 72440]], // Pau_Cin_Hau Bhks: [ [72704, 72712], [72714, 72758], [72760, 72773], [72784, 72812], ], // Bhaiksuki Marc: [ [72816, 72847], [72850, 72871], [72873, 72886], ], // Marchen Gonm: [ [72960, 72966], [72968, 72969], [72971, 73014], [73018, 73018], [73020, 73021], [73023, 73031], [73040, 73049], ], // Masaram_Gondi Gong: [ [73056, 73061], [73063, 73064], [73066, 73102], [73104, 73105], [73107, 73112], [73120, 73129], ], // Gunjala_Gondi Maka: [[73440, 73464]], // Makasar Kawi: [ [73472, 73488], [73490, 73530], [73534, 73561], ], // Kawi Xsux: [ [73728, 74649], [74752, 74862], [74864, 74868], [74880, 75075], ], // Cuneiform Cpmn: [[77712, 77810]], // Cypro_Minoan Egyp: [[77824, 78933]], // Egyptian_Hieroglyphs Hluw: [[82944, 83526]], // Anatolian_Hieroglyphs Mroo: [ [92736, 92766], [92768, 92777], [92782, 92783], ], // Mro Tnsa: [ [92784, 92862], [92864, 92873], ], // Tangsa Bass: [ [92880, 92909], [92912, 92917], ], // Bassa_Vah Hmng: [ [92928, 92997], [93008, 93017], [93019, 93025], [93027, 93047], [93053, 93071], ], // Pahawh_Hmong Medf: [[93760, 93850]], // Medefaidrin Plrd: [ [93952, 94026], [94031, 94087], [94095, 94111], ], // Miao Tang: [ [94176, 94176], [94208, 100343], [100352, 101119], [101632, 101640], ], // Tangut Nshu: [ [94177, 94177], [110960, 111355], ], // Nushu Kits: [ [94180, 94180], [101120, 101589], ], // Khitan_Small_Script Dupl: [ [113664, 113770], [113776, 113788], [113792, 113800], [113808, 113817], [113820, 113823], ], // Duployan Sgnw: [ [120832, 121483], [121499, 121503], [121505, 121519], ], // SignWriting Hmnp: [ [123136, 123180], [123184, 123197], [123200, 123209], [123214, 123215], ], // Nyiakeng_Puachue_Hmong Toto: [[123536, 123566]], // Toto Wcho: [ [123584, 123641], [123647, 123647], ], // Wancho Nagm: [[124112, 124153]], // Nag_Mundari Mend: [ [124928, 125124], [125127, 125142], ], // Mende_Kikakui Adlm: [ [125184, 125259], [125264, 125273], [125278, 125279], ], // Adlam Zyyy: [ [0, 64], [91, 96], [123, 169], [171, 185], [187, 191], [215, 215], [247, 247], [697, 735], [741, 745], [748, 767], [884, 884], [894, 894], [901, 901], [903, 903], [1541, 1541], [1548, 1548], [1563, 1563], [1567, 1567], [1600, 1600], [1757, 1757], [2274, 2274], [2404, 2405], [3647, 3647], [4053, 4056], [4347, 4347], [5867, 5869], [5941, 5942], [6146, 6147], [6149, 6149], [7379, 7379], [7393, 7393], [7401, 7404], [7406, 7411], [7413, 7415], [7418, 7418], [8192, 8203], [8206, 8292], [8294, 8304], [8308, 8318], [8320, 8334], [8352, 8384], [8448, 8485], [8487, 8489], [8492, 8497], [8499, 8525], [8527, 8543], [8585, 8587], [8592, 9254], [9280, 9290], [9312, 10239], [10496, 11123], [11126, 11157], [11159, 11263], [11776, 11869], [12272, 12283], [12288, 12292], [12294, 12294], [12296, 12320], [12336, 12343], [12348, 12351], [12443, 12444], [12448, 12448], [12539, 12540], [12688, 12703], [12736, 12771], [12832, 12895], [12927, 13007], [13055, 13055], [13144, 13311], [19904, 19967], [42752, 42785], [42888, 42890], [43056, 43065], [43310, 43310], [43471, 43471], [43867, 43867], [43882, 43883], [64830, 64831], [65040, 65049], [65072, 65106], [65108, 65126], [65128, 65131], [65279, 65279], [65281, 65312], [65339, 65344], [65371, 65381], [65392, 65392], [65438, 65439], [65504, 65510], [65512, 65518], [65529, 65532], [65792, 65794], [65799, 65843], [65847, 65855], [65936, 65948], [66000, 66044], [66273, 66299], [113824, 113827], [118608, 118723], [118784, 119029], [119040, 119078], [119081, 119142], [119146, 119162], [119171, 119172], [119180, 119209], [119214, 119274], [119488, 119507], [119520, 119539], [119552, 119638], [119648, 119672], [119808, 119892], [119894, 119964], [119966, 119967], [119970, 119970], [119973, 119974], [119977, 119980], [119982, 119993], [119995, 119995], [119997, 120003], [120005, 120069], [120071, 120074], [120077, 120084], [120086, 120092], [120094, 120121], [120123, 120126], [120128, 120132], [120134, 120134], [120138, 120144], [120146, 120485], [120488, 120779], [120782, 120831], [126065, 126132], [126209, 126269], [126976, 127019], [127024, 127123], [127136, 127150], [127153, 127167], [127169, 127183], [127185, 127221], [127232, 127405], [127462, 127487], [127489, 127490], [127504, 127547], [127552, 127560], [127568, 127569], [127584, 127589], [127744, 128727], [128732, 128748], [128752, 128764], [128768, 128886], [128891, 128985], [128992, 129003], [129008, 129008], [129024, 129035], [129040, 129095], [129104, 129113], [129120, 129159], [129168, 129197], [129200, 129201], [129280, 129619], [129632, 129645], [129648, 129660], [129664, 129672], [129680, 129725], [129727, 129733], [129742, 129755], [129760, 129768], [129776, 129784], [129792, 129938], [129940, 129994], [130032, 130041], [917505, 917505], [917536, 917631], ], // Common Zzzz: [[65533, 65533]], }; export function getScriptPredictor( replaceWhitespace: boolean = true, replacePunctuation: boolean = true, replaceDigits: boolean = true, ): (sent: string) => ScoredScript { // Create a map of code points to script names const histMap: Map> = new Map(); for (const [key, ranges] of Object.entries(SCRIPT_RANGES)) { for (const [start, end] of ranges) { for (let ordinal = start; ordinal <= end; ordinal++) { if (!histMap.has(ordinal)) { histMap.set(ordinal, new Set()); } histMap.get(ordinal)!.add(key); } } } // Helper function to check if a character is whitespace const isWhitespace = (char: string): boolean => /\s/.test(char); // Helper function to check if a character is punctuation const isPunctuation = (char: string): boolean => /[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]/.test(char); // Helper function to check if a character is a digit const isDigit = (char: string): boolean => /\d/.test(char); return (sent: string): ScoredScript => { // Filter out characters based on settings const filteredText = sent .split("") .filter((char) => { if (replaceWhitespace && isWhitespace(char)) return false; if (replacePunctuation && isPunctuation(char)) return false; if (replaceDigits && isDigit(char)) return false; return true; }) .join(""); if (filteredText.length === 0) { return [null, 0, { details: null, tie: null, interval: null }]; } // Count characters by script const scriptCount: Map = new Map(); for (const char of filteredText) { const ordinal = char.codePointAt(0)!; const scripts = histMap.get(ordinal) || new Set(["Zzzz"]); for (const script of scripts) { scriptCount.set(script, (scriptCount.get(script) || 0) + 1); } } // Convert to sorted object for details const sortedScores: Record = {}; for (const [script, count] of scriptCount.entries()) { sortedScores[script] = count / filteredText.length; } // Find the script with maximum score let maxScore = 0; let maxScript: string | null = null; for (const [script, count] of scriptCount.entries()) { const score = count / filteredText.length; if (score > maxScore) { maxScore = score; maxScript = script; } } // Sort scores for details const sortedEntries = Object.entries(sortedScores).sort( (a, b) => b[1] - a[1], ); const sortedDetails: Record = Object.fromEntries(sortedEntries); // Calculate interval and check for ties if (sortedEntries.length > 1) { const secondScore = sortedEntries[1][1]; const interval = maxScore - secondScore; return [ maxScript, maxScore, { details: sortedDetails, tie: interval === 0, interval: interval, }, ]; } return [ maxScript, maxScore, { details: sortedDetails, tie: false, interval: 1, }, ]; }; } export function separateScript(sent: string): Record { const result: Record = {}; for (const char of sent) { const codePoint = char.codePointAt(0)!; for (const [script, ranges] of Object.entries(SCRIPT_RANGES)) { for (const [start, end] of ranges) { if ((start <= codePoint && codePoint <= end) || char === " ") { if (!result[script]) { result[script] = []; } result[script].push(char); break; } } } } // Filter out empty values and spaces, convert arrays to strings const filtered: Record = {}; for (const [key, value] of Object.entries(result)) { const joined = value.join(""); if (joined && joined.trim()) { filtered[key] = joined; } } return filtered; } // Test functions export function testPredictScript(): void { const predictor = getScriptPredictor(); const tests: [string, [string | null, number]][] = [ ["this is a latin script.", ["Latn", 1.0]], ["isso é escrita latina 1234", ["Latn", 1.0]], ["এটি বাংলা লিপি", ["Beng", 1.0]], ["นี่คืออักษรไทย", ["Thai", 1.0]], [ "자미로콰이 Jamiroquai는 영국의 애시드 재즈 밴드이다 자미로콰이는 년대 초반 런던에서 활발하게 일어난 애시드 재즈", ["Hang", 0.8148148148148148], ], ["이어지는기사 에서그점 에관해알려줄것 입니다", ["Hang", 1.0]], ["12345", [null, 0]], [" ", [null, 0]], ["", [null, 0]], ]; for (const [input, expected] of tests) { const result = predictor(input); console.assert( result[0] === expected[0] && Math.abs(result[1] - expected[1]) < 0.0001, `Test failed for "${input}"\nExpected: ${expected}\nGot: [${result[0]}, ${result[1]}]`, ); } } export function testSeparateScript(): void { const sent = "Hello Salut سلام 你好 こんにちは שלום مرحبا"; const detected = separateScript(sent); const groundTruth: Record = { Latn: "Hello Salut ", Hebr: " שלום ", Arab: " سلام مرحبا", Hani: " 你好 ", Hira: " こんにちは ", }; for (const key of Object.keys(groundTruth)) { console.assert( key in detected, `Error: '${key}' script not found in detected scripts.`, ); const detectedTokens = detected[key] .split(" ") .map((x) => x.trim()) .filter((x) => x.length > 0); const groundTruthTokens = groundTruth[key] .split(" ") .map((x) => x.trim()) .filter((x) => x.length > 0); console.assert( JSON.stringify(detectedTokens.sort()) === JSON.stringify(groundTruthTokens.sort()), `Error: Tokens for key '${key}' do not match.`, ); } }