diff options
| author | polwex <polwex@sortug.com> | 2025-11-23 13:29:28 +0700 |
|---|---|---|
| committer | polwex <polwex@sortug.com> | 2025-11-23 13:29:28 +0700 |
| commit | ba2dbc660c229d3e86662d35513dfa7c904d9870 (patch) | |
| tree | afdc039ac31587be0a3d089d024222fb2023fbe9 /packages/lang/src/unicode/index.ts | |
| parent | cb1b56f5a0eddbf77446f415f2beda57c8305f85 (diff) | |
Diffstat (limited to 'packages/lang/src/unicode/index.ts')
| -rw-r--r-- | packages/lang/src/unicode/index.ts | 1424 |
1 files changed, 0 insertions, 1424 deletions
diff --git a/packages/lang/src/unicode/index.ts b/packages/lang/src/unicode/index.ts deleted file mode 100644 index bf8821d..0000000 --- a/packages/lang/src/unicode/index.ts +++ /dev/null @@ -1,1424 +0,0 @@ -// Author: Amir Hossein Kargaran (TypeScript port) -// Date: March, 2025 - -import { ISO_15924_CODE } from "../iso"; - -// Description: This code detects/separates the script(s) (writing system(s)) of the given text. -// TypeScript port of the original Python implementation. - -// Types -type ScriptRange = [number, number]; -type ScriptRanges = Record<string, ScriptRange[]>; - -type ScoredScript = [ISO_15924_CODE | null, number, ScriptDetails]; - -interface ScriptDetails { - details: Record<ISO_15924_CODE, number> | null; - tie: boolean | null; - interval: number | null; -} - -// The SCRIPT_RANGES object contains Unicode ranges for different scripts -export const SCRIPT_RANGES: ScriptRanges = { - Latn: [ - [65, 90], - [97, 122], - [170, 170], - [186, 186], - [192, 214], - [216, 246], - [248, 696], - [736, 740], - [7424, 7461], - [7468, 7516], - [7522, 7525], - [7531, 7543], - [7545, 7614], - [7680, 7935], - [8305, 8305], - [8319, 8319], - [8336, 8348], - [8490, 8491], - [8498, 8498], - [8526, 8526], - [8544, 8584], - [11360, 11391], - [42786, 42887], - [42891, 42954], - [42960, 42961], - [42963, 42963], - [42965, 42969], - [42994, 43007], - [43824, 43866], - [43868, 43876], - [43878, 43881], - [64256, 64262], - [65313, 65338], - [65345, 65370], - [67456, 67461], - [67463, 67504], - [67506, 67514], - [122624, 122654], - [122661, 122666], - ], // Latin - Bopo: [ - [746, 747], - [12549, 12591], - [12704, 12735], - ], // Bopomofo - Zinh: [ - [768, 879], - [1157, 1158], - [2385, 2388], - [6832, 6862], - [7376, 7378], - [7380, 7392], - [7394, 7400], - [7405, 7405], - [7412, 7412], - [7416, 7417], - [7616, 7679], - [8204, 8205], - [8400, 8432], - [12330, 12333], - [12441, 12442], - [65024, 65039], - [65056, 65069], - [66045, 66045], - [66272, 66272], - [70459, 70459], - [118528, 118573], - [118576, 118598], - [119143, 119145], - [119163, 119170], - [119173, 119179], - [119210, 119213], - [917760, 917999], - ], // Inherited - Grek: [ - [880, 883], - [885, 887], - [890, 893], - [895, 895], - [900, 900], - [902, 902], - [904, 906], - [908, 908], - [910, 929], - [931, 993], - [1008, 1023], - [7462, 7466], - [7517, 7521], - [7526, 7530], - [7615, 7615], - [7936, 7957], - [7960, 7965], - [7968, 8005], - [8008, 8013], - [8016, 8023], - [8025, 8025], - [8027, 8027], - [8029, 8029], - [8031, 8061], - [8064, 8116], - [8118, 8132], - [8134, 8147], - [8150, 8155], - [8157, 8175], - [8178, 8180], - [8182, 8190], - [8486, 8486], - [43877, 43877], - [65856, 65934], - [65952, 65952], - [119296, 119365], - ], // Greek - Copt: [ - [994, 1007], - [11392, 11507], - [11513, 11519], - ], // Coptic - Cyrl: [ - [1024, 1156], - [1159, 1327], - [7296, 7304], - [7467, 7467], - [7544, 7544], - [11744, 11775], - [42560, 42655], - [65070, 65071], - [122928, 122989], - [123023, 123023], - ], // Cyrillic - Armn: [ - [1329, 1366], - [1369, 1418], - [1421, 1423], - [64275, 64279], - ], // Armenian - Hebr: [ - [1425, 1479], - [1488, 1514], - [1519, 1524], - [64285, 64310], - [64312, 64316], - [64318, 64318], - [64320, 64321], - [64323, 64324], - [64326, 64335], - ], // Hebrew - Arab: [ - [1536, 1540], - [1542, 1547], - [1549, 1562], - [1564, 1566], - [1568, 1599], - [1601, 1648], - [1649, 1756], - [1758, 1791], - [1872, 1919], - [2160, 2190], - [2192, 2193], - [2200, 2273], - [2275, 2303], - [64336, 64450], - [64467, 64829], - [64832, 64911], - [64914, 64967], - [64975, 64975], - [65008, 65023], - [65136, 65140], - [65142, 65276], - [69216, 69246], - [69373, 69375], - [126464, 126467], - [126469, 126495], - [126497, 126498], - [126500, 126500], - [126503, 126503], - [126505, 126514], - [126516, 126519], - [126521, 126521], - [126523, 126523], - [126530, 126530], - [126535, 126535], - [126537, 126537], - [126539, 126539], - [126541, 126543], - [126545, 126546], - [126548, 126548], - [126551, 126551], - [126553, 126553], - [126555, 126555], - [126557, 126557], - [126559, 126559], - [126561, 126562], - [126564, 126564], - [126567, 126570], - [126572, 126578], - [126580, 126583], - [126585, 126588], - [126590, 126590], - [126592, 126601], - [126603, 126619], - [126625, 126627], - [126629, 126633], - [126635, 126651], - [126704, 126705], - ], // Arabic - Syrc: [ - [1792, 1805], - [1807, 1866], - [1869, 1871], - [2144, 2154], - ], // Syriac - Thaa: [[1920, 1969]], // Thaana - Nkoo: [ - [1984, 2042], - [2045, 2047], - ], // Nko - Samr: [ - [2048, 2093], - [2096, 2110], - ], // Samaritan - Mand: [ - [2112, 2139], - [2142, 2142], - ], // Mandaic - Deva: [ - [2304, 2384], - [2389, 2403], - [2406, 2431], - [43232, 43263], - [72448, 72457], - ], // Devanagari - Beng: [ - [2432, 2435], - [2437, 2444], - [2447, 2448], - [2451, 2472], - [2474, 2480], - [2482, 2482], - [2486, 2489], - [2492, 2500], - [2503, 2504], - [2507, 2510], - [2519, 2519], - [2524, 2525], - [2527, 2531], - [2534, 2558], - ], // Bengali - Guru: [ - [2561, 2563], - [2565, 2570], - [2575, 2576], - [2579, 2600], - [2602, 2608], - [2610, 2611], - [2613, 2614], - [2616, 2617], - [2620, 2620], - [2622, 2626], - [2631, 2632], - [2635, 2637], - [2641, 2641], - [2649, 2652], - [2654, 2654], - [2662, 2678], - ], // Gurmukhi - Gujr: [ - [2689, 2691], - [2693, 2701], - [2703, 2705], - [2707, 2728], - [2730, 2736], - [2738, 2739], - [2741, 2745], - [2748, 2757], - [2759, 2761], - [2763, 2765], - [2768, 2768], - [2784, 2787], - [2790, 2801], - [2809, 2815], - ], // Gujarati - Orya: [ - [2817, 2819], - [2821, 2828], - [2831, 2832], - [2835, 2856], - [2858, 2864], - [2866, 2867], - [2869, 2873], - [2876, 2884], - [2887, 2888], - [2891, 2893], - [2901, 2903], - [2908, 2909], - [2911, 2915], - [2918, 2935], - ], // Oriya - Taml: [ - [2946, 2947], - [2949, 2954], - [2958, 2960], - [2962, 2965], - [2969, 2970], - [2972, 2972], - [2974, 2975], - [2979, 2980], - [2984, 2986], - [2990, 3001], - [3006, 3010], - [3014, 3016], - [3018, 3021], - [3024, 3024], - [3031, 3031], - [3046, 3066], - [73664, 73713], - [73727, 73727], - ], // Tamil - Telu: [ - [3072, 3084], - [3086, 3088], - [3090, 3112], - [3114, 3129], - [3132, 3140], - [3142, 3144], - [3146, 3149], - [3157, 3158], - [3160, 3162], - [3165, 3165], - [3168, 3171], - [3174, 3183], - [3191, 3199], - ], // Telugu - Knda: [ - [3200, 3212], - [3214, 3216], - [3218, 3240], - [3242, 3251], - [3253, 3257], - [3260, 3268], - [3270, 3272], - [3274, 3277], - [3285, 3286], - [3293, 3294], - [3296, 3299], - [3302, 3311], - [3313, 3315], - ], // Kannada - Mlym: [ - [3328, 3340], - [3342, 3344], - [3346, 3396], - [3398, 3400], - [3402, 3407], - [3412, 3427], - [3430, 3455], - ], // Malayalam - Sinh: [ - [3457, 3459], - [3461, 3478], - [3482, 3505], - [3507, 3515], - [3517, 3517], - [3520, 3526], - [3530, 3530], - [3535, 3540], - [3542, 3542], - [3544, 3551], - [3558, 3567], - [3570, 3572], - [70113, 70132], - ], // Sinhala - Thai: [ - [3585, 3642], - [3648, 3675], - ], // Thai - Laoo: [ - [3713, 3714], - [3716, 3716], - [3718, 3722], - [3724, 3747], - [3749, 3749], - [3751, 3773], - [3776, 3780], - [3782, 3782], - [3784, 3790], - [3792, 3801], - [3804, 3807], - ], // Lao - Tibt: [ - [3840, 3911], - [3913, 3948], - [3953, 3991], - [3993, 4028], - [4030, 4044], - [4046, 4052], - [4057, 4058], - ], // Tibetan - Mymr: [ - [4096, 4255], - [43488, 43518], - [43616, 43647], - ], // Myanmar - Geor: [ - [4256, 4293], - [4295, 4295], - [4301, 4301], - [4304, 4346], - [4348, 4351], - [7312, 7354], - [7357, 7359], - [11520, 11557], - [11559, 11559], - [11565, 11565], - ], // Georgian - Hang: [ - [4352, 4607], - [12334, 12335], - [12593, 12686], - [12800, 12830], - [12896, 12926], - [43360, 43388], - [44032, 55203], - [55216, 55238], - [55243, 55291], - [65440, 65470], - [65474, 65479], - [65482, 65487], - [65490, 65495], - [65498, 65500], - ], // Hangul - Ethi: [ - [4608, 4680], - [4682, 4685], - [4688, 4694], - [4696, 4696], - [4698, 4701], - [4704, 4744], - [4746, 4749], - [4752, 4784], - [4786, 4789], - [4792, 4798], - [4800, 4800], - [4802, 4805], - [4808, 4822], - [4824, 4880], - [4882, 4885], - [4888, 4954], - [4957, 4988], - [4992, 5017], - [11648, 11670], - [11680, 11686], - [11688, 11694], - [11696, 11702], - [11704, 11710], - [11712, 11718], - [11720, 11726], - [11728, 11734], - [11736, 11742], - [43777, 43782], - [43785, 43790], - [43793, 43798], - [43808, 43814], - [43816, 43822], - [124896, 124902], - [124904, 124907], - [124909, 124910], - [124912, 124926], - ], // Ethiopic - Cher: [ - [5024, 5109], - [5112, 5117], - [43888, 43967], - ], // Cherokee - Cans: [ - [5120, 5759], - [6320, 6389], - [72368, 72383], - ], // Canadian_Aboriginal - Ogam: [[5760, 5788]], // Ogham - Runr: [ - [5792, 5866], - [5870, 5880], - ], // Runic - Tglg: [ - [5888, 5909], - [5919, 5919], - ], // Tagalog - Hano: [[5920, 5940]], // Hanunoo - Buhd: [[5952, 5971]], // Buhid - Tagb: [ - [5984, 5996], - [5998, 6000], - [6002, 6003], - ], // Tagbanwa - Khmr: [ - [6016, 6109], - [6112, 6121], - [6128, 6137], - [6624, 6655], - ], // Khmer - Mong: [ - [6144, 6145], - [6148, 6148], - [6150, 6169], - [6176, 6264], - [6272, 6314], - [71264, 71276], - ], // Mongolian - Limb: [ - [6400, 6430], - [6432, 6443], - [6448, 6459], - [6464, 6464], - [6468, 6479], - ], // Limbu - Tale: [ - [6480, 6509], - [6512, 6516], - ], // Tai_Le - Talu: [ - [6528, 6571], - [6576, 6601], - [6608, 6618], - [6622, 6623], - ], // New_Tai_Lue - Bugi: [ - [6656, 6683], - [6686, 6687], - ], // Buginese - Lana: [ - [6688, 6750], - [6752, 6780], - [6783, 6793], - [6800, 6809], - [6816, 6829], - ], // Tai_Tham - Bali: [ - [6912, 6988], - [6992, 7038], - ], // Balinese - Sund: [ - [7040, 7103], - [7360, 7367], - ], // Sundanese - Batk: [ - [7104, 7155], - [7164, 7167], - ], // Batak - Lepc: [ - [7168, 7223], - [7227, 7241], - [7245, 7247], - ], // Lepcha - Olck: [[7248, 7295]], // Ol_Chiki - Brai: [[10240, 10495]], // Braille - Glag: [ - [11264, 11359], - [122880, 122886], - [122888, 122904], - [122907, 122913], - [122915, 122916], - [122918, 122922], - ], // Glagolitic - Tfng: [ - [11568, 11623], - [11631, 11632], - [11647, 11647], - ], // Tifinagh - Hani: [ - [11904, 11929], - [11931, 12019], - [12032, 12245], - [12293, 12293], - [12295, 12295], - [12321, 12329], - [12344, 12347], - [13312, 19903], - [19968, 40959], - [63744, 64109], - [64112, 64217], - [94178, 94179], - [94192, 94193], - [131072, 173791], - [173824, 177977], - [177984, 178205], - [178208, 183969], - [183984, 191456], - [194560, 195101], - [196608, 201546], - [201552, 205743], - ], // Han - Hira: [ - [12353, 12438], - [12445, 12447], - [110593, 110879], - [110898, 110898], - [110928, 110930], - [127488, 127488], - ], // Hiragana - Kana: [ - [12449, 12538], - [12541, 12543], - [12784, 12799], - [13008, 13054], - [13056, 13143], - [65382, 65391], - [65393, 65437], - [110576, 110579], - [110581, 110587], - [110589, 110590], - [110592, 110592], - [110880, 110882], - [110933, 110933], - [110948, 110951], - ], // Katakana - Yiii: [ - [40960, 42124], - [42128, 42182], - ], // Yi - Lisu: [ - [42192, 42239], - [73648, 73648], - ], // Lisu - Vaii: [[42240, 42539]], // Vai - Bamu: [ - [42656, 42743], - [92160, 92728], - ], // Bamum - Sylo: [[43008, 43052]], // Syloti_Nagri - Phag: [[43072, 43127]], // Phags_Pa - Saur: [ - [43136, 43205], - [43214, 43225], - ], // Saurashtra - Kali: [ - [43264, 43309], - [43311, 43311], - ], // Kayah_Li - Rjng: [ - [43312, 43347], - [43359, 43359], - ], // Rejang - Java: [ - [43392, 43469], - [43472, 43481], - [43486, 43487], - ], // Javanese - Cham: [ - [43520, 43574], - [43584, 43597], - [43600, 43609], - [43612, 43615], - ], // Cham - Tavt: [ - [43648, 43714], - [43739, 43743], - ], // Tai_Viet - Mtei: [ - [43744, 43766], - [43968, 44013], - [44016, 44025], - ], // Meetei_Mayek - Linb: [ - [65536, 65547], - [65549, 65574], - [65576, 65594], - [65596, 65597], - [65599, 65613], - [65616, 65629], - [65664, 65786], - ], // Linear_B - Lyci: [[66176, 66204]], // Lycian - Cari: [[66208, 66256]], // Carian - Ital: [ - [66304, 66339], - [66349, 66351], - ], // Old_Italic - Goth: [[66352, 66378]], // Gothic - Perm: [[66384, 66426]], // Old_Permic - Ugar: [ - [66432, 66461], - [66463, 66463], - ], // Ugaritic - Xpeo: [ - [66464, 66499], - [66504, 66517], - ], // Old_Persian - Dsrt: [[66560, 66639]], // Deseret - Shaw: [[66640, 66687]], // Shavian - Osma: [ - [66688, 66717], - [66720, 66729], - ], // Osmanya - Osge: [ - [66736, 66771], - [66776, 66811], - ], // Osage - Elba: [[66816, 66855]], // Elbasan - Aghb: [ - [66864, 66915], - [66927, 66927], - ], // Caucasian_Albanian - Vith: [ - [66928, 66938], - [66940, 66954], - [66956, 66962], - [66964, 66965], - [66967, 66977], - [66979, 66993], - [66995, 67001], - [67003, 67004], - ], // Vithkuqi - Lina: [ - [67072, 67382], - [67392, 67413], - [67424, 67431], - ], // Linear_A - Cprt: [ - [67584, 67589], - [67592, 67592], - [67594, 67637], - [67639, 67640], - [67644, 67644], - [67647, 67647], - ], // Cypriot - Armi: [ - [67648, 67669], - [67671, 67679], - ], // Imperial_Aramaic - Palm: [[67680, 67711]], // Palmyrene - Nbat: [ - [67712, 67742], - [67751, 67759], - ], // Nabataean - Hatr: [ - [67808, 67826], - [67828, 67829], - [67835, 67839], - ], // Hatran - Phnx: [ - [67840, 67867], - [67871, 67871], - ], // Phoenician - Lydi: [ - [67872, 67897], - [67903, 67903], - ], // Lydian - Mero: [[67968, 67999]], // Meroitic_Hieroglyphs - Merc: [ - [68000, 68023], - [68028, 68047], - [68050, 68095], - ], // Meroitic_Cursive - Khar: [ - [68096, 68099], - [68101, 68102], - [68108, 68115], - [68117, 68119], - [68121, 68149], - [68152, 68154], - [68159, 68168], - [68176, 68184], - ], // Kharoshthi - Sarb: [[68192, 68223]], // Old_South_Arabian - Narb: [[68224, 68255]], // Old_North_Arabian - Mani: [ - [68288, 68326], - [68331, 68342], - ], // Manichaean - Avst: [ - [68352, 68405], - [68409, 68415], - ], // Avestan - Prti: [ - [68416, 68437], - [68440, 68447], - ], // Inscriptional_Parthian - Phli: [ - [68448, 68466], - [68472, 68479], - ], // Inscriptional_Pahlavi - Phlp: [ - [68480, 68497], - [68505, 68508], - [68521, 68527], - ], // Psalter_Pahlavi - Orkh: [[68608, 68680]], // Old_Turkic - Hung: [ - [68736, 68786], - [68800, 68850], - [68858, 68863], - ], // Old_Hungarian - Rohg: [ - [68864, 68903], - [68912, 68921], - ], // Hanifi_Rohingya - Yezi: [ - [69248, 69289], - [69291, 69293], - [69296, 69297], - ], // Yezidi - Sogo: [[69376, 69415]], // Old_Sogdian - Sogd: [[69424, 69465]], // Sogdian - Ougr: [[69488, 69513]], // Old_Uyghur - Chrs: [[69552, 69579]], // Chorasmian - Elym: [[69600, 69622]], // Elymaic - Brah: [ - [69632, 69709], - [69714, 69749], - [69759, 69759], - ], // Brahmi - Kthi: [ - [69760, 69826], - [69837, 69837], - ], // Kaithi - Sora: [ - [69840, 69864], - [69872, 69881], - ], // Sora_Sompeng - Cakm: [ - [69888, 69940], - [69942, 69959], - ], // Chakma - Mahj: [[69968, 70006]], // Mahajani - Shrd: [[70016, 70111]], // Sharada - Khoj: [ - [70144, 70161], - [70163, 70209], - ], // Khojki - Mult: [ - [70272, 70278], - [70280, 70280], - [70282, 70285], - [70287, 70301], - [70303, 70313], - ], // Multani - Sind: [ - [70320, 70378], - [70384, 70393], - ], // Khudawadi - Gran: [ - [70400, 70403], - [70405, 70412], - [70415, 70416], - [70419, 70440], - [70442, 70448], - [70450, 70451], - [70453, 70457], - [70460, 70468], - [70471, 70472], - [70475, 70477], - [70480, 70480], - [70487, 70487], - [70493, 70499], - [70502, 70508], - [70512, 70516], - ], // Grantha - Newa: [ - [70656, 70747], - [70749, 70753], - ], // Newa - Tirh: [ - [70784, 70855], - [70864, 70873], - ], // Tirhuta - Sidd: [ - [71040, 71093], - [71096, 71133], - ], // Siddham - Modi: [ - [71168, 71236], - [71248, 71257], - ], // Modi - Takr: [ - [71296, 71353], - [71360, 71369], - ], // Takri - Ahom: [ - [71424, 71450], - [71453, 71467], - [71472, 71494], - ], // Ahom - Dogr: [[71680, 71739]], // Dogra - Wara: [ - [71840, 71922], - [71935, 71935], - ], // Warang_Citi - Diak: [ - [71936, 71942], - [71945, 71945], - [71948, 71955], - [71957, 71958], - [71960, 71989], - [71991, 71992], - [71995, 72006], - [72016, 72025], - ], // Dives_Akuru - Nand: [ - [72096, 72103], - [72106, 72151], - [72154, 72164], - ], // Nandinagari - Zanb: [[72192, 72263]], // Zanabazar_Square - Soyo: [[72272, 72354]], // Soyombo - Pauc: [[72384, 72440]], // Pau_Cin_Hau - Bhks: [ - [72704, 72712], - [72714, 72758], - [72760, 72773], - [72784, 72812], - ], // Bhaiksuki - Marc: [ - [72816, 72847], - [72850, 72871], - [72873, 72886], - ], // Marchen - Gonm: [ - [72960, 72966], - [72968, 72969], - [72971, 73014], - [73018, 73018], - [73020, 73021], - [73023, 73031], - [73040, 73049], - ], // Masaram_Gondi - Gong: [ - [73056, 73061], - [73063, 73064], - [73066, 73102], - [73104, 73105], - [73107, 73112], - [73120, 73129], - ], // Gunjala_Gondi - Maka: [[73440, 73464]], // Makasar - Kawi: [ - [73472, 73488], - [73490, 73530], - [73534, 73561], - ], // Kawi - Xsux: [ - [73728, 74649], - [74752, 74862], - [74864, 74868], - [74880, 75075], - ], // Cuneiform - Cpmn: [[77712, 77810]], // Cypro_Minoan - Egyp: [[77824, 78933]], // Egyptian_Hieroglyphs - Hluw: [[82944, 83526]], // Anatolian_Hieroglyphs - Mroo: [ - [92736, 92766], - [92768, 92777], - [92782, 92783], - ], // Mro - Tnsa: [ - [92784, 92862], - [92864, 92873], - ], // Tangsa - Bass: [ - [92880, 92909], - [92912, 92917], - ], // Bassa_Vah - Hmng: [ - [92928, 92997], - [93008, 93017], - [93019, 93025], - [93027, 93047], - [93053, 93071], - ], // Pahawh_Hmong - Medf: [[93760, 93850]], // Medefaidrin - Plrd: [ - [93952, 94026], - [94031, 94087], - [94095, 94111], - ], // Miao - Tang: [ - [94176, 94176], - [94208, 100343], - [100352, 101119], - [101632, 101640], - ], // Tangut - Nshu: [ - [94177, 94177], - [110960, 111355], - ], // Nushu - Kits: [ - [94180, 94180], - [101120, 101589], - ], // Khitan_Small_Script - Dupl: [ - [113664, 113770], - [113776, 113788], - [113792, 113800], - [113808, 113817], - [113820, 113823], - ], // Duployan - Sgnw: [ - [120832, 121483], - [121499, 121503], - [121505, 121519], - ], // SignWriting - Hmnp: [ - [123136, 123180], - [123184, 123197], - [123200, 123209], - [123214, 123215], - ], // Nyiakeng_Puachue_Hmong - Toto: [[123536, 123566]], // Toto - Wcho: [ - [123584, 123641], - [123647, 123647], - ], // Wancho - Nagm: [[124112, 124153]], // Nag_Mundari - Mend: [ - [124928, 125124], - [125127, 125142], - ], // Mende_Kikakui - Adlm: [ - [125184, 125259], - [125264, 125273], - [125278, 125279], - ], // Adlam - Zyyy: [ - [0, 64], - [91, 96], - [123, 169], - [171, 185], - [187, 191], - [215, 215], - [247, 247], - [697, 735], - [741, 745], - [748, 767], - [884, 884], - [894, 894], - [901, 901], - [903, 903], - [1541, 1541], - [1548, 1548], - [1563, 1563], - [1567, 1567], - [1600, 1600], - [1757, 1757], - [2274, 2274], - [2404, 2405], - [3647, 3647], - [4053, 4056], - [4347, 4347], - [5867, 5869], - [5941, 5942], - [6146, 6147], - [6149, 6149], - [7379, 7379], - [7393, 7393], - [7401, 7404], - [7406, 7411], - [7413, 7415], - [7418, 7418], - [8192, 8203], - [8206, 8292], - [8294, 8304], - [8308, 8318], - [8320, 8334], - [8352, 8384], - [8448, 8485], - [8487, 8489], - [8492, 8497], - [8499, 8525], - [8527, 8543], - [8585, 8587], - [8592, 9254], - [9280, 9290], - [9312, 10239], - [10496, 11123], - [11126, 11157], - [11159, 11263], - [11776, 11869], - [12272, 12283], - [12288, 12292], - [12294, 12294], - [12296, 12320], - [12336, 12343], - [12348, 12351], - [12443, 12444], - [12448, 12448], - [12539, 12540], - [12688, 12703], - [12736, 12771], - [12832, 12895], - [12927, 13007], - [13055, 13055], - [13144, 13311], - [19904, 19967], - [42752, 42785], - [42888, 42890], - [43056, 43065], - [43310, 43310], - [43471, 43471], - [43867, 43867], - [43882, 43883], - [64830, 64831], - [65040, 65049], - [65072, 65106], - [65108, 65126], - [65128, 65131], - [65279, 65279], - [65281, 65312], - [65339, 65344], - [65371, 65381], - [65392, 65392], - [65438, 65439], - [65504, 65510], - [65512, 65518], - [65529, 65532], - [65792, 65794], - [65799, 65843], - [65847, 65855], - [65936, 65948], - [66000, 66044], - [66273, 66299], - [113824, 113827], - [118608, 118723], - [118784, 119029], - [119040, 119078], - [119081, 119142], - [119146, 119162], - [119171, 119172], - [119180, 119209], - [119214, 119274], - [119488, 119507], - [119520, 119539], - [119552, 119638], - [119648, 119672], - [119808, 119892], - [119894, 119964], - [119966, 119967], - [119970, 119970], - [119973, 119974], - [119977, 119980], - [119982, 119993], - [119995, 119995], - [119997, 120003], - [120005, 120069], - [120071, 120074], - [120077, 120084], - [120086, 120092], - [120094, 120121], - [120123, 120126], - [120128, 120132], - [120134, 120134], - [120138, 120144], - [120146, 120485], - [120488, 120779], - [120782, 120831], - [126065, 126132], - [126209, 126269], - [126976, 127019], - [127024, 127123], - [127136, 127150], - [127153, 127167], - [127169, 127183], - [127185, 127221], - [127232, 127405], - [127462, 127487], - [127489, 127490], - [127504, 127547], - [127552, 127560], - [127568, 127569], - [127584, 127589], - [127744, 128727], - [128732, 128748], - [128752, 128764], - [128768, 128886], - [128891, 128985], - [128992, 129003], - [129008, 129008], - [129024, 129035], - [129040, 129095], - [129104, 129113], - [129120, 129159], - [129168, 129197], - [129200, 129201], - [129280, 129619], - [129632, 129645], - [129648, 129660], - [129664, 129672], - [129680, 129725], - [129727, 129733], - [129742, 129755], - [129760, 129768], - [129776, 129784], - [129792, 129938], - [129940, 129994], - [130032, 130041], - [917505, 917505], - [917536, 917631], - ], // Common - Zzzz: [[65533, 65533]], -}; - -export function getScriptPredictor( - replaceWhitespace: boolean = true, - replacePunctuation: boolean = true, - replaceDigits: boolean = true, -): (sent: string) => ScoredScript { - // Create a map of code points to script names - const histMap: Map<number, Set<string>> = new Map(); - - for (const [key, ranges] of Object.entries(SCRIPT_RANGES)) { - for (const [start, end] of ranges) { - for (let ordinal = start; ordinal <= end; ordinal++) { - if (!histMap.has(ordinal)) { - histMap.set(ordinal, new Set()); - } - histMap.get(ordinal)!.add(key); - } - } - } - - // Helper function to check if a character is whitespace - const isWhitespace = (char: string): boolean => /\s/.test(char); - - // Helper function to check if a character is punctuation - const isPunctuation = (char: string): boolean => - /[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]/.test(char); - - // Helper function to check if a character is a digit - const isDigit = (char: string): boolean => /\d/.test(char); - - return (sent: string): ScoredScript => { - // Filter out characters based on settings - const filteredText = sent - .split("") - .filter((char) => { - if (replaceWhitespace && isWhitespace(char)) return false; - if (replacePunctuation && isPunctuation(char)) return false; - if (replaceDigits && isDigit(char)) return false; - return true; - }) - .join(""); - - if (filteredText.length === 0) { - return [null, 0, { details: null, tie: null, interval: null }]; - } - - // Count characters by script - const scriptCount: Map<string, number> = new Map(); - - for (const char of filteredText) { - const ordinal = char.codePointAt(0)!; - const scripts = histMap.get(ordinal) || new Set(["Zzzz"]); - - for (const script of scripts) { - scriptCount.set(script, (scriptCount.get(script) || 0) + 1); - } - } - - // Convert to sorted object for details - const sortedScores: Record<string, number> = {}; - for (const [script, count] of scriptCount.entries()) { - sortedScores[script] = count / filteredText.length; - } - - // Find the script with maximum score - let maxScore = 0; - let maxScript: string | null = null; - - for (const [script, count] of scriptCount.entries()) { - const score = count / filteredText.length; - if (score > maxScore) { - maxScore = score; - maxScript = script; - } - } - - // Sort scores for details - const sortedEntries = Object.entries(sortedScores).sort( - (a, b) => b[1] - a[1], - ); - const sortedDetails: Record<string, number> = - Object.fromEntries(sortedEntries); - - // Calculate interval and check for ties - if (sortedEntries.length > 1) { - const secondScore = sortedEntries[1][1]; - const interval = maxScore - secondScore; - return [ - maxScript, - maxScore, - { - details: sortedDetails, - tie: interval === 0, - interval: interval, - }, - ]; - } - - return [ - maxScript, - maxScore, - { - details: sortedDetails, - tie: false, - interval: 1, - }, - ]; - }; -} - -export function separateScript(sent: string): Record<string, string> { - const result: Record<string, string[]> = {}; - - for (const char of sent) { - const codePoint = char.codePointAt(0)!; - - for (const [script, ranges] of Object.entries(SCRIPT_RANGES)) { - for (const [start, end] of ranges) { - if ((start <= codePoint && codePoint <= end) || char === " ") { - if (!result[script]) { - result[script] = []; - } - result[script].push(char); - break; - } - } - } - } - - // Filter out empty values and spaces, convert arrays to strings - const filtered: Record<string, string> = {}; - for (const [key, value] of Object.entries(result)) { - const joined = value.join(""); - if (joined && joined.trim()) { - filtered[key] = joined; - } - } - - return filtered; -} - -// Test functions -export function testPredictScript(): void { - const predictor = getScriptPredictor(); - - const tests: [string, [string | null, number]][] = [ - ["this is a latin script.", ["Latn", 1.0]], - ["isso é escrita latina 1234", ["Latn", 1.0]], - ["এটি বাংলা লিপি", ["Beng", 1.0]], - ["นี่คืออักษรไทย", ["Thai", 1.0]], - [ - "자미로콰이 Jamiroquai는 영국의 애시드 재즈 밴드이다 자미로콰이는 년대 초반 런던에서 활발하게 일어난 애시드 재즈", - ["Hang", 0.8148148148148148], - ], - ["이어지는기사 에서그점 에관해알려줄것 입니다", ["Hang", 1.0]], - ["12345", [null, 0]], - [" ", [null, 0]], - ["", [null, 0]], - ]; - - for (const [input, expected] of tests) { - const result = predictor(input); - console.assert( - result[0] === expected[0] && Math.abs(result[1] - expected[1]) < 0.0001, - `Test failed for "${input}"\nExpected: ${expected}\nGot: [${result[0]}, ${result[1]}]`, - ); - } -} - -export function testSeparateScript(): void { - const sent = "Hello Salut سلام 你好 こんにちは שלום مرحبا"; - const detected = separateScript(sent); - - const groundTruth: Record<string, string> = { - Latn: "Hello Salut ", - Hebr: " שלום ", - Arab: " سلام مرحبا", - Hani: " 你好 ", - Hira: " こんにちは ", - }; - - for (const key of Object.keys(groundTruth)) { - console.assert( - key in detected, - `Error: '${key}' script not found in detected scripts.`, - ); - - const detectedTokens = detected[key] - .split(" ") - .map((x) => x.trim()) - .filter((x) => x.length > 0); - const groundTruthTokens = groundTruth[key] - .split(" ") - .map((x) => x.trim()) - .filter((x) => x.length > 0); - - console.assert( - JSON.stringify(detectedTokens.sort()) === - JSON.stringify(groundTruthTokens.sort()), - `Error: Tokens for key '${key}' do not match.`, - ); - } -} |
