1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
|
const ALL_LANGUAGE_CODES = {
iso639_1: [],
iso639_2_T: [], // Terminology codes
iso639_2_B: [], // Bibliographic codes
iso639_3: [],
iso639_5: [],
bcp47_language_subtags: [], // Primary language subtags from IANA
};
const LOC_ISO639_2_URL = 'https://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt';
// For ISO 639-3, SIL provides dated files. This is the structure of the comprehensive file.
// The actual filename changes with each update (e.g., iso-639-3_20240123.tab).
// You might need to go to https://iso639-3.sil.org/code_tables/download_tables and get the current link
// for the "Complete Code Tables" zip, then extract the main .tab file.
// For this script, I'll use a link to one specific (potentially older) version for demonstration.
// A more robust solution would involve downloading and unzipping the latest.
// This link points to the main table that includes mappings.
const SIL_ISO639_3_URL = 'https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_20240701.tab'; // Example: replace with current
const LOC_ISO639_5_URL = 'https://www.loc.gov/standards/iso639-5/iso639-5.tsv'; // TSV format
const IANA_LANGUAGE_SUBTAG_REGISTRY_URL = 'https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry';
async function fetchAndParseISO639_1_2() {
try {
console.log('Fetching ISO 639-1 & 639-2 codes from LOC...');
const response = await fetch(LOC_ISO639_2_URL);
if (!response.ok) throw new Error(`Failed to fetch ISO 639-1/2: ${response.statusText}`);
const text = await response.text();
const lines = text.trim().split('\n');
lines.forEach(line => {
// Format: alpha3-b|alpha3-t|alpha2|english_name|french_name
const parts = line.split('|');
if (parts.length >= 4) {
const alpha3_b = parts[0].trim();
const alpha3_t = parts[1].trim();
const alpha2 = parts[2].trim();
const englishName = parts[3].trim();
if (alpha3_b) { // Bibliographic code
ALL_LANGUAGE_CODES.iso639_2_B.push({ code: alpha3_b, name: englishName });
}
if (alpha3_t) { // Terminology code
ALL_LANGUAGE_CODES.iso639_2_T.push({ code: alpha3_t, name: englishName });
}
if (alpha2) { // Alpha-2 code
ALL_LANGUAGE_CODES.iso639_1.push({ code: alpha2, name: englishName });
}
}
});
console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_1.length} ISO 639-1 codes.`);
console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_B.length} ISO 639-2/B codes.`);
console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_T.length} ISO 639-2/T codes.`);
} catch (error) {
console.error('Error fetching ISO 639-1/2 codes:', error.message);
}
}
async function fetchAndParseISO639_3() {
try {
console.log('Fetching ISO 639-3 codes from SIL...');
const response = await fetch(SIL_ISO639_3_URL);
if (!response.ok) throw new Error(`Failed to fetch ISO 639-3: ${response.statusText}`);
const text = await response.text();
const lines = text.trim().split('\n');
const header = lines.shift().split('\t'); // Remove header line
// Expected header fields (order matters):
// Id (3-letter code) | Part2B | Part2T | Part1 | Scope | Language_Type | Ref_Name | Comment
const idIndex = header.indexOf('Id');
const refNameIndex = header.indexOf('Ref_Name');
const part1Index = header.indexOf('Part1'); // For cross-referencing ISO 639-1
if (idIndex === -1 || refNameIndex === -1) {
throw new Error('ISO 639-3 header format mismatch. Expected "Id" and "Ref_Name" columns.');
}
lines.forEach(line => {
const parts = line.split('\t');
const code = parts[idIndex]?.trim();
const name = parts[refNameIndex]?.trim();
const part1Code = parts[part1Index]?.trim();
if (code && name) {
ALL_LANGUAGE_CODES.iso639_3.push({ code, name });
// Also, let's try to get more complete ISO 639-1 from this source
// as it might be more comprehensive than LOC's where 639-1 is only if 639-2 exists
if (part1Code && !ALL_LANGUAGE_CODES.iso639_1.find(c => c.code === part1Code)) {
ALL_LANGUAGE_CODES.iso639_1.push({ code: part1Code, name });
}
}
});
console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_3.length} ISO 639-3 codes.`);
// Deduplicate and sort ISO 639-1 after potential additions
const uniqueIso639_1 = {};
ALL_LANGUAGE_CODES.iso639_1.forEach(item => uniqueIso639_1[item.code] = item);
ALL_LANGUAGE_CODES.iso639_1 = Object.values(uniqueIso639_1).sort((a, b) => a.code.localeCompare(b.code));
console.log(`Final unique ISO 639-1 count: ${ALL_LANGUAGE_CODES.iso639_1.length}.`);
} catch (error) {
console.error('Error fetching ISO 639-3 codes:', error.message);
console.warn('Make sure the SIL_ISO639_3_URL is current or points to a valid .tab file.');
}
}
async function fetchAndParseISO639_5() {
try {
console.log('Fetching ISO 639-5 codes from LOC...');
const response = await fetch(LOC_ISO639_5_URL);
if (!response.ok) throw new Error(`Failed to fetch ISO 639-5: ${response.statusText}`);
const text = await response.text();
const lines = text.trim().split('\n');
lines.shift(); // Remove header line: URI Code Label_en
lines.forEach(line => {
const parts = line.split('\t');
// URI | Code | Label_en | Label_fr ...
if (parts.length >= 3) {
const code = parts[1].trim();
const name = parts[2].trim();
if (code && name) {
ALL_LANGUAGE_CODES.iso639_5.push({ code, name });
}
}
});
console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_5.length} ISO 639-5 codes (language families/groups).`);
} catch (error) {
console.error('Error fetching ISO 639-5 codes:', error.message);
}
}
async function fetchAndParseIANALanguageSubtags() {
try {
console.log('Fetching IANA Language Subtag Registry...');
const response = await fetch(IANA_LANGUAGE_SUBTAG_REGISTRY_URL);
if (!response.ok) throw new Error(`Failed to fetch IANA registry: ${response.statusText}`);
const text = await response.text();
const entries = text.split('%%'); // Entries are separated by %%
entries.forEach(entry => {
const lines = entry.trim().split('\n');
let type = '';
let subtag = '';
let description = '';
lines.forEach(line => {
if (line.startsWith('Type:')) {
type = line.substring(5).trim();
} else if (line.startsWith('Subtag:')) {
subtag = line.substring(7).trim();
} else if (line.startsWith('Description:')) {
// Description can span multiple lines, but for simplicity, we take the first
if (!description) description = line.substring(12).trim();
}
});
if (type === 'language' && subtag && description) {
ALL_LANGUAGE_CODES.bcp47_language_subtags.push({
code: subtag,
name: description
});
}
});
console.log(`Fetched ${ALL_LANGUAGE_CODES.bcp47_language_subtags.length} primary language subtags from IANA.`);
} catch (error) {
console.error('Error fetching IANA Language Subtag Registry:', error.message);
}
}
async function main() {
console.log('Starting to fetch all language codes...\n');
await Promise.all([
fetchAndParseISO639_1_2(),
fetchAndParseISO639_3(), // Run this after 1_2 to potentially augment 639-1
fetchAndParseISO639_5(),
fetchAndParseIANALanguageSubtags()
]);
await Bun.write("bcp.json", JSON.stringify(ALL_LANGUAGE_CODES.bcp47_language_subtags))
console.log('\n\n--- All Fetched Language Codes ---');
// Example: Print counts and first few of each
for (const [key, codes] of Object.entries(ALL_LANGUAGE_CODES)) {
console.log(`\n--- ${key} (Total: ${codes.length}) ---`);
codes.slice(0, 50).forEach(c => console.log(`${c.code}: ${c.name}`));
if (codes.length > 50) console.log('... and more ...');
}
// You can now use ALL_LANGUAGE_CODES object for your needs
// e.g., save to a JSON file
// import fs from 'fs';
// fs.writeFileSync('all_language_codes.json', JSON.stringify(ALL_LANGUAGE_CODES, null, 2));
// console.log('\n\nSaved all codes to all_language_codes.json');
console.log('\nFetching complete.');
}
main().catch(console.error);
|