const ALL_LANGUAGE_CODES = { iso639_1: [], iso639_2_T: [], // Terminology codes iso639_2_B: [], // Bibliographic codes iso639_3: [], iso639_5: [], bcp47_language_subtags: [], // Primary language subtags from IANA }; const LOC_ISO639_2_URL = 'https://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt'; // For ISO 639-3, SIL provides dated files. This is the structure of the comprehensive file. // The actual filename changes with each update (e.g., iso-639-3_20240123.tab). // You might need to go to https://iso639-3.sil.org/code_tables/download_tables and get the current link // for the "Complete Code Tables" zip, then extract the main .tab file. // For this script, I'll use a link to one specific (potentially older) version for demonstration. // A more robust solution would involve downloading and unzipping the latest. // This link points to the main table that includes mappings. const SIL_ISO639_3_URL = 'https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_20240701.tab'; // Example: replace with current const LOC_ISO639_5_URL = 'https://www.loc.gov/standards/iso639-5/iso639-5.tsv'; // TSV format const IANA_LANGUAGE_SUBTAG_REGISTRY_URL = 'https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry'; async function fetchAndParseISO639_1_2() { try { console.log('Fetching ISO 639-1 & 639-2 codes from LOC...'); const response = await fetch(LOC_ISO639_2_URL); if (!response.ok) throw new Error(`Failed to fetch ISO 639-1/2: ${response.statusText}`); const text = await response.text(); const lines = text.trim().split('\n'); lines.forEach(line => { // Format: alpha3-b|alpha3-t|alpha2|english_name|french_name const parts = line.split('|'); if (parts.length >= 4) { const alpha3_b = parts[0].trim(); const alpha3_t = parts[1].trim(); const alpha2 = parts[2].trim(); const englishName = parts[3].trim(); if (alpha3_b) { // Bibliographic code ALL_LANGUAGE_CODES.iso639_2_B.push({ code: alpha3_b, name: englishName }); } if (alpha3_t) { // Terminology code ALL_LANGUAGE_CODES.iso639_2_T.push({ code: alpha3_t, name: englishName }); } if (alpha2) { // Alpha-2 code ALL_LANGUAGE_CODES.iso639_1.push({ code: alpha2, name: englishName }); } } }); console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_1.length} ISO 639-1 codes.`); console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_B.length} ISO 639-2/B codes.`); console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_T.length} ISO 639-2/T codes.`); } catch (error) { console.error('Error fetching ISO 639-1/2 codes:', error.message); } } async function fetchAndParseISO639_3() { try { console.log('Fetching ISO 639-3 codes from SIL...'); const response = await fetch(SIL_ISO639_3_URL); if (!response.ok) throw new Error(`Failed to fetch ISO 639-3: ${response.statusText}`); const text = await response.text(); const lines = text.trim().split('\n'); const header = lines.shift().split('\t'); // Remove header line // Expected header fields (order matters): // Id (3-letter code) | Part2B | Part2T | Part1 | Scope | Language_Type | Ref_Name | Comment const idIndex = header.indexOf('Id'); const refNameIndex = header.indexOf('Ref_Name'); const part1Index = header.indexOf('Part1'); // For cross-referencing ISO 639-1 if (idIndex === -1 || refNameIndex === -1) { throw new Error('ISO 639-3 header format mismatch. Expected "Id" and "Ref_Name" columns.'); } lines.forEach(line => { const parts = line.split('\t'); const code = parts[idIndex]?.trim(); const name = parts[refNameIndex]?.trim(); const part1Code = parts[part1Index]?.trim(); if (code && name) { ALL_LANGUAGE_CODES.iso639_3.push({ code, name }); // Also, let's try to get more complete ISO 639-1 from this source // as it might be more comprehensive than LOC's where 639-1 is only if 639-2 exists if (part1Code && !ALL_LANGUAGE_CODES.iso639_1.find(c => c.code === part1Code)) { ALL_LANGUAGE_CODES.iso639_1.push({ code: part1Code, name }); } } }); console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_3.length} ISO 639-3 codes.`); // Deduplicate and sort ISO 639-1 after potential additions const uniqueIso639_1 = {}; ALL_LANGUAGE_CODES.iso639_1.forEach(item => uniqueIso639_1[item.code] = item); ALL_LANGUAGE_CODES.iso639_1 = Object.values(uniqueIso639_1).sort((a, b) => a.code.localeCompare(b.code)); console.log(`Final unique ISO 639-1 count: ${ALL_LANGUAGE_CODES.iso639_1.length}.`); } catch (error) { console.error('Error fetching ISO 639-3 codes:', error.message); console.warn('Make sure the SIL_ISO639_3_URL is current or points to a valid .tab file.'); } } async function fetchAndParseISO639_5() { try { console.log('Fetching ISO 639-5 codes from LOC...'); const response = await fetch(LOC_ISO639_5_URL); if (!response.ok) throw new Error(`Failed to fetch ISO 639-5: ${response.statusText}`); const text = await response.text(); const lines = text.trim().split('\n'); lines.shift(); // Remove header line: URI Code Label_en lines.forEach(line => { const parts = line.split('\t'); // URI | Code | Label_en | Label_fr ... if (parts.length >= 3) { const code = parts[1].trim(); const name = parts[2].trim(); if (code && name) { ALL_LANGUAGE_CODES.iso639_5.push({ code, name }); } } }); console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_5.length} ISO 639-5 codes (language families/groups).`); } catch (error) { console.error('Error fetching ISO 639-5 codes:', error.message); } } async function fetchAndParseIANALanguageSubtags() { try { console.log('Fetching IANA Language Subtag Registry...'); const response = await fetch(IANA_LANGUAGE_SUBTAG_REGISTRY_URL); if (!response.ok) throw new Error(`Failed to fetch IANA registry: ${response.statusText}`); const text = await response.text(); const entries = text.split('%%'); // Entries are separated by %% entries.forEach(entry => { const lines = entry.trim().split('\n'); let type = ''; let subtag = ''; let description = ''; lines.forEach(line => { if (line.startsWith('Type:')) { type = line.substring(5).trim(); } else if (line.startsWith('Subtag:')) { subtag = line.substring(7).trim(); } else if (line.startsWith('Description:')) { // Description can span multiple lines, but for simplicity, we take the first if (!description) description = line.substring(12).trim(); } }); if (type === 'language' && subtag && description) { ALL_LANGUAGE_CODES.bcp47_language_subtags.push({ code: subtag, name: description }); } }); console.log(`Fetched ${ALL_LANGUAGE_CODES.bcp47_language_subtags.length} primary language subtags from IANA.`); } catch (error) { console.error('Error fetching IANA Language Subtag Registry:', error.message); } } async function main() { console.log('Starting to fetch all language codes...\n'); await Promise.all([ fetchAndParseISO639_1_2(), fetchAndParseISO639_3(), // Run this after 1_2 to potentially augment 639-1 fetchAndParseISO639_5(), fetchAndParseIANALanguageSubtags() ]); await Bun.write("bcp.json", JSON.stringify(ALL_LANGUAGE_CODES.bcp47_language_subtags)) console.log('\n\n--- All Fetched Language Codes ---'); // Example: Print counts and first few of each for (const [key, codes] of Object.entries(ALL_LANGUAGE_CODES)) { console.log(`\n--- ${key} (Total: ${codes.length}) ---`); codes.slice(0, 50).forEach(c => console.log(`${c.code}: ${c.name}`)); if (codes.length > 50) console.log('... and more ...'); } // You can now use ALL_LANGUAGE_CODES object for your needs // e.g., save to a JSON file // import fs from 'fs'; // fs.writeFileSync('all_language_codes.json', JSON.stringify(ALL_LANGUAGE_CODES, null, 2)); // console.log('\n\nSaved all codes to all_language_codes.json'); console.log('\nFetching complete.'); } main().catch(console.error);