From a3f24ea79b14394b24c4b60a010651eb29eeb872 Mon Sep 17 00:00:00 2001 From: polwex Date: Thu, 29 May 2025 12:10:22 +0700 Subject: glorious new db --- src/lib/db/codes.js | 203 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 src/lib/db/codes.js (limited to 'src/lib/db/codes.js') diff --git a/src/lib/db/codes.js b/src/lib/db/codes.js new file mode 100644 index 0000000..bef3e1b --- /dev/null +++ b/src/lib/db/codes.js @@ -0,0 +1,203 @@ + + +const ALL_LANGUAGE_CODES = { + iso639_1: [], + iso639_2_T: [], // Terminology codes + iso639_2_B: [], // Bibliographic codes + iso639_3: [], + iso639_5: [], + bcp47_language_subtags: [], // Primary language subtags from IANA +}; + +const LOC_ISO639_2_URL = 'https://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt'; +// For ISO 639-3, SIL provides dated files. This is the structure of the comprehensive file. +// The actual filename changes with each update (e.g., iso-639-3_20240123.tab). +// You might need to go to https://iso639-3.sil.org/code_tables/download_tables and get the current link +// for the "Complete Code Tables" zip, then extract the main .tab file. +// For this script, I'll use a link to one specific (potentially older) version for demonstration. +// A more robust solution would involve downloading and unzipping the latest. +// This link points to the main table that includes mappings. +const SIL_ISO639_3_URL = 'https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_20240701.tab'; // Example: replace with current +const LOC_ISO639_5_URL = 'https://www.loc.gov/standards/iso639-5/iso639-5.tsv'; // TSV format +const IANA_LANGUAGE_SUBTAG_REGISTRY_URL = 'https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry'; + +async function fetchAndParseISO639_1_2() { + try { + console.log('Fetching ISO 639-1 & 639-2 codes from LOC...'); + const response = await fetch(LOC_ISO639_2_URL); + if (!response.ok) throw new Error(`Failed to fetch ISO 639-1/2: ${response.statusText}`); + const text = await response.text(); + + const lines = text.trim().split('\n'); + lines.forEach(line => { + // Format: alpha3-b|alpha3-t|alpha2|english_name|french_name + const parts = line.split('|'); + if (parts.length >= 4) { + const alpha3_b = parts[0].trim(); + const alpha3_t = parts[1].trim(); + const alpha2 = parts[2].trim(); + const englishName = parts[3].trim(); + + if (alpha3_b) { // Bibliographic code + ALL_LANGUAGE_CODES.iso639_2_B.push({ code: alpha3_b, name: englishName }); + } + if (alpha3_t) { // Terminology code + ALL_LANGUAGE_CODES.iso639_2_T.push({ code: alpha3_t, name: englishName }); + } + if (alpha2) { // Alpha-2 code + ALL_LANGUAGE_CODES.iso639_1.push({ code: alpha2, name: englishName }); + } + } + }); + console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_1.length} ISO 639-1 codes.`); + console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_B.length} ISO 639-2/B codes.`); + console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_T.length} ISO 639-2/T codes.`); + } catch (error) { + console.error('Error fetching ISO 639-1/2 codes:', error.message); + } +} + +async function fetchAndParseISO639_3() { + try { + console.log('Fetching ISO 639-3 codes from SIL...'); + const response = await fetch(SIL_ISO639_3_URL); + if (!response.ok) throw new Error(`Failed to fetch ISO 639-3: ${response.statusText}`); + const text = await response.text(); + + const lines = text.trim().split('\n'); + const header = lines.shift().split('\t'); // Remove header line + // Expected header fields (order matters): + // Id (3-letter code) | Part2B | Part2T | Part1 | Scope | Language_Type | Ref_Name | Comment + const idIndex = header.indexOf('Id'); + const refNameIndex = header.indexOf('Ref_Name'); + const part1Index = header.indexOf('Part1'); // For cross-referencing ISO 639-1 + + if (idIndex === -1 || refNameIndex === -1) { + throw new Error('ISO 639-3 header format mismatch. Expected "Id" and "Ref_Name" columns.'); + } + + lines.forEach(line => { + const parts = line.split('\t'); + const code = parts[idIndex]?.trim(); + const name = parts[refNameIndex]?.trim(); + const part1Code = parts[part1Index]?.trim(); + + if (code && name) { + ALL_LANGUAGE_CODES.iso639_3.push({ code, name }); + + // Also, let's try to get more complete ISO 639-1 from this source + // as it might be more comprehensive than LOC's where 639-1 is only if 639-2 exists + if (part1Code && !ALL_LANGUAGE_CODES.iso639_1.find(c => c.code === part1Code)) { + ALL_LANGUAGE_CODES.iso639_1.push({ code: part1Code, name }); + } + } + }); + console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_3.length} ISO 639-3 codes.`); + // Deduplicate and sort ISO 639-1 after potential additions + const uniqueIso639_1 = {}; + ALL_LANGUAGE_CODES.iso639_1.forEach(item => uniqueIso639_1[item.code] = item); + ALL_LANGUAGE_CODES.iso639_1 = Object.values(uniqueIso639_1).sort((a, b) => a.code.localeCompare(b.code)); + console.log(`Final unique ISO 639-1 count: ${ALL_LANGUAGE_CODES.iso639_1.length}.`); + + } catch (error) { + console.error('Error fetching ISO 639-3 codes:', error.message); + console.warn('Make sure the SIL_ISO639_3_URL is current or points to a valid .tab file.'); + } +} + +async function fetchAndParseISO639_5() { + try { + console.log('Fetching ISO 639-5 codes from LOC...'); + const response = await fetch(LOC_ISO639_5_URL); + if (!response.ok) throw new Error(`Failed to fetch ISO 639-5: ${response.statusText}`); + const text = await response.text(); + + const lines = text.trim().split('\n'); + lines.shift(); // Remove header line: URI Code Label_en + + lines.forEach(line => { + const parts = line.split('\t'); + // URI | Code | Label_en | Label_fr ... + if (parts.length >= 3) { + const code = parts[1].trim(); + const name = parts[2].trim(); + if (code && name) { + ALL_LANGUAGE_CODES.iso639_5.push({ code, name }); + } + } + }); + console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_5.length} ISO 639-5 codes (language families/groups).`); + } catch (error) { + console.error('Error fetching ISO 639-5 codes:', error.message); + } +} + +async function fetchAndParseIANALanguageSubtags() { + try { + console.log('Fetching IANA Language Subtag Registry...'); + const response = await fetch(IANA_LANGUAGE_SUBTAG_REGISTRY_URL); + if (!response.ok) throw new Error(`Failed to fetch IANA registry: ${response.statusText}`); + const text = await response.text(); + + const entries = text.split('%%'); // Entries are separated by %% + entries.forEach(entry => { + const lines = entry.trim().split('\n'); + let type = ''; + let subtag = ''; + let description = ''; + + lines.forEach(line => { + if (line.startsWith('Type:')) { + type = line.substring(5).trim(); + } else if (line.startsWith('Subtag:')) { + subtag = line.substring(7).trim(); + } else if (line.startsWith('Description:')) { + // Description can span multiple lines, but for simplicity, we take the first + if (!description) description = line.substring(12).trim(); + } + }); + + if (type === 'language' && subtag && description) { + ALL_LANGUAGE_CODES.bcp47_language_subtags.push({ + code: subtag, + name: description + }); + } + }); + console.log(`Fetched ${ALL_LANGUAGE_CODES.bcp47_language_subtags.length} primary language subtags from IANA.`); + } catch (error) { + console.error('Error fetching IANA Language Subtag Registry:', error.message); + } +} + + +async function main() { + console.log('Starting to fetch all language codes...\n'); + + await Promise.all([ + fetchAndParseISO639_1_2(), + fetchAndParseISO639_3(), // Run this after 1_2 to potentially augment 639-1 + fetchAndParseISO639_5(), + fetchAndParseIANALanguageSubtags() + ]); + await Bun.write("bcp.json", JSON.stringify(ALL_LANGUAGE_CODES.bcp47_language_subtags)) + + console.log('\n\n--- All Fetched Language Codes ---'); + + // Example: Print counts and first few of each + for (const [key, codes] of Object.entries(ALL_LANGUAGE_CODES)) { + console.log(`\n--- ${key} (Total: ${codes.length}) ---`); + codes.slice(0, 50).forEach(c => console.log(`${c.code}: ${c.name}`)); + if (codes.length > 50) console.log('... and more ...'); + } + + // You can now use ALL_LANGUAGE_CODES object for your needs + // e.g., save to a JSON file + // import fs from 'fs'; + // fs.writeFileSync('all_language_codes.json', JSON.stringify(ALL_LANGUAGE_CODES, null, 2)); + // console.log('\n\nSaved all codes to all_language_codes.json'); + + console.log('\nFetching complete.'); +} + +main().catch(console.error); -- cgit v1.2.3