summaryrefslogtreecommitdiff
path: root/src/lib/db/codes.js
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/db/codes.js')
-rw-r--r--src/lib/db/codes.js203
1 files changed, 203 insertions, 0 deletions
diff --git a/src/lib/db/codes.js b/src/lib/db/codes.js
new file mode 100644
index 0000000..bef3e1b
--- /dev/null
+++ b/src/lib/db/codes.js
@@ -0,0 +1,203 @@
+
+
+const ALL_LANGUAGE_CODES = {
+ iso639_1: [],
+ iso639_2_T: [], // Terminology codes
+ iso639_2_B: [], // Bibliographic codes
+ iso639_3: [],
+ iso639_5: [],
+ bcp47_language_subtags: [], // Primary language subtags from IANA
+};
+
+const LOC_ISO639_2_URL = 'https://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt';
+// For ISO 639-3, SIL provides dated files. This is the structure of the comprehensive file.
+// The actual filename changes with each update (e.g., iso-639-3_20240123.tab).
+// You might need to go to https://iso639-3.sil.org/code_tables/download_tables and get the current link
+// for the "Complete Code Tables" zip, then extract the main .tab file.
+// For this script, I'll use a link to one specific (potentially older) version for demonstration.
+// A more robust solution would involve downloading and unzipping the latest.
+// This link points to the main table that includes mappings.
+const SIL_ISO639_3_URL = 'https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_20240701.tab'; // Example: replace with current
+const LOC_ISO639_5_URL = 'https://www.loc.gov/standards/iso639-5/iso639-5.tsv'; // TSV format
+const IANA_LANGUAGE_SUBTAG_REGISTRY_URL = 'https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry';
+
+async function fetchAndParseISO639_1_2() {
+ try {
+ console.log('Fetching ISO 639-1 & 639-2 codes from LOC...');
+ const response = await fetch(LOC_ISO639_2_URL);
+ if (!response.ok) throw new Error(`Failed to fetch ISO 639-1/2: ${response.statusText}`);
+ const text = await response.text();
+
+ const lines = text.trim().split('\n');
+ lines.forEach(line => {
+ // Format: alpha3-b|alpha3-t|alpha2|english_name|french_name
+ const parts = line.split('|');
+ if (parts.length >= 4) {
+ const alpha3_b = parts[0].trim();
+ const alpha3_t = parts[1].trim();
+ const alpha2 = parts[2].trim();
+ const englishName = parts[3].trim();
+
+ if (alpha3_b) { // Bibliographic code
+ ALL_LANGUAGE_CODES.iso639_2_B.push({ code: alpha3_b, name: englishName });
+ }
+ if (alpha3_t) { // Terminology code
+ ALL_LANGUAGE_CODES.iso639_2_T.push({ code: alpha3_t, name: englishName });
+ }
+ if (alpha2) { // Alpha-2 code
+ ALL_LANGUAGE_CODES.iso639_1.push({ code: alpha2, name: englishName });
+ }
+ }
+ });
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_1.length} ISO 639-1 codes.`);
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_B.length} ISO 639-2/B codes.`);
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_2_T.length} ISO 639-2/T codes.`);
+ } catch (error) {
+ console.error('Error fetching ISO 639-1/2 codes:', error.message);
+ }
+}
+
+async function fetchAndParseISO639_3() {
+ try {
+ console.log('Fetching ISO 639-3 codes from SIL...');
+ const response = await fetch(SIL_ISO639_3_URL);
+ if (!response.ok) throw new Error(`Failed to fetch ISO 639-3: ${response.statusText}`);
+ const text = await response.text();
+
+ const lines = text.trim().split('\n');
+ const header = lines.shift().split('\t'); // Remove header line
+ // Expected header fields (order matters):
+ // Id (3-letter code) | Part2B | Part2T | Part1 | Scope | Language_Type | Ref_Name | Comment
+ const idIndex = header.indexOf('Id');
+ const refNameIndex = header.indexOf('Ref_Name');
+ const part1Index = header.indexOf('Part1'); // For cross-referencing ISO 639-1
+
+ if (idIndex === -1 || refNameIndex === -1) {
+ throw new Error('ISO 639-3 header format mismatch. Expected "Id" and "Ref_Name" columns.');
+ }
+
+ lines.forEach(line => {
+ const parts = line.split('\t');
+ const code = parts[idIndex]?.trim();
+ const name = parts[refNameIndex]?.trim();
+ const part1Code = parts[part1Index]?.trim();
+
+ if (code && name) {
+ ALL_LANGUAGE_CODES.iso639_3.push({ code, name });
+
+ // Also, let's try to get more complete ISO 639-1 from this source
+ // as it might be more comprehensive than LOC's where 639-1 is only if 639-2 exists
+ if (part1Code && !ALL_LANGUAGE_CODES.iso639_1.find(c => c.code === part1Code)) {
+ ALL_LANGUAGE_CODES.iso639_1.push({ code: part1Code, name });
+ }
+ }
+ });
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_3.length} ISO 639-3 codes.`);
+ // Deduplicate and sort ISO 639-1 after potential additions
+ const uniqueIso639_1 = {};
+ ALL_LANGUAGE_CODES.iso639_1.forEach(item => uniqueIso639_1[item.code] = item);
+ ALL_LANGUAGE_CODES.iso639_1 = Object.values(uniqueIso639_1).sort((a, b) => a.code.localeCompare(b.code));
+ console.log(`Final unique ISO 639-1 count: ${ALL_LANGUAGE_CODES.iso639_1.length}.`);
+
+ } catch (error) {
+ console.error('Error fetching ISO 639-3 codes:', error.message);
+ console.warn('Make sure the SIL_ISO639_3_URL is current or points to a valid .tab file.');
+ }
+}
+
+async function fetchAndParseISO639_5() {
+ try {
+ console.log('Fetching ISO 639-5 codes from LOC...');
+ const response = await fetch(LOC_ISO639_5_URL);
+ if (!response.ok) throw new Error(`Failed to fetch ISO 639-5: ${response.statusText}`);
+ const text = await response.text();
+
+ const lines = text.trim().split('\n');
+ lines.shift(); // Remove header line: URI Code Label_en
+
+ lines.forEach(line => {
+ const parts = line.split('\t');
+ // URI | Code | Label_en | Label_fr ...
+ if (parts.length >= 3) {
+ const code = parts[1].trim();
+ const name = parts[2].trim();
+ if (code && name) {
+ ALL_LANGUAGE_CODES.iso639_5.push({ code, name });
+ }
+ }
+ });
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.iso639_5.length} ISO 639-5 codes (language families/groups).`);
+ } catch (error) {
+ console.error('Error fetching ISO 639-5 codes:', error.message);
+ }
+}
+
+async function fetchAndParseIANALanguageSubtags() {
+ try {
+ console.log('Fetching IANA Language Subtag Registry...');
+ const response = await fetch(IANA_LANGUAGE_SUBTAG_REGISTRY_URL);
+ if (!response.ok) throw new Error(`Failed to fetch IANA registry: ${response.statusText}`);
+ const text = await response.text();
+
+ const entries = text.split('%%'); // Entries are separated by %%
+ entries.forEach(entry => {
+ const lines = entry.trim().split('\n');
+ let type = '';
+ let subtag = '';
+ let description = '';
+
+ lines.forEach(line => {
+ if (line.startsWith('Type:')) {
+ type = line.substring(5).trim();
+ } else if (line.startsWith('Subtag:')) {
+ subtag = line.substring(7).trim();
+ } else if (line.startsWith('Description:')) {
+ // Description can span multiple lines, but for simplicity, we take the first
+ if (!description) description = line.substring(12).trim();
+ }
+ });
+
+ if (type === 'language' && subtag && description) {
+ ALL_LANGUAGE_CODES.bcp47_language_subtags.push({
+ code: subtag,
+ name: description
+ });
+ }
+ });
+ console.log(`Fetched ${ALL_LANGUAGE_CODES.bcp47_language_subtags.length} primary language subtags from IANA.`);
+ } catch (error) {
+ console.error('Error fetching IANA Language Subtag Registry:', error.message);
+ }
+}
+
+
+async function main() {
+ console.log('Starting to fetch all language codes...\n');
+
+ await Promise.all([
+ fetchAndParseISO639_1_2(),
+ fetchAndParseISO639_3(), // Run this after 1_2 to potentially augment 639-1
+ fetchAndParseISO639_5(),
+ fetchAndParseIANALanguageSubtags()
+ ]);
+ await Bun.write("bcp.json", JSON.stringify(ALL_LANGUAGE_CODES.bcp47_language_subtags))
+
+ console.log('\n\n--- All Fetched Language Codes ---');
+
+ // Example: Print counts and first few of each
+ for (const [key, codes] of Object.entries(ALL_LANGUAGE_CODES)) {
+ console.log(`\n--- ${key} (Total: ${codes.length}) ---`);
+ codes.slice(0, 50).forEach(c => console.log(`${c.code}: ${c.name}`));
+ if (codes.length > 50) console.log('... and more ...');
+ }
+
+ // You can now use ALL_LANGUAGE_CODES object for your needs
+ // e.g., save to a JSON file
+ // import fs from 'fs';
+ // fs.writeFileSync('all_language_codes.json', JSON.stringify(ALL_LANGUAGE_CODES, null, 2));
+ // console.log('\n\nSaved all codes to all_language_codes.json');
+
+ console.log('\nFetching complete.');
+}
+
+main().catch(console.error);