diff options
author | polwex <polwex@sortug.com> | 2025-06-22 09:21:58 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-06-22 09:21:58 +0700 |
commit | b43fe0d51da9a247bf94af27898d63f79d424073 (patch) | |
tree | 87cdba61deb79f73a829c96568b124adab66f1e1 /sorsyl/lib | |
parent | c9fbdb681b77698bdf8a503cb9d13b6f0b53fd93 (diff) |
getting there
Diffstat (limited to 'sorsyl/lib')
-rw-r--r-- | sorsyl/lib/dune | 2 | ||||
-rw-r--r-- | sorsyl/lib/feature.ml | 39 | ||||
-rw-r--r-- | sorsyl/lib/ipa_table.ml | 102 | ||||
-rw-r--r-- | sorsyl/lib/ipa_tableold.ml | 85 | ||||
-rw-r--r-- | sorsyl/lib/sonority.ml | 250 | ||||
-rw-r--r-- | sorsyl/lib/sonorityold.ml | 160 | ||||
-rw-r--r-- | sorsyl/lib/sonorityold.mli (renamed from sorsyl/lib/sonority.mli) | 0 |
7 files changed, 471 insertions, 167 deletions
diff --git a/sorsyl/lib/dune b/sorsyl/lib/dune index 148997f..bba2520 100644 --- a/sorsyl/lib/dune +++ b/sorsyl/lib/dune @@ -1,3 +1,3 @@ (library (name sorsyl) - (libraries csv base stdio)) + (libraries csv base stdio base_trie)) diff --git a/sorsyl/lib/feature.ml b/sorsyl/lib/feature.ml index 280977b..f67a300 100644 --- a/sorsyl/lib/feature.ml +++ b/sorsyl/lib/feature.ml @@ -44,7 +44,9 @@ let value_of_string = function | "0" -> Zero | s -> failwith (Printf.sprintf "Invalid feature value: %s" s) -let string_of_feature = function +let string_of_value = function Plus -> "+" | Minus -> "-" | Zero -> "0" + +let feature_of_string = function | "syl" -> Syllabic | "son" -> Sonorant | "cons" -> Consonantal @@ -71,6 +73,41 @@ let string_of_feature = function | "hireg" -> HighReg | _ -> failwith "not a valid feature" +let string_of_feature = function + | Syllabic -> "syl" + | Sonorant -> "son" + | Consonantal -> "cons" + | Continuant -> "cont" + | DelayedRelease -> "delrel" + | Lateral -> "lat" + | Nasal -> "nas" + | Strident -> "strid" + | Voiced -> "voi" + | SpreadGlottis -> "sg" + | ConstrictedGlottis -> "cg" + | Anterior -> "ant" + | Coronal -> "cor" + | Distributed -> "distr" + | Labial -> "lab" + | High -> "hi" + | Low -> "lo" + | Back -> "back" + | Rounded -> "round" + | Velaric -> "velaric" + | Tense -> "tense" + | Long -> "long" + | HighTone -> "hitone" + | HighReg -> "hireg" + +let string_of_segment segment = + Base.List.fold segment ~init:"" ~f:(fun acc (feature, value) -> + let item = + Printf.sprintf "%s:%s" + (string_of_feature feature) + (string_of_value value) + in + Printf.sprintf "%s\n%s" acc item) + (** Check if a segment has a specific feature with a given value *) let has_feature (value, feature_name) segment = List.exists (fun (v, f) -> v = value && f = feature_name) segment diff --git a/sorsyl/lib/ipa_table.ml b/sorsyl/lib/ipa_table.ml index bee027a..295100d 100644 --- a/sorsyl/lib/ipa_table.ml +++ b/sorsyl/lib/ipa_table.ml @@ -1,20 +1,21 @@ -(** Type representing a segment as a set of feature specifications *) -(* an association list I guess . Use List.assoc to handle*) +(** Functional IPA table implementation without global state *) -(** Decision tree for computing sonority values *) -type bool_tree = - | Leaf of int (** Terminal node with sonority value *) - | Node of { - test : Feature.segment -> bool; (** Test function *) - t_branch : bool_tree; (** Branch to follow if test is true *) - f_branch : bool_tree; (** Branch to follow if test is false *) - } +open Base +module StringTrie = Trie.Of_string type ipa_entry = { ipa : string; features : Feature.segment } (** Type representing a row from the IPA CSV file *) -(** Storage for loaded IPA data *) -let ipa_table : (string, Feature.segment) Hashtbl.t = Hashtbl.create 1000 +type ipa_table = (string, Feature.segment, String.comparator_witness) Map.t +(** Type representing the IPA table *) + +(* type t = (string, Feature.segment, String.comparator_witness) Map.t *) + +type t = { + table : ipa_table; + trie : + (string, Feature.segment, StringTrie.Keychain.keychain_description) Trie.t; +} (** Parse a single row from the CSV file *) let parse_row (row : string list) : ipa_entry option = @@ -50,14 +51,14 @@ let parse_row (row : string list) : ipa_entry option = ] in (* Skip the header row *) - if ipa = "ipa" then None + if String.equal ipa "ipa" then None else let rec build_features names values acc = match (names, values) with | [], [] -> Some (List.rev acc) | name :: ns, value :: vs -> let fval = Feature.value_of_string value in - let fname = Feature.string_of_feature name in + let fname = Feature.feature_of_string name in build_features ns vs ((fname, fval) :: acc) | _ -> None (* Mismatched lengths *) in @@ -65,21 +66,64 @@ let parse_row (row : string list) : ipa_entry option = | Some feature_list -> Some { ipa; features = feature_list } | None -> None) -(** Load IPA data from CSV file *) -let load_csv filename = - let ic = open_in filename in +(** Load IPA data from CSV file and return the table *) +let load_csv (data_dir : string) : t = + let filename = Stdlib.Filename.concat data_dir "ipa_all.csv" in + let ic = Stdio.In_channel.create filename in let csv = Csv.of_channel ic in - try - Csv.iter - ~f:(fun row -> - match parse_row row with - | Some entry -> Hashtbl.add ipa_table entry.ipa entry.features - | None -> ()) - csv; - close_in ic - with e -> - close_in ic; - raise e + + let result = + try + let entries = + Csv.fold_left csv ~init:[] ~f:(fun acc row -> + match parse_row row with Some entry -> entry :: acc | None -> acc) + in + let table = + List.fold entries + ~init:(Map.empty (module String)) + ~f:(fun acc entry -> Map.set acc ~key:entry.ipa ~data:entry.features) + in + let alist = Map.to_alist table in + let trie = Trie.of_alist_exn StringTrie.Keychain.keychainable alist in + Stdio.In_channel.close ic; + { table; trie } + with e -> + Stdio.In_channel.close ic; + raise e + in + result (** Look up features for an IPA segment *) -let lookup_segment ipa = Hashtbl.find_opt ipa_table ipa +let lookup_segment table (ipa : string) : Feature.segment option = + Map.find table ipa + +(** Get all segments in the table *) +let all_segments table : (string * Feature.segment) list = Map.to_alist table + +(** Check if a segment exists in the table *) +let mem table (ipa : string) : bool = Map.mem table ipa + +(** Get the number of segments in the table *) +let length table : int = Map.length table + +let fts ?(_normalize = true) table ipa = Map.find table ipa + +let longest_one_seg_prefix trie word = + let rec aux trie remaining = + if String.is_empty remaining then None + else + let res = Trie.find trie remaining in + match res with + | None -> aux trie (String.drop_suffix remaining 1) + | Some _data -> Some remaining + in + aux trie word + +let ipa_segs ?(_normalize = true) data word = + let rec aux acc remaining = + match longest_one_seg_prefix data.trie remaining with + | None -> acc + | Some seg -> + aux (seg :: acc) (String.drop_prefix remaining (String.length seg)) + in + List.rev (aux [] word) diff --git a/sorsyl/lib/ipa_tableold.ml b/sorsyl/lib/ipa_tableold.ml new file mode 100644 index 0000000..eb7d3fc --- /dev/null +++ b/sorsyl/lib/ipa_tableold.ml @@ -0,0 +1,85 @@ +(** Type representing a segment as a set of feature specifications *) +(* an association list I guess . Use List.assoc to handle*) + +(** Decision tree for computing sonority values *) +type bool_tree = + | Leaf of int (** Terminal node with sonority value *) + | Node of { + test : Feature.segment -> bool; (** Test function *) + t_branch : bool_tree; (** Branch to follow if test is true *) + f_branch : bool_tree; (** Branch to follow if test is false *) + } + +type ipa_entry = { ipa : string; features : Feature.segment } +(** Type representing a row from the IPA CSV file *) + +(** Storage for loaded IPA data *) +let ipa_table : (string, Feature.segment) Hashtbl.t = Hashtbl.create 1000 + +(** Parse a single row from the CSV file *) +let parse_row (row : string list) : ipa_entry option = + match row with + | [] -> None + | ipa :: features -> ( + let feature_names = + [ + "syl"; + "son"; + "cons"; + "cont"; + "delrel"; + "lat"; + "nas"; + "strid"; + "voi"; + "sg"; + "cg"; + "ant"; + "cor"; + "distr"; + "lab"; + "hi"; + "lo"; + "back"; + "round"; + "velaric"; + "tense"; + "long"; + "hitone"; + "hireg"; + ] + in + (* Skip the header row *) + if ipa = "ipa" then None + else + let rec build_features names values acc = + match (names, values) with + | [], [] -> Some (List.rev acc) + | name :: ns, value :: vs -> + let fval = Feature.value_of_string value in + let fname = Feature.feature_of_string name in + build_features ns vs ((fname, fval) :: acc) + | _ -> None (* Mismatched lengths *) + in + match build_features feature_names features [] with + | Some feature_list -> Some { ipa; features = feature_list } + | None -> None) + +(** Load IPA data from CSV file *) +let load_csv filename = + let ic = open_in filename in + let csv = Csv.of_channel ic in + try + Csv.iter + ~f:(fun row -> + match parse_row row with + | Some entry -> Hashtbl.add ipa_table entry.ipa entry.features + | None -> ()) + csv; + close_in ic + with e -> + close_in ic; + raise e + +(** Look up features for an IPA segment *) +let lookup_segment ipa = Hashtbl.find_opt ipa_table ipa diff --git a/sorsyl/lib/sonority.ml b/sorsyl/lib/sonority.ml index 90bfa55..c47d4a0 100644 --- a/sorsyl/lib/sonority.ml +++ b/sorsyl/lib/sonority.ml @@ -1,16 +1,4 @@ -(** Sonority module for determining the sonority of phonetic segments. - - This module provides functionality to determine the sonority of IPA - (International Phonetic Alphabet) segments on a scale of 1 to 9, where: - - 9: Low vowels (most sonorous) - - 8: High vowels - - 7: Glides/approximants - - 6: Liquids - - 5: Nasals - - 4: Voiced fricatives - - 3: Voiceless fricatives - - 2: Voiced stops - - 1: Voiceless stops (least sonorous) *) +(** Functional sonority module without global state *) (** Decision tree for computing sonority values *) type bool_tree = @@ -21,140 +9,130 @@ type bool_tree = f_branch : bool_tree; (** Branch to follow if test is false *) } -(** Main Sonority module functionality *) -module Sonority = struct - (** Initialize the module by loading IPA data *) - let init data_dir = - let csv_file = Filename.concat data_dir "ipa_all.csv" in - Ipa_table.load_csv csv_file - - (** Build the decision tree for sonority calculation *) - let build_tree () = - let open Feature in - let plusSyl = test (Syllabic, Plus) in - let minusHi = test (High, Minus) in - let minusCons = test (Consonantal, Minus) in - let plusSon = test (Sonorant, Plus) in - let minusNas = test (Nasal, Minus) in - let plusCont = test (Continuant, Plus) in - let plusVoi = test (Voiced, Plus) in - - (* Build the tree bottom-up *) - let minusHi_branch = - Node - { - test = minusHi; - t_branch = Leaf 9; - (* -hi vowels = low vowels *) - f_branch = Leaf 8; - (* +hi vowels = high vowels *) - } - in - - let plusVoi1_branch = - Node - { - test = plusVoi; - t_branch = Leaf 4; - (* +voi +cont = voiced fricatives *) - f_branch = Leaf 3; - (* -voi +cont = voiceless fricatives *) - } - in - - let plusVoi2_branch = - Node - { - test = plusVoi; - t_branch = Leaf 2; - (* +voi -cont = voiced stops *) - f_branch = Leaf 1; - (* -voi -cont = voiceless stops *) - } - in - - let plusCont_branch = - Node - { - test = plusCont; - t_branch = plusVoi1_branch; - (* +cont = fricatives *) - f_branch = plusVoi2_branch; - (* -cont = stops *) - } - in - - let minusNas_branch = - Node - { - test = minusNas; - t_branch = Leaf 6; - (* -nas +son = liquids *) - f_branch = Leaf 5; - (* +nas +son = nasals *) - } - in - - let plusSon_branch = - Node - { - test = plusSon; - t_branch = minusNas_branch; - (* +son = sonorants *) - f_branch = plusCont_branch; - (* -son = obstruents *) - } - in - - let minusCons_branch = - Node - { - test = minusCons; - t_branch = Leaf 7; - (* -cons = glides *) - f_branch = plusSon_branch; - (* +cons = true consonants *) - } - in +type t = { ipa_table : Ipa_table.t; decision_tree : bool_tree } +(** Type representing a sonority calculator *) + +(** Build the decision tree for sonority calculation *) +let build_tree () = + let open Feature in + let plusSyl = test (Syllabic, Plus) in + let minusHi = test (High, Minus) in + let minusCons = test (Consonantal, Minus) in + let plusSon = test (Sonorant, Plus) in + let minusNas = test (Nasal, Minus) in + let plusCont = test (Continuant, Plus) in + let plusVoi = test (Voiced, Plus) in + + (* Build the tree bottom-up, matching the Python original exactly *) + let minusHi_branch = + Node + { + test = minusHi; + t_branch = Leaf 9; + (* -hi vowels = low vowels *) + f_branch = Leaf 8; + (* +hi vowels = high vowels *) + } + in + let plusVoi1_branch = Node { - test = plusSyl; - t_branch = minusHi_branch; - (* +syl = vowels *) - f_branch = minusCons_branch; - (* -syl = non-vowels *) + test = plusVoi; + t_branch = Leaf 4; + (* +voi +cont = voiced fricatives *) + f_branch = Leaf 3; + (* -voi +cont = voiceless fricatives *) } + in - (** Evaluate the decision tree for a segment *) - let rec eval_tree tree segment = - match tree with - | Leaf value -> value - | Node { test; t_branch; f_branch } -> - if test segment then eval_tree t_branch segment - else eval_tree f_branch segment + let plusVoi2_branch = + Node + { + test = plusVoi; + t_branch = Leaf 2; + (* +voi -cont = voiced stops *) + f_branch = Leaf 1; + (* -voi -cont = voiceless stops *) + } + in - (** The main decision tree instance *) - let sonority_tree = lazy (build_tree ()) + let plusCont_branch = + Node + { + test = plusCont; + t_branch = plusVoi1_branch; + (* +cont = fricatives *) + f_branch = plusVoi2_branch; + (* -cont = stops *) + } + in - (** Get sonority value from feature specifications *) - let sonority_from_features segment = - eval_tree (Lazy.force sonority_tree) segment + let minusNas_branch = + Node + { + test = minusNas; + t_branch = Leaf 6; + (* -nas +son = liquids *) + f_branch = Leaf 5; + (* +nas +son = nasals *) + } + in + + let plusSon_branch = + Node + { + test = plusSon; + t_branch = minusNas_branch; + (* +son = sonorants *) + f_branch = plusCont_branch; + (* -son = obstruents *) + } + in - (** Get sonority value from an IPA character *) - let sonority ipa = - match Ipa_table.lookup_segment ipa with - | Some features -> sonority_from_features features - | None -> failwith (Printf.sprintf "Unknown IPA segment: %s" ipa) -end + let minusCons_branch = + Node + { + test = minusCons; + t_branch = Leaf 7; + (* -cons = glides *) + f_branch = plusSon_branch; + (* +cons = true consonants *) + } + in + + Node + { + test = plusSyl; + t_branch = minusHi_branch; + (* +syl = vowels *) + f_branch = minusCons_branch; + (* -syl = non-vowels *) + } -(** Public interface *) +(** Create a sonority calculator from data directory *) +let create (data_dir : string) : t = + let ipa_table = Ipa_table.load_csv data_dir in + let decision_tree = build_tree () in + { ipa_table; decision_tree } -(** Initialize the sonority module with the data directory *) -let init = Sonority.init +(** Traverse the decision tree to get sonority value *) +let rec traverse_tree (tree : bool_tree) (segment : Feature.segment) : int = + match tree with + | Leaf value -> value + | Node { test; t_branch; f_branch } -> + if test segment then traverse_tree t_branch segment + else traverse_tree f_branch segment (** Get the sonority value (1-9) for an IPA character *) -let sonority = Sonority.sonority +let sonority (calc : t) (ipa : string) : int = + match Ipa_table.lookup_segment calc.ipa_table.table ipa with + | Some features -> traverse_tree calc.decision_tree features + | None -> failwith (Printf.sprintf "Unknown IPA segment: %s" ipa) (** Get the sonority value from a feature specification *) -let sonority_from_features = Sonority.sonority_from_features +let sonority_from_features (calc : t) (segment : Feature.segment) : int = + traverse_tree calc.decision_tree segment + +(** Get the underlying IPA table *) +let get_ipa_table (calc : t) : Ipa_table.t = calc.ipa_table diff --git a/sorsyl/lib/sonorityold.ml b/sorsyl/lib/sonorityold.ml new file mode 100644 index 0000000..65dd9e5 --- /dev/null +++ b/sorsyl/lib/sonorityold.ml @@ -0,0 +1,160 @@ +(** Sonority module for determining the sonority of phonetic segments. + + This module provides functionality to determine the sonority of IPA + (International Phonetic Alphabet) segments on a scale of 1 to 9, where: + - 9: Low vowels (most sonorous) + - 8: High vowels + - 7: Glides/approximants + - 6: Liquids + - 5: Nasals + - 4: Voiced fricatives + - 3: Voiceless fricatives + - 2: Voiced stops + - 1: Voiceless stops (least sonorous) *) + +(** Decision tree for computing sonority values *) +type bool_tree = + | Leaf of int (** Terminal node with sonority value *) + | Node of { + test : Feature.segment -> bool; (** Test function *) + t_branch : bool_tree; (** Branch to follow if test is true *) + f_branch : bool_tree; (** Branch to follow if test is false *) + } + +(** Main Sonority module functionality *) +module Sonority = struct + (** Initialize the module by loading IPA data *) + let init data_dir = + let csv_file = Filename.concat data_dir "ipa_all.csv" in + Ipa_tableold.load_csv csv_file + + (** Build the decision tree for sonority calculation *) + let build_tree () = + let open Feature in + let plusSyl = test (Syllabic, Plus) in + let minusHi = test (High, Minus) in + let minusCons = test (Consonantal, Minus) in + let plusSon = test (Sonorant, Plus) in + let minusNas = test (Nasal, Minus) in + let plusCont = test (Continuant, Plus) in + let plusVoi = test (Voiced, Plus) in + + (* Build the tree bottom-up *) + let minusHi_branch = + Node + { + test = minusHi; + t_branch = Leaf 9; + (* -hi vowels = low vowels *) + f_branch = Leaf 8; + (* +hi vowels = high vowels *) + } + in + + let plusVoi1_branch = + Node + { + test = plusVoi; + t_branch = Leaf 4; + (* +voi +cont = voiced fricatives *) + f_branch = Leaf 3; + (* -voi +cont = voiceless fricatives *) + } + in + + let plusVoi2_branch = + Node + { + test = plusVoi; + t_branch = Leaf 2; + (* +voi -cont = voiced stops *) + f_branch = Leaf 1; + (* -voi -cont = voiceless stops *) + } + in + + let plusCont_branch = + Node + { + test = plusCont; + t_branch = plusVoi1_branch; + (* +cont = fricatives *) + f_branch = plusVoi2_branch; + (* -cont = stops *) + } + in + + let minusNas_branch = + Node + { + test = minusNas; + t_branch = Leaf 6; + (* -nas +son = liquids *) + f_branch = Leaf 5; + (* +nas +son = nasals *) + } + in + + let plusSon_branch = + Node + { + test = plusSon; + t_branch = minusNas_branch; + (* +son = sonorants *) + f_branch = plusCont_branch; + (* -son = obstruents *) + } + in + + let minusCons_branch = + Node + { + test = minusCons; + t_branch = Leaf 7; + (* -cons = glides *) + f_branch = plusSon_branch; + (* +cons = true consonants *) + } + in + + Node + { + test = plusSyl; + t_branch = minusHi_branch; + (* +syl = vowels *) + f_branch = minusCons_branch; + (* -syl = non-vowels *) + } + + (** Evaluate the decision tree for a segment *) + let rec eval_tree tree segment = + match tree with + | Leaf value -> value + | Node { test; t_branch; f_branch } -> + if test segment then eval_tree t_branch segment + else eval_tree f_branch segment + + (** The main decision tree instance *) + let sonority_tree = lazy (build_tree ()) + + (** Get sonority value from feature specifications *) + let sonority_from_features segment = + eval_tree (Lazy.force sonority_tree) segment + + (** Get sonority value from an IPA character *) + let sonority ipa = + match Ipa_tableold.lookup_segment ipa with + | Some features -> sonority_from_features features + | None -> failwith (Printf.sprintf "Unknown IPA segment: %s" ipa) +end + +(** Public interface *) + +(** Initialize the sonority module with the data directory *) +let init = Sonority.init + +(** Get the sonority value (1-9) for an IPA character *) +let sonority = Sonority.sonority + +(** Get the sonority value from a feature specification *) +let sonority_from_features = Sonority.sonority_from_features diff --git a/sorsyl/lib/sonority.mli b/sorsyl/lib/sonorityold.mli index 3e9166e..3e9166e 100644 --- a/sorsyl/lib/sonority.mli +++ b/sorsyl/lib/sonorityold.mli |