diff options
Diffstat (limited to 'sorsyl/lib/ipa_table.ml')
-rw-r--r-- | sorsyl/lib/ipa_table.ml | 102 |
1 files changed, 73 insertions, 29 deletions
diff --git a/sorsyl/lib/ipa_table.ml b/sorsyl/lib/ipa_table.ml index bee027a..295100d 100644 --- a/sorsyl/lib/ipa_table.ml +++ b/sorsyl/lib/ipa_table.ml @@ -1,20 +1,21 @@ -(** Type representing a segment as a set of feature specifications *) -(* an association list I guess . Use List.assoc to handle*) +(** Functional IPA table implementation without global state *) -(** Decision tree for computing sonority values *) -type bool_tree = - | Leaf of int (** Terminal node with sonority value *) - | Node of { - test : Feature.segment -> bool; (** Test function *) - t_branch : bool_tree; (** Branch to follow if test is true *) - f_branch : bool_tree; (** Branch to follow if test is false *) - } +open Base +module StringTrie = Trie.Of_string type ipa_entry = { ipa : string; features : Feature.segment } (** Type representing a row from the IPA CSV file *) -(** Storage for loaded IPA data *) -let ipa_table : (string, Feature.segment) Hashtbl.t = Hashtbl.create 1000 +type ipa_table = (string, Feature.segment, String.comparator_witness) Map.t +(** Type representing the IPA table *) + +(* type t = (string, Feature.segment, String.comparator_witness) Map.t *) + +type t = { + table : ipa_table; + trie : + (string, Feature.segment, StringTrie.Keychain.keychain_description) Trie.t; +} (** Parse a single row from the CSV file *) let parse_row (row : string list) : ipa_entry option = @@ -50,14 +51,14 @@ let parse_row (row : string list) : ipa_entry option = ] in (* Skip the header row *) - if ipa = "ipa" then None + if String.equal ipa "ipa" then None else let rec build_features names values acc = match (names, values) with | [], [] -> Some (List.rev acc) | name :: ns, value :: vs -> let fval = Feature.value_of_string value in - let fname = Feature.string_of_feature name in + let fname = Feature.feature_of_string name in build_features ns vs ((fname, fval) :: acc) | _ -> None (* Mismatched lengths *) in @@ -65,21 +66,64 @@ let parse_row (row : string list) : ipa_entry option = | Some feature_list -> Some { ipa; features = feature_list } | None -> None) -(** Load IPA data from CSV file *) -let load_csv filename = - let ic = open_in filename in +(** Load IPA data from CSV file and return the table *) +let load_csv (data_dir : string) : t = + let filename = Stdlib.Filename.concat data_dir "ipa_all.csv" in + let ic = Stdio.In_channel.create filename in let csv = Csv.of_channel ic in - try - Csv.iter - ~f:(fun row -> - match parse_row row with - | Some entry -> Hashtbl.add ipa_table entry.ipa entry.features - | None -> ()) - csv; - close_in ic - with e -> - close_in ic; - raise e + + let result = + try + let entries = + Csv.fold_left csv ~init:[] ~f:(fun acc row -> + match parse_row row with Some entry -> entry :: acc | None -> acc) + in + let table = + List.fold entries + ~init:(Map.empty (module String)) + ~f:(fun acc entry -> Map.set acc ~key:entry.ipa ~data:entry.features) + in + let alist = Map.to_alist table in + let trie = Trie.of_alist_exn StringTrie.Keychain.keychainable alist in + Stdio.In_channel.close ic; + { table; trie } + with e -> + Stdio.In_channel.close ic; + raise e + in + result (** Look up features for an IPA segment *) -let lookup_segment ipa = Hashtbl.find_opt ipa_table ipa +let lookup_segment table (ipa : string) : Feature.segment option = + Map.find table ipa + +(** Get all segments in the table *) +let all_segments table : (string * Feature.segment) list = Map.to_alist table + +(** Check if a segment exists in the table *) +let mem table (ipa : string) : bool = Map.mem table ipa + +(** Get the number of segments in the table *) +let length table : int = Map.length table + +let fts ?(_normalize = true) table ipa = Map.find table ipa + +let longest_one_seg_prefix trie word = + let rec aux trie remaining = + if String.is_empty remaining then None + else + let res = Trie.find trie remaining in + match res with + | None -> aux trie (String.drop_suffix remaining 1) + | Some _data -> Some remaining + in + aux trie word + +let ipa_segs ?(_normalize = true) data word = + let rec aux acc remaining = + match longest_one_seg_prefix data.trie remaining with + | None -> acc + | Some seg -> + aux (seg :: acc) (String.drop_prefix remaining (String.length seg)) + in + List.rev (aux [] word) |