blob: 3ecbda4f294df56f1d0f7e493a31266b8f81d95c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
(** Functional IPA table implementation without global state *)
open Base
module StringTrie = Trie.Of_string
type ipa_entry = { ipa : string; features : Feature.segment }
(** Type representing a row from the IPA CSV file *)
type ipa_table = (string, Feature.segment, String.comparator_witness) Map.t
(** Type representing the IPA table *)
(* type t = (string, Feature.segment, String.comparator_witness) Map.t *)
type t = {
table : ipa_table;
trie :
(string, Feature.segment, StringTrie.Keychain.keychain_description) Trie.t;
}
(** Parse a single row from the CSV file *)
let parse_row (row : string list) : ipa_entry option =
match row with
| [] -> None
| ipa :: features -> (
let feature_names =
[
"syl";
"son";
"cons";
"cont";
"delrel";
"lat";
"nas";
"strid";
"voi";
"sg";
"cg";
"ant";
"cor";
"distr";
"lab";
"hi";
"lo";
"back";
"round";
"velaric";
"tense";
"long";
"hitone";
"hireg";
]
in
(* Skip the header row *)
if String.equal ipa "ipa" then None
else
let rec build_features names values acc =
match (names, values) with
| [], [] -> Some (List.rev acc)
| name :: ns, value :: vs ->
let fval = Feature.value_of_string value in
let fname = Feature.feature_of_string name in
build_features ns vs ((fname, fval) :: acc)
| _ -> None (* Mismatched lengths *)
in
match build_features feature_names features [] with
| Some feature_list -> Some { ipa; features = feature_list }
| None -> None)
(** Load IPA data from CSV file and return the table *)
let load_csv (data_dir : string) : t =
let filename = Stdlib.Filename.concat data_dir "ipa_all.csv" in
let ic = Stdio.In_channel.create filename in
let csv = Csv.of_channel ic in
let result =
try
let entries =
Csv.fold_left csv ~init:[] ~f:(fun acc row ->
match parse_row row with Some entry -> entry :: acc | None -> acc)
in
let table =
List.fold entries
~init:(Map.empty (module String))
~f:(fun acc entry -> Map.set acc ~key:entry.ipa ~data:entry.features)
in
let alist = Map.to_alist table in
let trie = Trie.of_alist_exn StringTrie.Keychain.keychainable alist in
Stdio.In_channel.close ic;
{ table; trie }
with e ->
Stdio.In_channel.close ic;
raise e
in
result
(** Look up features for an IPA segment *)
let lookup_segment table (ipa : string) : Feature.segment option =
Map.find table ipa
(** Get all segments in the table *)
let all_segments table : (string * Feature.segment) list = Map.to_alist table
(** Check if a segment exists in the table *)
let mem table (ipa : string) : bool = Map.mem table ipa
(** Get the number of segments in the table *)
let length table : int = Map.length table
let fts ?(_normalize = true) table ipa = Map.find table ipa
let longest_one_seg_prefix trie word =
let rec aux trie remaining =
if String.is_empty remaining then None
else
let res = Trie.find trie remaining in
match res with
| None -> aux trie (String.drop_suffix remaining 1)
| Some _data -> Some remaining
in
aux trie word
let ipa_segs ?(_normalize = true) data word =
let rec aux acc remaining =
if String.is_empty remaining then acc
else
match longest_one_seg_prefix data.trie remaining with
| None -> aux acc (String.drop_prefix remaining 1)
| Some seg ->
aux (seg :: acc) (String.drop_prefix remaining (String.length seg))
in
List.rev (aux [] word)
|