blob: 295100d00d06c06fbe71ac70651bff92d4fa5c0d (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
(** Functional IPA table implementation without global state *)
open Base
module StringTrie = Trie.Of_string
type ipa_entry = { ipa : string; features : Feature.segment }
(** Type representing a row from the IPA CSV file *)
type ipa_table = (string, Feature.segment, String.comparator_witness) Map.t
(** Type representing the IPA table *)
(* type t = (string, Feature.segment, String.comparator_witness) Map.t *)
type t = {
table : ipa_table;
trie :
(string, Feature.segment, StringTrie.Keychain.keychain_description) Trie.t;
}
(** Parse a single row from the CSV file *)
let parse_row (row : string list) : ipa_entry option =
match row with
| [] -> None
| ipa :: features -> (
let feature_names =
[
"syl";
"son";
"cons";
"cont";
"delrel";
"lat";
"nas";
"strid";
"voi";
"sg";
"cg";
"ant";
"cor";
"distr";
"lab";
"hi";
"lo";
"back";
"round";
"velaric";
"tense";
"long";
"hitone";
"hireg";
]
in
(* Skip the header row *)
if String.equal ipa "ipa" then None
else
let rec build_features names values acc =
match (names, values) with
| [], [] -> Some (List.rev acc)
| name :: ns, value :: vs ->
let fval = Feature.value_of_string value in
let fname = Feature.feature_of_string name in
build_features ns vs ((fname, fval) :: acc)
| _ -> None (* Mismatched lengths *)
in
match build_features feature_names features [] with
| Some feature_list -> Some { ipa; features = feature_list }
| None -> None)
(** Load IPA data from CSV file and return the table *)
let load_csv (data_dir : string) : t =
let filename = Stdlib.Filename.concat data_dir "ipa_all.csv" in
let ic = Stdio.In_channel.create filename in
let csv = Csv.of_channel ic in
let result =
try
let entries =
Csv.fold_left csv ~init:[] ~f:(fun acc row ->
match parse_row row with Some entry -> entry :: acc | None -> acc)
in
let table =
List.fold entries
~init:(Map.empty (module String))
~f:(fun acc entry -> Map.set acc ~key:entry.ipa ~data:entry.features)
in
let alist = Map.to_alist table in
let trie = Trie.of_alist_exn StringTrie.Keychain.keychainable alist in
Stdio.In_channel.close ic;
{ table; trie }
with e ->
Stdio.In_channel.close ic;
raise e
in
result
(** Look up features for an IPA segment *)
let lookup_segment table (ipa : string) : Feature.segment option =
Map.find table ipa
(** Get all segments in the table *)
let all_segments table : (string * Feature.segment) list = Map.to_alist table
(** Check if a segment exists in the table *)
let mem table (ipa : string) : bool = Map.mem table ipa
(** Get the number of segments in the table *)
let length table : int = Map.length table
let fts ?(_normalize = true) table ipa = Map.find table ipa
let longest_one_seg_prefix trie word =
let rec aux trie remaining =
if String.is_empty remaining then None
else
let res = Trie.find trie remaining in
match res with
| None -> aux trie (String.drop_suffix remaining 1)
| Some _data -> Some remaining
in
aux trie word
let ipa_segs ?(_normalize = true) data word =
let rec aux acc remaining =
match longest_one_seg_prefix data.trie remaining with
| None -> acc
| Some seg ->
aux (seg :: acc) (String.drop_prefix remaining (String.length seg))
in
List.rev (aux [] word)
|