summaryrefslogtreecommitdiff
path: root/sorsyl/lib/ipa_table.ml
blob: 3ecbda4f294df56f1d0f7e493a31266b8f81d95c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
(** Functional IPA table implementation without global state *)

open Base
module StringTrie = Trie.Of_string

type ipa_entry = { ipa : string; features : Feature.segment }
(** Type representing a row from the IPA CSV file *)

type ipa_table = (string, Feature.segment, String.comparator_witness) Map.t
(** Type representing the IPA table *)

(* type t = (string, Feature.segment, String.comparator_witness) Map.t *)

type t = {
  table : ipa_table;
  trie :
    (string, Feature.segment, StringTrie.Keychain.keychain_description) Trie.t;
}

(** Parse a single row from the CSV file *)
let parse_row (row : string list) : ipa_entry option =
  match row with
  | [] -> None
  | ipa :: features -> (
      let feature_names =
        [
          "syl";
          "son";
          "cons";
          "cont";
          "delrel";
          "lat";
          "nas";
          "strid";
          "voi";
          "sg";
          "cg";
          "ant";
          "cor";
          "distr";
          "lab";
          "hi";
          "lo";
          "back";
          "round";
          "velaric";
          "tense";
          "long";
          "hitone";
          "hireg";
        ]
      in
      (* Skip the header row *)
      if String.equal ipa "ipa" then None
      else
        let rec build_features names values acc =
          match (names, values) with
          | [], [] -> Some (List.rev acc)
          | name :: ns, value :: vs ->
              let fval = Feature.value_of_string value in
              let fname = Feature.feature_of_string name in
              build_features ns vs ((fname, fval) :: acc)
          | _ -> None (* Mismatched lengths *)
        in
        match build_features feature_names features [] with
        | Some feature_list -> Some { ipa; features = feature_list }
        | None -> None)

(** Load IPA data from CSV file and return the table *)
let load_csv (data_dir : string) : t =
  let filename = Stdlib.Filename.concat data_dir "ipa_all.csv" in
  let ic = Stdio.In_channel.create filename in
  let csv = Csv.of_channel ic in

  let result =
    try
      let entries =
        Csv.fold_left csv ~init:[] ~f:(fun acc row ->
            match parse_row row with Some entry -> entry :: acc | None -> acc)
      in
      let table =
        List.fold entries
          ~init:(Map.empty (module String))
          ~f:(fun acc entry -> Map.set acc ~key:entry.ipa ~data:entry.features)
      in
      let alist = Map.to_alist table in
      let trie = Trie.of_alist_exn StringTrie.Keychain.keychainable alist in
      Stdio.In_channel.close ic;
      { table; trie }
    with e ->
      Stdio.In_channel.close ic;
      raise e
  in
  result

(** Look up features for an IPA segment *)
let lookup_segment table (ipa : string) : Feature.segment option =
  Map.find table ipa

(** Get all segments in the table *)
let all_segments table : (string * Feature.segment) list = Map.to_alist table

(** Check if a segment exists in the table *)
let mem table (ipa : string) : bool = Map.mem table ipa

(** Get the number of segments in the table *)
let length table : int = Map.length table

let fts ?(_normalize = true) table ipa = Map.find table ipa

let longest_one_seg_prefix trie word =
  let rec aux trie remaining =
    if String.is_empty remaining then None
    else
      let res = Trie.find trie remaining in
      match res with
      | None -> aux trie (String.drop_suffix remaining 1)
      | Some _data -> Some remaining
  in
  aux trie word

let ipa_segs ?(_normalize = true) data word =
  let rec aux acc remaining =
    if String.is_empty remaining then acc
    else
      match longest_one_seg_prefix data.trie remaining with
      | None -> aux acc (String.drop_prefix remaining 1)
      | Some seg ->
          aux (seg :: acc) (String.drop_prefix remaining (String.length seg))
  in
  List.rev (aux [] word)