summaryrefslogtreecommitdiff
path: root/sorsyl/lib/ipa_table.ml
blob: 295100d00d06c06fbe71ac70651bff92d4fa5c0d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
(** Functional IPA table implementation without global state *)

open Base
module StringTrie = Trie.Of_string

type ipa_entry = { ipa : string; features : Feature.segment }
(** Type representing a row from the IPA CSV file *)

type ipa_table = (string, Feature.segment, String.comparator_witness) Map.t
(** Type representing the IPA table *)

(* type t = (string, Feature.segment, String.comparator_witness) Map.t *)

type t = {
  table : ipa_table;
  trie :
    (string, Feature.segment, StringTrie.Keychain.keychain_description) Trie.t;
}

(** Parse a single row from the CSV file *)
let parse_row (row : string list) : ipa_entry option =
  match row with
  | [] -> None
  | ipa :: features -> (
      let feature_names =
        [
          "syl";
          "son";
          "cons";
          "cont";
          "delrel";
          "lat";
          "nas";
          "strid";
          "voi";
          "sg";
          "cg";
          "ant";
          "cor";
          "distr";
          "lab";
          "hi";
          "lo";
          "back";
          "round";
          "velaric";
          "tense";
          "long";
          "hitone";
          "hireg";
        ]
      in
      (* Skip the header row *)
      if String.equal ipa "ipa" then None
      else
        let rec build_features names values acc =
          match (names, values) with
          | [], [] -> Some (List.rev acc)
          | name :: ns, value :: vs ->
              let fval = Feature.value_of_string value in
              let fname = Feature.feature_of_string name in
              build_features ns vs ((fname, fval) :: acc)
          | _ -> None (* Mismatched lengths *)
        in
        match build_features feature_names features [] with
        | Some feature_list -> Some { ipa; features = feature_list }
        | None -> None)

(** Load IPA data from CSV file and return the table *)
let load_csv (data_dir : string) : t =
  let filename = Stdlib.Filename.concat data_dir "ipa_all.csv" in
  let ic = Stdio.In_channel.create filename in
  let csv = Csv.of_channel ic in

  let result =
    try
      let entries =
        Csv.fold_left csv ~init:[] ~f:(fun acc row ->
            match parse_row row with Some entry -> entry :: acc | None -> acc)
      in
      let table =
        List.fold entries
          ~init:(Map.empty (module String))
          ~f:(fun acc entry -> Map.set acc ~key:entry.ipa ~data:entry.features)
      in
      let alist = Map.to_alist table in
      let trie = Trie.of_alist_exn StringTrie.Keychain.keychainable alist in
      Stdio.In_channel.close ic;
      { table; trie }
    with e ->
      Stdio.In_channel.close ic;
      raise e
  in
  result

(** Look up features for an IPA segment *)
let lookup_segment table (ipa : string) : Feature.segment option =
  Map.find table ipa

(** Get all segments in the table *)
let all_segments table : (string * Feature.segment) list = Map.to_alist table

(** Check if a segment exists in the table *)
let mem table (ipa : string) : bool = Map.mem table ipa

(** Get the number of segments in the table *)
let length table : int = Map.length table

let fts ?(_normalize = true) table ipa = Map.find table ipa

let longest_one_seg_prefix trie word =
  let rec aux trie remaining =
    if String.is_empty remaining then None
    else
      let res = Trie.find trie remaining in
      match res with
      | None -> aux trie (String.drop_suffix remaining 1)
      | Some _data -> Some remaining
  in
  aux trie word

let ipa_segs ?(_normalize = true) data word =
  let rec aux acc remaining =
    match longest_one_seg_prefix data.trie remaining with
    | None -> acc
    | Some seg ->
        aux (seg :: acc) (String.drop_prefix remaining (String.length seg))
  in
  List.rev (aux [] word)