diff options
author | polwex <polwex@sortug.com> | 2025-06-22 09:21:58 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-06-22 09:21:58 +0700 |
commit | b43fe0d51da9a247bf94af27898d63f79d424073 (patch) | |
tree | 87cdba61deb79f73a829c96568b124adab66f1e1 | |
parent | c9fbdb681b77698bdf8a503cb9d13b6f0b53fd93 (diff) |
getting there
-rw-r--r-- | sorsyl/dune-project | 2 | ||||
-rw-r--r-- | sorsyl/lib/dune | 2 | ||||
-rw-r--r-- | sorsyl/lib/feature.ml | 39 | ||||
-rw-r--r-- | sorsyl/lib/ipa_table.ml | 102 | ||||
-rw-r--r-- | sorsyl/lib/ipa_tableold.ml | 85 | ||||
-rw-r--r-- | sorsyl/lib/sonority.ml | 250 | ||||
-rw-r--r-- | sorsyl/lib/sonorityold.ml | 160 | ||||
-rw-r--r-- | sorsyl/lib/sonorityold.mli (renamed from sorsyl/lib/sonority.mli) | 0 | ||||
-rw-r--r-- | sorsyl/sorsyl.opam | 3 | ||||
-rw-r--r-- | sorsyl/test/dune | 8 | ||||
-rw-r--r-- | sorsyl/test/test_sonority.ml | 60 | ||||
-rw-r--r-- | sorsyl/test/test_sonorityold.ml | 133 | ||||
-rw-r--r-- | sorsyl/test/test_table.ml | 208 |
13 files changed, 863 insertions, 189 deletions
diff --git a/sorsyl/dune-project b/sorsyl/dune-project index fe23f5d..6651317 100644 --- a/sorsyl/dune-project +++ b/sorsyl/dune-project @@ -19,7 +19,7 @@ (name sorsyl) (synopsis "A short synopsis") (description "A longer description") - (depends ocaml csv) + (depends ocaml base base_trie csv uunf) (tags ("add topics" "to describe" your project))) diff --git a/sorsyl/lib/dune b/sorsyl/lib/dune index 148997f..bba2520 100644 --- a/sorsyl/lib/dune +++ b/sorsyl/lib/dune @@ -1,3 +1,3 @@ (library (name sorsyl) - (libraries csv base stdio)) + (libraries csv base stdio base_trie)) diff --git a/sorsyl/lib/feature.ml b/sorsyl/lib/feature.ml index 280977b..f67a300 100644 --- a/sorsyl/lib/feature.ml +++ b/sorsyl/lib/feature.ml @@ -44,7 +44,9 @@ let value_of_string = function | "0" -> Zero | s -> failwith (Printf.sprintf "Invalid feature value: %s" s) -let string_of_feature = function +let string_of_value = function Plus -> "+" | Minus -> "-" | Zero -> "0" + +let feature_of_string = function | "syl" -> Syllabic | "son" -> Sonorant | "cons" -> Consonantal @@ -71,6 +73,41 @@ let string_of_feature = function | "hireg" -> HighReg | _ -> failwith "not a valid feature" +let string_of_feature = function + | Syllabic -> "syl" + | Sonorant -> "son" + | Consonantal -> "cons" + | Continuant -> "cont" + | DelayedRelease -> "delrel" + | Lateral -> "lat" + | Nasal -> "nas" + | Strident -> "strid" + | Voiced -> "voi" + | SpreadGlottis -> "sg" + | ConstrictedGlottis -> "cg" + | Anterior -> "ant" + | Coronal -> "cor" + | Distributed -> "distr" + | Labial -> "lab" + | High -> "hi" + | Low -> "lo" + | Back -> "back" + | Rounded -> "round" + | Velaric -> "velaric" + | Tense -> "tense" + | Long -> "long" + | HighTone -> "hitone" + | HighReg -> "hireg" + +let string_of_segment segment = + Base.List.fold segment ~init:"" ~f:(fun acc (feature, value) -> + let item = + Printf.sprintf "%s:%s" + (string_of_feature feature) + (string_of_value value) + in + Printf.sprintf "%s\n%s" acc item) + (** Check if a segment has a specific feature with a given value *) let has_feature (value, feature_name) segment = List.exists (fun (v, f) -> v = value && f = feature_name) segment diff --git a/sorsyl/lib/ipa_table.ml b/sorsyl/lib/ipa_table.ml index bee027a..295100d 100644 --- a/sorsyl/lib/ipa_table.ml +++ b/sorsyl/lib/ipa_table.ml @@ -1,20 +1,21 @@ -(** Type representing a segment as a set of feature specifications *) -(* an association list I guess . Use List.assoc to handle*) +(** Functional IPA table implementation without global state *) -(** Decision tree for computing sonority values *) -type bool_tree = - | Leaf of int (** Terminal node with sonority value *) - | Node of { - test : Feature.segment -> bool; (** Test function *) - t_branch : bool_tree; (** Branch to follow if test is true *) - f_branch : bool_tree; (** Branch to follow if test is false *) - } +open Base +module StringTrie = Trie.Of_string type ipa_entry = { ipa : string; features : Feature.segment } (** Type representing a row from the IPA CSV file *) -(** Storage for loaded IPA data *) -let ipa_table : (string, Feature.segment) Hashtbl.t = Hashtbl.create 1000 +type ipa_table = (string, Feature.segment, String.comparator_witness) Map.t +(** Type representing the IPA table *) + +(* type t = (string, Feature.segment, String.comparator_witness) Map.t *) + +type t = { + table : ipa_table; + trie : + (string, Feature.segment, StringTrie.Keychain.keychain_description) Trie.t; +} (** Parse a single row from the CSV file *) let parse_row (row : string list) : ipa_entry option = @@ -50,14 +51,14 @@ let parse_row (row : string list) : ipa_entry option = ] in (* Skip the header row *) - if ipa = "ipa" then None + if String.equal ipa "ipa" then None else let rec build_features names values acc = match (names, values) with | [], [] -> Some (List.rev acc) | name :: ns, value :: vs -> let fval = Feature.value_of_string value in - let fname = Feature.string_of_feature name in + let fname = Feature.feature_of_string name in build_features ns vs ((fname, fval) :: acc) | _ -> None (* Mismatched lengths *) in @@ -65,21 +66,64 @@ let parse_row (row : string list) : ipa_entry option = | Some feature_list -> Some { ipa; features = feature_list } | None -> None) -(** Load IPA data from CSV file *) -let load_csv filename = - let ic = open_in filename in +(** Load IPA data from CSV file and return the table *) +let load_csv (data_dir : string) : t = + let filename = Stdlib.Filename.concat data_dir "ipa_all.csv" in + let ic = Stdio.In_channel.create filename in let csv = Csv.of_channel ic in - try - Csv.iter - ~f:(fun row -> - match parse_row row with - | Some entry -> Hashtbl.add ipa_table entry.ipa entry.features - | None -> ()) - csv; - close_in ic - with e -> - close_in ic; - raise e + + let result = + try + let entries = + Csv.fold_left csv ~init:[] ~f:(fun acc row -> + match parse_row row with Some entry -> entry :: acc | None -> acc) + in + let table = + List.fold entries + ~init:(Map.empty (module String)) + ~f:(fun acc entry -> Map.set acc ~key:entry.ipa ~data:entry.features) + in + let alist = Map.to_alist table in + let trie = Trie.of_alist_exn StringTrie.Keychain.keychainable alist in + Stdio.In_channel.close ic; + { table; trie } + with e -> + Stdio.In_channel.close ic; + raise e + in + result (** Look up features for an IPA segment *) -let lookup_segment ipa = Hashtbl.find_opt ipa_table ipa +let lookup_segment table (ipa : string) : Feature.segment option = + Map.find table ipa + +(** Get all segments in the table *) +let all_segments table : (string * Feature.segment) list = Map.to_alist table + +(** Check if a segment exists in the table *) +let mem table (ipa : string) : bool = Map.mem table ipa + +(** Get the number of segments in the table *) +let length table : int = Map.length table + +let fts ?(_normalize = true) table ipa = Map.find table ipa + +let longest_one_seg_prefix trie word = + let rec aux trie remaining = + if String.is_empty remaining then None + else + let res = Trie.find trie remaining in + match res with + | None -> aux trie (String.drop_suffix remaining 1) + | Some _data -> Some remaining + in + aux trie word + +let ipa_segs ?(_normalize = true) data word = + let rec aux acc remaining = + match longest_one_seg_prefix data.trie remaining with + | None -> acc + | Some seg -> + aux (seg :: acc) (String.drop_prefix remaining (String.length seg)) + in + List.rev (aux [] word) diff --git a/sorsyl/lib/ipa_tableold.ml b/sorsyl/lib/ipa_tableold.ml new file mode 100644 index 0000000..eb7d3fc --- /dev/null +++ b/sorsyl/lib/ipa_tableold.ml @@ -0,0 +1,85 @@ +(** Type representing a segment as a set of feature specifications *) +(* an association list I guess . Use List.assoc to handle*) + +(** Decision tree for computing sonority values *) +type bool_tree = + | Leaf of int (** Terminal node with sonority value *) + | Node of { + test : Feature.segment -> bool; (** Test function *) + t_branch : bool_tree; (** Branch to follow if test is true *) + f_branch : bool_tree; (** Branch to follow if test is false *) + } + +type ipa_entry = { ipa : string; features : Feature.segment } +(** Type representing a row from the IPA CSV file *) + +(** Storage for loaded IPA data *) +let ipa_table : (string, Feature.segment) Hashtbl.t = Hashtbl.create 1000 + +(** Parse a single row from the CSV file *) +let parse_row (row : string list) : ipa_entry option = + match row with + | [] -> None + | ipa :: features -> ( + let feature_names = + [ + "syl"; + "son"; + "cons"; + "cont"; + "delrel"; + "lat"; + "nas"; + "strid"; + "voi"; + "sg"; + "cg"; + "ant"; + "cor"; + "distr"; + "lab"; + "hi"; + "lo"; + "back"; + "round"; + "velaric"; + "tense"; + "long"; + "hitone"; + "hireg"; + ] + in + (* Skip the header row *) + if ipa = "ipa" then None + else + let rec build_features names values acc = + match (names, values) with + | [], [] -> Some (List.rev acc) + | name :: ns, value :: vs -> + let fval = Feature.value_of_string value in + let fname = Feature.feature_of_string name in + build_features ns vs ((fname, fval) :: acc) + | _ -> None (* Mismatched lengths *) + in + match build_features feature_names features [] with + | Some feature_list -> Some { ipa; features = feature_list } + | None -> None) + +(** Load IPA data from CSV file *) +let load_csv filename = + let ic = open_in filename in + let csv = Csv.of_channel ic in + try + Csv.iter + ~f:(fun row -> + match parse_row row with + | Some entry -> Hashtbl.add ipa_table entry.ipa entry.features + | None -> ()) + csv; + close_in ic + with e -> + close_in ic; + raise e + +(** Look up features for an IPA segment *) +let lookup_segment ipa = Hashtbl.find_opt ipa_table ipa diff --git a/sorsyl/lib/sonority.ml b/sorsyl/lib/sonority.ml index 90bfa55..c47d4a0 100644 --- a/sorsyl/lib/sonority.ml +++ b/sorsyl/lib/sonority.ml @@ -1,16 +1,4 @@ -(** Sonority module for determining the sonority of phonetic segments. - - This module provides functionality to determine the sonority of IPA - (International Phonetic Alphabet) segments on a scale of 1 to 9, where: - - 9: Low vowels (most sonorous) - - 8: High vowels - - 7: Glides/approximants - - 6: Liquids - - 5: Nasals - - 4: Voiced fricatives - - 3: Voiceless fricatives - - 2: Voiced stops - - 1: Voiceless stops (least sonorous) *) +(** Functional sonority module without global state *) (** Decision tree for computing sonority values *) type bool_tree = @@ -21,140 +9,130 @@ type bool_tree = f_branch : bool_tree; (** Branch to follow if test is false *) } -(** Main Sonority module functionality *) -module Sonority = struct - (** Initialize the module by loading IPA data *) - let init data_dir = - let csv_file = Filename.concat data_dir "ipa_all.csv" in - Ipa_table.load_csv csv_file - - (** Build the decision tree for sonority calculation *) - let build_tree () = - let open Feature in - let plusSyl = test (Syllabic, Plus) in - let minusHi = test (High, Minus) in - let minusCons = test (Consonantal, Minus) in - let plusSon = test (Sonorant, Plus) in - let minusNas = test (Nasal, Minus) in - let plusCont = test (Continuant, Plus) in - let plusVoi = test (Voiced, Plus) in - - (* Build the tree bottom-up *) - let minusHi_branch = - Node - { - test = minusHi; - t_branch = Leaf 9; - (* -hi vowels = low vowels *) - f_branch = Leaf 8; - (* +hi vowels = high vowels *) - } - in - - let plusVoi1_branch = - Node - { - test = plusVoi; - t_branch = Leaf 4; - (* +voi +cont = voiced fricatives *) - f_branch = Leaf 3; - (* -voi +cont = voiceless fricatives *) - } - in - - let plusVoi2_branch = - Node - { - test = plusVoi; - t_branch = Leaf 2; - (* +voi -cont = voiced stops *) - f_branch = Leaf 1; - (* -voi -cont = voiceless stops *) - } - in - - let plusCont_branch = - Node - { - test = plusCont; - t_branch = plusVoi1_branch; - (* +cont = fricatives *) - f_branch = plusVoi2_branch; - (* -cont = stops *) - } - in - - let minusNas_branch = - Node - { - test = minusNas; - t_branch = Leaf 6; - (* -nas +son = liquids *) - f_branch = Leaf 5; - (* +nas +son = nasals *) - } - in - - let plusSon_branch = - Node - { - test = plusSon; - t_branch = minusNas_branch; - (* +son = sonorants *) - f_branch = plusCont_branch; - (* -son = obstruents *) - } - in - - let minusCons_branch = - Node - { - test = minusCons; - t_branch = Leaf 7; - (* -cons = glides *) - f_branch = plusSon_branch; - (* +cons = true consonants *) - } - in +type t = { ipa_table : Ipa_table.t; decision_tree : bool_tree } +(** Type representing a sonority calculator *) + +(** Build the decision tree for sonority calculation *) +let build_tree () = + let open Feature in + let plusSyl = test (Syllabic, Plus) in + let minusHi = test (High, Minus) in + let minusCons = test (Consonantal, Minus) in + let plusSon = test (Sonorant, Plus) in + let minusNas = test (Nasal, Minus) in + let plusCont = test (Continuant, Plus) in + let plusVoi = test (Voiced, Plus) in + + (* Build the tree bottom-up, matching the Python original exactly *) + let minusHi_branch = + Node + { + test = minusHi; + t_branch = Leaf 9; + (* -hi vowels = low vowels *) + f_branch = Leaf 8; + (* +hi vowels = high vowels *) + } + in + let plusVoi1_branch = Node { - test = plusSyl; - t_branch = minusHi_branch; - (* +syl = vowels *) - f_branch = minusCons_branch; - (* -syl = non-vowels *) + test = plusVoi; + t_branch = Leaf 4; + (* +voi +cont = voiced fricatives *) + f_branch = Leaf 3; + (* -voi +cont = voiceless fricatives *) } + in - (** Evaluate the decision tree for a segment *) - let rec eval_tree tree segment = - match tree with - | Leaf value -> value - | Node { test; t_branch; f_branch } -> - if test segment then eval_tree t_branch segment - else eval_tree f_branch segment + let plusVoi2_branch = + Node + { + test = plusVoi; + t_branch = Leaf 2; + (* +voi -cont = voiced stops *) + f_branch = Leaf 1; + (* -voi -cont = voiceless stops *) + } + in - (** The main decision tree instance *) - let sonority_tree = lazy (build_tree ()) + let plusCont_branch = + Node + { + test = plusCont; + t_branch = plusVoi1_branch; + (* +cont = fricatives *) + f_branch = plusVoi2_branch; + (* -cont = stops *) + } + in - (** Get sonority value from feature specifications *) - let sonority_from_features segment = - eval_tree (Lazy.force sonority_tree) segment + let minusNas_branch = + Node + { + test = minusNas; + t_branch = Leaf 6; + (* -nas +son = liquids *) + f_branch = Leaf 5; + (* +nas +son = nasals *) + } + in + + let plusSon_branch = + Node + { + test = plusSon; + t_branch = minusNas_branch; + (* +son = sonorants *) + f_branch = plusCont_branch; + (* -son = obstruents *) + } + in - (** Get sonority value from an IPA character *) - let sonority ipa = - match Ipa_table.lookup_segment ipa with - | Some features -> sonority_from_features features - | None -> failwith (Printf.sprintf "Unknown IPA segment: %s" ipa) -end + let minusCons_branch = + Node + { + test = minusCons; + t_branch = Leaf 7; + (* -cons = glides *) + f_branch = plusSon_branch; + (* +cons = true consonants *) + } + in + + Node + { + test = plusSyl; + t_branch = minusHi_branch; + (* +syl = vowels *) + f_branch = minusCons_branch; + (* -syl = non-vowels *) + } -(** Public interface *) +(** Create a sonority calculator from data directory *) +let create (data_dir : string) : t = + let ipa_table = Ipa_table.load_csv data_dir in + let decision_tree = build_tree () in + { ipa_table; decision_tree } -(** Initialize the sonority module with the data directory *) -let init = Sonority.init +(** Traverse the decision tree to get sonority value *) +let rec traverse_tree (tree : bool_tree) (segment : Feature.segment) : int = + match tree with + | Leaf value -> value + | Node { test; t_branch; f_branch } -> + if test segment then traverse_tree t_branch segment + else traverse_tree f_branch segment (** Get the sonority value (1-9) for an IPA character *) -let sonority = Sonority.sonority +let sonority (calc : t) (ipa : string) : int = + match Ipa_table.lookup_segment calc.ipa_table.table ipa with + | Some features -> traverse_tree calc.decision_tree features + | None -> failwith (Printf.sprintf "Unknown IPA segment: %s" ipa) (** Get the sonority value from a feature specification *) -let sonority_from_features = Sonority.sonority_from_features +let sonority_from_features (calc : t) (segment : Feature.segment) : int = + traverse_tree calc.decision_tree segment + +(** Get the underlying IPA table *) +let get_ipa_table (calc : t) : Ipa_table.t = calc.ipa_table diff --git a/sorsyl/lib/sonorityold.ml b/sorsyl/lib/sonorityold.ml new file mode 100644 index 0000000..65dd9e5 --- /dev/null +++ b/sorsyl/lib/sonorityold.ml @@ -0,0 +1,160 @@ +(** Sonority module for determining the sonority of phonetic segments. + + This module provides functionality to determine the sonority of IPA + (International Phonetic Alphabet) segments on a scale of 1 to 9, where: + - 9: Low vowels (most sonorous) + - 8: High vowels + - 7: Glides/approximants + - 6: Liquids + - 5: Nasals + - 4: Voiced fricatives + - 3: Voiceless fricatives + - 2: Voiced stops + - 1: Voiceless stops (least sonorous) *) + +(** Decision tree for computing sonority values *) +type bool_tree = + | Leaf of int (** Terminal node with sonority value *) + | Node of { + test : Feature.segment -> bool; (** Test function *) + t_branch : bool_tree; (** Branch to follow if test is true *) + f_branch : bool_tree; (** Branch to follow if test is false *) + } + +(** Main Sonority module functionality *) +module Sonority = struct + (** Initialize the module by loading IPA data *) + let init data_dir = + let csv_file = Filename.concat data_dir "ipa_all.csv" in + Ipa_tableold.load_csv csv_file + + (** Build the decision tree for sonority calculation *) + let build_tree () = + let open Feature in + let plusSyl = test (Syllabic, Plus) in + let minusHi = test (High, Minus) in + let minusCons = test (Consonantal, Minus) in + let plusSon = test (Sonorant, Plus) in + let minusNas = test (Nasal, Minus) in + let plusCont = test (Continuant, Plus) in + let plusVoi = test (Voiced, Plus) in + + (* Build the tree bottom-up *) + let minusHi_branch = + Node + { + test = minusHi; + t_branch = Leaf 9; + (* -hi vowels = low vowels *) + f_branch = Leaf 8; + (* +hi vowels = high vowels *) + } + in + + let plusVoi1_branch = + Node + { + test = plusVoi; + t_branch = Leaf 4; + (* +voi +cont = voiced fricatives *) + f_branch = Leaf 3; + (* -voi +cont = voiceless fricatives *) + } + in + + let plusVoi2_branch = + Node + { + test = plusVoi; + t_branch = Leaf 2; + (* +voi -cont = voiced stops *) + f_branch = Leaf 1; + (* -voi -cont = voiceless stops *) + } + in + + let plusCont_branch = + Node + { + test = plusCont; + t_branch = plusVoi1_branch; + (* +cont = fricatives *) + f_branch = plusVoi2_branch; + (* -cont = stops *) + } + in + + let minusNas_branch = + Node + { + test = minusNas; + t_branch = Leaf 6; + (* -nas +son = liquids *) + f_branch = Leaf 5; + (* +nas +son = nasals *) + } + in + + let plusSon_branch = + Node + { + test = plusSon; + t_branch = minusNas_branch; + (* +son = sonorants *) + f_branch = plusCont_branch; + (* -son = obstruents *) + } + in + + let minusCons_branch = + Node + { + test = minusCons; + t_branch = Leaf 7; + (* -cons = glides *) + f_branch = plusSon_branch; + (* +cons = true consonants *) + } + in + + Node + { + test = plusSyl; + t_branch = minusHi_branch; + (* +syl = vowels *) + f_branch = minusCons_branch; + (* -syl = non-vowels *) + } + + (** Evaluate the decision tree for a segment *) + let rec eval_tree tree segment = + match tree with + | Leaf value -> value + | Node { test; t_branch; f_branch } -> + if test segment then eval_tree t_branch segment + else eval_tree f_branch segment + + (** The main decision tree instance *) + let sonority_tree = lazy (build_tree ()) + + (** Get sonority value from feature specifications *) + let sonority_from_features segment = + eval_tree (Lazy.force sonority_tree) segment + + (** Get sonority value from an IPA character *) + let sonority ipa = + match Ipa_tableold.lookup_segment ipa with + | Some features -> sonority_from_features features + | None -> failwith (Printf.sprintf "Unknown IPA segment: %s" ipa) +end + +(** Public interface *) + +(** Initialize the sonority module with the data directory *) +let init = Sonority.init + +(** Get the sonority value (1-9) for an IPA character *) +let sonority = Sonority.sonority + +(** Get the sonority value from a feature specification *) +let sonority_from_features = Sonority.sonority_from_features diff --git a/sorsyl/lib/sonority.mli b/sorsyl/lib/sonorityold.mli index 3e9166e..3e9166e 100644 --- a/sorsyl/lib/sonority.mli +++ b/sorsyl/lib/sonorityold.mli diff --git a/sorsyl/sorsyl.opam b/sorsyl/sorsyl.opam index 7c561b3..72b5389 100644 --- a/sorsyl/sorsyl.opam +++ b/sorsyl/sorsyl.opam @@ -12,7 +12,10 @@ bug-reports: "https://github.com/username/reponame/issues" depends: [ "dune" {>= "3.19"} "ocaml" + "base" + "base_trie" "csv" + "uunf" "odoc" {with-doc} ] build: [ diff --git a/sorsyl/test/dune b/sorsyl/test/dune index 701e92d..e79f200 100644 --- a/sorsyl/test/dune +++ b/sorsyl/test/dune @@ -1,3 +1,11 @@ +; (test +; (name test_sonorityold) +; (libraries sorsyl)) + (test (name test_sonority) (libraries sorsyl)) + +(test + (name test_table) + (libraries sorsyl)) diff --git a/sorsyl/test/test_sonority.ml b/sorsyl/test/test_sonority.ml index 70845a6..c24f4b4 100644 --- a/sorsyl/test/test_sonority.ml +++ b/sorsyl/test/test_sonority.ml @@ -1,11 +1,9 @@ -(** Tests for the Sonority module *) +(** Tests for the functional Sonority module *) open Sorsyl -(** Test fixture - initialize the module once *) -let () = - (* Initialize with the data directory *) - (* When run with dune test, the working directory is _build/default/test *) +(** Test fixture - create the sonority calculator once *) +let sonority_calc = let data_dir = if Sys.file_exists "./data" then "./data" else if Sys.file_exists "../data" then "../data" @@ -14,13 +12,13 @@ let () = Printf.eprintf "Current directory: %s\n" (Sys.getcwd ()); failwith "Cannot find data directory") in - Sonority.init data_dir + Sonority.create data_dir (** Test sonority value 9 - Low vowels *) let test_sonority_nine () = let segments = [ "a"; "ɑ"; "æ"; "ɒ"; "e"; "o̥" ] in let expected = [ 9; 9; 9; 9; 9; 9 ] in - let results = List.map Sonority.sonority segments in + let results = List.map (Sonority.sonority sonority_calc) segments in assert (results = expected); Printf.printf "test_sonority_nine: PASSED\n" @@ -28,7 +26,7 @@ let test_sonority_nine () = let test_sonority_eight () = let segments = [ "i"; "y"; "ɨ"; "ʉ"; "ɯ"; "u" ] in let expected = [ 8; 8; 8; 8; 8; 8 ] in - let results = List.map Sonority.sonority segments in + let results = List.map (Sonority.sonority sonority_calc) segments in assert (results = expected); Printf.printf "test_sonority_eight: PASSED\n" @@ -36,7 +34,7 @@ let test_sonority_eight () = let test_sonority_seven () = let segments = [ "j"; "w"; "ʋ"; "ɰ"; "ɹ"; "e̯" ] in let expected = [ 7; 7; 7; 7; 7; 7 ] in - let results = List.map Sonority.sonority segments in + let results = List.map (Sonority.sonority sonority_calc) segments in assert (results = expected); Printf.printf "test_sonority_seven: PASSED\n" @@ -44,7 +42,7 @@ let test_sonority_seven () = let test_sonority_six () = let segments = [ "l"; "ɭ"; "r"; "ɾ" ] in let expected = [ 6; 6; 6; 6 ] in - let results = List.map Sonority.sonority segments in + let results = List.map (Sonority.sonority sonority_calc) segments in assert (results = expected); Printf.printf "test_sonority_six: PASSED\n" @@ -52,7 +50,7 @@ let test_sonority_six () = let test_sonority_five () = let segments = [ "n"; "m"; "ŋ"; "ɴ" ] in let expected = [ 5; 5; 5; 5 ] in - let results = List.map Sonority.sonority segments in + let results = List.map (Sonority.sonority sonority_calc) segments in assert (results = expected); Printf.printf "test_sonority_five: PASSED\n" @@ -60,15 +58,18 @@ let test_sonority_five () = let test_sonority_four () = let segments = [ "v"; "z"; "ʒ"; "ɣ" ] in let expected = [ 4; 4; 4; 4 ] in - let results = List.map Sonority.sonority segments in + let results = List.map (Sonority.sonority sonority_calc) segments in + let results_string = + List.fold_left (fun acc item -> Printf.sprintf "%s-%d" acc item) "" results + in assert (results = expected); - Printf.printf "test_sonority_four: PASSED\n" + Printf.printf "test_sonority_four: %s\nPASSED\n" results_string (** Test sonority value 3 - Voiceless fricatives *) let test_sonority_three () = let segments = [ "f"; "s"; "x"; "ħ"; "ʃ" ] in let expected = [ 3; 3; 3; 3; 3 ] in - let results = List.map Sonority.sonority segments in + let results = List.map (Sonority.sonority sonority_calc) segments in assert (results = expected); Printf.printf "test_sonority_three: PASSED\n" @@ -76,7 +77,7 @@ let test_sonority_three () = let test_sonority_two () = let segments = [ "b"; "ɡ"; "d"; "ɢ" ] in let expected = [ 2; 2; 2; 2 ] in - let results = List.map Sonority.sonority segments in + let results = List.map (Sonority.sonority sonority_calc) segments in assert (results = expected); Printf.printf "test_sonority_two: PASSED\n" @@ -84,14 +85,14 @@ let test_sonority_two () = let test_sonority_one () = let segments = [ "p"; "k"; "c"; "q" ] in let expected = [ 1; 1; 1; 1 ] in - let results = List.map Sonority.sonority segments in + let results = List.map (Sonority.sonority sonority_calc) segments in assert (results = expected); Printf.printf "test_sonority_one: PASSED\n" (** Test unknown segment handling *) let test_unknown_segment () = try - let _ = Sonority.sonority "🦆" in + let _ = Sonority.sonority sonority_calc "🦆" in assert false (* Should not reach here *) with | Failure msg when String.sub msg 0 20 = "Unknown IPA segment:" -> @@ -110,14 +111,30 @@ let test_sonority_from_features () = (Feature.Voiced, Feature.Minus); ] in - let result = Sonority.sonority_from_features segment in + let result = Sonority.sonority_from_features sonority_calc segment in assert (result = 1); Printf.printf "test_sonority_from_features: PASSED\n" +(** Test that we can create multiple calculators (no global state) *) +let test_multiple_calculators () = + let data_dir = + if Sys.file_exists "./data" then "./data" + else if Sys.file_exists "../data" then "../data" + else if Sys.file_exists "../../../data" then "../../../data" + else failwith "Cannot find data directory" + in + let calc1 = Sonority.create data_dir in + let calc2 = Sonority.create data_dir in + + (* Both should work independently *) + assert (Sonority.sonority calc1 "a" = 9); + assert (Sonority.sonority calc2 "a" = 9); + Printf.printf "test_multiple_calculators: PASSED\n" + (** Run all tests *) let () = Printf.printf "Running Sonority module tests...\n"; - Printf.printf "================================\n"; + Printf.printf "===================================\n"; test_sonority_nine (); test_sonority_eight (); test_sonority_seven (); @@ -129,5 +146,6 @@ let () = test_sonority_one (); test_unknown_segment (); test_sonority_from_features (); - Printf.printf "================================\n"; - Printf.printf "All tests passed!\n" + test_multiple_calculators (); + Printf.printf "===================================\n"; + Printf.printf "All Sonority tests passed!\n" diff --git a/sorsyl/test/test_sonorityold.ml b/sorsyl/test/test_sonorityold.ml new file mode 100644 index 0000000..70845a6 --- /dev/null +++ b/sorsyl/test/test_sonorityold.ml @@ -0,0 +1,133 @@ +(** Tests for the Sonority module *) + +open Sorsyl + +(** Test fixture - initialize the module once *) +let () = + (* Initialize with the data directory *) + (* When run with dune test, the working directory is _build/default/test *) + let data_dir = + if Sys.file_exists "./data" then "./data" + else if Sys.file_exists "../data" then "../data" + else if Sys.file_exists "../../../data" then "../../../data" + else ( + Printf.eprintf "Current directory: %s\n" (Sys.getcwd ()); + failwith "Cannot find data directory") + in + Sonority.init data_dir + +(** Test sonority value 9 - Low vowels *) +let test_sonority_nine () = + let segments = [ "a"; "ɑ"; "æ"; "ɒ"; "e"; "o̥" ] in + let expected = [ 9; 9; 9; 9; 9; 9 ] in + let results = List.map Sonority.sonority segments in + assert (results = expected); + Printf.printf "test_sonority_nine: PASSED\n" + +(** Test sonority value 8 - High vowels *) +let test_sonority_eight () = + let segments = [ "i"; "y"; "ɨ"; "ʉ"; "ɯ"; "u" ] in + let expected = [ 8; 8; 8; 8; 8; 8 ] in + let results = List.map Sonority.sonority segments in + assert (results = expected); + Printf.printf "test_sonority_eight: PASSED\n" + +(** Test sonority value 7 - Glides/approximants *) +let test_sonority_seven () = + let segments = [ "j"; "w"; "ʋ"; "ɰ"; "ɹ"; "e̯" ] in + let expected = [ 7; 7; 7; 7; 7; 7 ] in + let results = List.map Sonority.sonority segments in + assert (results = expected); + Printf.printf "test_sonority_seven: PASSED\n" + +(** Test sonority value 6 - Liquids *) +let test_sonority_six () = + let segments = [ "l"; "ɭ"; "r"; "ɾ" ] in + let expected = [ 6; 6; 6; 6 ] in + let results = List.map Sonority.sonority segments in + assert (results = expected); + Printf.printf "test_sonority_six: PASSED\n" + +(** Test sonority value 5 - Nasals *) +let test_sonority_five () = + let segments = [ "n"; "m"; "ŋ"; "ɴ" ] in + let expected = [ 5; 5; 5; 5 ] in + let results = List.map Sonority.sonority segments in + assert (results = expected); + Printf.printf "test_sonority_five: PASSED\n" + +(** Test sonority value 4 - Voiced fricatives *) +let test_sonority_four () = + let segments = [ "v"; "z"; "ʒ"; "ɣ" ] in + let expected = [ 4; 4; 4; 4 ] in + let results = List.map Sonority.sonority segments in + assert (results = expected); + Printf.printf "test_sonority_four: PASSED\n" + +(** Test sonority value 3 - Voiceless fricatives *) +let test_sonority_three () = + let segments = [ "f"; "s"; "x"; "ħ"; "ʃ" ] in + let expected = [ 3; 3; 3; 3; 3 ] in + let results = List.map Sonority.sonority segments in + assert (results = expected); + Printf.printf "test_sonority_three: PASSED\n" + +(** Test sonority value 2 - Voiced stops *) +let test_sonority_two () = + let segments = [ "b"; "ɡ"; "d"; "ɢ" ] in + let expected = [ 2; 2; 2; 2 ] in + let results = List.map Sonority.sonority segments in + assert (results = expected); + Printf.printf "test_sonority_two: PASSED\n" + +(** Test sonority value 1 - Voiceless stops *) +let test_sonority_one () = + let segments = [ "p"; "k"; "c"; "q" ] in + let expected = [ 1; 1; 1; 1 ] in + let results = List.map Sonority.sonority segments in + assert (results = expected); + Printf.printf "test_sonority_one: PASSED\n" + +(** Test unknown segment handling *) +let test_unknown_segment () = + try + let _ = Sonority.sonority "🦆" in + assert false (* Should not reach here *) + with + | Failure msg when String.sub msg 0 20 = "Unknown IPA segment:" -> + Printf.printf "test_unknown_segment: PASSED\n" + | _ -> assert false + +(** Test feature-based sonority calculation *) +let test_sonority_from_features () = + (* Test a simple voiceless stop: -syl, +cons, -son, -cont, -voi *) + let segment = + [ + (Feature.Syllabic, Feature.Minus); + (Feature.Consonantal, Feature.Plus); + (Feature.Sonorant, Feature.Minus); + (Feature.Continuant, Feature.Minus); + (Feature.Voiced, Feature.Minus); + ] + in + let result = Sonority.sonority_from_features segment in + assert (result = 1); + Printf.printf "test_sonority_from_features: PASSED\n" + +(** Run all tests *) +let () = + Printf.printf "Running Sonority module tests...\n"; + Printf.printf "================================\n"; + test_sonority_nine (); + test_sonority_eight (); + test_sonority_seven (); + test_sonority_six (); + test_sonority_five (); + test_sonority_four (); + test_sonority_three (); + test_sonority_two (); + test_sonority_one (); + test_unknown_segment (); + test_sonority_from_features (); + Printf.printf "================================\n"; + Printf.printf "All tests passed!\n" diff --git a/sorsyl/test/test_table.ml b/sorsyl/test/test_table.ml new file mode 100644 index 0000000..89cb4a2 --- /dev/null +++ b/sorsyl/test/test_table.ml @@ -0,0 +1,208 @@ +open Sorsyl + +(* let es = *) +(* String.split_on_char ' ' *) +(* "la ˌinteɾnˌaθjonˌaliθaθjˈon del kˌoɾaθˈon i el ðˌikθjonˈaɾjo" *) + +(* let de = *) +(* [ *) +(* "kɔmˈpjuːtɐ"; *) +(* "teleˈfoːn"; *) +(* "fɑˈmiːliːjə"; *) +(* "ˈʔɑːpɔtekə"; *) +(* "ˈʃoːkolɑdə"; *) +(* "ˈtoːmɑtən"; *) +(* "ˈbananə"; *) +(* "ˈpoːlitsae"; *) +(* "ˈmuːzɔøm"; *) +(* "ˈbʏçɐ̯ae"; *) +(* "mediˈt͡siːn"; *) +(* "ˈpɾoːfɛszoɾ"; *) +(* "ʔeleˈfɑnt"; *) +(* "dokumɛnt"; *) +(* "ˈʔɪntɛɾnət"; *) +(* "ˈʔʊnʔivɛɾziˈtɛːt"; *) +(* "ˈkaɾtɔffɛln"; *) +(* "ˈmatemɑtik"; *) +(* "gəˈbʊɾtstɑk"; *) +(* "ˈvœʁtɐˌbuːx"; *) +(* "ˈbɪbli̯oːtək"; *) +(* "demɔkɾɑˈtiːjə"; *) +(* "fotɔgɾɑˈfiːjə"; *) +(* "tɛçnoloˈgiːjə"; *) +(* "bioloˈgiːjə"; *) +(* "psʏçoloˈgiːjə"; *) +(* "filozɔfiːjə"; *) +(* "ˈʃoːkolɑdə"; *) +(* "ˈmaɾmelɑdə"; *) +(* "ˈzɛkɾetɛɾɪn"; *) +(* "ʔɛntˈʃʊldɪgʊŋ"; *) +(* "ˈkɾaŋkɛnvɛɾzɪçɐ̯ʊŋ"; *) +(* "gəˈbʊɾtstagspaɾty"; *) +(* "kɔmmunikɑˈtsĭoːn"; *) +(* "ʔɔɾgɑnizɑˈtsĭoːn"; *) +(* "bʏɾgɛɾˈməɪstɐ̯"; *) +(* "zeɛnsvʏɾdɪkˈkəiːt"; *) +(* "ˈmiːneɾalvaszɐ̯"; *) +(* "ˈtsuːzammenaɾˈbəiːt"; *) +(* "mœglɪçˈkəiːtən"; *) +(* "gəlegɛnˈəiːtən"; *) +(* "naxˈmɪtˌtɑːk"; *) +(* "ʔɪnfɔɾmɑˈtsĭoːn"; *) +(* "televiˈzĭoːn"; *) +(* "gəʃvɪndɪkkaetsbɛgɾentsʊŋ"; *) +(* "ˈkɾaŋkɛnaozaofɛntalt"; *) +(* "ʔaɾbaetslozɪkˈkəiːt"; *) +(* "fɛːɐ̯ˈʔantvɔɾtlɪçˈkəiːt"; *) +(* "zeɛnsvʏɾdɪkˈkəiːtən"; *) +(* "ˈzɛlpstvɛɾstɛntlɪç"; *) +(* "ˈvaenaxtsgɛʃɛŋkə"; *) +(* "gəˈbʊɾtstaksgɛʃɛŋk"; *) +(* "tuɾɪstenɪnfɔɾmɑˈtsĭoːn"; *) +(* "ˈʔʊnˌʔiːvɛɾzitɛtspɾofɛszoɾ"; *) +(* "ˈleːbɛnsmɪtɛlgɛʃɛft"; *) +(* "ˈfɑɾɾatvɛɾlae"; *) +(* "ˈbʊndɛstakzapgeɔɾdnətɐ̯"; *) +(* "ˈʃtɾaeçɔltsʃɛçtɛlçən"; *) +(* "ˈfɾɔøntʃaftsbetsiːʊŋən"; *) +(* "ˈɾɛçtsʃʊtsvɛɾzɪçɐ̯ʊŋ"; *) +(* "nɑɾʊŋsmɪtelʊnvɛɾtɾɛglɪçˈkəiːt"; *) +(* ] *) + +(* let en1 = *) +(* [ *) +(* "ˈæpəɫ"; *) +(* "ˈðɪs"; *) +(* "ˈɪs"; *) +(* "ˈeɪ"; *) +(* "ɫɪŋˈɡwɪstɪks"; *) +(* "kəˈtæstɹəfi"; *) +(* "wɪˈθaʊt"; *) +(* "ˈpɹɛsədənt"; *) +(* "ˈtu"; *) +(* "ədˈmɪt"; *) +(* "ˈænd"; *) +(* "ˈɪts"; *) +(* "ˌɪnstəˈɡeɪʃən"; *) +(* "ˈʃʊd"; *) +(* "ˈbi"; *) +(* "ˈpənɪʃt"; *) +(* ] *) + +let en2 = + [ + "d͡ʒɹ̩mən"; + "ˈpɹɛzənt"; + "ˈɑɹtɪkəɫ"; + "pɹəˈvaɪdz"; + "ˌɹiəˈnæɫəsəs"; + "kˈɔːɹɑːnəl"; + "kɝˈoʊnəɫ"; + "ˈɑptɪks"; + "ˈɛksəɫəns"; + "əbˈstɹuənts"; + "ˈdʒɝmən"; + "ˈɛŋɡɫɪʃ"; + "ˈkɑmənɫi"; + "əˈsumd"; + "ˈdʒɪps"; + "ˈpɫæstɝ"; + "ˈɛŋɡɫɪʃ"; + (* "ˈɫæps"; *) + "ˈɑɹɡjud"; + "bɪˈɫoʊ"; + "ˌɛkstɹəsɪˈɫæbɪk"; + "ˈkɑnsənənts"; + "ˌdɛɹəˈveɪʃənəɫ"; + "ˈsteɪdʒ"; + "ˌɛkstɹəsɪɫəˈbɪsɪti"; + "ˈaɪðɝ"; + "ˈɫæŋɡwɪdʒɪz"; + "ɪɡˈzɪsts"; + "ˈɛvədəns"; + "ˈkɑmənɫi"; + "pɹɪˈzɛntɪd"; + "səˈpɔɹt"; + "ˌɛkstɹəsɪˈɫæbɪk"; + "ˈkɑnsənənts"; + "kəmˈpætəbəɫ"; + "ˈfʊɫi"; + "ˈsɝfəs"; + "ˌɑptəˈmæɫəti"; + "ˌθiɝˈɛtɪk"; + "ˈtɹitmənt"; + "kənˈstɹeɪnts"; + "ɹɪˈfɝɪŋ"; + "ˈfʊɫi"; + "səˈɫæbəˌfaɪd"; + "ˈaʊtˌpʊt"; + "ˌɹɛpɹəzɛnˈteɪʃənz"; + "pɹəˈpoʊzd"; + ] + +(* let zh = [ "/t͡ɕi⁵¹ ti⁵¹ pʰi⁵¹/" ] *) + +(* let ws = *) +(* [ *) +(* ( "/nuˌmɑ.noʊ.ʌl.tɹə.maɪ.kɹoʊˈskɑ.pɪkˌsɪ.lɪ.koʊ.vɑl.keɪ.noʊ.koʊ.niˈoʊ.sɪs/", *) +(* "/əˈbæn.dn̩.əd.li/", *) +(* " /əˈbæn.dn̩.əd.li/", *) +(* "/əˈbæn.dn̩.mn̩t/", *) +(* "/-ˌbiːə-/", *) +(* "/əˈbluː.ʃn̩/xx" ); *) +(* ] *) + +let get_data = + Printf.printf "Getting data\n"; + let data_dir = + if Sys.file_exists "./data" then "./data" + else if Sys.file_exists "../data" then "../data" + else if Sys.file_exists "../../../data" then "../../../data" + else ( + Printf.eprintf "Current directory: %s\n" (Sys.getcwd ()); + failwith "Cannot find data directory") + in + Ipa_table.load_csv data_dir + +let test_fts table = + let result = Ipa_table.fts table "s" in + match result with + | None -> () + | Some seg -> Printf.printf "fts\n %s\n" (Feature.string_of_segment seg) +(* let expected = None in *) +(* let results = Ipa_table.fts *) +(* assert (result = expected); *) +(* Printf.printf "test_fts: PASSED\n" *) + +let test_segs data = + let words = en2 in + (* let expected = [] in *) + let _results = + Base.List.map words ~f:(fun word -> + let res = Ipa_table.ipa_segs data word in + Printf.printf "%s\n" word; + let xl = + Base.List.fold res ~init:"" ~f:(fun acc char -> + Printf.sprintf "%s-%s" acc char) + in + Printf.printf "%s\n" xl; + res) + in + (* Base.List.iter results ~f:(fun x -> *) + (* let xl = *) + (* Base.List.fold x ~init:"" ~f:(fun acc char -> *) + (* Printf.sprintf "%s-%s" acc char) *) + (* in *) + (* Printf.printf "%s\n" xl) *) + (* assert (result = expected) *) + Printf.printf "test_fts: PASSED\n" + +(** Run all tests *) +let () = + let data = get_data in + Printf.printf "Running IPA Table module tests...\n"; + Printf.printf "===================================\n"; + test_fts data.table; + test_segs data; + Printf.printf "===================================\n"; + Printf.printf "All IPA Table tests passed!\n" |