summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorpolwex <polwex@sortug.com>2025-06-22 09:21:58 +0700
committerpolwex <polwex@sortug.com>2025-06-22 09:21:58 +0700
commitb43fe0d51da9a247bf94af27898d63f79d424073 (patch)
tree87cdba61deb79f73a829c96568b124adab66f1e1
parentc9fbdb681b77698bdf8a503cb9d13b6f0b53fd93 (diff)
getting there
-rw-r--r--sorsyl/dune-project2
-rw-r--r--sorsyl/lib/dune2
-rw-r--r--sorsyl/lib/feature.ml39
-rw-r--r--sorsyl/lib/ipa_table.ml102
-rw-r--r--sorsyl/lib/ipa_tableold.ml85
-rw-r--r--sorsyl/lib/sonority.ml250
-rw-r--r--sorsyl/lib/sonorityold.ml160
-rw-r--r--sorsyl/lib/sonorityold.mli (renamed from sorsyl/lib/sonority.mli)0
-rw-r--r--sorsyl/sorsyl.opam3
-rw-r--r--sorsyl/test/dune8
-rw-r--r--sorsyl/test/test_sonority.ml60
-rw-r--r--sorsyl/test/test_sonorityold.ml133
-rw-r--r--sorsyl/test/test_table.ml208
13 files changed, 863 insertions, 189 deletions
diff --git a/sorsyl/dune-project b/sorsyl/dune-project
index fe23f5d..6651317 100644
--- a/sorsyl/dune-project
+++ b/sorsyl/dune-project
@@ -19,7 +19,7 @@
(name sorsyl)
(synopsis "A short synopsis")
(description "A longer description")
- (depends ocaml csv)
+ (depends ocaml base base_trie csv uunf)
(tags
("add topics" "to describe" your project)))
diff --git a/sorsyl/lib/dune b/sorsyl/lib/dune
index 148997f..bba2520 100644
--- a/sorsyl/lib/dune
+++ b/sorsyl/lib/dune
@@ -1,3 +1,3 @@
(library
(name sorsyl)
- (libraries csv base stdio))
+ (libraries csv base stdio base_trie))
diff --git a/sorsyl/lib/feature.ml b/sorsyl/lib/feature.ml
index 280977b..f67a300 100644
--- a/sorsyl/lib/feature.ml
+++ b/sorsyl/lib/feature.ml
@@ -44,7 +44,9 @@ let value_of_string = function
| "0" -> Zero
| s -> failwith (Printf.sprintf "Invalid feature value: %s" s)
-let string_of_feature = function
+let string_of_value = function Plus -> "+" | Minus -> "-" | Zero -> "0"
+
+let feature_of_string = function
| "syl" -> Syllabic
| "son" -> Sonorant
| "cons" -> Consonantal
@@ -71,6 +73,41 @@ let string_of_feature = function
| "hireg" -> HighReg
| _ -> failwith "not a valid feature"
+let string_of_feature = function
+ | Syllabic -> "syl"
+ | Sonorant -> "son"
+ | Consonantal -> "cons"
+ | Continuant -> "cont"
+ | DelayedRelease -> "delrel"
+ | Lateral -> "lat"
+ | Nasal -> "nas"
+ | Strident -> "strid"
+ | Voiced -> "voi"
+ | SpreadGlottis -> "sg"
+ | ConstrictedGlottis -> "cg"
+ | Anterior -> "ant"
+ | Coronal -> "cor"
+ | Distributed -> "distr"
+ | Labial -> "lab"
+ | High -> "hi"
+ | Low -> "lo"
+ | Back -> "back"
+ | Rounded -> "round"
+ | Velaric -> "velaric"
+ | Tense -> "tense"
+ | Long -> "long"
+ | HighTone -> "hitone"
+ | HighReg -> "hireg"
+
+let string_of_segment segment =
+ Base.List.fold segment ~init:"" ~f:(fun acc (feature, value) ->
+ let item =
+ Printf.sprintf "%s:%s"
+ (string_of_feature feature)
+ (string_of_value value)
+ in
+ Printf.sprintf "%s\n%s" acc item)
+
(** Check if a segment has a specific feature with a given value *)
let has_feature (value, feature_name) segment =
List.exists (fun (v, f) -> v = value && f = feature_name) segment
diff --git a/sorsyl/lib/ipa_table.ml b/sorsyl/lib/ipa_table.ml
index bee027a..295100d 100644
--- a/sorsyl/lib/ipa_table.ml
+++ b/sorsyl/lib/ipa_table.ml
@@ -1,20 +1,21 @@
-(** Type representing a segment as a set of feature specifications *)
-(* an association list I guess . Use List.assoc to handle*)
+(** Functional IPA table implementation without global state *)
-(** Decision tree for computing sonority values *)
-type bool_tree =
- | Leaf of int (** Terminal node with sonority value *)
- | Node of {
- test : Feature.segment -> bool; (** Test function *)
- t_branch : bool_tree; (** Branch to follow if test is true *)
- f_branch : bool_tree; (** Branch to follow if test is false *)
- }
+open Base
+module StringTrie = Trie.Of_string
type ipa_entry = { ipa : string; features : Feature.segment }
(** Type representing a row from the IPA CSV file *)
-(** Storage for loaded IPA data *)
-let ipa_table : (string, Feature.segment) Hashtbl.t = Hashtbl.create 1000
+type ipa_table = (string, Feature.segment, String.comparator_witness) Map.t
+(** Type representing the IPA table *)
+
+(* type t = (string, Feature.segment, String.comparator_witness) Map.t *)
+
+type t = {
+ table : ipa_table;
+ trie :
+ (string, Feature.segment, StringTrie.Keychain.keychain_description) Trie.t;
+}
(** Parse a single row from the CSV file *)
let parse_row (row : string list) : ipa_entry option =
@@ -50,14 +51,14 @@ let parse_row (row : string list) : ipa_entry option =
]
in
(* Skip the header row *)
- if ipa = "ipa" then None
+ if String.equal ipa "ipa" then None
else
let rec build_features names values acc =
match (names, values) with
| [], [] -> Some (List.rev acc)
| name :: ns, value :: vs ->
let fval = Feature.value_of_string value in
- let fname = Feature.string_of_feature name in
+ let fname = Feature.feature_of_string name in
build_features ns vs ((fname, fval) :: acc)
| _ -> None (* Mismatched lengths *)
in
@@ -65,21 +66,64 @@ let parse_row (row : string list) : ipa_entry option =
| Some feature_list -> Some { ipa; features = feature_list }
| None -> None)
-(** Load IPA data from CSV file *)
-let load_csv filename =
- let ic = open_in filename in
+(** Load IPA data from CSV file and return the table *)
+let load_csv (data_dir : string) : t =
+ let filename = Stdlib.Filename.concat data_dir "ipa_all.csv" in
+ let ic = Stdio.In_channel.create filename in
let csv = Csv.of_channel ic in
- try
- Csv.iter
- ~f:(fun row ->
- match parse_row row with
- | Some entry -> Hashtbl.add ipa_table entry.ipa entry.features
- | None -> ())
- csv;
- close_in ic
- with e ->
- close_in ic;
- raise e
+
+ let result =
+ try
+ let entries =
+ Csv.fold_left csv ~init:[] ~f:(fun acc row ->
+ match parse_row row with Some entry -> entry :: acc | None -> acc)
+ in
+ let table =
+ List.fold entries
+ ~init:(Map.empty (module String))
+ ~f:(fun acc entry -> Map.set acc ~key:entry.ipa ~data:entry.features)
+ in
+ let alist = Map.to_alist table in
+ let trie = Trie.of_alist_exn StringTrie.Keychain.keychainable alist in
+ Stdio.In_channel.close ic;
+ { table; trie }
+ with e ->
+ Stdio.In_channel.close ic;
+ raise e
+ in
+ result
(** Look up features for an IPA segment *)
-let lookup_segment ipa = Hashtbl.find_opt ipa_table ipa
+let lookup_segment table (ipa : string) : Feature.segment option =
+ Map.find table ipa
+
+(** Get all segments in the table *)
+let all_segments table : (string * Feature.segment) list = Map.to_alist table
+
+(** Check if a segment exists in the table *)
+let mem table (ipa : string) : bool = Map.mem table ipa
+
+(** Get the number of segments in the table *)
+let length table : int = Map.length table
+
+let fts ?(_normalize = true) table ipa = Map.find table ipa
+
+let longest_one_seg_prefix trie word =
+ let rec aux trie remaining =
+ if String.is_empty remaining then None
+ else
+ let res = Trie.find trie remaining in
+ match res with
+ | None -> aux trie (String.drop_suffix remaining 1)
+ | Some _data -> Some remaining
+ in
+ aux trie word
+
+let ipa_segs ?(_normalize = true) data word =
+ let rec aux acc remaining =
+ match longest_one_seg_prefix data.trie remaining with
+ | None -> acc
+ | Some seg ->
+ aux (seg :: acc) (String.drop_prefix remaining (String.length seg))
+ in
+ List.rev (aux [] word)
diff --git a/sorsyl/lib/ipa_tableold.ml b/sorsyl/lib/ipa_tableold.ml
new file mode 100644
index 0000000..eb7d3fc
--- /dev/null
+++ b/sorsyl/lib/ipa_tableold.ml
@@ -0,0 +1,85 @@
+(** Type representing a segment as a set of feature specifications *)
+(* an association list I guess . Use List.assoc to handle*)
+
+(** Decision tree for computing sonority values *)
+type bool_tree =
+ | Leaf of int (** Terminal node with sonority value *)
+ | Node of {
+ test : Feature.segment -> bool; (** Test function *)
+ t_branch : bool_tree; (** Branch to follow if test is true *)
+ f_branch : bool_tree; (** Branch to follow if test is false *)
+ }
+
+type ipa_entry = { ipa : string; features : Feature.segment }
+(** Type representing a row from the IPA CSV file *)
+
+(** Storage for loaded IPA data *)
+let ipa_table : (string, Feature.segment) Hashtbl.t = Hashtbl.create 1000
+
+(** Parse a single row from the CSV file *)
+let parse_row (row : string list) : ipa_entry option =
+ match row with
+ | [] -> None
+ | ipa :: features -> (
+ let feature_names =
+ [
+ "syl";
+ "son";
+ "cons";
+ "cont";
+ "delrel";
+ "lat";
+ "nas";
+ "strid";
+ "voi";
+ "sg";
+ "cg";
+ "ant";
+ "cor";
+ "distr";
+ "lab";
+ "hi";
+ "lo";
+ "back";
+ "round";
+ "velaric";
+ "tense";
+ "long";
+ "hitone";
+ "hireg";
+ ]
+ in
+ (* Skip the header row *)
+ if ipa = "ipa" then None
+ else
+ let rec build_features names values acc =
+ match (names, values) with
+ | [], [] -> Some (List.rev acc)
+ | name :: ns, value :: vs ->
+ let fval = Feature.value_of_string value in
+ let fname = Feature.feature_of_string name in
+ build_features ns vs ((fname, fval) :: acc)
+ | _ -> None (* Mismatched lengths *)
+ in
+ match build_features feature_names features [] with
+ | Some feature_list -> Some { ipa; features = feature_list }
+ | None -> None)
+
+(** Load IPA data from CSV file *)
+let load_csv filename =
+ let ic = open_in filename in
+ let csv = Csv.of_channel ic in
+ try
+ Csv.iter
+ ~f:(fun row ->
+ match parse_row row with
+ | Some entry -> Hashtbl.add ipa_table entry.ipa entry.features
+ | None -> ())
+ csv;
+ close_in ic
+ with e ->
+ close_in ic;
+ raise e
+
+(** Look up features for an IPA segment *)
+let lookup_segment ipa = Hashtbl.find_opt ipa_table ipa
diff --git a/sorsyl/lib/sonority.ml b/sorsyl/lib/sonority.ml
index 90bfa55..c47d4a0 100644
--- a/sorsyl/lib/sonority.ml
+++ b/sorsyl/lib/sonority.ml
@@ -1,16 +1,4 @@
-(** Sonority module for determining the sonority of phonetic segments.
-
- This module provides functionality to determine the sonority of IPA
- (International Phonetic Alphabet) segments on a scale of 1 to 9, where:
- - 9: Low vowels (most sonorous)
- - 8: High vowels
- - 7: Glides/approximants
- - 6: Liquids
- - 5: Nasals
- - 4: Voiced fricatives
- - 3: Voiceless fricatives
- - 2: Voiced stops
- - 1: Voiceless stops (least sonorous) *)
+(** Functional sonority module without global state *)
(** Decision tree for computing sonority values *)
type bool_tree =
@@ -21,140 +9,130 @@ type bool_tree =
f_branch : bool_tree; (** Branch to follow if test is false *)
}
-(** Main Sonority module functionality *)
-module Sonority = struct
- (** Initialize the module by loading IPA data *)
- let init data_dir =
- let csv_file = Filename.concat data_dir "ipa_all.csv" in
- Ipa_table.load_csv csv_file
-
- (** Build the decision tree for sonority calculation *)
- let build_tree () =
- let open Feature in
- let plusSyl = test (Syllabic, Plus) in
- let minusHi = test (High, Minus) in
- let minusCons = test (Consonantal, Minus) in
- let plusSon = test (Sonorant, Plus) in
- let minusNas = test (Nasal, Minus) in
- let plusCont = test (Continuant, Plus) in
- let plusVoi = test (Voiced, Plus) in
-
- (* Build the tree bottom-up *)
- let minusHi_branch =
- Node
- {
- test = minusHi;
- t_branch = Leaf 9;
- (* -hi vowels = low vowels *)
- f_branch = Leaf 8;
- (* +hi vowels = high vowels *)
- }
- in
-
- let plusVoi1_branch =
- Node
- {
- test = plusVoi;
- t_branch = Leaf 4;
- (* +voi +cont = voiced fricatives *)
- f_branch = Leaf 3;
- (* -voi +cont = voiceless fricatives *)
- }
- in
-
- let plusVoi2_branch =
- Node
- {
- test = plusVoi;
- t_branch = Leaf 2;
- (* +voi -cont = voiced stops *)
- f_branch = Leaf 1;
- (* -voi -cont = voiceless stops *)
- }
- in
-
- let plusCont_branch =
- Node
- {
- test = plusCont;
- t_branch = plusVoi1_branch;
- (* +cont = fricatives *)
- f_branch = plusVoi2_branch;
- (* -cont = stops *)
- }
- in
-
- let minusNas_branch =
- Node
- {
- test = minusNas;
- t_branch = Leaf 6;
- (* -nas +son = liquids *)
- f_branch = Leaf 5;
- (* +nas +son = nasals *)
- }
- in
-
- let plusSon_branch =
- Node
- {
- test = plusSon;
- t_branch = minusNas_branch;
- (* +son = sonorants *)
- f_branch = plusCont_branch;
- (* -son = obstruents *)
- }
- in
-
- let minusCons_branch =
- Node
- {
- test = minusCons;
- t_branch = Leaf 7;
- (* -cons = glides *)
- f_branch = plusSon_branch;
- (* +cons = true consonants *)
- }
- in
+type t = { ipa_table : Ipa_table.t; decision_tree : bool_tree }
+(** Type representing a sonority calculator *)
+
+(** Build the decision tree for sonority calculation *)
+let build_tree () =
+ let open Feature in
+ let plusSyl = test (Syllabic, Plus) in
+ let minusHi = test (High, Minus) in
+ let minusCons = test (Consonantal, Minus) in
+ let plusSon = test (Sonorant, Plus) in
+ let minusNas = test (Nasal, Minus) in
+ let plusCont = test (Continuant, Plus) in
+ let plusVoi = test (Voiced, Plus) in
+
+ (* Build the tree bottom-up, matching the Python original exactly *)
+ let minusHi_branch =
+ Node
+ {
+ test = minusHi;
+ t_branch = Leaf 9;
+ (* -hi vowels = low vowels *)
+ f_branch = Leaf 8;
+ (* +hi vowels = high vowels *)
+ }
+ in
+ let plusVoi1_branch =
Node
{
- test = plusSyl;
- t_branch = minusHi_branch;
- (* +syl = vowels *)
- f_branch = minusCons_branch;
- (* -syl = non-vowels *)
+ test = plusVoi;
+ t_branch = Leaf 4;
+ (* +voi +cont = voiced fricatives *)
+ f_branch = Leaf 3;
+ (* -voi +cont = voiceless fricatives *)
}
+ in
- (** Evaluate the decision tree for a segment *)
- let rec eval_tree tree segment =
- match tree with
- | Leaf value -> value
- | Node { test; t_branch; f_branch } ->
- if test segment then eval_tree t_branch segment
- else eval_tree f_branch segment
+ let plusVoi2_branch =
+ Node
+ {
+ test = plusVoi;
+ t_branch = Leaf 2;
+ (* +voi -cont = voiced stops *)
+ f_branch = Leaf 1;
+ (* -voi -cont = voiceless stops *)
+ }
+ in
- (** The main decision tree instance *)
- let sonority_tree = lazy (build_tree ())
+ let plusCont_branch =
+ Node
+ {
+ test = plusCont;
+ t_branch = plusVoi1_branch;
+ (* +cont = fricatives *)
+ f_branch = plusVoi2_branch;
+ (* -cont = stops *)
+ }
+ in
- (** Get sonority value from feature specifications *)
- let sonority_from_features segment =
- eval_tree (Lazy.force sonority_tree) segment
+ let minusNas_branch =
+ Node
+ {
+ test = minusNas;
+ t_branch = Leaf 6;
+ (* -nas +son = liquids *)
+ f_branch = Leaf 5;
+ (* +nas +son = nasals *)
+ }
+ in
+
+ let plusSon_branch =
+ Node
+ {
+ test = plusSon;
+ t_branch = minusNas_branch;
+ (* +son = sonorants *)
+ f_branch = plusCont_branch;
+ (* -son = obstruents *)
+ }
+ in
- (** Get sonority value from an IPA character *)
- let sonority ipa =
- match Ipa_table.lookup_segment ipa with
- | Some features -> sonority_from_features features
- | None -> failwith (Printf.sprintf "Unknown IPA segment: %s" ipa)
-end
+ let minusCons_branch =
+ Node
+ {
+ test = minusCons;
+ t_branch = Leaf 7;
+ (* -cons = glides *)
+ f_branch = plusSon_branch;
+ (* +cons = true consonants *)
+ }
+ in
+
+ Node
+ {
+ test = plusSyl;
+ t_branch = minusHi_branch;
+ (* +syl = vowels *)
+ f_branch = minusCons_branch;
+ (* -syl = non-vowels *)
+ }
-(** Public interface *)
+(** Create a sonority calculator from data directory *)
+let create (data_dir : string) : t =
+ let ipa_table = Ipa_table.load_csv data_dir in
+ let decision_tree = build_tree () in
+ { ipa_table; decision_tree }
-(** Initialize the sonority module with the data directory *)
-let init = Sonority.init
+(** Traverse the decision tree to get sonority value *)
+let rec traverse_tree (tree : bool_tree) (segment : Feature.segment) : int =
+ match tree with
+ | Leaf value -> value
+ | Node { test; t_branch; f_branch } ->
+ if test segment then traverse_tree t_branch segment
+ else traverse_tree f_branch segment
(** Get the sonority value (1-9) for an IPA character *)
-let sonority = Sonority.sonority
+let sonority (calc : t) (ipa : string) : int =
+ match Ipa_table.lookup_segment calc.ipa_table.table ipa with
+ | Some features -> traverse_tree calc.decision_tree features
+ | None -> failwith (Printf.sprintf "Unknown IPA segment: %s" ipa)
(** Get the sonority value from a feature specification *)
-let sonority_from_features = Sonority.sonority_from_features
+let sonority_from_features (calc : t) (segment : Feature.segment) : int =
+ traverse_tree calc.decision_tree segment
+
+(** Get the underlying IPA table *)
+let get_ipa_table (calc : t) : Ipa_table.t = calc.ipa_table
diff --git a/sorsyl/lib/sonorityold.ml b/sorsyl/lib/sonorityold.ml
new file mode 100644
index 0000000..65dd9e5
--- /dev/null
+++ b/sorsyl/lib/sonorityold.ml
@@ -0,0 +1,160 @@
+(** Sonority module for determining the sonority of phonetic segments.
+
+ This module provides functionality to determine the sonority of IPA
+ (International Phonetic Alphabet) segments on a scale of 1 to 9, where:
+ - 9: Low vowels (most sonorous)
+ - 8: High vowels
+ - 7: Glides/approximants
+ - 6: Liquids
+ - 5: Nasals
+ - 4: Voiced fricatives
+ - 3: Voiceless fricatives
+ - 2: Voiced stops
+ - 1: Voiceless stops (least sonorous) *)
+
+(** Decision tree for computing sonority values *)
+type bool_tree =
+ | Leaf of int (** Terminal node with sonority value *)
+ | Node of {
+ test : Feature.segment -> bool; (** Test function *)
+ t_branch : bool_tree; (** Branch to follow if test is true *)
+ f_branch : bool_tree; (** Branch to follow if test is false *)
+ }
+
+(** Main Sonority module functionality *)
+module Sonority = struct
+ (** Initialize the module by loading IPA data *)
+ let init data_dir =
+ let csv_file = Filename.concat data_dir "ipa_all.csv" in
+ Ipa_tableold.load_csv csv_file
+
+ (** Build the decision tree for sonority calculation *)
+ let build_tree () =
+ let open Feature in
+ let plusSyl = test (Syllabic, Plus) in
+ let minusHi = test (High, Minus) in
+ let minusCons = test (Consonantal, Minus) in
+ let plusSon = test (Sonorant, Plus) in
+ let minusNas = test (Nasal, Minus) in
+ let plusCont = test (Continuant, Plus) in
+ let plusVoi = test (Voiced, Plus) in
+
+ (* Build the tree bottom-up *)
+ let minusHi_branch =
+ Node
+ {
+ test = minusHi;
+ t_branch = Leaf 9;
+ (* -hi vowels = low vowels *)
+ f_branch = Leaf 8;
+ (* +hi vowels = high vowels *)
+ }
+ in
+
+ let plusVoi1_branch =
+ Node
+ {
+ test = plusVoi;
+ t_branch = Leaf 4;
+ (* +voi +cont = voiced fricatives *)
+ f_branch = Leaf 3;
+ (* -voi +cont = voiceless fricatives *)
+ }
+ in
+
+ let plusVoi2_branch =
+ Node
+ {
+ test = plusVoi;
+ t_branch = Leaf 2;
+ (* +voi -cont = voiced stops *)
+ f_branch = Leaf 1;
+ (* -voi -cont = voiceless stops *)
+ }
+ in
+
+ let plusCont_branch =
+ Node
+ {
+ test = plusCont;
+ t_branch = plusVoi1_branch;
+ (* +cont = fricatives *)
+ f_branch = plusVoi2_branch;
+ (* -cont = stops *)
+ }
+ in
+
+ let minusNas_branch =
+ Node
+ {
+ test = minusNas;
+ t_branch = Leaf 6;
+ (* -nas +son = liquids *)
+ f_branch = Leaf 5;
+ (* +nas +son = nasals *)
+ }
+ in
+
+ let plusSon_branch =
+ Node
+ {
+ test = plusSon;
+ t_branch = minusNas_branch;
+ (* +son = sonorants *)
+ f_branch = plusCont_branch;
+ (* -son = obstruents *)
+ }
+ in
+
+ let minusCons_branch =
+ Node
+ {
+ test = minusCons;
+ t_branch = Leaf 7;
+ (* -cons = glides *)
+ f_branch = plusSon_branch;
+ (* +cons = true consonants *)
+ }
+ in
+
+ Node
+ {
+ test = plusSyl;
+ t_branch = minusHi_branch;
+ (* +syl = vowels *)
+ f_branch = minusCons_branch;
+ (* -syl = non-vowels *)
+ }
+
+ (** Evaluate the decision tree for a segment *)
+ let rec eval_tree tree segment =
+ match tree with
+ | Leaf value -> value
+ | Node { test; t_branch; f_branch } ->
+ if test segment then eval_tree t_branch segment
+ else eval_tree f_branch segment
+
+ (** The main decision tree instance *)
+ let sonority_tree = lazy (build_tree ())
+
+ (** Get sonority value from feature specifications *)
+ let sonority_from_features segment =
+ eval_tree (Lazy.force sonority_tree) segment
+
+ (** Get sonority value from an IPA character *)
+ let sonority ipa =
+ match Ipa_tableold.lookup_segment ipa with
+ | Some features -> sonority_from_features features
+ | None -> failwith (Printf.sprintf "Unknown IPA segment: %s" ipa)
+end
+
+(** Public interface *)
+
+(** Initialize the sonority module with the data directory *)
+let init = Sonority.init
+
+(** Get the sonority value (1-9) for an IPA character *)
+let sonority = Sonority.sonority
+
+(** Get the sonority value from a feature specification *)
+let sonority_from_features = Sonority.sonority_from_features
diff --git a/sorsyl/lib/sonority.mli b/sorsyl/lib/sonorityold.mli
index 3e9166e..3e9166e 100644
--- a/sorsyl/lib/sonority.mli
+++ b/sorsyl/lib/sonorityold.mli
diff --git a/sorsyl/sorsyl.opam b/sorsyl/sorsyl.opam
index 7c561b3..72b5389 100644
--- a/sorsyl/sorsyl.opam
+++ b/sorsyl/sorsyl.opam
@@ -12,7 +12,10 @@ bug-reports: "https://github.com/username/reponame/issues"
depends: [
"dune" {>= "3.19"}
"ocaml"
+ "base"
+ "base_trie"
"csv"
+ "uunf"
"odoc" {with-doc}
]
build: [
diff --git a/sorsyl/test/dune b/sorsyl/test/dune
index 701e92d..e79f200 100644
--- a/sorsyl/test/dune
+++ b/sorsyl/test/dune
@@ -1,3 +1,11 @@
+; (test
+; (name test_sonorityold)
+; (libraries sorsyl))
+
(test
(name test_sonority)
(libraries sorsyl))
+
+(test
+ (name test_table)
+ (libraries sorsyl))
diff --git a/sorsyl/test/test_sonority.ml b/sorsyl/test/test_sonority.ml
index 70845a6..c24f4b4 100644
--- a/sorsyl/test/test_sonority.ml
+++ b/sorsyl/test/test_sonority.ml
@@ -1,11 +1,9 @@
-(** Tests for the Sonority module *)
+(** Tests for the functional Sonority module *)
open Sorsyl
-(** Test fixture - initialize the module once *)
-let () =
- (* Initialize with the data directory *)
- (* When run with dune test, the working directory is _build/default/test *)
+(** Test fixture - create the sonority calculator once *)
+let sonority_calc =
let data_dir =
if Sys.file_exists "./data" then "./data"
else if Sys.file_exists "../data" then "../data"
@@ -14,13 +12,13 @@ let () =
Printf.eprintf "Current directory: %s\n" (Sys.getcwd ());
failwith "Cannot find data directory")
in
- Sonority.init data_dir
+ Sonority.create data_dir
(** Test sonority value 9 - Low vowels *)
let test_sonority_nine () =
let segments = [ "a"; "ɑ"; "æ"; "ɒ"; "e"; "o̥" ] in
let expected = [ 9; 9; 9; 9; 9; 9 ] in
- let results = List.map Sonority.sonority segments in
+ let results = List.map (Sonority.sonority sonority_calc) segments in
assert (results = expected);
Printf.printf "test_sonority_nine: PASSED\n"
@@ -28,7 +26,7 @@ let test_sonority_nine () =
let test_sonority_eight () =
let segments = [ "i"; "y"; "ɨ"; "ʉ"; "ɯ"; "u" ] in
let expected = [ 8; 8; 8; 8; 8; 8 ] in
- let results = List.map Sonority.sonority segments in
+ let results = List.map (Sonority.sonority sonority_calc) segments in
assert (results = expected);
Printf.printf "test_sonority_eight: PASSED\n"
@@ -36,7 +34,7 @@ let test_sonority_eight () =
let test_sonority_seven () =
let segments = [ "j"; "w"; "ʋ"; "ɰ"; "ɹ"; "e̯" ] in
let expected = [ 7; 7; 7; 7; 7; 7 ] in
- let results = List.map Sonority.sonority segments in
+ let results = List.map (Sonority.sonority sonority_calc) segments in
assert (results = expected);
Printf.printf "test_sonority_seven: PASSED\n"
@@ -44,7 +42,7 @@ let test_sonority_seven () =
let test_sonority_six () =
let segments = [ "l"; "ɭ"; "r"; "ɾ" ] in
let expected = [ 6; 6; 6; 6 ] in
- let results = List.map Sonority.sonority segments in
+ let results = List.map (Sonority.sonority sonority_calc) segments in
assert (results = expected);
Printf.printf "test_sonority_six: PASSED\n"
@@ -52,7 +50,7 @@ let test_sonority_six () =
let test_sonority_five () =
let segments = [ "n"; "m"; "ŋ"; "ɴ" ] in
let expected = [ 5; 5; 5; 5 ] in
- let results = List.map Sonority.sonority segments in
+ let results = List.map (Sonority.sonority sonority_calc) segments in
assert (results = expected);
Printf.printf "test_sonority_five: PASSED\n"
@@ -60,15 +58,18 @@ let test_sonority_five () =
let test_sonority_four () =
let segments = [ "v"; "z"; "ʒ"; "ɣ" ] in
let expected = [ 4; 4; 4; 4 ] in
- let results = List.map Sonority.sonority segments in
+ let results = List.map (Sonority.sonority sonority_calc) segments in
+ let results_string =
+ List.fold_left (fun acc item -> Printf.sprintf "%s-%d" acc item) "" results
+ in
assert (results = expected);
- Printf.printf "test_sonority_four: PASSED\n"
+ Printf.printf "test_sonority_four: %s\nPASSED\n" results_string
(** Test sonority value 3 - Voiceless fricatives *)
let test_sonority_three () =
let segments = [ "f"; "s"; "x"; "ħ"; "ʃ" ] in
let expected = [ 3; 3; 3; 3; 3 ] in
- let results = List.map Sonority.sonority segments in
+ let results = List.map (Sonority.sonority sonority_calc) segments in
assert (results = expected);
Printf.printf "test_sonority_three: PASSED\n"
@@ -76,7 +77,7 @@ let test_sonority_three () =
let test_sonority_two () =
let segments = [ "b"; "ɡ"; "d"; "ɢ" ] in
let expected = [ 2; 2; 2; 2 ] in
- let results = List.map Sonority.sonority segments in
+ let results = List.map (Sonority.sonority sonority_calc) segments in
assert (results = expected);
Printf.printf "test_sonority_two: PASSED\n"
@@ -84,14 +85,14 @@ let test_sonority_two () =
let test_sonority_one () =
let segments = [ "p"; "k"; "c"; "q" ] in
let expected = [ 1; 1; 1; 1 ] in
- let results = List.map Sonority.sonority segments in
+ let results = List.map (Sonority.sonority sonority_calc) segments in
assert (results = expected);
Printf.printf "test_sonority_one: PASSED\n"
(** Test unknown segment handling *)
let test_unknown_segment () =
try
- let _ = Sonority.sonority "🦆" in
+ let _ = Sonority.sonority sonority_calc "🦆" in
assert false (* Should not reach here *)
with
| Failure msg when String.sub msg 0 20 = "Unknown IPA segment:" ->
@@ -110,14 +111,30 @@ let test_sonority_from_features () =
(Feature.Voiced, Feature.Minus);
]
in
- let result = Sonority.sonority_from_features segment in
+ let result = Sonority.sonority_from_features sonority_calc segment in
assert (result = 1);
Printf.printf "test_sonority_from_features: PASSED\n"
+(** Test that we can create multiple calculators (no global state) *)
+let test_multiple_calculators () =
+ let data_dir =
+ if Sys.file_exists "./data" then "./data"
+ else if Sys.file_exists "../data" then "../data"
+ else if Sys.file_exists "../../../data" then "../../../data"
+ else failwith "Cannot find data directory"
+ in
+ let calc1 = Sonority.create data_dir in
+ let calc2 = Sonority.create data_dir in
+
+ (* Both should work independently *)
+ assert (Sonority.sonority calc1 "a" = 9);
+ assert (Sonority.sonority calc2 "a" = 9);
+ Printf.printf "test_multiple_calculators: PASSED\n"
+
(** Run all tests *)
let () =
Printf.printf "Running Sonority module tests...\n";
- Printf.printf "================================\n";
+ Printf.printf "===================================\n";
test_sonority_nine ();
test_sonority_eight ();
test_sonority_seven ();
@@ -129,5 +146,6 @@ let () =
test_sonority_one ();
test_unknown_segment ();
test_sonority_from_features ();
- Printf.printf "================================\n";
- Printf.printf "All tests passed!\n"
+ test_multiple_calculators ();
+ Printf.printf "===================================\n";
+ Printf.printf "All Sonority tests passed!\n"
diff --git a/sorsyl/test/test_sonorityold.ml b/sorsyl/test/test_sonorityold.ml
new file mode 100644
index 0000000..70845a6
--- /dev/null
+++ b/sorsyl/test/test_sonorityold.ml
@@ -0,0 +1,133 @@
+(** Tests for the Sonority module *)
+
+open Sorsyl
+
+(** Test fixture - initialize the module once *)
+let () =
+ (* Initialize with the data directory *)
+ (* When run with dune test, the working directory is _build/default/test *)
+ let data_dir =
+ if Sys.file_exists "./data" then "./data"
+ else if Sys.file_exists "../data" then "../data"
+ else if Sys.file_exists "../../../data" then "../../../data"
+ else (
+ Printf.eprintf "Current directory: %s\n" (Sys.getcwd ());
+ failwith "Cannot find data directory")
+ in
+ Sonority.init data_dir
+
+(** Test sonority value 9 - Low vowels *)
+let test_sonority_nine () =
+ let segments = [ "a"; "ɑ"; "æ"; "ɒ"; "e"; "o̥" ] in
+ let expected = [ 9; 9; 9; 9; 9; 9 ] in
+ let results = List.map Sonority.sonority segments in
+ assert (results = expected);
+ Printf.printf "test_sonority_nine: PASSED\n"
+
+(** Test sonority value 8 - High vowels *)
+let test_sonority_eight () =
+ let segments = [ "i"; "y"; "ɨ"; "ʉ"; "ɯ"; "u" ] in
+ let expected = [ 8; 8; 8; 8; 8; 8 ] in
+ let results = List.map Sonority.sonority segments in
+ assert (results = expected);
+ Printf.printf "test_sonority_eight: PASSED\n"
+
+(** Test sonority value 7 - Glides/approximants *)
+let test_sonority_seven () =
+ let segments = [ "j"; "w"; "ʋ"; "ɰ"; "ɹ"; "e̯" ] in
+ let expected = [ 7; 7; 7; 7; 7; 7 ] in
+ let results = List.map Sonority.sonority segments in
+ assert (results = expected);
+ Printf.printf "test_sonority_seven: PASSED\n"
+
+(** Test sonority value 6 - Liquids *)
+let test_sonority_six () =
+ let segments = [ "l"; "ɭ"; "r"; "ɾ" ] in
+ let expected = [ 6; 6; 6; 6 ] in
+ let results = List.map Sonority.sonority segments in
+ assert (results = expected);
+ Printf.printf "test_sonority_six: PASSED\n"
+
+(** Test sonority value 5 - Nasals *)
+let test_sonority_five () =
+ let segments = [ "n"; "m"; "ŋ"; "ɴ" ] in
+ let expected = [ 5; 5; 5; 5 ] in
+ let results = List.map Sonority.sonority segments in
+ assert (results = expected);
+ Printf.printf "test_sonority_five: PASSED\n"
+
+(** Test sonority value 4 - Voiced fricatives *)
+let test_sonority_four () =
+ let segments = [ "v"; "z"; "ʒ"; "ɣ" ] in
+ let expected = [ 4; 4; 4; 4 ] in
+ let results = List.map Sonority.sonority segments in
+ assert (results = expected);
+ Printf.printf "test_sonority_four: PASSED\n"
+
+(** Test sonority value 3 - Voiceless fricatives *)
+let test_sonority_three () =
+ let segments = [ "f"; "s"; "x"; "ħ"; "ʃ" ] in
+ let expected = [ 3; 3; 3; 3; 3 ] in
+ let results = List.map Sonority.sonority segments in
+ assert (results = expected);
+ Printf.printf "test_sonority_three: PASSED\n"
+
+(** Test sonority value 2 - Voiced stops *)
+let test_sonority_two () =
+ let segments = [ "b"; "ɡ"; "d"; "ɢ" ] in
+ let expected = [ 2; 2; 2; 2 ] in
+ let results = List.map Sonority.sonority segments in
+ assert (results = expected);
+ Printf.printf "test_sonority_two: PASSED\n"
+
+(** Test sonority value 1 - Voiceless stops *)
+let test_sonority_one () =
+ let segments = [ "p"; "k"; "c"; "q" ] in
+ let expected = [ 1; 1; 1; 1 ] in
+ let results = List.map Sonority.sonority segments in
+ assert (results = expected);
+ Printf.printf "test_sonority_one: PASSED\n"
+
+(** Test unknown segment handling *)
+let test_unknown_segment () =
+ try
+ let _ = Sonority.sonority "🦆" in
+ assert false (* Should not reach here *)
+ with
+ | Failure msg when String.sub msg 0 20 = "Unknown IPA segment:" ->
+ Printf.printf "test_unknown_segment: PASSED\n"
+ | _ -> assert false
+
+(** Test feature-based sonority calculation *)
+let test_sonority_from_features () =
+ (* Test a simple voiceless stop: -syl, +cons, -son, -cont, -voi *)
+ let segment =
+ [
+ (Feature.Syllabic, Feature.Minus);
+ (Feature.Consonantal, Feature.Plus);
+ (Feature.Sonorant, Feature.Minus);
+ (Feature.Continuant, Feature.Minus);
+ (Feature.Voiced, Feature.Minus);
+ ]
+ in
+ let result = Sonority.sonority_from_features segment in
+ assert (result = 1);
+ Printf.printf "test_sonority_from_features: PASSED\n"
+
+(** Run all tests *)
+let () =
+ Printf.printf "Running Sonority module tests...\n";
+ Printf.printf "================================\n";
+ test_sonority_nine ();
+ test_sonority_eight ();
+ test_sonority_seven ();
+ test_sonority_six ();
+ test_sonority_five ();
+ test_sonority_four ();
+ test_sonority_three ();
+ test_sonority_two ();
+ test_sonority_one ();
+ test_unknown_segment ();
+ test_sonority_from_features ();
+ Printf.printf "================================\n";
+ Printf.printf "All tests passed!\n"
diff --git a/sorsyl/test/test_table.ml b/sorsyl/test/test_table.ml
new file mode 100644
index 0000000..89cb4a2
--- /dev/null
+++ b/sorsyl/test/test_table.ml
@@ -0,0 +1,208 @@
+open Sorsyl
+
+(* let es = *)
+(* String.split_on_char ' ' *)
+(* "la ˌinteɾnˌaθjonˌaliθaθjˈon del kˌoɾaθˈon i el ðˌikθjonˈaɾjo" *)
+
+(* let de = *)
+(* [ *)
+(* "kɔmˈpjuːtɐ"; *)
+(* "teleˈfoːn"; *)
+(* "fɑˈmiːliːjə"; *)
+(* "ˈʔɑːpɔtekə"; *)
+(* "ˈʃoːkolɑdə"; *)
+(* "ˈtoːmɑtən"; *)
+(* "ˈbananə"; *)
+(* "ˈpoːlitsae"; *)
+(* "ˈmuːzɔøm"; *)
+(* "ˈbʏçɐ̯ae"; *)
+(* "mediˈt͡siːn"; *)
+(* "ˈpɾoːfɛszoɾ"; *)
+(* "ʔeleˈfɑnt"; *)
+(* "dokumɛnt"; *)
+(* "ˈʔɪntɛɾnət"; *)
+(* "ˈʔʊnʔivɛɾziˈtɛːt"; *)
+(* "ˈkaɾtɔffɛln"; *)
+(* "ˈmatemɑtik"; *)
+(* "gəˈbʊɾtstɑk"; *)
+(* "ˈvœʁtɐˌbuːx"; *)
+(* "ˈbɪbli̯oːtək"; *)
+(* "demɔkɾɑˈtiːjə"; *)
+(* "fotɔgɾɑˈfiːjə"; *)
+(* "tɛçnoloˈgiːjə"; *)
+(* "bioloˈgiːjə"; *)
+(* "psʏçoloˈgiːjə"; *)
+(* "filozɔfiːjə"; *)
+(* "ˈʃoːkolɑdə"; *)
+(* "ˈmaɾmelɑdə"; *)
+(* "ˈzɛkɾetɛɾɪn"; *)
+(* "ʔɛntˈʃʊldɪgʊŋ"; *)
+(* "ˈkɾaŋkɛnvɛɾzɪçɐ̯ʊŋ"; *)
+(* "gəˈbʊɾtstagspaɾty"; *)
+(* "kɔmmunikɑˈtsĭoːn"; *)
+(* "ʔɔɾgɑnizɑˈtsĭoːn"; *)
+(* "bʏɾgɛɾˈməɪstɐ̯"; *)
+(* "zeɛnsvʏɾdɪkˈkəiːt"; *)
+(* "ˈmiːneɾalvaszɐ̯"; *)
+(* "ˈtsuːzammenaɾˈbəiːt"; *)
+(* "mœglɪçˈkəiːtən"; *)
+(* "gəlegɛnˈəiːtən"; *)
+(* "naxˈmɪtˌtɑːk"; *)
+(* "ʔɪnfɔɾmɑˈtsĭoːn"; *)
+(* "televiˈzĭoːn"; *)
+(* "gəʃvɪndɪkkaetsbɛgɾentsʊŋ"; *)
+(* "ˈkɾaŋkɛnaozaofɛntalt"; *)
+(* "ʔaɾbaetslozɪkˈkəiːt"; *)
+(* "fɛːɐ̯ˈʔantvɔɾtlɪçˈkəiːt"; *)
+(* "zeɛnsvʏɾdɪkˈkəiːtən"; *)
+(* "ˈzɛlpstvɛɾstɛntlɪç"; *)
+(* "ˈvaenaxtsgɛʃɛŋkə"; *)
+(* "gəˈbʊɾtstaksgɛʃɛŋk"; *)
+(* "tuɾɪstenɪnfɔɾmɑˈtsĭoːn"; *)
+(* "ˈʔʊnˌʔiːvɛɾzitɛtspɾofɛszoɾ"; *)
+(* "ˈleːbɛnsmɪtɛlgɛʃɛft"; *)
+(* "ˈfɑɾɾatvɛɾlae"; *)
+(* "ˈbʊndɛstakzapgeɔɾdnətɐ̯"; *)
+(* "ˈʃtɾaeçɔltsʃɛçtɛlçən"; *)
+(* "ˈfɾɔøntʃaftsbetsiːʊŋən"; *)
+(* "ˈɾɛçtsʃʊtsvɛɾzɪçɐ̯ʊŋ"; *)
+(* "nɑɾʊŋsmɪtelʊnvɛɾtɾɛglɪçˈkəiːt"; *)
+(* ] *)
+
+(* let en1 = *)
+(* [ *)
+(* "ˈæpəɫ"; *)
+(* "ˈðɪs"; *)
+(* "ˈɪs"; *)
+(* "ˈeɪ"; *)
+(* "ɫɪŋˈɡwɪstɪks"; *)
+(* "kəˈtæstɹəfi"; *)
+(* "wɪˈθaʊt"; *)
+(* "ˈpɹɛsədənt"; *)
+(* "ˈtu"; *)
+(* "ədˈmɪt"; *)
+(* "ˈænd"; *)
+(* "ˈɪts"; *)
+(* "ˌɪnstəˈɡeɪʃən"; *)
+(* "ˈʃʊd"; *)
+(* "ˈbi"; *)
+(* "ˈpənɪʃt"; *)
+(* ] *)
+
+let en2 =
+ [
+ "d͡ʒɹ̩mən";
+ "ˈpɹɛzənt";
+ "ˈɑɹtɪkəɫ";
+ "pɹəˈvaɪdz";
+ "ˌɹiəˈnæɫəsəs";
+ "kˈɔːɹɑːnəl";
+ "kɝˈoʊnəɫ";
+ "ˈɑptɪks";
+ "ˈɛksəɫəns";
+ "əbˈstɹuənts";
+ "ˈdʒɝmən";
+ "ˈɛŋɡɫɪʃ";
+ "ˈkɑmənɫi";
+ "əˈsumd";
+ "ˈdʒɪps";
+ "ˈpɫæstɝ";
+ "ˈɛŋɡɫɪʃ";
+ (* "ˈɫæps"; *)
+ "ˈɑɹɡjud";
+ "bɪˈɫoʊ";
+ "ˌɛkstɹəsɪˈɫæbɪk";
+ "ˈkɑnsənənts";
+ "ˌdɛɹəˈveɪʃənəɫ";
+ "ˈsteɪdʒ";
+ "ˌɛkstɹəsɪɫəˈbɪsɪti";
+ "ˈaɪðɝ";
+ "ˈɫæŋɡwɪdʒɪz";
+ "ɪɡˈzɪsts";
+ "ˈɛvədəns";
+ "ˈkɑmənɫi";
+ "pɹɪˈzɛntɪd";
+ "səˈpɔɹt";
+ "ˌɛkstɹəsɪˈɫæbɪk";
+ "ˈkɑnsənənts";
+ "kəmˈpætəbəɫ";
+ "ˈfʊɫi";
+ "ˈsɝfəs";
+ "ˌɑptəˈmæɫəti";
+ "ˌθiɝˈɛtɪk";
+ "ˈtɹitmənt";
+ "kənˈstɹeɪnts";
+ "ɹɪˈfɝɪŋ";
+ "ˈfʊɫi";
+ "səˈɫæbəˌfaɪd";
+ "ˈaʊtˌpʊt";
+ "ˌɹɛpɹəzɛnˈteɪʃənz";
+ "pɹəˈpoʊzd";
+ ]
+
+(* let zh = [ "/t͡ɕi⁵¹ ti⁵¹ pʰi⁵¹/" ] *)
+
+(* let ws = *)
+(* [ *)
+(* ( "/nuˌmɑ.noʊ.ʌl.tɹə.maɪ.kɹoʊˈskɑ.pɪkˌsɪ.lɪ.koʊ.vɑl.keɪ.noʊ.koʊ.niˈoʊ.sɪs/", *)
+(* "/əˈbæn.dn̩.əd.li/", *)
+(* " /əˈbæn.dn̩.əd.li/", *)
+(* "/əˈbæn.dn̩.mn̩t/", *)
+(* "/-ˌbiːə-/", *)
+(* "/əˈbluː.ʃn̩/xx" ); *)
+(* ] *)
+
+let get_data =
+ Printf.printf "Getting data\n";
+ let data_dir =
+ if Sys.file_exists "./data" then "./data"
+ else if Sys.file_exists "../data" then "../data"
+ else if Sys.file_exists "../../../data" then "../../../data"
+ else (
+ Printf.eprintf "Current directory: %s\n" (Sys.getcwd ());
+ failwith "Cannot find data directory")
+ in
+ Ipa_table.load_csv data_dir
+
+let test_fts table =
+ let result = Ipa_table.fts table "s" in
+ match result with
+ | None -> ()
+ | Some seg -> Printf.printf "fts\n %s\n" (Feature.string_of_segment seg)
+(* let expected = None in *)
+(* let results = Ipa_table.fts *)
+(* assert (result = expected); *)
+(* Printf.printf "test_fts: PASSED\n" *)
+
+let test_segs data =
+ let words = en2 in
+ (* let expected = [] in *)
+ let _results =
+ Base.List.map words ~f:(fun word ->
+ let res = Ipa_table.ipa_segs data word in
+ Printf.printf "%s\n" word;
+ let xl =
+ Base.List.fold res ~init:"" ~f:(fun acc char ->
+ Printf.sprintf "%s-%s" acc char)
+ in
+ Printf.printf "%s\n" xl;
+ res)
+ in
+ (* Base.List.iter results ~f:(fun x -> *)
+ (* let xl = *)
+ (* Base.List.fold x ~init:"" ~f:(fun acc char -> *)
+ (* Printf.sprintf "%s-%s" acc char) *)
+ (* in *)
+ (* Printf.printf "%s\n" xl) *)
+ (* assert (result = expected) *)
+ Printf.printf "test_fts: PASSED\n"
+
+(** Run all tests *)
+let () =
+ let data = get_data in
+ Printf.printf "Running IPA Table module tests...\n";
+ Printf.printf "===================================\n";
+ test_fts data.table;
+ test_segs data;
+ Printf.printf "===================================\n";
+ Printf.printf "All IPA Table tests passed!\n"