summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sorsyl/lib/syllabifier.ml187
-rw-r--r--sorsyl/lib/syllabifier.mli24
-rw-r--r--sorsyl/lib/syllable.ml97
-rw-r--r--sorsyl/lib/syllable.mli54
-rw-r--r--sorsyl/test/dune4
-rw-r--r--sorsyl/test/test_syllabifier.ml49
6 files changed, 415 insertions, 0 deletions
diff --git a/sorsyl/lib/syllabifier.ml b/sorsyl/lib/syllabifier.ml
new file mode 100644
index 0000000..af4ce17
--- /dev/null
+++ b/sorsyl/lib/syllabifier.ml
@@ -0,0 +1,187 @@
+(** Syllabifier module for segmenting words into syllables using sonority *)
+
+open Base
+
+(** Result of syllabification *)
+type syllabified = {
+ word : string;
+ ipa : string;
+ lang : string;
+ clean_ipa : string;
+ syllables : Syllable.t list;
+}
+
+(** State during syllabification *)
+type state = {
+ sonority : Sonority.t;
+ ipa_table : Ipa_table.t;
+ segments : string list;
+ stress_idx : int;
+ syllables : Syllable.t list;
+ current_syllable : Syllable.t;
+}
+
+(** Check if a segment is a tone marker *)
+let is_tone ipa_table segment =
+ match Ipa_table.fts ipa_table segment with
+ | Some features ->
+ let not_tone =
+ Feature.has_feature (Feature.HighTone, Feature.Zero) features &&
+ Feature.has_feature (Feature.HighReg, Feature.Zero) features
+ in
+ (not not_tone) || String.equal segment "˧"
+ | None -> false
+
+(** Check if a segment is a nucleus (syllabic) *)
+let is_nucleus ipa_table segment =
+ match Ipa_table.fts ipa_table segment with
+ | Some features -> Feature.has_feature (Feature.Syllabic, Feature.Plus) features
+ | None -> false
+
+(** Check if a segment is a vowel *)
+let is_vowel = is_nucleus
+
+(** Check if there's a vowel remaining in the segments from index *)
+let has_vowel_remaining ipa_table segments idx =
+ let rec check i =
+ if i >= List.length segments then false
+ else if is_vowel ipa_table (List.nth_exn segments i) then true
+ else check (i + 1)
+ in
+ check idx
+
+(** Get sonority value for a segment, handling multi-character nuclei *)
+let get_nucleus_sonority sonority nucleus =
+ try
+ Sonority.sonority sonority nucleus
+ with _ ->
+ (* For multi-character nuclei, return the sonority of the last character *)
+ String.fold nucleus ~init:0 ~f:(fun acc c ->
+ try
+ Sonority.sonority sonority (String.of_char c)
+ with _ -> acc
+ )
+
+(** Check if next segment has specific features *)
+let next_has_features ipa_table segments idx features =
+ if idx + 1 >= List.length segments then false
+ else
+ match Ipa_table.fts ipa_table (List.nth_exn segments (idx + 1)) with
+ | Some seg_features ->
+ List.for_all features ~f:(fun (feat, value) ->
+ Feature.has_feature (feat, value) seg_features
+ )
+ | None -> false
+
+(** Create a new syllable and add current one to list *)
+let new_syllable state syllable idx =
+ let finalized = Syllable.finalize state.current_syllable ~end_idx:idx ~stress_idx:state.stress_idx in
+ { state with
+ syllables = finalized :: state.syllables;
+ current_syllable = { syllable with start_idx = idx };
+ }
+
+(** Process a single segment *)
+let process_segment state segment idx =
+ let is_last = idx = List.length state.segments - 1 in
+ let is_last_syl = not (has_vowel_remaining state.ipa_table.table state.segments idx) in
+
+ (* Handle tones *)
+ if is_tone state.ipa_table.table segment then
+ { state with
+ current_syllable = Syllable.append_tone state.current_syllable segment;
+ }
+ (* Handle nucleus (vowels) *)
+ else if is_nucleus state.ipa_table.table segment then
+ if String.is_empty state.current_syllable.nucleus then
+ (* First vowel in syllable *)
+ { state with
+ current_syllable = { state.current_syllable with nucleus = segment };
+ }
+ else
+ (* Already have a nucleus - check for diphthong *)
+ let nucleus_sonority = get_nucleus_sonority state.sonority state.current_syllable.nucleus in
+ let segment_sonority =
+ try Sonority.sonority state.sonority segment
+ with _ -> 0
+ in
+ if nucleus_sonority > segment_sonority then
+ (* Decreasing sonority - add to current nucleus as diphthong *)
+ { state with
+ current_syllable = Syllable.append_nucleus state.current_syllable segment;
+ }
+ else
+ (* New syllable *)
+ new_syllable state (Syllable.create ~nucleus:segment ()) idx
+ (* Handle consonants *)
+ else
+ if String.is_empty state.current_syllable.nucleus then
+ (* No nucleus yet - add to onset *)
+ { state with
+ current_syllable = Syllable.append_onset state.current_syllable segment;
+ }
+ else if is_last then
+ (* Last segment - add to coda *)
+ { state with
+ current_syllable = Syllable.append_coda state.current_syllable segment;
+ }
+ else if next_has_features state.ipa_table.table state.segments idx
+ [(Feature.Syllabic, Feature.Plus)] then
+ (* Next is vowel - start new syllable *)
+ new_syllable state (Syllable.create ~onset:segment ()) idx
+ else if is_last_syl ||
+ not (next_has_features state.ipa_table.table state.segments idx
+ [(Feature.Sonorant, Feature.Plus); (Feature.Nasal, Feature.Minus)]) then
+ (* Add to coda *)
+ { state with
+ current_syllable = Syllable.append_coda state.current_syllable segment;
+ }
+ else
+ (* Start new syllable *)
+ new_syllable state (Syllable.create ~onset:segment ()) idx
+
+(** Syllabify a word given its IPA transcription *)
+let syllabify ~sonority ~ipa ~word ~lang =
+ let ipa_table = Sonority.get_ipa_table sonority in
+
+ (* Find stress marker position *)
+ let stress_idx =
+ match String.substr_index ipa ~pattern:"ˈ" with
+ | Some idx -> idx
+ | None -> -1
+ in
+
+ (* Normalize tones and segment the IPA *)
+ let normalized =
+ (* For now, just use the IPA as-is since we don't have tone normalization yet *)
+ ipa
+ in
+ let segments = Ipa_table.ipa_segs ipa_table normalized in
+ let clean_ipa = String.concat segments ~sep:"" in
+
+ (* Initial state *)
+ let init_state = {
+ sonority;
+ ipa_table;
+ segments;
+ stress_idx;
+ syllables = [];
+ current_syllable = Syllable.empty;
+ } in
+
+ (* Process each segment *)
+ let final_state =
+ List.foldi segments ~init:init_state ~f:(fun idx state segment ->
+ process_segment state segment idx
+ )
+ in
+
+ (* Finalize last syllable *)
+ let last_syl =
+ Syllable.finalize final_state.current_syllable
+ ~end_idx:(List.length segments)
+ ~stress_idx:final_state.stress_idx
+ in
+ let all_syllables = List.rev (last_syl :: final_state.syllables) in
+
+ { word; ipa; lang; clean_ipa; syllables = all_syllables } \ No newline at end of file
diff --git a/sorsyl/lib/syllabifier.mli b/sorsyl/lib/syllabifier.mli
new file mode 100644
index 0000000..e40feeb
--- /dev/null
+++ b/sorsyl/lib/syllabifier.mli
@@ -0,0 +1,24 @@
+(** Syllabifier module for segmenting words into syllables using sonority *)
+
+(** Result of syllabification *)
+type syllabified = {
+ word : string; (** Original orthographic word *)
+ ipa : string; (** Original IPA transcription *)
+ lang : string; (** Language code *)
+ clean_ipa : string; (** Cleaned IPA without diacritics *)
+ syllables : Syllable.t list; (** List of syllables *)
+}
+
+(** Syllabify a word given its IPA transcription
+ @param sonority The sonority calculator
+ @param ipa The IPA transcription of the word
+ @param word The orthographic form of the word
+ @param lang The language code
+ @return The syllabified result
+*)
+val syllabify :
+ sonority:Sonority.t ->
+ ipa:string ->
+ word:string ->
+ lang:string ->
+ syllabified \ No newline at end of file
diff --git a/sorsyl/lib/syllable.ml b/sorsyl/lib/syllable.ml
new file mode 100644
index 0000000..ef72736
--- /dev/null
+++ b/sorsyl/lib/syllable.ml
@@ -0,0 +1,97 @@
+(** Module for representing syllables and their components *)
+
+open Base
+
+(** Type representing a syllable with its phonological components *)
+type t = {
+ onset : string; (** Initial consonants *)
+ medial : string; (** Medial consonants (between onset and nucleus) *)
+ nucleus : string; (** Vowel core of the syllable *)
+ coda : string; (** Final consonants *)
+ tone : string; (** Tonal information *)
+ spelling : string; (** Orthographic representation *)
+ start_idx : int; (** Start position in the word *)
+ end_idx : int; (** End position in the word *)
+ stressed : bool; (** Whether this syllable is stressed *)
+}
+
+(** Create an empty syllable *)
+let empty = {
+ onset = "";
+ medial = "";
+ nucleus = "";
+ coda = "";
+ tone = "";
+ spelling = "";
+ start_idx = 0;
+ end_idx = 0;
+ stressed = false;
+}
+
+(** Create a syllable with specified components *)
+let create ?(onset = "") ?(medial = "") ?(nucleus = "") ?(coda = "") ?(tone = "")
+ ?(spelling = "") ?(start_idx = 0) ?(end_idx = 0) ?(stressed = false) () =
+ { onset; medial; nucleus; coda; tone; spelling; start_idx; end_idx; stressed }
+
+(** Get the complete syllable string *)
+let all t = t.onset ^ t.medial ^ t.nucleus ^ t.coda ^ t.tone
+
+(** Get the rhyme (medial + nucleus + coda) *)
+let rhyme t = t.medial ^ t.nucleus ^ t.coda
+
+(** Check if the nucleus contains length marker *)
+let is_long t = String.is_substring t.nucleus ~substring:"ː"
+
+(** Finalize a syllable - determine if it's stressed based on position relative to stress marker *)
+let finalize t ~end_idx ~stress_idx =
+ let stressed =
+ if stress_idx >= 0 then
+ stress_idx >= t.start_idx && stress_idx <= end_idx
+ else
+ false
+ in
+ { t with end_idx; stressed }
+
+(** Pretty print a syllable with Unicode box drawing *)
+let pretty_print t =
+ let open Printf in
+ let box_width = 10 in
+ let pad s =
+ let len = String.length s in
+ if len >= box_width then String.prefix s box_width
+ else s ^ String.make (box_width - len) ' '
+ in
+
+ let h_line = String.make box_width '-' in
+ let top = "+" ^ h_line ^ "+" in
+ let bottom = "+" ^ h_line ^ "+" in
+ let make_row label content = sprintf "|%s|" (pad (label ^ content)) in
+
+ String.concat ~sep:"\n" [
+ top;
+ make_row "σ: " (all t);
+ make_row "O: " t.onset;
+ make_row "M: " t.medial;
+ make_row "N: " t.nucleus;
+ make_row "C: " t.coda;
+ make_row "T: " t.tone;
+ if t.stressed then make_row "* " "stressed" else "";
+ bottom
+ ] |> String.strip
+
+(** Convert to string representation *)
+let to_string t =
+ Printf.sprintf "{onset=%S; medial=%S; nucleus=%S; coda=%S; tone=%S; stressed=%b}"
+ t.onset t.medial t.nucleus t.coda t.tone t.stressed
+
+(** Append to onset *)
+let append_onset t s = { t with onset = t.onset ^ s }
+
+(** Append to nucleus *)
+let append_nucleus t s = { t with nucleus = t.nucleus ^ s }
+
+(** Append to coda *)
+let append_coda t s = { t with coda = t.coda ^ s }
+
+(** Append to tone *)
+let append_tone t s = { t with tone = t.tone ^ s } \ No newline at end of file
diff --git a/sorsyl/lib/syllable.mli b/sorsyl/lib/syllable.mli
new file mode 100644
index 0000000..e16364f
--- /dev/null
+++ b/sorsyl/lib/syllable.mli
@@ -0,0 +1,54 @@
+(** Module for representing syllables and their components *)
+
+(** Type representing a syllable with its phonological components *)
+type t = {
+ onset : string; (** Initial consonants *)
+ medial : string; (** Medial consonants (between onset and nucleus) *)
+ nucleus : string; (** Vowel core of the syllable *)
+ coda : string; (** Final consonants *)
+ tone : string; (** Tonal information *)
+ spelling : string; (** Orthographic representation *)
+ start_idx : int; (** Start position in the word *)
+ end_idx : int; (** End position in the word *)
+ stressed : bool; (** Whether this syllable is stressed *)
+}
+
+(** Empty syllable constant *)
+val empty : t
+
+(** Create a syllable with specified components *)
+val create :
+ ?onset:string ->
+ ?medial:string ->
+ ?nucleus:string ->
+ ?coda:string ->
+ ?tone:string ->
+ ?spelling:string ->
+ ?start_idx:int ->
+ ?end_idx:int ->
+ ?stressed:bool ->
+ unit -> t
+
+(** Get the complete syllable string *)
+val all : t -> string
+
+(** Get the rhyme (medial + nucleus + coda) *)
+val rhyme : t -> string
+
+(** Check if the nucleus contains length marker *)
+val is_long : t -> bool
+
+(** Finalize a syllable - determine if it's stressed based on position relative to stress marker *)
+val finalize : t -> end_idx:int -> stress_idx:int -> t
+
+(** Pretty print a syllable with Unicode box drawing *)
+val pretty_print : t -> string
+
+(** Convert to string representation *)
+val to_string : t -> string
+
+(** Append operations *)
+val append_onset : t -> string -> t
+val append_nucleus : t -> string -> t
+val append_coda : t -> string -> t
+val append_tone : t -> string -> t \ No newline at end of file
diff --git a/sorsyl/test/dune b/sorsyl/test/dune
index e79f200..98e22a7 100644
--- a/sorsyl/test/dune
+++ b/sorsyl/test/dune
@@ -9,3 +9,7 @@
(test
(name test_table)
(libraries sorsyl))
+
+(test
+ (name test_syllabifier)
+ (libraries sorsyl))
diff --git a/sorsyl/test/test_syllabifier.ml b/sorsyl/test/test_syllabifier.ml
new file mode 100644
index 0000000..08f3d0d
--- /dev/null
+++ b/sorsyl/test/test_syllabifier.ml
@@ -0,0 +1,49 @@
+open Printf
+open Sorsyl
+
+let test_syllabifier () =
+ (* Create sonority calculator *)
+ let data_dir =
+ if Sys.file_exists "./data" then "./data"
+ else if Sys.file_exists "../data" then "../data"
+ else if Sys.file_exists "../../../data" then "../../../data"
+ else (
+ Printf.eprintf "Current directory: %s\n" (Sys.getcwd ());
+ failwith "Cannot find data directory")
+ in
+ let sonority = Sonority.create data_dir in
+
+ (* Test basic syllabification *)
+ let test_word word ipa lang =
+ printf "\nTesting: %s [%s] (%s)\n" word ipa lang;
+ let result = Syllabifier.syllabify ~sonority ~ipa ~word ~lang in
+ printf "Clean IPA: %s\n" result.clean_ipa;
+ printf "Syllables: %d\n" (List.length result.syllables);
+ Base.List.iteri result.syllables ~f:(fun i syl ->
+ printf "\nSyllable %d:\n" (i + 1);
+ printf " All: %s\n" (Syllable.all syl);
+ printf " Onset: %s\n" syl.onset;
+ printf " Nucleus: %s\n" syl.nucleus;
+ printf " Coda: %s\n" syl.coda;
+ printf " Tone: %s\n" syl.tone;
+ printf " Stressed: %b\n" syl.stressed)
+ in
+
+ (* Test cases *)
+ test_word "hello" "hɛˈloʊ" "en";
+ test_word "computer" "kəmˈpjuːtər" "en";
+ test_word "language" "ˈlæŋɡwɪdʒ" "en";
+ test_word "syllable" "ˈsɪləbəl" "en";
+
+ (* Test with tones (if supported) *)
+ test_word "ma" "ma˧" "zh";
+
+ (* Test pretty printing *)
+ printf "\n\nPretty print example:\n";
+ let result =
+ Syllabifier.syllabify ~sonority ~ipa:"ˈsɪləbəl" ~word:"syllable" ~lang:"en"
+ in
+ Base.List.iter result.syllables ~f:(fun syl ->
+ printf "%s\n" (Syllable.pretty_print syl))
+
+let () = test_syllabifier ()