summaryrefslogtreecommitdiff
path: root/sorsyl/lib/syllabifier.ml
diff options
context:
space:
mode:
Diffstat (limited to 'sorsyl/lib/syllabifier.ml')
-rw-r--r--sorsyl/lib/syllabifier.ml187
1 files changed, 187 insertions, 0 deletions
diff --git a/sorsyl/lib/syllabifier.ml b/sorsyl/lib/syllabifier.ml
new file mode 100644
index 0000000..af4ce17
--- /dev/null
+++ b/sorsyl/lib/syllabifier.ml
@@ -0,0 +1,187 @@
+(** Syllabifier module for segmenting words into syllables using sonority *)
+
+open Base
+
+(** Result of syllabification *)
+type syllabified = {
+ word : string;
+ ipa : string;
+ lang : string;
+ clean_ipa : string;
+ syllables : Syllable.t list;
+}
+
+(** State during syllabification *)
+type state = {
+ sonority : Sonority.t;
+ ipa_table : Ipa_table.t;
+ segments : string list;
+ stress_idx : int;
+ syllables : Syllable.t list;
+ current_syllable : Syllable.t;
+}
+
+(** Check if a segment is a tone marker *)
+let is_tone ipa_table segment =
+ match Ipa_table.fts ipa_table segment with
+ | Some features ->
+ let not_tone =
+ Feature.has_feature (Feature.HighTone, Feature.Zero) features &&
+ Feature.has_feature (Feature.HighReg, Feature.Zero) features
+ in
+ (not not_tone) || String.equal segment "˧"
+ | None -> false
+
+(** Check if a segment is a nucleus (syllabic) *)
+let is_nucleus ipa_table segment =
+ match Ipa_table.fts ipa_table segment with
+ | Some features -> Feature.has_feature (Feature.Syllabic, Feature.Plus) features
+ | None -> false
+
+(** Check if a segment is a vowel *)
+let is_vowel = is_nucleus
+
+(** Check if there's a vowel remaining in the segments from index *)
+let has_vowel_remaining ipa_table segments idx =
+ let rec check i =
+ if i >= List.length segments then false
+ else if is_vowel ipa_table (List.nth_exn segments i) then true
+ else check (i + 1)
+ in
+ check idx
+
+(** Get sonority value for a segment, handling multi-character nuclei *)
+let get_nucleus_sonority sonority nucleus =
+ try
+ Sonority.sonority sonority nucleus
+ with _ ->
+ (* For multi-character nuclei, return the sonority of the last character *)
+ String.fold nucleus ~init:0 ~f:(fun acc c ->
+ try
+ Sonority.sonority sonority (String.of_char c)
+ with _ -> acc
+ )
+
+(** Check if next segment has specific features *)
+let next_has_features ipa_table segments idx features =
+ if idx + 1 >= List.length segments then false
+ else
+ match Ipa_table.fts ipa_table (List.nth_exn segments (idx + 1)) with
+ | Some seg_features ->
+ List.for_all features ~f:(fun (feat, value) ->
+ Feature.has_feature (feat, value) seg_features
+ )
+ | None -> false
+
+(** Create a new syllable and add current one to list *)
+let new_syllable state syllable idx =
+ let finalized = Syllable.finalize state.current_syllable ~end_idx:idx ~stress_idx:state.stress_idx in
+ { state with
+ syllables = finalized :: state.syllables;
+ current_syllable = { syllable with start_idx = idx };
+ }
+
+(** Process a single segment *)
+let process_segment state segment idx =
+ let is_last = idx = List.length state.segments - 1 in
+ let is_last_syl = not (has_vowel_remaining state.ipa_table.table state.segments idx) in
+
+ (* Handle tones *)
+ if is_tone state.ipa_table.table segment then
+ { state with
+ current_syllable = Syllable.append_tone state.current_syllable segment;
+ }
+ (* Handle nucleus (vowels) *)
+ else if is_nucleus state.ipa_table.table segment then
+ if String.is_empty state.current_syllable.nucleus then
+ (* First vowel in syllable *)
+ { state with
+ current_syllable = { state.current_syllable with nucleus = segment };
+ }
+ else
+ (* Already have a nucleus - check for diphthong *)
+ let nucleus_sonority = get_nucleus_sonority state.sonority state.current_syllable.nucleus in
+ let segment_sonority =
+ try Sonority.sonority state.sonority segment
+ with _ -> 0
+ in
+ if nucleus_sonority > segment_sonority then
+ (* Decreasing sonority - add to current nucleus as diphthong *)
+ { state with
+ current_syllable = Syllable.append_nucleus state.current_syllable segment;
+ }
+ else
+ (* New syllable *)
+ new_syllable state (Syllable.create ~nucleus:segment ()) idx
+ (* Handle consonants *)
+ else
+ if String.is_empty state.current_syllable.nucleus then
+ (* No nucleus yet - add to onset *)
+ { state with
+ current_syllable = Syllable.append_onset state.current_syllable segment;
+ }
+ else if is_last then
+ (* Last segment - add to coda *)
+ { state with
+ current_syllable = Syllable.append_coda state.current_syllable segment;
+ }
+ else if next_has_features state.ipa_table.table state.segments idx
+ [(Feature.Syllabic, Feature.Plus)] then
+ (* Next is vowel - start new syllable *)
+ new_syllable state (Syllable.create ~onset:segment ()) idx
+ else if is_last_syl ||
+ not (next_has_features state.ipa_table.table state.segments idx
+ [(Feature.Sonorant, Feature.Plus); (Feature.Nasal, Feature.Minus)]) then
+ (* Add to coda *)
+ { state with
+ current_syllable = Syllable.append_coda state.current_syllable segment;
+ }
+ else
+ (* Start new syllable *)
+ new_syllable state (Syllable.create ~onset:segment ()) idx
+
+(** Syllabify a word given its IPA transcription *)
+let syllabify ~sonority ~ipa ~word ~lang =
+ let ipa_table = Sonority.get_ipa_table sonority in
+
+ (* Find stress marker position *)
+ let stress_idx =
+ match String.substr_index ipa ~pattern:"ˈ" with
+ | Some idx -> idx
+ | None -> -1
+ in
+
+ (* Normalize tones and segment the IPA *)
+ let normalized =
+ (* For now, just use the IPA as-is since we don't have tone normalization yet *)
+ ipa
+ in
+ let segments = Ipa_table.ipa_segs ipa_table normalized in
+ let clean_ipa = String.concat segments ~sep:"" in
+
+ (* Initial state *)
+ let init_state = {
+ sonority;
+ ipa_table;
+ segments;
+ stress_idx;
+ syllables = [];
+ current_syllable = Syllable.empty;
+ } in
+
+ (* Process each segment *)
+ let final_state =
+ List.foldi segments ~init:init_state ~f:(fun idx state segment ->
+ process_segment state segment idx
+ )
+ in
+
+ (* Finalize last syllable *)
+ let last_syl =
+ Syllable.finalize final_state.current_syllable
+ ~end_idx:(List.length segments)
+ ~stress_idx:final_state.stress_idx
+ in
+ let all_syllables = List.rev (last_syl :: final_state.syllables) in
+
+ { word; ipa; lang; clean_ipa; syllables = all_syllables } \ No newline at end of file