(** Syllabifier module for segmenting words into syllables using sonority *) open Base (** Result of syllabification *) type syllabified = { word : string; ipa : string; lang : string; clean_ipa : string; syllables : Syllable.t list; } (** State during syllabification *) type state = { sonority : Sonority.t; ipa_table : Ipa_table.t; segments : string list; stress_idx : int; syllables : Syllable.t list; current_syllable : Syllable.t; } (** Check if a segment is a tone marker *) let is_tone ipa_table segment = match Ipa_table.fts ipa_table segment with | Some features -> let not_tone = Feature.has_feature (Feature.HighTone, Feature.Zero) features && Feature.has_feature (Feature.HighReg, Feature.Zero) features in (not not_tone) || String.equal segment "˧" | None -> false (** Check if a segment is a nucleus (syllabic) *) let is_nucleus ipa_table segment = match Ipa_table.fts ipa_table segment with | Some features -> Feature.has_feature (Feature.Syllabic, Feature.Plus) features | None -> false (** Check if a segment is a vowel *) let is_vowel = is_nucleus (** Check if there's a vowel remaining in the segments from index *) let has_vowel_remaining ipa_table segments idx = let rec check i = if i >= List.length segments then false else if is_vowel ipa_table (List.nth_exn segments i) then true else check (i + 1) in check idx (** Get sonority value for a segment, handling multi-character nuclei *) let get_nucleus_sonority sonority nucleus = try Sonority.sonority sonority nucleus with _ -> (* For multi-character nuclei, return the sonority of the last character *) String.fold nucleus ~init:0 ~f:(fun acc c -> try Sonority.sonority sonority (String.of_char c) with _ -> acc ) (** Check if next segment has specific features *) let next_has_features ipa_table segments idx features = if idx + 1 >= List.length segments then false else match Ipa_table.fts ipa_table (List.nth_exn segments (idx + 1)) with | Some seg_features -> List.for_all features ~f:(fun (feat, value) -> Feature.has_feature (feat, value) seg_features ) | None -> false (** Create a new syllable and add current one to list *) let new_syllable state syllable idx = let finalized = Syllable.finalize state.current_syllable ~end_idx:idx ~stress_idx:state.stress_idx in { state with syllables = finalized :: state.syllables; current_syllable = { syllable with start_idx = idx }; } (** Process a single segment *) let process_segment state segment idx = let is_last = idx = List.length state.segments - 1 in let is_last_syl = not (has_vowel_remaining state.ipa_table.table state.segments idx) in (* Handle tones *) if is_tone state.ipa_table.table segment then { state with current_syllable = Syllable.append_tone state.current_syllable segment; } (* Handle nucleus (vowels) *) else if is_nucleus state.ipa_table.table segment then if String.is_empty state.current_syllable.nucleus then (* First vowel in syllable *) { state with current_syllable = { state.current_syllable with nucleus = segment }; } else (* Already have a nucleus - check for diphthong *) let nucleus_sonority = get_nucleus_sonority state.sonority state.current_syllable.nucleus in let segment_sonority = try Sonority.sonority state.sonority segment with _ -> 0 in if nucleus_sonority > segment_sonority then (* Decreasing sonority - add to current nucleus as diphthong *) { state with current_syllable = Syllable.append_nucleus state.current_syllable segment; } else (* New syllable *) new_syllable state (Syllable.create ~nucleus:segment ()) idx (* Handle consonants *) else if String.is_empty state.current_syllable.nucleus then (* No nucleus yet - add to onset *) { state with current_syllable = Syllable.append_onset state.current_syllable segment; } else if is_last then (* Last segment - add to coda *) { state with current_syllable = Syllable.append_coda state.current_syllable segment; } else if next_has_features state.ipa_table.table state.segments idx [(Feature.Syllabic, Feature.Plus)] then (* Next is vowel - start new syllable *) new_syllable state (Syllable.create ~onset:segment ()) idx else if is_last_syl || not (next_has_features state.ipa_table.table state.segments idx [(Feature.Sonorant, Feature.Plus); (Feature.Nasal, Feature.Minus)]) then (* Add to coda *) { state with current_syllable = Syllable.append_coda state.current_syllable segment; } else (* Start new syllable *) new_syllable state (Syllable.create ~onset:segment ()) idx (** Syllabify a word given its IPA transcription *) let syllabify ~sonority ~ipa ~word ~lang = let ipa_table = Sonority.get_ipa_table sonority in (* Find stress marker position *) let stress_idx = match String.substr_index ipa ~pattern:"ˈ" with | Some idx -> idx | None -> -1 in (* Normalize tones and segment the IPA *) let normalized = (* For now, just use the IPA as-is since we don't have tone normalization yet *) ipa in let segments = Ipa_table.ipa_segs ipa_table normalized in let clean_ipa = String.concat segments ~sep:"" in (* Initial state *) let init_state = { sonority; ipa_table; segments; stress_idx; syllables = []; current_syllable = Syllable.empty; } in (* Process each segment *) let final_state = List.foldi segments ~init:init_state ~f:(fun idx state segment -> process_segment state segment idx ) in (* Finalize last syllable *) let last_syl = Syllable.finalize final_state.current_syllable ~end_idx:(List.length segments) ~stress_idx:final_state.stress_idx in let all_syllables = List.rev (last_syl :: final_state.syllables) in { word; ipa; lang; clean_ipa; syllables = all_syllables }