diff options
Diffstat (limited to 'sorsyl/lib/syllabifier.ml')
-rw-r--r-- | sorsyl/lib/syllabifier.ml | 187 |
1 files changed, 187 insertions, 0 deletions
diff --git a/sorsyl/lib/syllabifier.ml b/sorsyl/lib/syllabifier.ml new file mode 100644 index 0000000..af4ce17 --- /dev/null +++ b/sorsyl/lib/syllabifier.ml @@ -0,0 +1,187 @@ +(** Syllabifier module for segmenting words into syllables using sonority *) + +open Base + +(** Result of syllabification *) +type syllabified = { + word : string; + ipa : string; + lang : string; + clean_ipa : string; + syllables : Syllable.t list; +} + +(** State during syllabification *) +type state = { + sonority : Sonority.t; + ipa_table : Ipa_table.t; + segments : string list; + stress_idx : int; + syllables : Syllable.t list; + current_syllable : Syllable.t; +} + +(** Check if a segment is a tone marker *) +let is_tone ipa_table segment = + match Ipa_table.fts ipa_table segment with + | Some features -> + let not_tone = + Feature.has_feature (Feature.HighTone, Feature.Zero) features && + Feature.has_feature (Feature.HighReg, Feature.Zero) features + in + (not not_tone) || String.equal segment "˧" + | None -> false + +(** Check if a segment is a nucleus (syllabic) *) +let is_nucleus ipa_table segment = + match Ipa_table.fts ipa_table segment with + | Some features -> Feature.has_feature (Feature.Syllabic, Feature.Plus) features + | None -> false + +(** Check if a segment is a vowel *) +let is_vowel = is_nucleus + +(** Check if there's a vowel remaining in the segments from index *) +let has_vowel_remaining ipa_table segments idx = + let rec check i = + if i >= List.length segments then false + else if is_vowel ipa_table (List.nth_exn segments i) then true + else check (i + 1) + in + check idx + +(** Get sonority value for a segment, handling multi-character nuclei *) +let get_nucleus_sonority sonority nucleus = + try + Sonority.sonority sonority nucleus + with _ -> + (* For multi-character nuclei, return the sonority of the last character *) + String.fold nucleus ~init:0 ~f:(fun acc c -> + try + Sonority.sonority sonority (String.of_char c) + with _ -> acc + ) + +(** Check if next segment has specific features *) +let next_has_features ipa_table segments idx features = + if idx + 1 >= List.length segments then false + else + match Ipa_table.fts ipa_table (List.nth_exn segments (idx + 1)) with + | Some seg_features -> + List.for_all features ~f:(fun (feat, value) -> + Feature.has_feature (feat, value) seg_features + ) + | None -> false + +(** Create a new syllable and add current one to list *) +let new_syllable state syllable idx = + let finalized = Syllable.finalize state.current_syllable ~end_idx:idx ~stress_idx:state.stress_idx in + { state with + syllables = finalized :: state.syllables; + current_syllable = { syllable with start_idx = idx }; + } + +(** Process a single segment *) +let process_segment state segment idx = + let is_last = idx = List.length state.segments - 1 in + let is_last_syl = not (has_vowel_remaining state.ipa_table.table state.segments idx) in + + (* Handle tones *) + if is_tone state.ipa_table.table segment then + { state with + current_syllable = Syllable.append_tone state.current_syllable segment; + } + (* Handle nucleus (vowels) *) + else if is_nucleus state.ipa_table.table segment then + if String.is_empty state.current_syllable.nucleus then + (* First vowel in syllable *) + { state with + current_syllable = { state.current_syllable with nucleus = segment }; + } + else + (* Already have a nucleus - check for diphthong *) + let nucleus_sonority = get_nucleus_sonority state.sonority state.current_syllable.nucleus in + let segment_sonority = + try Sonority.sonority state.sonority segment + with _ -> 0 + in + if nucleus_sonority > segment_sonority then + (* Decreasing sonority - add to current nucleus as diphthong *) + { state with + current_syllable = Syllable.append_nucleus state.current_syllable segment; + } + else + (* New syllable *) + new_syllable state (Syllable.create ~nucleus:segment ()) idx + (* Handle consonants *) + else + if String.is_empty state.current_syllable.nucleus then + (* No nucleus yet - add to onset *) + { state with + current_syllable = Syllable.append_onset state.current_syllable segment; + } + else if is_last then + (* Last segment - add to coda *) + { state with + current_syllable = Syllable.append_coda state.current_syllable segment; + } + else if next_has_features state.ipa_table.table state.segments idx + [(Feature.Syllabic, Feature.Plus)] then + (* Next is vowel - start new syllable *) + new_syllable state (Syllable.create ~onset:segment ()) idx + else if is_last_syl || + not (next_has_features state.ipa_table.table state.segments idx + [(Feature.Sonorant, Feature.Plus); (Feature.Nasal, Feature.Minus)]) then + (* Add to coda *) + { state with + current_syllable = Syllable.append_coda state.current_syllable segment; + } + else + (* Start new syllable *) + new_syllable state (Syllable.create ~onset:segment ()) idx + +(** Syllabify a word given its IPA transcription *) +let syllabify ~sonority ~ipa ~word ~lang = + let ipa_table = Sonority.get_ipa_table sonority in + + (* Find stress marker position *) + let stress_idx = + match String.substr_index ipa ~pattern:"ˈ" with + | Some idx -> idx + | None -> -1 + in + + (* Normalize tones and segment the IPA *) + let normalized = + (* For now, just use the IPA as-is since we don't have tone normalization yet *) + ipa + in + let segments = Ipa_table.ipa_segs ipa_table normalized in + let clean_ipa = String.concat segments ~sep:"" in + + (* Initial state *) + let init_state = { + sonority; + ipa_table; + segments; + stress_idx; + syllables = []; + current_syllable = Syllable.empty; + } in + + (* Process each segment *) + let final_state = + List.foldi segments ~init:init_state ~f:(fun idx state segment -> + process_segment state segment idx + ) + in + + (* Finalize last syllable *) + let last_syl = + Syllable.finalize final_state.current_syllable + ~end_idx:(List.length segments) + ~stress_idx:final_state.stress_idx + in + let all_syllables = List.rev (last_syl :: final_state.syllables) in + + { word; ipa; lang; clean_ipa; syllables = all_syllables }
\ No newline at end of file |