diff options
author | polwex <polwex@sortug.com> | 2025-06-22 13:46:57 +0700 |
---|---|---|
committer | polwex <polwex@sortug.com> | 2025-06-22 13:46:57 +0700 |
commit | 5dd49048bb65de3d572d43ba2f1b01435c71a35a (patch) | |
tree | 7f8e629ae511c3947a80f99906542f3fd2de0a9f | |
parent | 5f495c1d4ee624f9d24f03e50700e7d9a9305b73 (diff) |
-rw-r--r-- | sorsyl/lib/syllabifier.ml | 187 | ||||
-rw-r--r-- | sorsyl/lib/syllabifier.mli | 24 | ||||
-rw-r--r-- | sorsyl/lib/syllable.ml | 97 | ||||
-rw-r--r-- | sorsyl/lib/syllable.mli | 54 | ||||
-rw-r--r-- | sorsyl/test/dune | 4 | ||||
-rw-r--r-- | sorsyl/test/test_syllabifier.ml | 49 |
6 files changed, 415 insertions, 0 deletions
diff --git a/sorsyl/lib/syllabifier.ml b/sorsyl/lib/syllabifier.ml new file mode 100644 index 0000000..af4ce17 --- /dev/null +++ b/sorsyl/lib/syllabifier.ml @@ -0,0 +1,187 @@ +(** Syllabifier module for segmenting words into syllables using sonority *) + +open Base + +(** Result of syllabification *) +type syllabified = { + word : string; + ipa : string; + lang : string; + clean_ipa : string; + syllables : Syllable.t list; +} + +(** State during syllabification *) +type state = { + sonority : Sonority.t; + ipa_table : Ipa_table.t; + segments : string list; + stress_idx : int; + syllables : Syllable.t list; + current_syllable : Syllable.t; +} + +(** Check if a segment is a tone marker *) +let is_tone ipa_table segment = + match Ipa_table.fts ipa_table segment with + | Some features -> + let not_tone = + Feature.has_feature (Feature.HighTone, Feature.Zero) features && + Feature.has_feature (Feature.HighReg, Feature.Zero) features + in + (not not_tone) || String.equal segment "˧" + | None -> false + +(** Check if a segment is a nucleus (syllabic) *) +let is_nucleus ipa_table segment = + match Ipa_table.fts ipa_table segment with + | Some features -> Feature.has_feature (Feature.Syllabic, Feature.Plus) features + | None -> false + +(** Check if a segment is a vowel *) +let is_vowel = is_nucleus + +(** Check if there's a vowel remaining in the segments from index *) +let has_vowel_remaining ipa_table segments idx = + let rec check i = + if i >= List.length segments then false + else if is_vowel ipa_table (List.nth_exn segments i) then true + else check (i + 1) + in + check idx + +(** Get sonority value for a segment, handling multi-character nuclei *) +let get_nucleus_sonority sonority nucleus = + try + Sonority.sonority sonority nucleus + with _ -> + (* For multi-character nuclei, return the sonority of the last character *) + String.fold nucleus ~init:0 ~f:(fun acc c -> + try + Sonority.sonority sonority (String.of_char c) + with _ -> acc + ) + +(** Check if next segment has specific features *) +let next_has_features ipa_table segments idx features = + if idx + 1 >= List.length segments then false + else + match Ipa_table.fts ipa_table (List.nth_exn segments (idx + 1)) with + | Some seg_features -> + List.for_all features ~f:(fun (feat, value) -> + Feature.has_feature (feat, value) seg_features + ) + | None -> false + +(** Create a new syllable and add current one to list *) +let new_syllable state syllable idx = + let finalized = Syllable.finalize state.current_syllable ~end_idx:idx ~stress_idx:state.stress_idx in + { state with + syllables = finalized :: state.syllables; + current_syllable = { syllable with start_idx = idx }; + } + +(** Process a single segment *) +let process_segment state segment idx = + let is_last = idx = List.length state.segments - 1 in + let is_last_syl = not (has_vowel_remaining state.ipa_table.table state.segments idx) in + + (* Handle tones *) + if is_tone state.ipa_table.table segment then + { state with + current_syllable = Syllable.append_tone state.current_syllable segment; + } + (* Handle nucleus (vowels) *) + else if is_nucleus state.ipa_table.table segment then + if String.is_empty state.current_syllable.nucleus then + (* First vowel in syllable *) + { state with + current_syllable = { state.current_syllable with nucleus = segment }; + } + else + (* Already have a nucleus - check for diphthong *) + let nucleus_sonority = get_nucleus_sonority state.sonority state.current_syllable.nucleus in + let segment_sonority = + try Sonority.sonority state.sonority segment + with _ -> 0 + in + if nucleus_sonority > segment_sonority then + (* Decreasing sonority - add to current nucleus as diphthong *) + { state with + current_syllable = Syllable.append_nucleus state.current_syllable segment; + } + else + (* New syllable *) + new_syllable state (Syllable.create ~nucleus:segment ()) idx + (* Handle consonants *) + else + if String.is_empty state.current_syllable.nucleus then + (* No nucleus yet - add to onset *) + { state with + current_syllable = Syllable.append_onset state.current_syllable segment; + } + else if is_last then + (* Last segment - add to coda *) + { state with + current_syllable = Syllable.append_coda state.current_syllable segment; + } + else if next_has_features state.ipa_table.table state.segments idx + [(Feature.Syllabic, Feature.Plus)] then + (* Next is vowel - start new syllable *) + new_syllable state (Syllable.create ~onset:segment ()) idx + else if is_last_syl || + not (next_has_features state.ipa_table.table state.segments idx + [(Feature.Sonorant, Feature.Plus); (Feature.Nasal, Feature.Minus)]) then + (* Add to coda *) + { state with + current_syllable = Syllable.append_coda state.current_syllable segment; + } + else + (* Start new syllable *) + new_syllable state (Syllable.create ~onset:segment ()) idx + +(** Syllabify a word given its IPA transcription *) +let syllabify ~sonority ~ipa ~word ~lang = + let ipa_table = Sonority.get_ipa_table sonority in + + (* Find stress marker position *) + let stress_idx = + match String.substr_index ipa ~pattern:"ˈ" with + | Some idx -> idx + | None -> -1 + in + + (* Normalize tones and segment the IPA *) + let normalized = + (* For now, just use the IPA as-is since we don't have tone normalization yet *) + ipa + in + let segments = Ipa_table.ipa_segs ipa_table normalized in + let clean_ipa = String.concat segments ~sep:"" in + + (* Initial state *) + let init_state = { + sonority; + ipa_table; + segments; + stress_idx; + syllables = []; + current_syllable = Syllable.empty; + } in + + (* Process each segment *) + let final_state = + List.foldi segments ~init:init_state ~f:(fun idx state segment -> + process_segment state segment idx + ) + in + + (* Finalize last syllable *) + let last_syl = + Syllable.finalize final_state.current_syllable + ~end_idx:(List.length segments) + ~stress_idx:final_state.stress_idx + in + let all_syllables = List.rev (last_syl :: final_state.syllables) in + + { word; ipa; lang; clean_ipa; syllables = all_syllables }
\ No newline at end of file diff --git a/sorsyl/lib/syllabifier.mli b/sorsyl/lib/syllabifier.mli new file mode 100644 index 0000000..e40feeb --- /dev/null +++ b/sorsyl/lib/syllabifier.mli @@ -0,0 +1,24 @@ +(** Syllabifier module for segmenting words into syllables using sonority *) + +(** Result of syllabification *) +type syllabified = { + word : string; (** Original orthographic word *) + ipa : string; (** Original IPA transcription *) + lang : string; (** Language code *) + clean_ipa : string; (** Cleaned IPA without diacritics *) + syllables : Syllable.t list; (** List of syllables *) +} + +(** Syllabify a word given its IPA transcription + @param sonority The sonority calculator + @param ipa The IPA transcription of the word + @param word The orthographic form of the word + @param lang The language code + @return The syllabified result +*) +val syllabify : + sonority:Sonority.t -> + ipa:string -> + word:string -> + lang:string -> + syllabified
\ No newline at end of file diff --git a/sorsyl/lib/syllable.ml b/sorsyl/lib/syllable.ml new file mode 100644 index 0000000..ef72736 --- /dev/null +++ b/sorsyl/lib/syllable.ml @@ -0,0 +1,97 @@ +(** Module for representing syllables and their components *) + +open Base + +(** Type representing a syllable with its phonological components *) +type t = { + onset : string; (** Initial consonants *) + medial : string; (** Medial consonants (between onset and nucleus) *) + nucleus : string; (** Vowel core of the syllable *) + coda : string; (** Final consonants *) + tone : string; (** Tonal information *) + spelling : string; (** Orthographic representation *) + start_idx : int; (** Start position in the word *) + end_idx : int; (** End position in the word *) + stressed : bool; (** Whether this syllable is stressed *) +} + +(** Create an empty syllable *) +let empty = { + onset = ""; + medial = ""; + nucleus = ""; + coda = ""; + tone = ""; + spelling = ""; + start_idx = 0; + end_idx = 0; + stressed = false; +} + +(** Create a syllable with specified components *) +let create ?(onset = "") ?(medial = "") ?(nucleus = "") ?(coda = "") ?(tone = "") + ?(spelling = "") ?(start_idx = 0) ?(end_idx = 0) ?(stressed = false) () = + { onset; medial; nucleus; coda; tone; spelling; start_idx; end_idx; stressed } + +(** Get the complete syllable string *) +let all t = t.onset ^ t.medial ^ t.nucleus ^ t.coda ^ t.tone + +(** Get the rhyme (medial + nucleus + coda) *) +let rhyme t = t.medial ^ t.nucleus ^ t.coda + +(** Check if the nucleus contains length marker *) +let is_long t = String.is_substring t.nucleus ~substring:"ː" + +(** Finalize a syllable - determine if it's stressed based on position relative to stress marker *) +let finalize t ~end_idx ~stress_idx = + let stressed = + if stress_idx >= 0 then + stress_idx >= t.start_idx && stress_idx <= end_idx + else + false + in + { t with end_idx; stressed } + +(** Pretty print a syllable with Unicode box drawing *) +let pretty_print t = + let open Printf in + let box_width = 10 in + let pad s = + let len = String.length s in + if len >= box_width then String.prefix s box_width + else s ^ String.make (box_width - len) ' ' + in + + let h_line = String.make box_width '-' in + let top = "+" ^ h_line ^ "+" in + let bottom = "+" ^ h_line ^ "+" in + let make_row label content = sprintf "|%s|" (pad (label ^ content)) in + + String.concat ~sep:"\n" [ + top; + make_row "σ: " (all t); + make_row "O: " t.onset; + make_row "M: " t.medial; + make_row "N: " t.nucleus; + make_row "C: " t.coda; + make_row "T: " t.tone; + if t.stressed then make_row "* " "stressed" else ""; + bottom + ] |> String.strip + +(** Convert to string representation *) +let to_string t = + Printf.sprintf "{onset=%S; medial=%S; nucleus=%S; coda=%S; tone=%S; stressed=%b}" + t.onset t.medial t.nucleus t.coda t.tone t.stressed + +(** Append to onset *) +let append_onset t s = { t with onset = t.onset ^ s } + +(** Append to nucleus *) +let append_nucleus t s = { t with nucleus = t.nucleus ^ s } + +(** Append to coda *) +let append_coda t s = { t with coda = t.coda ^ s } + +(** Append to tone *) +let append_tone t s = { t with tone = t.tone ^ s }
\ No newline at end of file diff --git a/sorsyl/lib/syllable.mli b/sorsyl/lib/syllable.mli new file mode 100644 index 0000000..e16364f --- /dev/null +++ b/sorsyl/lib/syllable.mli @@ -0,0 +1,54 @@ +(** Module for representing syllables and their components *) + +(** Type representing a syllable with its phonological components *) +type t = { + onset : string; (** Initial consonants *) + medial : string; (** Medial consonants (between onset and nucleus) *) + nucleus : string; (** Vowel core of the syllable *) + coda : string; (** Final consonants *) + tone : string; (** Tonal information *) + spelling : string; (** Orthographic representation *) + start_idx : int; (** Start position in the word *) + end_idx : int; (** End position in the word *) + stressed : bool; (** Whether this syllable is stressed *) +} + +(** Empty syllable constant *) +val empty : t + +(** Create a syllable with specified components *) +val create : + ?onset:string -> + ?medial:string -> + ?nucleus:string -> + ?coda:string -> + ?tone:string -> + ?spelling:string -> + ?start_idx:int -> + ?end_idx:int -> + ?stressed:bool -> + unit -> t + +(** Get the complete syllable string *) +val all : t -> string + +(** Get the rhyme (medial + nucleus + coda) *) +val rhyme : t -> string + +(** Check if the nucleus contains length marker *) +val is_long : t -> bool + +(** Finalize a syllable - determine if it's stressed based on position relative to stress marker *) +val finalize : t -> end_idx:int -> stress_idx:int -> t + +(** Pretty print a syllable with Unicode box drawing *) +val pretty_print : t -> string + +(** Convert to string representation *) +val to_string : t -> string + +(** Append operations *) +val append_onset : t -> string -> t +val append_nucleus : t -> string -> t +val append_coda : t -> string -> t +val append_tone : t -> string -> t
\ No newline at end of file diff --git a/sorsyl/test/dune b/sorsyl/test/dune index e79f200..98e22a7 100644 --- a/sorsyl/test/dune +++ b/sorsyl/test/dune @@ -9,3 +9,7 @@ (test (name test_table) (libraries sorsyl)) + +(test + (name test_syllabifier) + (libraries sorsyl)) diff --git a/sorsyl/test/test_syllabifier.ml b/sorsyl/test/test_syllabifier.ml new file mode 100644 index 0000000..08f3d0d --- /dev/null +++ b/sorsyl/test/test_syllabifier.ml @@ -0,0 +1,49 @@ +open Printf +open Sorsyl + +let test_syllabifier () = + (* Create sonority calculator *) + let data_dir = + if Sys.file_exists "./data" then "./data" + else if Sys.file_exists "../data" then "../data" + else if Sys.file_exists "../../../data" then "../../../data" + else ( + Printf.eprintf "Current directory: %s\n" (Sys.getcwd ()); + failwith "Cannot find data directory") + in + let sonority = Sonority.create data_dir in + + (* Test basic syllabification *) + let test_word word ipa lang = + printf "\nTesting: %s [%s] (%s)\n" word ipa lang; + let result = Syllabifier.syllabify ~sonority ~ipa ~word ~lang in + printf "Clean IPA: %s\n" result.clean_ipa; + printf "Syllables: %d\n" (List.length result.syllables); + Base.List.iteri result.syllables ~f:(fun i syl -> + printf "\nSyllable %d:\n" (i + 1); + printf " All: %s\n" (Syllable.all syl); + printf " Onset: %s\n" syl.onset; + printf " Nucleus: %s\n" syl.nucleus; + printf " Coda: %s\n" syl.coda; + printf " Tone: %s\n" syl.tone; + printf " Stressed: %b\n" syl.stressed) + in + + (* Test cases *) + test_word "hello" "hɛˈloʊ" "en"; + test_word "computer" "kəmˈpjuːtər" "en"; + test_word "language" "ˈlæŋɡwɪdʒ" "en"; + test_word "syllable" "ˈsɪləbəl" "en"; + + (* Test with tones (if supported) *) + test_word "ma" "ma˧" "zh"; + + (* Test pretty printing *) + printf "\n\nPretty print example:\n"; + let result = + Syllabifier.syllabify ~sonority ~ipa:"ˈsɪləbəl" ~word:"syllable" ~lang:"en" + in + Base.List.iter result.syllables ~f:(fun syl -> + printf "%s\n" (Syllable.pretty_print syl)) + +let () = test_syllabifier () |