blob: af4ce17da53a9b24cb1e215fc0bc785bae4675e9 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
|
(** Syllabifier module for segmenting words into syllables using sonority *)
open Base
(** Result of syllabification *)
type syllabified = {
word : string;
ipa : string;
lang : string;
clean_ipa : string;
syllables : Syllable.t list;
}
(** State during syllabification *)
type state = {
sonority : Sonority.t;
ipa_table : Ipa_table.t;
segments : string list;
stress_idx : int;
syllables : Syllable.t list;
current_syllable : Syllable.t;
}
(** Check if a segment is a tone marker *)
let is_tone ipa_table segment =
match Ipa_table.fts ipa_table segment with
| Some features ->
let not_tone =
Feature.has_feature (Feature.HighTone, Feature.Zero) features &&
Feature.has_feature (Feature.HighReg, Feature.Zero) features
in
(not not_tone) || String.equal segment "˧"
| None -> false
(** Check if a segment is a nucleus (syllabic) *)
let is_nucleus ipa_table segment =
match Ipa_table.fts ipa_table segment with
| Some features -> Feature.has_feature (Feature.Syllabic, Feature.Plus) features
| None -> false
(** Check if a segment is a vowel *)
let is_vowel = is_nucleus
(** Check if there's a vowel remaining in the segments from index *)
let has_vowel_remaining ipa_table segments idx =
let rec check i =
if i >= List.length segments then false
else if is_vowel ipa_table (List.nth_exn segments i) then true
else check (i + 1)
in
check idx
(** Get sonority value for a segment, handling multi-character nuclei *)
let get_nucleus_sonority sonority nucleus =
try
Sonority.sonority sonority nucleus
with _ ->
(* For multi-character nuclei, return the sonority of the last character *)
String.fold nucleus ~init:0 ~f:(fun acc c ->
try
Sonority.sonority sonority (String.of_char c)
with _ -> acc
)
(** Check if next segment has specific features *)
let next_has_features ipa_table segments idx features =
if idx + 1 >= List.length segments then false
else
match Ipa_table.fts ipa_table (List.nth_exn segments (idx + 1)) with
| Some seg_features ->
List.for_all features ~f:(fun (feat, value) ->
Feature.has_feature (feat, value) seg_features
)
| None -> false
(** Create a new syllable and add current one to list *)
let new_syllable state syllable idx =
let finalized = Syllable.finalize state.current_syllable ~end_idx:idx ~stress_idx:state.stress_idx in
{ state with
syllables = finalized :: state.syllables;
current_syllable = { syllable with start_idx = idx };
}
(** Process a single segment *)
let process_segment state segment idx =
let is_last = idx = List.length state.segments - 1 in
let is_last_syl = not (has_vowel_remaining state.ipa_table.table state.segments idx) in
(* Handle tones *)
if is_tone state.ipa_table.table segment then
{ state with
current_syllable = Syllable.append_tone state.current_syllable segment;
}
(* Handle nucleus (vowels) *)
else if is_nucleus state.ipa_table.table segment then
if String.is_empty state.current_syllable.nucleus then
(* First vowel in syllable *)
{ state with
current_syllable = { state.current_syllable with nucleus = segment };
}
else
(* Already have a nucleus - check for diphthong *)
let nucleus_sonority = get_nucleus_sonority state.sonority state.current_syllable.nucleus in
let segment_sonority =
try Sonority.sonority state.sonority segment
with _ -> 0
in
if nucleus_sonority > segment_sonority then
(* Decreasing sonority - add to current nucleus as diphthong *)
{ state with
current_syllable = Syllable.append_nucleus state.current_syllable segment;
}
else
(* New syllable *)
new_syllable state (Syllable.create ~nucleus:segment ()) idx
(* Handle consonants *)
else
if String.is_empty state.current_syllable.nucleus then
(* No nucleus yet - add to onset *)
{ state with
current_syllable = Syllable.append_onset state.current_syllable segment;
}
else if is_last then
(* Last segment - add to coda *)
{ state with
current_syllable = Syllable.append_coda state.current_syllable segment;
}
else if next_has_features state.ipa_table.table state.segments idx
[(Feature.Syllabic, Feature.Plus)] then
(* Next is vowel - start new syllable *)
new_syllable state (Syllable.create ~onset:segment ()) idx
else if is_last_syl ||
not (next_has_features state.ipa_table.table state.segments idx
[(Feature.Sonorant, Feature.Plus); (Feature.Nasal, Feature.Minus)]) then
(* Add to coda *)
{ state with
current_syllable = Syllable.append_coda state.current_syllable segment;
}
else
(* Start new syllable *)
new_syllable state (Syllable.create ~onset:segment ()) idx
(** Syllabify a word given its IPA transcription *)
let syllabify ~sonority ~ipa ~word ~lang =
let ipa_table = Sonority.get_ipa_table sonority in
(* Find stress marker position *)
let stress_idx =
match String.substr_index ipa ~pattern:"ˈ" with
| Some idx -> idx
| None -> -1
in
(* Normalize tones and segment the IPA *)
let normalized =
(* For now, just use the IPA as-is since we don't have tone normalization yet *)
ipa
in
let segments = Ipa_table.ipa_segs ipa_table normalized in
let clean_ipa = String.concat segments ~sep:"" in
(* Initial state *)
let init_state = {
sonority;
ipa_table;
segments;
stress_idx;
syllables = [];
current_syllable = Syllable.empty;
} in
(* Process each segment *)
let final_state =
List.foldi segments ~init:init_state ~f:(fun idx state segment ->
process_segment state segment idx
)
in
(* Finalize last syllable *)
let last_syl =
Syllable.finalize final_state.current_syllable
~end_idx:(List.length segments)
~stress_idx:final_state.stress_idx
in
let all_syllables = List.rev (last_syl :: final_state.syllables) in
{ word; ipa; lang; clean_ipa; syllables = all_syllables }
|