1 /*
2  * Copyright 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 use std::cmp;
18 
19 /// An implementation of hyphenation for Android.
20 ///
21 /// The hyphenation in Android is done with two steps: first performs the Knuth-Liang hyphenation
22 /// algorithm for populating all possible hyphenation points. Then, resolve hyphenation type from
23 /// the scripts and locales.
24 ///
25 /// The Knuth-Liang hyphenation works like as follows:
26 /// The Knuth-Liang hyphenation uses two dictionary: pattern dictionary and exception files. The
27 /// files end with ".hyp.txt" are exception files and the files end with ".pat.txt" are pattern
28 /// files. If the word is in exception file, the hyphenation is performed as it is specified in the
29 /// exception file.
30 ///
31 /// Then, if the word is not in the exception file, the Knuth-Liang hyphenation is performed with
32 /// hyphenation pattern dictionary. The hyphenation pattern dictionary is a list of sub-word with
33 /// hyphenation level as number. The level values are assigned between each letters including before
34 /// the first letter and last letter. If the value is odd number, then the position is a hyphenation
35 /// point. If the value is even number, then the position is not a hyphenation point. In the pattern
36 /// file, the 0 is dropped, so the meaning of "re4ti4z" is level values "0040040" for sub-word
37 /// "retiz". The hyphenation is performed by iterating all patterns and assigning level values to
38 /// the possible break points. If the break point can be assigned from multiple patterns, the
39 /// maximum value is used. If none of the pattern matches the break point, the level is zero,
40 /// therefore do not break. And finally the odd numbered positions are the break points.
41 ///
42 /// Here is an example how the "hyphenation" is broken into "hy-phen-ation".
43 /// The relevant patterns in the pattern dictionary are
44 /// - hy3ph
45 /// - he2n
46 /// - hena4
47 /// - hen5at
48 /// - 1na
49 /// - n2at
50 /// - 1tio
51 /// - 2io
52 /// - o2n
53 ///
54 /// Then when these patterns are applied to the word "hyphenation", it becomes like
55 ///
56 ///   h y p h e n a t i o n
57 ///  0 0 3 0 0              : hy3ph
58 ///        0 0 2 0          : he2n
59 ///        0 0 0 0 4        : hena4
60 ///        0 0 0 5 0 0      : hen5at
61 ///            1 0 0        : 1na
62 ///            0 2 0 0      : n2at
63 ///                1 0 0 0  : 1tio
64 ///                  2 0 0  : 2io
65 ///                    0 2 0: o2n
66 /// ---------------------------------
67 ///  0 0 3 0 0 2 5 4 2 0 2 0: max
68 ///
69 /// Then, the odd-numbered break points are hyphenation allowed break points, so the result is
70 /// "hy-phen-ation".
71 ///
72 /// In the Android implementation, the hyphenation pattern file is preprocessed to Trie in build
73 /// time. For the detailed file format, see comments of HyphenationData struct.
74 ///
75 /// Once the all possible hyphenation break points are collected, the decide the hyphenation break
76 /// type is determined based on the script and locale. For example, in case of Arabic, the letter
77 /// form should not be changed by hyphenation, so ZWJ can be inserted before and after hyphen
78 /// letter.
79 
80 const CHAR_SOFT_HYPHEN: u16 = 0x00AD;
81 const CHAR_MIDDLE_DOT: u16 = 0x00B7;
82 const CHAR_HYPHEN_MINUS: u16 = 0x002D;
83 const CHAR_HYPHEN: u16 = 0x2010;
84 
85 // The following U_JT_* constants must be same to the ones defined in
86 // frameworks/minikin/lib/minikin/ffi/IciBridge.h
87 // TODO: Replace with ICU4X once it becomes available in Android.
88 const U_JT_NON_JOINING: u8 = 0;
89 const U_JT_DUAL_JOINING: u8 = 1;
90 const U_JT_RIGHT_JOINING: u8 = 2;
91 const U_JT_LEFT_JOINING: u8 = 3;
92 const U_JT_JOIN_CAUSING: u8 = 4;
93 const U_JT_TRANSPARENT: u8 = 5;
94 
95 // The following USCRIPT_* constants must be same to the ones defined in
96 // frameworks/minikin/lib/minikin/ffi/IciBridge.h
97 // TODO: Replace with ICU4X once it becomes available in Android.
98 const USCRIPT_LATIN: u8 = 0;
99 const USCRIPT_ARABIC: u8 = 1;
100 const USCRIPT_KANNADA: u8 = 2;
101 const USCRIPT_MALAYALAM: u8 = 3;
102 const USCRIPT_TAMIL: u8 = 4;
103 const USCRIPT_TELUGU: u8 = 5;
104 const USCRIPT_ARMENIAN: u8 = 6;
105 const USCRIPT_CANADIAN_ABORIGINAL: u8 = 7;
106 
107 use crate::ffi::getJoiningType;
108 use crate::ffi::getScript;
109 
110 /// Hyphenation types
111 /// The following values must be equal to the ones in
112 /// frameworks/minikin/include/minikin/Hyphenator.h
113 #[repr(u8)]
114 #[derive(PartialEq, Copy, Clone)]
115 pub enum HyphenationType {
116     /// Do not break.
117     DontBreak = 0,
118     /// Break the line and insert a normal hyphen.
119     BreakAndInsertHyphen = 1,
120     /// Break the line and insert an Armenian hyphen (U+058A).
121     BreakAndInsertArmenianHyphen = 2,
122     /// Break the line and insert a Canadian Syllabics hyphen (U+1400).
123     BreakAndInsertUcasHyphen = 4,
124     /// Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen
125     /// present or the script does not use a hyphen (e.g. in Malayalam).
126     BreakAndDontInsertHyphen = 5,
127     /// Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates
128     /// as "l-/l".
129     BreakAndReplaceWithHyphen = 6,
130     /// Break the line, and repeat the hyphen (which is the last character) at the beginning of the
131     /// next line. Used in Polish (where "czerwono-niebieska" should hyphenate as
132     /// "czerwono-/-niebieska") and Slovenian.
133     BreakAndInsertHyphenAtNextLine = 7,
134     /// Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
135     /// This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
136     /// behavior when a soft hyphen is used in Arabic script.
137     BreakAndInsertHyphenAndZwj = 8,
138 }
139 
140 /// Hyphenation locale
141 #[repr(u8)]
142 #[derive(PartialEq, Copy, Clone)]
143 pub enum HyphenationLocale {
144     /// Other locale
145     Other = 0,
146     /// Catalan
147     Catalan = 1,
148     /// Polish
149     Polish = 2,
150     /// Slovenian
151     Slovenian = 3,
152 }
153 
154 const MAX_HYPHEN_SIZE: u32 = 64;
155 
156 struct HyphenationData<'a> {
157     bytes: &'a [u8],
158 }
159 
160 /// The Hyphenation pattern file is encoded into binary format during build time.
161 /// The hyphenation pattern file is encoded into three objects: AlphabetTable, Trie, Patterns.
162 ///
163 /// First, to avoid high value of utf16 char values in Trie object, char values are mapped to
164 /// internal alphabet codes. The AlphabetTable0 and AndroidTable1 has a map from utf16 char values
165 /// to internal u16 alphabet codes. The AlphabetTable0 is used if the min and max used code points
166 /// has less than 1024, i.e. max_codepoint - min_codepoint < 1024. The AlphabetTable1 is used
167 /// otherwise.
168 ///
169 /// Then, the pattern file is encoded with Trie and Pattern object with using internal
170 /// alphabet code. For example, in case of the entry "ef5i5nite", the hyphenation score "00550000"
171 /// is stored in the Pattern object and the subword "efinite" is stored in the Trie object.
172 ///
173 /// The Trie object is encoded as one dimensional u32 arrays. Each u32 integer contains packed
174 /// index to the Pattern object, index to the next node entry and alphabet code.
175 /// Trie Entry:
176 ///    0                   1                   2                   3
177 ///    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1  (bits)
178 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
179 ///   |   index to pattern data   |  index to the next node |   code  |
180 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
181 ///   Note: the layout is as an example of pattern_shift = 19, link_shift = 5.
182 ///
183 /// The Pattern object is encoded into two data: entry list and data payload. The entry is a packed
184 /// u32 integer that contains length of the pattern, an amount of shift of the pattern index and
185 /// an offset from the payload head.
186 ///
187 /// Pattern Entry:
188 ///    0                   1                   2                   3
189 ///    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1  (bits)
190 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
191 ///   |len of pat | pat shift |      offset to the pattern data       |
192 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
193 ///
194 /// The pattern data and related information can be obtained as follows:
195 ///
196 ///   // Pattern information
197 ///   let entry = pattern[16 + pattern_index * 4]  // read u32 as little endian.
198 ///   let pattern_length = entry >> 26
199 ///   let pattern_shift = (entry > 20) & 0x3f
200 ///
201 ///   // Pattern value retrieval: i-th offset in the word.
202 ///   let pattern_offset = pattern[8] // read u32 as little endian.
203 ///   let pattern_value = pattern[pattern_offset + (entry & 0xfffff) + i]
204 impl<'a> HyphenationData<'a> {
new(bytes: &'a [u8]) -> Self205     pub const fn new(bytes: &'a [u8]) -> Self {
206         HyphenationData { bytes }
207     }
208 
read_u32(&self, offset: u32) -> u32209     pub fn read_u32(&self, offset: u32) -> u32 {
210         let usize_offset = offset as usize;
211         self.bytes
212             .get(usize_offset..usize_offset + 4)
213             .map(|x: &[u8]| u32::from_le_bytes(x.try_into().unwrap()))
214             .unwrap()
215     }
216 }
217 
218 /// Header struct of the hyphenation pattern file.
219 /// The object layout follows:
220 ///    0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F    (bytes)
221 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
222 ///   |      magic    |     version   |alphabet offset|  trie offset  |
223 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
224 ///   |pattern offset |   file size   |
225 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
226 pub struct Header<'a> {
227     data: HyphenationData<'a>,
228 }
229 
230 /// Alphabet Table version 0 struct of the hyphenation pattern file.
231 /// The object layout follows:
232 ///    0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F    (bytes)
233 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
234 ///   |     version   | min codepoint | max codepoint |    payload
235 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
236 pub struct AlphabetTable0<'a> {
237     data: HyphenationData<'a>,
238     min_codepoint: u32,
239     max_codepoint: u32,
240 }
241 
242 /// Alphabet Table version 1 struct of the hyphenation pattern file.
243 /// The object layout follows:
244 ///    0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F    (bytes)
245 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
246 ///   |     version   | num of entries|         payload
247 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
248 pub struct AlphabetTable1<'a> {
249     data: HyphenationData<'a>,
250     num_entries: u32,
251 }
252 
253 /// An entry of alphabet table version 1 struct of the hyphenation pattern file.
254 /// The entry is packed u32 value: the high 21 bits are code point and low 11 bits
255 /// are alphabet code value.
256 ///    0                   1                   2                   3
257 ///    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1  (bits)
258 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
259 ///   |                code point               |     code value      |
260 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
261 pub struct AlphabetTable1Entry {
262     entry: u32,
263 }
264 
265 /// Trie struct of the hyphenation pattern file.
266 /// The object layout follows:
267 ///    0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F    (bytes)
268 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
269 ///   |     version   |   char mask   |  link shift   |   link mask   |
270 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
271 ///   | pattern shift |  num entries  |         payload
272 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
273 pub struct Trie<'a> {
274     data: HyphenationData<'a>,
275 }
276 
277 /// Pattern struct of the hyphenation pattern file.
278 /// The object layout follows:
279 ///    0   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F    (bytes)
280 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
281 ///   |     version   | num entries   | pattern offset|  pattern size |
282 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
283 ///   | payload
284 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
285 pub struct Pattern<'a> {
286     data: HyphenationData<'a>,
287     pattern_offset: u32,
288 }
289 
290 /// An entry of pattern struct of the hyphenation pattern file.
291 /// The entry is packed u32 value: the highest 6 bits are for length, next 6 bits are amount of
292 /// shift, and lowest 20 bits are offset of the first value from the pattern offset value.
293 ///    0                   1                   2                   3
294 ///    0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1  (bits)
295 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
296 ///   |   length  |   shift   |     offset of the first value         |
297 ///   +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
298 pub struct PatternEntry<'a> {
299     data: HyphenationData<'a>,
300     pattern_offset: u32,
301     entry: u32,
302 }
303 
304 impl<'a> Header<'a> {
305     /// Construct a reader of the Header struct from the byte array.
new(bytes: &'a [u8]) -> Self306     pub const fn new(bytes: &'a [u8]) -> Self {
307         Header { data: HyphenationData::new(bytes) }
308     }
309 
310     /// Returns the reader of the alphabet code.
alphabet_table(&self) -> Option<Box<dyn AlphabetLookup + 'a>>311     pub fn alphabet_table(&self) -> Option<Box<dyn AlphabetLookup + 'a>> {
312         let offset = self.data.read_u32(8);
313         let version = self.data.read_u32(offset);
314         return match version {
315             0 => Some(Box::new(AlphabetTable0::new(self.read_offset_and_slice(8)))),
316             1 => Some(Box::new(AlphabetTable1::new(self.read_offset_and_slice(8)))),
317             _ => None,
318         };
319     }
320 
321     /// Returns the reader of the trie struct.
trie_table(&self) -> Trie<'a>322     pub fn trie_table(&self) -> Trie<'a> {
323         Trie::new(self.read_offset_and_slice(12))
324     }
325 
326     /// Returns the reader of the pattern struct.
pattern_table(&self) -> Pattern<'a>327     pub fn pattern_table(&self) -> Pattern<'a> {
328         Pattern::new(self.read_offset_and_slice(16))
329     }
330 
read_offset_and_slice(&self, offset: u32) -> &'a [u8]331     fn read_offset_and_slice(&self, offset: u32) -> &'a [u8] {
332         let offset = self.data.read_u32(offset) as usize;
333         self.data.bytes.get(offset..).unwrap()
334     }
335 }
336 
337 pub trait AlphabetLookup {
338     /// Get the alphabet code for the code point.
get_at(&self, c: u32) -> Option<u16>339     fn get_at(&self, c: u32) -> Option<u16>;
340 
341     /// Lookup the internal alphabet codes from UTF-16 character codes.
lookup( &self, alpha_codes: &mut [u16; MAX_HYPHEN_SIZE as usize], word: &[u16], ) -> HyphenationType342     fn lookup(
343         &self,
344         alpha_codes: &mut [u16; MAX_HYPHEN_SIZE as usize],
345         word: &[u16],
346     ) -> HyphenationType {
347         let mut result = HyphenationType::BreakAndInsertHyphen;
348         alpha_codes[0] = 0; // word start
349         for i in 0..word.len() {
350             let c = word[i] as u32;
351             if let Some(code) = self.get_at(c) {
352                 alpha_codes[i + 1] = code;
353             } else {
354                 return HyphenationType::DontBreak;
355             }
356             if result == HyphenationType::BreakAndInsertHyphen {
357                 result = Hyphenator::hyphenation_type_based_on_script(c);
358             }
359         }
360         alpha_codes[word.len() + 1] = 0; // word termination
361         result
362     }
363 }
364 
365 /// Map from utf16 code unit to the internal alphabet code.
366 impl<'a> AlphabetTable0<'a> {
367     /// Construct a reader of the Alphabet Table version 0 struct from the byte array.
new(bytes: &'a [u8]) -> Self368     pub fn new(bytes: &'a [u8]) -> Self {
369         let data = HyphenationData::new(bytes);
370         let min_codepoint = data.read_u32(4);
371         let max_codepoint = data.read_u32(8);
372         AlphabetTable0 { data, min_codepoint, max_codepoint }
373     }
374 }
375 
376 impl<'a> AlphabetLookup for AlphabetTable0<'a> {
377     /// Returns an entry of the specified offset.
get_at(&self, offset: u32) -> Option<u16>378     fn get_at(&self, offset: u32) -> Option<u16> {
379         if offset < self.min_codepoint || offset >= self.max_codepoint {
380             None
381         } else {
382             let code = self.data.bytes[(offset - self.min_codepoint) as usize + 12] as u16;
383             if code == 0 {
384                 None
385             } else {
386                 Some(code)
387             }
388         }
389     }
390 }
391 
392 /// Map from utf16 code unit to the internal alphabet code.
393 impl<'a> AlphabetTable1<'a> {
394     /// Construct a reader of the Alphabet Table version 1 struct from the byte array.
new(bytes: &'a [u8]) -> Self395     pub fn new(bytes: &'a [u8]) -> Self {
396         let data = HyphenationData::new(bytes);
397         let num_entries = data.read_u32(4);
398         AlphabetTable1 { data, num_entries }
399     }
400 
lower_bounds(&self, value: u32) -> Option<u32>401     fn lower_bounds(&self, value: u32) -> Option<u32> {
402         let mut b = 0;
403         let mut e = self.num_entries;
404         while b != e {
405             let m = b + (e - b) / 2;
406             let c = self.data.read_u32(8 + m * 4);
407             if c >= value {
408                 e = m;
409             } else {
410                 b = m + 1;
411             }
412         }
413         if b == self.num_entries {
414             None
415         } else {
416             Some(b)
417         }
418     }
419 }
420 
421 impl<'a> AlphabetLookup for AlphabetTable1<'a> {
get_at(&self, c: u32) -> Option<u16>422     fn get_at(&self, c: u32) -> Option<u16> {
423         if let Some(r) = self.lower_bounds(c << 11) {
424             let entry = AlphabetTable1Entry::new(self.data.read_u32(8 + r * 4));
425             if entry.codepoint() == c {
426                 Some(entry.value())
427             } else {
428                 None
429             }
430         } else {
431             None
432         }
433     }
434 }
435 
436 /// A packed u32 entry of the AlphabetTable1.
437 impl AlphabetTable1Entry {
new(entry_value: u32) -> Self438     pub const fn new(entry_value: u32) -> Self {
439         AlphabetTable1Entry { entry: entry_value }
440     }
441 
442     /// Unpack code point from entry value.
codepoint(&self) -> u32443     pub fn codepoint(&self) -> u32 {
444         self.entry >> 11
445     }
446 
447     /// Unpack value from entry value.
value(&self) -> u16448     pub fn value(&self) -> u16 {
449         (self.entry & 0x7ff).try_into().unwrap()
450     }
451 }
452 
453 /// A Trie object.
454 /// See the function comment of HyphenationData for the details.
455 impl<'a> Trie<'a> {
456     /// Construct a reader of the Trie struct from the byte array.
new(bytes: &'a [u8]) -> Self457     pub const fn new(bytes: &'a [u8]) -> Self {
458         Trie { data: HyphenationData::new(bytes) }
459     }
460 
461     /// Returns an entry of at the offset.
462     /// The entry of the next alphabet code is
463     ///
464     /// let entry = trie.get_at(node + alphabet_codes[char])
get_at(&self, offset: u32) -> u32465     pub fn get_at(&self, offset: u32) -> u32 {
466         self.data.read_u32(24 + offset * 4)
467     }
468 
469     /// Returns the bit mask for the character code point of the node.
470     /// You can get node's character code point by
471     ///
472     /// let node_character = entry & char_mask.
char_mask(&self) -> u32473     pub fn char_mask(&self) -> u32 {
474         self.data.read_u32(4)
475     }
476 
477     /// Returns the amount of shift of the node index.
478     /// You can get node number as following
479     ///
480     /// let next_node = (entry & link_mask) >> link_shift
link_shift(&self) -> u32481     pub fn link_shift(&self) -> u32 {
482         self.data.read_u32(8)
483     }
484 
485     /// Returns the mask for the node index.
486     /// You can get node number as following
487     ///
488     /// let next_node = (entry & link_mask) >> link_shift
link_mask(&self) -> u32489     pub fn link_mask(&self) -> u32 {
490         self.data.read_u32(12)
491     }
492 
493     /// Returns the amount of shift of the pattern index.
494     /// You can get pattern index as following
495     ///
496     /// let pattern_index = entry >> pattern_shift
pattern_shift(&self) -> u32497     pub fn pattern_shift(&self) -> u32 {
498         self.data.read_u32(16)
499     }
500 }
501 
502 /// A Pattern object.
503 /// See the function comment of HyphenationData for the details.
504 impl<'a> Pattern<'a> {
505     /// Construct a reader of the Pattern struct from the byte array.
new(bytes: &'a [u8]) -> Self506     pub fn new(bytes: &'a [u8]) -> Self {
507         let data = HyphenationData::new(bytes);
508         let pattern_offset = data.read_u32(8);
509         Pattern { data, pattern_offset }
510     }
511 
512     /// Returns a packed u32 entry at the given offset.
entry_at(&self, offset: u32) -> PatternEntry<'a>513     pub fn entry_at(&self, offset: u32) -> PatternEntry<'a> {
514         let entry = self.data.read_u32(16 + offset * 4);
515         PatternEntry::new(self.data.bytes, self.pattern_offset, entry)
516     }
517 }
518 
519 /// An entry of the pattern object.
520 impl<'a> PatternEntry<'a> {
521     /// Construct a reader of the Pattern struct from the byte array.
new(bytes: &'a [u8], pattern_offset: u32, entry: u32) -> Self522     pub const fn new(bytes: &'a [u8], pattern_offset: u32, entry: u32) -> Self {
523         PatternEntry { data: HyphenationData::new(bytes), pattern_offset, entry }
524     }
525 
526     /// Unpack length of the pattern from the packed entry value.
len(&self) -> u32527     pub fn len(&self) -> u32 {
528         self.entry >> 26
529     }
530 
531     /// Unpack an amount of shift of the pattern data from the packed entry value.
shift(&self) -> u32532     pub fn shift(&self) -> u32 {
533         (self.entry >> 20) & 0x3f
534     }
535 
536     /// Returns a hyphenation score value at the offset in word with the entry.
value_at(&self, offset: u32) -> u8537     pub fn value_at(&self, offset: u32) -> u8 {
538         self.data.bytes[(self.pattern_offset + (self.entry & 0xfffff) + offset) as usize]
539     }
540 }
541 
542 /// Performs hyphenation
543 pub struct Hyphenator {
544     data: &'static [u8],
545     min_prefix: u32,
546     min_suffix: u32,
547     locale: HyphenationLocale,
548 }
549 
550 impl Hyphenator {
551     /// Create a new hyphenator instance
new(data: &'static [u8], min_prefix: u32, min_suffix: u32, locale: &str) -> Self552     pub fn new(data: &'static [u8], min_prefix: u32, min_suffix: u32, locale: &str) -> Self {
553         logger::init(
554             logger::Config::default()
555                 .with_tag_on_device("Minikin")
556                 .with_max_level(log::LevelFilter::Trace),
557         );
558         Self {
559             data,
560             min_prefix,
561             min_suffix,
562             locale: if locale == "pl" {
563                 HyphenationLocale::Polish
564             } else if locale == "ca" {
565                 HyphenationLocale::Catalan
566             } else if locale == "sl" {
567                 HyphenationLocale::Slovenian
568             } else {
569                 HyphenationLocale::Other
570             },
571         }
572     }
573 
574     /// Performs a hyphenation
hyphenate(&self, word: &[u16], out: &mut [u8])575     pub fn hyphenate(&self, word: &[u16], out: &mut [u8]) {
576         let len: u32 = word.len().try_into().unwrap();
577         let padded_len = len + 2;
578         if !self.data.is_empty()
579             && len >= self.min_prefix + self.min_suffix
580             && padded_len <= MAX_HYPHEN_SIZE
581         {
582             let header = Header::new(self.data);
583             let mut alpha_codes: [u16; MAX_HYPHEN_SIZE as usize] = [0; MAX_HYPHEN_SIZE as usize];
584             let hyphen_value = if let Some(alphabet) = header.alphabet_table() {
585                 alphabet.lookup(&mut alpha_codes, word)
586             } else {
587                 HyphenationType::DontBreak
588             };
589 
590             if hyphen_value != HyphenationType::DontBreak {
591                 self.hyphenate_from_codes(alpha_codes, padded_len, hyphen_value, out);
592                 return;
593             }
594             // TODO: try NFC normalization
595             // TODO: handle non-BMP Unicode (requires remapping of offsets)
596         }
597         // Note that we will always get here if the word contains a hyphen or a soft hyphen, because
598         // the alphabet is not expected to contain a hyphen or a soft hyphen character, so
599         // alphabetLookup would return DONT_BREAK.
600         self.hyphenate_with_no_pattern(word, out);
601     }
602 
603     /// This function determines whether a character is like U+2010 HYPHEN in line breaking and
604     /// usage: a character immediately after which line breaks are allowed, but words containing
605     /// it should not be automatically hyphenated using patterns. This is a curated set, created by
606     /// manually inspecting all the characters that have the Unicode line breaking property of BA or
607     /// HY and seeing which ones are hyphens.
is_line_breaking_hyphen(c: u16) -> bool608     fn is_line_breaking_hyphen(c: u16) -> bool {
609         c == 0x002D ||  // HYPHEN-MINUS
610             c == 0x058A ||  // ARMENIAN HYPHEN
611             c == 0x05BE ||  // HEBREW PUNCTUATION MAQAF
612             c == 0x1400 ||  // CANADIAN SYLLABICS HYPHEN
613             c == 0x2010 ||  // HYPHEN
614             c == 0x2013 ||  // EN DASH
615             c == 0x2027 ||  // HYPHENATION POINT
616             c == 0x2E17 ||  // DOUBLE OBLIQUE HYPHEN
617             c == 0x2E40 // DOUBLE HYPHEN
618     }
619 
620     /// Resolves the hyphenation type for Arabic text.
621     /// In case of Arabic text, the letter form should not be changed by hyphenation.
622     /// So, if the hyphenation is in the middle of the joining context, insert ZWJ for keeping the
623     /// form from the original text.
get_hyph_type_for_arabic(word: &[u16], location: u32) -> HyphenationType624     fn get_hyph_type_for_arabic(word: &[u16], location: u32) -> HyphenationType {
625         let mut i = location;
626         let mut join_type: u8 = U_JT_NON_JOINING;
627         while i < word.len().try_into().unwrap() {
628             join_type = getJoiningType(word[i as usize].into());
629             if join_type != U_JT_TRANSPARENT {
630                 break;
631             }
632             i += 1;
633         }
634         if join_type == U_JT_DUAL_JOINING
635             || join_type == U_JT_RIGHT_JOINING
636             || join_type == U_JT_JOIN_CAUSING
637         {
638             // The next character is of the type that may join the last character. See if the last
639             // character is also of the right type.
640             join_type = U_JT_NON_JOINING;
641             if i >= 2 {
642                 i = location - 2; // skip the soft hyphen
643                 loop {
644                     join_type = getJoiningType(word[i as usize].into());
645                     if join_type != U_JT_TRANSPARENT {
646                         break;
647                     }
648                     if i == 0 {
649                         break;
650                     }
651                     i -= 1;
652                 }
653             }
654             if join_type == U_JT_DUAL_JOINING
655                 || join_type == U_JT_LEFT_JOINING
656                 || join_type == U_JT_JOIN_CAUSING
657             {
658                 return HyphenationType::BreakAndInsertHyphenAndZwj;
659             }
660         }
661         HyphenationType::BreakAndInsertHyphen
662     }
663 
664     /// Performs the hyphenation without pattern files.
hyphenate_with_no_pattern(&self, word: &[u16], out: &mut [u8])665     fn hyphenate_with_no_pattern(&self, word: &[u16], out: &mut [u8]) {
666         let word_len: u32 = word.len().try_into().unwrap();
667         out[0] = HyphenationType::DontBreak as u8;
668         for i in 1..word_len {
669             let prev_char = word[i as usize - 1];
670             if i > 1 && Self::is_line_breaking_hyphen(prev_char) {
671                 if (prev_char == CHAR_HYPHEN_MINUS || prev_char == CHAR_HYPHEN)
672                     && (self.locale == HyphenationLocale::Polish
673                         || self.locale == HyphenationLocale::Slovenian)
674                     && getScript(word[i as usize].into()) == USCRIPT_LATIN
675                 {
676                     // In Polish and Slovenian, hyphens get repeated at the next line. To be safe,
677                     // we will do this only if the next character is Latin.
678                     out[i as usize] = HyphenationType::BreakAndInsertHyphenAtNextLine as u8;
679                 } else {
680                     out[i as usize] = HyphenationType::BreakAndDontInsertHyphen as u8;
681                 }
682             } else if i > 1 && prev_char == CHAR_SOFT_HYPHEN {
683                 // Break after soft hyphens, but only if they don't start the word (a soft hyphen
684                 // starting the word doesn't give any useful break opportunities). The type of the
685                 // break is based on the script of the character we break on.
686                 if getScript(word[i as usize].into()) == USCRIPT_ARABIC {
687                     // For Arabic, we need to look and see if the characters around the soft hyphen
688                     // actually join. If they don't, we'll just insert a normal hyphen.
689                     out[i as usize] = Self::get_hyph_type_for_arabic(word, i) as u8;
690                 } else {
691                     out[i as usize] =
692                         Self::hyphenation_type_based_on_script(word[i as usize] as u32) as u8;
693                 }
694             } else if prev_char == CHAR_MIDDLE_DOT
695                 && self.min_prefix < i
696                 && i <= word_len - self.min_suffix
697                 && ((word[i as usize - 2] == 'l' as u16 && word[i as usize] == 'l' as u16)
698                     || (word[i as usize - 2] == 'L' as u16 && word[i as usize] == 'L' as u16))
699                 && self.locale == HyphenationLocale::Catalan
700             {
701                 // In Catalan, "l·l" should break as "l-" on the first line
702                 // and "l" on the next line.
703                 out[i as usize] = HyphenationType::BreakAndReplaceWithHyphen as u8;
704             } else {
705                 out[i as usize] = HyphenationType::DontBreak as u8;
706             }
707         }
708     }
709 
710     /// Performs the hyphenation with pattern file.
hyphenate_from_codes( &self, codes: [u16; MAX_HYPHEN_SIZE as usize], len: u32, hyphen_value: HyphenationType, out: &mut [u8], )711     fn hyphenate_from_codes(
712         &self,
713         codes: [u16; MAX_HYPHEN_SIZE as usize],
714         len: u32,
715         hyphen_value: HyphenationType,
716         out: &mut [u8],
717     ) {
718         let header = Header::new(self.data);
719         let trie = header.trie_table();
720         let pattern = header.pattern_table();
721         let char_mask = trie.char_mask();
722         let link_shift = trie.link_shift();
723         let link_mask = trie.link_mask();
724         let pattern_shift = trie.pattern_shift();
725         let max_offset = len - self.min_suffix - 1;
726 
727         for i in 0..(len - 1) {
728             let mut node: u32 = 0; // index into Trie table
729             for j in i..len {
730                 let c: u32 = codes[j as usize].into();
731                 let entry = trie.get_at(node + c);
732                 if (entry & char_mask) == c {
733                     node = (entry & link_mask) >> link_shift;
734                 } else {
735                     break;
736                 }
737                 let pat_ix = trie.get_at(node) >> pattern_shift;
738                 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an
739                 // offset into the buf pool. This is the pattern for the substring (i..j) we just
740                 // matched, which we combine (via point-wise max) into the buffer vector.
741                 if pat_ix != 0 {
742                     let pat_entry = pattern.entry_at(pat_ix);
743                     let pat_len = pat_entry.len();
744                     let pat_shift = pat_entry.shift();
745                     let offset = j + 1 - (pat_len + pat_shift);
746                     // offset is the index within buffer that lines up with the start of pat_buf
747                     let start = if self.min_prefix < offset { 0 } else { self.min_prefix - offset };
748                     if offset > max_offset {
749                         continue;
750                     }
751                     let end = cmp::min(pat_len, max_offset - offset);
752                     for k in start..end {
753                         out[(offset + k) as usize] =
754                             cmp::max(out[(offset + k) as usize], pat_entry.value_at(k));
755                     }
756                 }
757             }
758         }
759 
760         // Since the above calculation does not modify values outside
761         // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK.
762         for r in out.iter_mut().take(max_offset as usize).skip(self.min_prefix as usize) {
763             *r = if *r & 1 != 0 { hyphen_value as u8 } else { HyphenationType::DontBreak as u8 };
764         }
765     }
766 
hyphenation_type_based_on_script(code_point: u32) -> HyphenationType767     fn hyphenation_type_based_on_script(code_point: u32) -> HyphenationType {
768         let script = getScript(code_point);
769         if script == USCRIPT_KANNADA
770             || script == USCRIPT_MALAYALAM
771             || script == USCRIPT_TAMIL
772             || script == USCRIPT_TELUGU
773         {
774             HyphenationType::BreakAndDontInsertHyphen
775         } else if script == USCRIPT_ARMENIAN {
776             HyphenationType::BreakAndInsertArmenianHyphen
777         } else if script == USCRIPT_CANADIAN_ABORIGINAL {
778             HyphenationType::BreakAndInsertUcasHyphen
779         } else {
780             HyphenationType::BreakAndInsertHyphen
781         }
782     }
783 }
784