1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /**
18  * An implementation of Liang's hyphenation algorithm.
19  */
20 
21 #ifndef MINIKIN_HYPHENATOR_H
22 #define MINIKIN_HYPHENATOR_H
23 
24 #include <string>
25 #include <vector>
26 
27 #include "minikin/Characters.h"
28 #include "minikin/U16StringPiece.h"
29 
30 namespace minikin {
31 
32 class Hyphenator;
33 
34 // Registers the hyphenator.
35 // This doesn't take ownership of the hyphenator but we don't need to care about the ownership.
36 // In Android, the Hyphenator is allocated in Zygote and never gets released.
37 void addHyphenator(const std::string& localeStr, const Hyphenator* hyphenator);
38 void addHyphenatorAlias(const std::string& fromLocaleStr, const std::string& toLocaleStr);
39 
40 enum class HyphenationType : uint8_t {
41     // Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0.
42 
43     // Do not break.
44     DONT_BREAK = 0,
45     // Break the line and insert a normal hyphen.
46     BREAK_AND_INSERT_HYPHEN = 1,
47     // Break the line and insert an Armenian hyphen (U+058A).
48     BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
49     // Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
50     BREAK_AND_INSERT_MAQAF = 3,
51     // Break the line and insert a Canadian Syllabics hyphen (U+1400).
52     BREAK_AND_INSERT_UCAS_HYPHEN = 4,
53     // Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen
54     // present or the script does not use a hyphen (e.g. in Malayalam).
55     BREAK_AND_DONT_INSERT_HYPHEN = 5,
56     // Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates
57     // as "l-/l".
58     BREAK_AND_REPLACE_WITH_HYPHEN = 6,
59     // Break the line, and repeat the hyphen (which is the last character) at the beginning of the
60     // next line. Used in Polish (where "czerwono-niebieska" should hyphenate as
61     // "czerwono-/-niebieska") and Slovenian.
62     BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
63     // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line.
64     // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default
65     // behavior when a soft hyphen is used in Arabic script.
66     BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
67 };
68 
69 // The hyphen edit represents an edit to the string when a word is hyphenated.
70 // The most common hyphen edit is adding a "-" at the end of a syllable, but nonstandard hyphenation
71 // allows for more choices.
72 // One at the beginning of the string/line and one at the end.
73 enum class EndHyphenEdit : uint8_t {
74     // Note that everything inserting characters must have a value greater than or equal to
75     // INSERT_HYPHEN.
76     NO_EDIT = 0b000,
77     REPLACE_WITH_HYPHEN = 0b001,
78 
79     INSERT_HYPHEN = 0b010,
80     INSERT_ARMENIAN_HYPHEN = 0b011,
81     INSERT_MAQAF = 0b100,
82     INSERT_UCAS_HYPHEN = 0b101,
83     INSERT_ZWJ_AND_HYPHEN = 0b110,
84 };
85 
86 enum class StartHyphenEdit : uint8_t {
87     NO_EDIT = 0b00,
88 
89     INSERT_HYPHEN = 0b01,
90     INSERT_ZWJ = 0b10,
91 };
92 
93 typedef uint8_t HyphenEdit;
94 constexpr uint8_t START_BITS_SHIFT = 3;
95 // The following two masks must keep in sync with the definitions in the Java code at:
96 // frameworks/base/graphics/java/android/graphics/Paint.java
97 constexpr uint8_t MASK_END_OF_LINE = 0b00111;
98 constexpr uint8_t MASK_START_OF_LINE = 0b11000;
99 
packHyphenEdit(StartHyphenEdit start,EndHyphenEdit end)100 inline HyphenEdit packHyphenEdit(StartHyphenEdit start, EndHyphenEdit end) {
101     return static_cast<uint8_t>(start) << START_BITS_SHIFT | static_cast<uint8_t>(end);
102 }
103 
endHyphenEdit(HyphenEdit hyphenEdit)104 inline EndHyphenEdit endHyphenEdit(HyphenEdit hyphenEdit) {
105     return static_cast<EndHyphenEdit>(hyphenEdit & MASK_END_OF_LINE);
106 }
107 
startHyphenEdit(HyphenEdit hyphenEdit)108 inline StartHyphenEdit startHyphenEdit(HyphenEdit hyphenEdit) {
109     return static_cast<StartHyphenEdit>(hyphenEdit >> START_BITS_SHIFT);
110 }
111 
isReplacement(EndHyphenEdit hyph)112 inline bool isReplacement(EndHyphenEdit hyph) {
113     return hyph == EndHyphenEdit::REPLACE_WITH_HYPHEN;
114 }
115 
isInsertion(StartHyphenEdit hyph)116 inline bool isInsertion(StartHyphenEdit hyph) {
117     return hyph != StartHyphenEdit::NO_EDIT;
118 }
119 
isInsertion(EndHyphenEdit hyph)120 inline bool isInsertion(EndHyphenEdit hyph) {
121     return static_cast<uint8_t>(hyph) >= static_cast<uint8_t>(EndHyphenEdit::INSERT_HYPHEN);
122 }
123 
124 template <typename T, size_t size>
ARRAYSIZE(T const (&)[size])125 constexpr size_t ARRAYSIZE(T const (&)[size]) {
126     return size;
127 }
128 constexpr uint16_t HYPHEN_STR_ZWJ[] = {CHAR_ZWJ};
129 constexpr uint16_t HYPHEN_STR_HYPHEN[] = {CHAR_HYPHEN};
130 constexpr uint16_t HYPHEN_STR_ARMENIAN_HYPHEN[] = {CHAR_ARMENIAN_HYPHEN};
131 constexpr uint16_t HYPHEN_STR_MAQAF[] = {CHAR_MAQAF};
132 constexpr uint16_t HYPHEN_STR_UCAS_HYPHEN[] = {CHAR_UCAS_HYPHEN};
133 constexpr uint16_t HYPHEN_STR_ZWJ_AND_HYPHEN[] = {CHAR_ZWJ, CHAR_HYPHEN};
134 constexpr std::pair<const uint16_t*, size_t> EMPTY_HYPHEN_STR(nullptr, 0);
135 #define MAKE_HYPHEN_STR(chars) std::make_pair((chars), ARRAYSIZE(chars))
136 
getHyphenString(StartHyphenEdit hyph)137 inline std::pair<const uint16_t*, size_t> getHyphenString(StartHyphenEdit hyph) {
138     if (hyph == StartHyphenEdit::INSERT_ZWJ) {
139         return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ);
140     } else if (hyph == StartHyphenEdit::INSERT_HYPHEN) {
141         return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
142     } else {
143         return EMPTY_HYPHEN_STR;
144     }
145 }
146 
getHyphenString(EndHyphenEdit hyph)147 inline std::pair<const uint16_t*, size_t> getHyphenString(EndHyphenEdit hyph) {
148     switch (hyph) {
149         case EndHyphenEdit::REPLACE_WITH_HYPHEN:  // fall through
150         case EndHyphenEdit::INSERT_HYPHEN:
151             return MAKE_HYPHEN_STR(HYPHEN_STR_HYPHEN);
152         case EndHyphenEdit::INSERT_ARMENIAN_HYPHEN:
153             return MAKE_HYPHEN_STR(HYPHEN_STR_ARMENIAN_HYPHEN);
154         case EndHyphenEdit::INSERT_MAQAF:
155             return MAKE_HYPHEN_STR(HYPHEN_STR_MAQAF);
156         case EndHyphenEdit::INSERT_UCAS_HYPHEN:
157             return MAKE_HYPHEN_STR(HYPHEN_STR_UCAS_HYPHEN);
158         case EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN:
159             return MAKE_HYPHEN_STR(HYPHEN_STR_ZWJ_AND_HYPHEN);
160         case EndHyphenEdit::NO_EDIT:
161         default:
162             return EMPTY_HYPHEN_STR;
163     }
164 }
165 #undef MAKE_HYPHEN_STR
166 
167 EndHyphenEdit editForThisLine(HyphenationType type);
168 StartHyphenEdit editForNextLine(HyphenationType type);
169 
170 // hyb file header; implementation details are in the .cpp file
171 struct Header;
172 
173 class Hyphenator {
174 public:
175     // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in
176     // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the
177     // corresponding code unit offset in the word.
178     //
179     // out must have at least the length of the word capacity.
180     //
181     // Example: word is "hyphen", result is the following, corresponding to "hy-phen":
182     // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK]
183     virtual void hyphenate(const U16StringPiece& word, HyphenationType* out) const = 0;
184 
185     // Compute the hyphenation of a word.
186     //
187     // out will be resized to word length.
hyphenate(const U16StringPiece & word,std::vector<HyphenationType> * out)188     void hyphenate(const U16StringPiece& word, std::vector<HyphenationType>* out) const {
189         out->resize(word.size());
190         return hyphenate(word, out->data());
191     }
192 
193     // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character
194     // immediately after which line breaks are allowed, but words containing it should not be
195     // automatically hyphenated.
196     static bool isLineBreakingHyphen(uint32_t cp);
197 
198     // pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
199     // the caller is responsible for ensuring that the lifetime of the pattern data is
200     // at least as long as the Hyphenator object.
201 
202     // This class doesn't copy or take ownership of patternData. Caller must keep the data valid
203     // until this instance is deleted.
204     // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens.
205     static Hyphenator* loadBinary(const uint8_t* patternData, size_t dataSize, size_t minPrefix,
206                                   size_t minSuffix, const std::string& locale);
207 
208     // This is test only function for loading Rust implementation.
209     static Hyphenator* loadBinaryForRust(const uint8_t* patternData, size_t dataSize,
210                                          size_t minPrefix, size_t minSuffix,
211                                          const std::string& locale);
~Hyphenator()212     virtual ~Hyphenator() {}
213 
214 protected:
215     enum class HyphenationLocale : uint8_t {
216         OTHER = 0,
217         CATALAN = 1,
218         POLISH = 2,
219         SLOVENIAN = 3,
220     };
221 };
222 
223 class HyphenatorCXX : public Hyphenator {
224 public:
225     // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in
226     // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the
227     // corresponding code unit offset in the word.
228     //
229     // out must have at least the length of the word capacity.
230     //
231     // Example: word is "hyphen", result is the following, corresponding to "hy-phen":
232     // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK]
233     virtual void hyphenate(const U16StringPiece& word, HyphenationType* out) const override;
234 
235     // Compute the hyphenation of a word.
236     //
237     // out will be resized to word length.
hyphenate(const U16StringPiece & word,std::vector<HyphenationType> * out)238     void hyphenate(const U16StringPiece& word, std::vector<HyphenationType>* out) const {
239         out->resize(word.size());
240         return hyphenate(word, out->data());
241     }
242 
243     // This class doesn't copy or take ownership of patternData. Caller must keep the data valid
244     // until this instance is deleted.
245     // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens.
246     static Hyphenator* loadBinary(const uint8_t* patternData, size_t dataSize, size_t minPrefix,
247                                   size_t minSuffix, const std::string& locale);
248 
249 private:
250     // Use Hyphenator::loadBinary instead.
251     HyphenatorCXX(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
252                   HyphenationLocale hyphenLocale);
253 
254     // apply various hyphenation rules including hard and soft hyphens, ignoring patterns
255     void hyphenateWithNoPatterns(const U16StringPiece& word, HyphenationType* out) const;
256 
257     // Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map.
258     // Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or
259     // BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen.
260     // Note that this method writes len+2 entries into alpha_codes (including start and stop)
261     HyphenationType alphabetLookup(uint16_t* alpha_codes, const U16StringPiece& word) const;
262 
263     // calculate hyphenation from patterns, assuming alphabet lookup has already been done
264     void hyphenateFromCodes(const uint16_t* codes, size_t len, HyphenationType hyphenValue,
265                             HyphenationType* out) const;
266 
267     // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
268     // that temporary buffers can be stack-allocated without waste, which is a slightly
269     // different use case. It measures UTF-16 code units.
270     static const size_t MAX_HYPHENATED_SIZE = 64;
271 
272     const uint8_t* mPatternData;
273     const size_t mMinPrefix, mMinSuffix;
274     const HyphenationLocale mHyphenationLocale;
275 
276     // accessors for binary data
getHeader()277     const Header* getHeader() const { return reinterpret_cast<const Header*>(mPatternData); }
278 };
279 
280 }  // namespace minikin
281 
282 #endif  // MINIKIN_HYPHENATOR_H
283