1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINIKIN_LINE_BREAKER_UTIL_H
18 #define MINIKIN_LINE_BREAKER_UTIL_H
19 
20 #include <vector>
21 
22 #include "HyphenatorMap.h"
23 #include "LayoutUtils.h"
24 #include "Locale.h"
25 #include "LocaleListCache.h"
26 #include "MinikinInternal.h"
27 #include "WordBreaker.h"
28 #include "minikin/Hyphenator.h"
29 #include "minikin/LineBreakStyle.h"
30 #include "minikin/MeasuredText.h"
31 #include "minikin/U16StringPiece.h"
32 
33 namespace minikin {
34 
35 constexpr uint32_t LBW_AUTO_HEURISTICS_LINE_COUNT = 5;
36 
37 // ParaWidth is used to hold cumulative width from beginning of paragraph. Note that for very large
38 // paragraphs, accuracy could degrade using only 32-bit float. Note however that float is used
39 // extensively on the Java side for this. This is a typedef so that we can easily change it based
40 // on performance/accuracy tradeoff.
41 typedef float ParaWidth;
42 
43 // Hyphenates a string potentially containing non-breaking spaces.
44 std::vector<HyphenationType> hyphenate(const U16StringPiece& string, const Hyphenator& hypenator);
45 
46 // This function determines whether a character is a space that disappears at end of line.
47 // It is the Unicode set: [[:General_Category=Space_Separator:]-[:Line_Break=Glue:]], plus '\n'.
48 // Note: all such characters are in the BMP, so it's ok to use code units for this.
isLineEndSpace(uint16_t c)49 inline bool isLineEndSpace(uint16_t c) {
50     return c == '\n' || c == ' '                           // SPACE
51            || c == 0x1680                                  // OGHAM SPACE MARK
52            || (0x2000 <= c && c <= 0x200A && c != 0x2007)  // EN QUAD, EM QUAD, EN SPACE, EM SPACE,
53            // THREE-PER-EM SPACE, FOUR-PER-EM SPACE,
54            // SIX-PER-EM SPACE, PUNCTUATION SPACE,
55            // THIN SPACE, HAIR SPACE
56            || c == 0x205F  // MEDIUM MATHEMATICAL SPACE
57            || c == 0x3000;
58 }
59 
trimTrailingLineEndSpaces(const U16StringPiece & textBuf,const Range & range)60 inline Range trimTrailingLineEndSpaces(const U16StringPiece& textBuf, const Range& range) {
61     for (uint32_t i = 0; i < range.getLength(); i++) {
62         if (!isLineEndSpace(textBuf[range.getEnd() - i - 1])) {
63             return Range(range.getStart(), range.getEnd() - i);
64         }
65     }
66     return Range(range.getStart(), range.getStart());
67 }
68 
getEffectiveLocale(uint32_t localeListId)69 inline Locale getEffectiveLocale(uint32_t localeListId) {
70     const LocaleList& localeList = LocaleListCache::getById(localeListId);
71     return localeList.empty() ? Locale() : localeList[0];
72 }
73 
74 // Retrieves hyphenation break points from a word.
populateHyphenationPoints(const U16StringPiece & textBuf,const Run & run,const Hyphenator & hyphenator,const Range & contextRange,const Range & hyphenationTargetRange,const std::vector<float> & charWidths,bool ignoreKerning,std::vector<HyphenBreak> * out,LayoutPieces * pieces)75 inline void populateHyphenationPoints(
76         const U16StringPiece& textBuf,         // A text buffer.
77         const Run& run,                        // A run of this region.
78         const Hyphenator& hyphenator,          // A hyphenator to be used for hyphenation.
79         const Range& contextRange,             // A context range for measuring hyphenated piece.
80         const Range& hyphenationTargetRange,   // An actual range for the hyphenation target.
81         const std::vector<float>& charWidths,  // Char width used for hyphen piece estimation.
82         bool ignoreKerning,                    // True use full shaping for hyphenation piece.
83         std::vector<HyphenBreak>* out,         // An output to be appended.
84         LayoutPieces* pieces) {                // An output of layout pieces. Maybe null.
85     if (!run.getRange().contains(contextRange) || !contextRange.contains(hyphenationTargetRange)) {
86         return;
87     }
88 
89     const std::vector<HyphenationType> hyphenResult =
90             hyphenate(textBuf.substr(hyphenationTargetRange), hyphenator);
91     for (uint32_t i = hyphenationTargetRange.getStart(); i < hyphenationTargetRange.getEnd(); ++i) {
92         const HyphenationType hyph = hyphenResult[hyphenationTargetRange.toRangeOffset(i)];
93         if (hyph == HyphenationType::DONT_BREAK) {
94             continue;  // Not a hyphenation point.
95         }
96 
97         if (!ignoreKerning) {
98             auto hyphenPart = contextRange.split(i);
99             U16StringPiece firstText = textBuf.substr(hyphenPart.first);
100             U16StringPiece secondText = textBuf.substr(hyphenPart.second);
101             const float first =
102                     run.measureHyphenPiece(firstText, Range(0, firstText.size()),
103                                            StartHyphenEdit::NO_EDIT /* start hyphen edit */,
104                                            editForThisLine(hyph) /* end hyphen edit */, pieces);
105             const float second =
106                     run.measureHyphenPiece(secondText, Range(0, secondText.size()),
107                                            editForNextLine(hyph) /* start hyphen edit */,
108                                            EndHyphenEdit::NO_EDIT /* end hyphen edit */, pieces);
109 
110             out->emplace_back(i, hyph, first, second);
111         } else {
112             float first = 0;
113             float second = 0;
114             for (uint32_t j = contextRange.getStart(); j < i; ++j) {
115                 first += charWidths[j];
116             }
117             for (uint32_t j = i; j < contextRange.getEnd(); ++j) {
118                 second += charWidths[j];
119             }
120 
121             EndHyphenEdit endEdit = editForThisLine(hyph);
122             StartHyphenEdit startEdit = editForNextLine(hyph);
123 
124             if (endEdit != EndHyphenEdit::NO_EDIT) {
125                 auto [str, strSize] = getHyphenString(endEdit);
126                 first += run.measureText(U16StringPiece(str, strSize));
127             }
128 
129             if (startEdit != StartHyphenEdit::NO_EDIT) {
130                 auto [str, strSize] = getHyphenString(startEdit);
131                 second += run.measureText(U16StringPiece(str, strSize));
132             }
133 
134             out->emplace_back(i, hyph, first, second);
135         }
136     }
137 }
138 
139 // Class for tracking the word breaker transition point.
140 class WordBreakerTransitionTracker {
141 public:
142     // Update the word breaker transition information. This function return true if the word
143     // breaker need to be updated.
update(const Run & run)144     bool update(const Run& run) {
145         const uint32_t newLocaleListId = run.getLocaleListId();
146         const LineBreakStyle newLineBreakStyle = run.lineBreakStyle();
147         const LineBreakWordStyle newLineBreakWordStyle = run.lineBreakWordStyle();
148         const bool isUpdate = localeListId != newLocaleListId ||
149                               lineBreakStyle != newLineBreakStyle ||
150                               lineBreakWordStyle != newLineBreakWordStyle;
151 
152         localeListId = newLocaleListId;
153         lineBreakStyle = newLineBreakStyle;
154         lineBreakWordStyle = newLineBreakWordStyle;
155 
156         return isUpdate;
157     }
158 
getCurrentLocaleList()159     const LocaleList& getCurrentLocaleList() const {
160         return LocaleListCache::getById(localeListId);
161     }
162 
getCurrentLineBreakStyle()163     LineBreakStyle getCurrentLineBreakStyle() const { return lineBreakStyle; }
164 
getCurrentLineBreakWordStyle()165     LineBreakWordStyle getCurrentLineBreakWordStyle() const { return lineBreakWordStyle; }
166 
167 private:
168     uint32_t localeListId = LocaleListCache::kInvalidListId;
169     LineBreakStyle lineBreakStyle = LineBreakStyle::None;
170     LineBreakWordStyle lineBreakWordStyle = LineBreakWordStyle::None;
171 };
172 
resolveWordStyleAuto(LineBreakWordStyle lbWordStyle,const LocaleList & localeList,bool forceWordStyleAutoToPhrase)173 inline std::pair<LineBreakWordStyle, bool> resolveWordStyleAuto(LineBreakWordStyle lbWordStyle,
174                                                                 const LocaleList& localeList,
175                                                                 bool forceWordStyleAutoToPhrase) {
176     if (lbWordStyle == LineBreakWordStyle::Auto) {
177         if (forceWordStyleAutoToPhrase) {
178             return std::make_pair(LineBreakWordStyle::Phrase, false);
179         } else if (localeList.hasKorean()) {
180             return std::make_pair(LineBreakWordStyle::Phrase, false);
181         } else if (localeList.hasJapanese()) {
182             return std::make_pair(LineBreakWordStyle::None, true);
183         } else {
184             return std::make_pair(LineBreakWordStyle::None, false);
185         }
186     } else {
187         return std::make_pair(lbWordStyle, false);
188     }
189 }
190 
191 // Processes and retrieve informations from characters in the paragraph.
192 struct CharProcessor {
193     // The number of spaces.
194     uint32_t rawSpaceCount = 0;
195 
196     // The number of spaces minus trailing spaces.
197     uint32_t effectiveSpaceCount = 0;
198 
199     // The sum of character width from the paragraph start.
200     ParaWidth sumOfCharWidths = 0.0;
201 
202     // The sum of character width from the paragraph start minus trailing line end spaces.
203     // This means that the line width from the paragraph start if we decided break now.
204     ParaWidth effectiveWidth = 0.0;
205 
206     // The total amount of character widths at the previous word break point.
207     ParaWidth sumOfCharWidthsAtPrevWordBreak = 0.0;
208 
209     // The next word break offset.
210     uint32_t nextWordBreak = 0;
211 
212     // The previous word break offset.
213     uint32_t prevWordBreak = 0;
214 
215     // The width of a space. May be 0 if there are no spaces.
216     // Note: if there are multiple different widths for spaces (for example, because of mixing of
217     // fonts), it's only guaranteed to pick one.
218     float spaceWidth = 0.0f;
219 
220     // The current hyphenator.
221     const Hyphenator* hyphenator = nullptr;
222 
223     bool retryWithPhraseWordBreak = false;
224 
225     float maxCharWidth = 0.0f;
226 
227     // Retrieve the current word range.
wordRangeCharProcessor228     inline Range wordRange() const { return breaker.wordRange(); }
229 
230     // Retrieve the current context range.
contextRangeCharProcessor231     inline Range contextRange() const { return Range(prevWordBreak, nextWordBreak); }
232 
233     // Returns the width from the last word break point.
widthFromLastWordBreakCharProcessor234     inline ParaWidth widthFromLastWordBreak() const {
235         return effectiveWidth - sumOfCharWidthsAtPrevWordBreak;
236     }
237 
238     // Returns the break penalty for the current word break point.
wordBreakPenaltyCharProcessor239     inline int wordBreakPenalty() const { return breaker.breakBadness(); }
240 
CharProcessorCharProcessor241     CharProcessor(const U16StringPiece& text) { breaker.setText(text.data(), text.size()); }
242 
243     // The user of CharProcessor must call updateLocaleIfNecessary with valid locale at least one
244     // time before feeding characters.
updateLocaleIfNecessaryCharProcessor245     void updateLocaleIfNecessary(const Run& run, bool forceWordStyleAutoToPhrase) {
246         if (wbTracker.update(run)) {
247             const LocaleList& localeList = wbTracker.getCurrentLocaleList();
248             const Locale locale = localeList.empty() ? Locale() : localeList[0];
249 
250             LineBreakWordStyle lbWordStyle = wbTracker.getCurrentLineBreakWordStyle();
251             std::tie(lbWordStyle, retryWithPhraseWordBreak) =
252                     resolveWordStyleAuto(lbWordStyle, localeList, forceWordStyleAutoToPhrase);
253             nextWordBreak = breaker.followingWithLocale(locale, run.lineBreakStyle(), lbWordStyle,
254                                                         run.getRange().getStart());
255             hyphenator = HyphenatorMap::lookup(locale);
256         }
257     }
258 
259     // Process one character.
feedCharCharProcessor260     void feedChar(uint32_t idx, uint16_t c, float w, bool canBreakHere) {
261         if (idx == nextWordBreak) {
262             if (canBreakHere) {
263                 prevWordBreak = nextWordBreak;
264                 sumOfCharWidthsAtPrevWordBreak = sumOfCharWidths;
265             }
266             nextWordBreak = breaker.next();
267         }
268         if (isWordSpace(c)) {
269             rawSpaceCount += 1;
270             spaceWidth = w;
271         }
272         sumOfCharWidths += w;
273         maxCharWidth = std::max(maxCharWidth, w);
274         if (isLineEndSpace(c)) {
275             // If we break a line on a line-ending space, that space goes away. So postBreak
276             // and postSpaceCount, which keep the width and number of spaces if we decide to
277             // break at this point, don't need to get adjusted.
278         } else {
279             effectiveSpaceCount = rawSpaceCount;
280             effectiveWidth = sumOfCharWidths;
281         }
282     }
283 
284 private:
285     WordBreakerTransitionTracker wbTracker;
286     WordBreaker breaker;
287 };
288 }  // namespace minikin
289 
290 #endif  // MINIKIN_LINE_BREAKER_UTIL_H
291