1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "WordBreaker.h"
18 
19 #include <list>
20 #include <map>
21 
22 #include <unicode/ubrk.h>
23 #include <unicode/uchar.h>
24 #include <unicode/utf16.h>
25 
26 #include "minikin/Emoji.h"
27 #include "minikin/Hyphenator.h"
28 
29 #include "Locale.h"
30 #include "MinikinInternal.h"
31 
32 namespace minikin {
33 
34 namespace {
createNewIterator(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)35 static std::unique_ptr<BreakIterator> createNewIterator(const Locale& locale,
36                                                         LineBreakStyle lbStyle,
37                                                         LineBreakWordStyle lbWordStyle) {
38     MINIKIN_ASSERT(lbStyle != LineBreakStyle::Auto,
39                    "LineBreakStyle::Auto must be resolved beforehand.");
40     MINIKIN_ASSERT(lbWordStyle != LineBreakWordStyle::Auto,
41                    "LineBreakWordStyle::Auto must be resolved beforehand.");
42 
43     // TODO: handle failure status
44     if (lbStyle == LineBreakStyle::NoBreak) {
45         return std::make_unique<NoBreakBreakIterator>();
46     } else {
47         UErrorCode status = U_ZERO_ERROR;
48         char localeID[ULOC_FULLNAME_CAPACITY] = {};
49         uloc_forLanguageTag(locale.getStringWithLineBreakOption(lbStyle, lbWordStyle).c_str(),
50                             localeID, ULOC_FULLNAME_CAPACITY, nullptr, &status);
51         IcuUbrkUniquePtr icuBrkPtr(
52                 ubrk_open(UBreakIteratorType::UBRK_LINE, localeID, nullptr, 0, &status));
53         return std::make_unique<ICUBreakIterator>(std::move(icuBrkPtr));
54     }
55 }
56 }  // namespace
57 
setText(UText * text,size_t)58 void ICUBreakIterator::setText(UText* text, size_t) {
59     UErrorCode status = U_ZERO_ERROR;
60     ubrk_setUText(mBreaker.get(), text, &status);
61 }
62 
isBoundary(int32_t i)63 bool ICUBreakIterator::isBoundary(int32_t i) {
64     return ubrk_isBoundary(mBreaker.get(), i);
65 }
66 
following(size_t i)67 int32_t ICUBreakIterator::following(size_t i) {
68     return ubrk_following(mBreaker.get(), i);
69 }
70 
next()71 int32_t ICUBreakIterator::next() {
72     return ubrk_next(mBreaker.get());
73 }
74 
acquire(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)75 ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale,
76                                                          LineBreakStyle lbStyle,
77                                                          LineBreakWordStyle lbWordStyle) {
78     if (lbStyle == LineBreakStyle::Auto) {
79         lbStyle = locale.supportsScript('J', 'p', 'a', 'n') ? LineBreakStyle::Strict
80                                                             : LineBreakStyle::None;
81     }
82 
83     const uint64_t id = locale.getIdentifier();
84     std::lock_guard<std::mutex> lock(mMutex);
85     for (auto i = mPool.begin(); i != mPool.end(); i++) {
86         if (i->localeId == id && i->lbStyle == lbStyle && i->lbWordStyle == lbWordStyle) {
87             Slot slot = std::move(*i);
88             mPool.erase(i);
89             return slot;
90         }
91     }
92 
93     // Not found in pool. Create new one.
94     return {id, lbStyle, lbWordStyle, createNewIterator(locale, lbStyle, lbWordStyle)};
95 }
96 
release(ICULineBreakerPool::Slot && slot)97 void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) {
98     if (slot.breaker.get() == nullptr) {
99         return;  // Already released slot. Do nothing.
100     }
101     std::lock_guard<std::mutex> lock(mMutex);
102     if (mPool.size() >= MAX_POOL_SIZE) {
103         // Pool is full. Move to local variable, so that the given slot will be released when the
104         // variable leaves the scope.
105         Slot localSlot = std::move(slot);
106         return;
107     }
108     mPool.push_front(std::move(slot));
109 }
110 
WordBreaker()111 WordBreaker::WordBreaker()
112         : mPool(&ICULineBreakerPoolImpl::getInstance()), mUText(nullptr, &utext_close) {}
113 
WordBreaker(ICULineBreakerPool * pool)114 WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool), mUText(nullptr, &utext_close) {}
115 
followingWithLocale(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle,size_t from)116 ssize_t WordBreaker::followingWithLocale(const Locale& locale, LineBreakStyle lbStyle,
117                                          LineBreakWordStyle lbWordStyle, size_t from) {
118     if (!mUText) {
119         return mCurrent;
120     }
121     mIcuBreaker = mPool->acquire(locale, lbStyle, lbWordStyle);
122     MINIKIN_ASSERT(mText != nullptr, "setText must be called first");
123     // TODO: handle failure status
124     mIcuBreaker.breaker->setText(mUText.get(), mTextSize);
125     if (mInEmailOrUrl) {
126         // Note:
127         // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context.
128         // The email/URL detection doesn't support following() functionality, so that we can't
129         // restart from the specific position. This means following() can not be supported in
130         // general, but keeping old email/URL context works for LineBreaker since it just wants to
131         // re-calculate the next break point with the new locale.
132     } else {
133         mCurrent = mLast = mScanOffset = from;
134         next();
135     }
136     return mCurrent;
137 }
138 
setText(const uint16_t * data,size_t size)139 void WordBreaker::setText(const uint16_t* data, size_t size) {
140     mText = data;
141     mTextSize = size;
142     mLast = 0;
143     mCurrent = 0;
144     mScanOffset = 0;
145     mInEmailOrUrl = false;
146     UErrorCode status = U_ZERO_ERROR;
147     mUText.reset(utext_openUChars(nullptr, reinterpret_cast<const UChar*>(data), size, &status));
148 }
149 
current() const150 ssize_t WordBreaker::current() const {
151     return mCurrent;
152 }
153 
154 /**
155  * Determine whether a line break at position i within the buffer buf is valid. This
156  * represents customization beyond the ICU behavior, because plain ICU provides some
157  * line break opportunities that we don't want.
158  **/
isValidBreak(const uint16_t * buf,size_t bufEnd,int32_t i)159 static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) {
160     const size_t position = static_cast<size_t>(i);
161     if (i == UBRK_DONE || position == bufEnd) {
162         // If the iterator reaches the end, treat as break.
163         return true;
164     }
165     uint32_t codePoint;
166     size_t prev_offset = position;
167     U16_PREV(buf, 0, prev_offset, codePoint);
168     // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
169     if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
170         return false;
171     }
172     // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
173     // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
174     // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
175     // where no line break could be imagined, since the Myanmar virama is a pure stacker.
176     if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
177         return false;
178     }
179 
180     uint32_t next_codepoint;
181     size_t next_offset = position;
182     U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
183 
184     // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
185     // emoji data than ICU does.
186     if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
187         return false;
188     }
189 
190     // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
191     if (isEmojiModifier(next_codepoint)) {
192         if (codePoint == 0xFE0F && prev_offset > 0) {
193             // skip over emoji variation selector
194             U16_PREV(buf, 0, prev_offset, codePoint);
195         }
196         if (isEmojiBase(codePoint)) {
197             return false;
198         }
199     }
200     return true;
201 }
202 
203 // Customized iteratorNext that takes care of both resets and our modifications
204 // to ICU's behavior.
iteratorNext()205 int32_t WordBreaker::iteratorNext() {
206     int32_t result = mIcuBreaker.breaker->following(mCurrent);
207     while (!isValidBreak(mText, mTextSize, result)) {
208         result = mIcuBreaker.breaker->next();
209     }
210     return result;
211 }
212 
213 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
breakAfter(uint16_t c)214 static bool breakAfter(uint16_t c) {
215     return c == ':' || c == '=' || c == '&';
216 }
217 
218 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
breakBefore(uint16_t c)219 static bool breakBefore(uint16_t c) {
220     return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' ||
221            c == '%' || c == '=' || c == '&';
222 }
223 
224 enum ScanState {
225     START,
226     SAW_AT,
227     SAW_COLON,
228     SAW_COLON_SLASH,
229     SAW_COLON_SLASH_SLASH,
230 };
231 
detectEmailOrUrl()232 void WordBreaker::detectEmailOrUrl() {
233     // scan forward from current ICU position for email address or URL
234     if (mLast >= mScanOffset) {
235         ScanState state = START;
236         size_t i;
237         for (i = mLast; i < mTextSize; i++) {
238             uint16_t c = mText[i];
239             // scan only ASCII characters, stop at space
240             if (!(' ' < c && c <= 0x007E)) {
241                 break;
242             }
243             if (state == START && c == '@') {
244                 state = SAW_AT;
245             } else if (state == START && c == ':') {
246                 state = SAW_COLON;
247             } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
248                 if (c == '/') {
249                     state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
250                 } else {
251                     state = START;
252                 }
253             }
254         }
255         if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
256             if (!mIcuBreaker.breaker->isBoundary(i)) {
257                 // If there are combining marks or such at the end of the URL or the email address,
258                 // consider them a part of the URL or the email, and skip to the next actual
259                 // boundary.
260                 i = mIcuBreaker.breaker->following(i);
261             }
262             mInEmailOrUrl = true;
263         } else {
264             mInEmailOrUrl = false;
265         }
266         mScanOffset = i;
267     }
268 }
269 
findNextBreakInEmailOrUrl()270 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
271     // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
272     uint16_t lastChar = mText[mLast];
273     ssize_t i;
274     for (i = mLast + 1; i < mScanOffset; i++) {
275         if (breakAfter(lastChar)) {
276             break;
277         }
278         // break after double slash
279         if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
280             break;
281         }
282         const uint16_t thisChar = mText[i];
283         // never break after hyphen
284         if (lastChar != '-') {
285             if (breakBefore(thisChar)) {
286                 break;
287             }
288             // break before single slash
289             if (thisChar == '/' && lastChar != '/' &&
290                 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
291                 break;
292             }
293         }
294         lastChar = thisChar;
295     }
296     return i;
297 }
298 
next()299 ssize_t WordBreaker::next() {
300     mLast = mCurrent;
301 
302     detectEmailOrUrl();
303     if (mInEmailOrUrl) {
304         mCurrent = findNextBreakInEmailOrUrl();
305     } else {  // Business as usual
306         mCurrent = (ssize_t)iteratorNext();
307     }
308     return mCurrent;
309 }
310 
wordStart() const311 ssize_t WordBreaker::wordStart() const {
312     if (mInEmailOrUrl) {
313         return mLast;
314     }
315     ssize_t result = mLast;
316     while (result < mCurrent) {
317         UChar32 c;
318         ssize_t ix = result;
319         U16_NEXT(mText, ix, mCurrent, c);
320         const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
321         // strip leading punctuation, defined as OP and QU line breaking classes,
322         // see UAX #14
323         if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
324             break;
325         }
326         result = ix;
327     }
328     return result;
329 }
330 
wordEnd() const331 ssize_t WordBreaker::wordEnd() const {
332     if (mInEmailOrUrl) {
333         return mLast;
334     }
335     ssize_t result = mCurrent;
336     while (result > mLast) {
337         UChar32 c;
338         ssize_t ix = result;
339         U16_PREV(mText, mLast, ix, c);
340         const int32_t gc_mask = U_GET_GC_MASK(c);
341         // strip trailing spaces, punctuation and control characters
342         if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) {
343             break;
344         }
345         result = ix;
346     }
347     return result;
348 }
349 
breakBadness() const350 int WordBreaker::breakBadness() const {
351     return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
352 }
353 
finish()354 void WordBreaker::finish() {
355     mText = nullptr;
356     mUText.reset();
357     mPool->release(std::move(mIcuBreaker));
358 }
359 
360 }  // namespace minikin
361