1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /**
18  * A wrapper around ICU's line break iterator, that gives customized line
19  * break opportunities, as well as identifying words for the purpose of
20  * hyphenation.
21  */
22 
23 #ifndef MINIKIN_WORD_BREAKER_H
24 #define MINIKIN_WORD_BREAKER_H
25 
26 #include <unicode/ubrk.h>
27 
28 #include <list>
29 #include <memory>
30 #include <mutex>
31 
32 #include "Locale.h"
33 #include "minikin/IcuUtils.h"
34 #include "minikin/LineBreakStyle.h"
35 #include "minikin/Macros.h"
36 #include "minikin/Range.h"
37 
38 namespace minikin {
39 
40 class BreakIterator {
41 public:
BreakIterator()42     BreakIterator() {}
~BreakIterator()43     virtual ~BreakIterator() {}
44     virtual void setText(UText* text, size_t size) = 0;
45     virtual bool isBoundary(int32_t i) = 0;
46     virtual int32_t following(size_t i) = 0;
47     virtual int32_t next() = 0;
48 };
49 
50 // A class interface for providing pooling implementation of ICU's line breaker.
51 // The implementation can be customized for testing purposes.
52 class ICULineBreakerPool {
53 public:
54     struct Slot {
SlotSlot55         Slot() : localeId(0), breaker(nullptr) {}
SlotSlot56         Slot(uint64_t localeId, LineBreakStyle lbStyle, LineBreakWordStyle lbWordStyle,
57              std::unique_ptr<BreakIterator>&& breaker)
58                 : localeId(localeId),
59                   lbStyle(lbStyle),
60                   lbWordStyle(lbWordStyle),
61                   breaker(std::move(breaker)) {}
62 
63         Slot(Slot&& other) = default;
64         Slot& operator=(Slot&& other) = default;
65 
66         // Forbid copy and assignment.
67         Slot(const Slot&) = delete;
68         Slot& operator=(const Slot&) = delete;
69 
70         uint64_t localeId;
71         LineBreakStyle lbStyle;
72         LineBreakWordStyle lbWordStyle;
73         std::unique_ptr<BreakIterator> breaker;
74     };
~ICULineBreakerPool()75     virtual ~ICULineBreakerPool() {}
76     virtual Slot acquire(const Locale& locale, LineBreakStyle lbStyle,
77                          LineBreakWordStyle lbWordStyle) = 0;
78     virtual void release(Slot&& slot) = 0;
79 };
80 
81 // An singleton implementation of the ICU line breaker pool.
82 // Since creating ICU line breaker instance takes some time. Pool it for later use.
83 class ICULineBreakerPoolImpl : public ICULineBreakerPool {
84 public:
85     Slot acquire(const Locale& locale, LineBreakStyle lbStyle,
86                  LineBreakWordStyle lbWordStyle) override;
87     void release(Slot&& slot) override;
88 
getInstance()89     static ICULineBreakerPoolImpl& getInstance() {
90         static ICULineBreakerPoolImpl pool;
91         return pool;
92     }
93 
94 protected:
95     // protected for testing purposes.
96     static constexpr size_t MAX_POOL_SIZE = 4;
ICULineBreakerPoolImpl()97     ICULineBreakerPoolImpl(){};  // singleton.
getPoolSize()98     size_t getPoolSize() const {
99         std::lock_guard<std::mutex> lock(mMutex);
100         return mPool.size();
101     }
102 
103 private:
104     std::list<Slot> mPool GUARDED_BY(mMutex);
105     mutable std::mutex mMutex;
106 };
107 
108 class ICUBreakIterator : public BreakIterator {
109 public:
ICUBreakIterator(IcuUbrkUniquePtr && breaker)110     ICUBreakIterator(IcuUbrkUniquePtr&& breaker) : mBreaker(std::move(breaker)) {}
~ICUBreakIterator()111     virtual ~ICUBreakIterator() {}
112     virtual void setText(UText* text, size_t size);
113     virtual bool isBoundary(int32_t i);
114     virtual int32_t following(size_t i);
115     virtual int32_t next();
116 
117 private:
118     IcuUbrkUniquePtr mBreaker;
119 };
120 
121 class NoBreakBreakIterator : public BreakIterator {
122 public:
NoBreakBreakIterator()123     NoBreakBreakIterator() {}
~NoBreakBreakIterator()124     virtual ~NoBreakBreakIterator() {}
125 
setText(UText *,size_t size)126     virtual void setText(UText*, size_t size) { mSize = size; }
isBoundary(int32_t i)127     virtual bool isBoundary(int32_t i) { return i == 0 || i == static_cast<int32_t>(mSize); }
following(size_t)128     virtual int32_t following(size_t) { return mSize; }
next()129     virtual int32_t next() { return mSize; }
130 
131 private:
132     size_t mSize = 0;
133 };
134 
135 class WordBreaker {
136 public:
~WordBreaker()137     virtual ~WordBreaker() { finish(); }
138 
139     WordBreaker();
140 
141     void setText(const uint16_t* data, size_t size);
142 
143     // Advance iterator to next word break with current locale. Return offset, or -1 if EOT
144     ssize_t next();
145 
146     // Advance iterator to the break just after "from" with using the new provided locale.
147     // Return offset, or -1 if EOT
148     ssize_t followingWithLocale(const Locale& locale, LineBreakStyle lbStyle,
149                                 LineBreakWordStyle lbWordStyle, size_t from);
150 
151     // Current offset of iterator, equal to 0 at BOT or last return from next()
152     ssize_t current() const;
153 
154     // After calling next(), wordStart() and wordEnd() are offsets defining the previous
155     // word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation.
156     ssize_t wordStart() const;
157 
158     ssize_t wordEnd() const;
159 
160     // Returns the range from wordStart() to wordEnd().
161     // If wordEnd() <= wordStart(), returns empty range.
wordRange()162     inline Range wordRange() const {
163         const uint32_t start = wordStart();
164         const uint32_t end = wordEnd();
165         return start < end ? Range(start, end) : Range(end, end);
166     }
167 
168     int breakBadness() const;
169 
170     void finish();
171 
172 protected:
173     // protected virtual for testing purpose.
174     // Caller must release the pool.
175     WordBreaker(ICULineBreakerPool* pool);
176 
177 private:
178     int32_t iteratorNext();
179     void detectEmailOrUrl();
180     ssize_t findNextBreakInEmailOrUrl();
181 
182     // Doesn't take ownership. Must not be nullptr. Must be set in constructor.
183     ICULineBreakerPool* mPool;
184 
185     ICULineBreakerPool::Slot mIcuBreaker;
186 
187     std::unique_ptr<UText, decltype(&utext_close)> mUText;
188     const uint16_t* mText = nullptr;
189     size_t mTextSize;
190     ssize_t mLast;
191     ssize_t mCurrent;
192 
193     // state for the email address / url detector
194     ssize_t mScanOffset;
195     bool mInEmailOrUrl;
196 };
197 
198 }  // namespace minikin
199 
200 #endif  // MINIKIN_WORD_BREAKER_H
201