summaryrefslogtreecommitdiff
path: root/libs/minikin/WordBreaker.h
blob: c4af6356c95389b9c297f408623ebdad48b1ca5b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * A wrapper around ICU's line break iterator, that gives customized line
 * break opportunities, as well as identifying words for the purpose of
 * hyphenation.
 */

#ifndef MINIKIN_WORD_BREAKER_H
#define MINIKIN_WORD_BREAKER_H

#include <unicode/ubrk.h>

#include <list>
#include <memory>
#include <mutex>

#include "Locale.h"
#include "minikin/IcuUtils.h"
#include "minikin/LineBreakStyle.h"
#include "minikin/Macros.h"
#include "minikin/Range.h"

namespace minikin {

// A class interface for providing pooling implementation of ICU's line breaker.
// The implementation can be customized for testing purposes.
class ICULineBreakerPool {
public:
    struct Slot {
        Slot() : localeId(0), breaker(nullptr) {}
        Slot(uint64_t localeId, LineBreakStyle lbStyle, LineBreakWordStyle lbWordStyle,
             IcuUbrkUniquePtr&& breaker)
                : localeId(localeId),
                  lbStyle(lbStyle),
                  lbWordStyle(lbWordStyle),
                  breaker(std::move(breaker)) {}

        Slot(Slot&& other) = default;
        Slot& operator=(Slot&& other) = default;

        // Forbid copy and assignment.
        Slot(const Slot&) = delete;
        Slot& operator=(const Slot&) = delete;

        uint64_t localeId;
        LineBreakStyle lbStyle;
        LineBreakWordStyle lbWordStyle;
        IcuUbrkUniquePtr breaker;
    };
    virtual ~ICULineBreakerPool() {}
    virtual Slot acquire(const Locale& locale, LineBreakStyle lbStyle,
                         LineBreakWordStyle lbWordStyle) = 0;
    virtual void release(Slot&& slot) = 0;
};

// An singleton implementation of the ICU line breaker pool.
// Since creating ICU line breaker instance takes some time. Pool it for later use.
class ICULineBreakerPoolImpl : public ICULineBreakerPool {
public:
    Slot acquire(const Locale& locale, LineBreakStyle lbStyle,
                 LineBreakWordStyle lbWordStyle) override;
    void release(Slot&& slot) override;

    static ICULineBreakerPoolImpl& getInstance() {
        static ICULineBreakerPoolImpl pool;
        return pool;
    }

protected:
    // protected for testing purposes.
    static constexpr size_t MAX_POOL_SIZE = 4;
    ICULineBreakerPoolImpl(){};  // singleton.
    size_t getPoolSize() const {
        std::lock_guard<std::mutex> lock(mMutex);
        return mPool.size();
    }

private:
    std::list<Slot> mPool GUARDED_BY(mMutex);
    mutable std::mutex mMutex;
};

class WordBreaker {
public:
    virtual ~WordBreaker() { finish(); }

    WordBreaker();

    void setText(const uint16_t* data, size_t size);

    // Advance iterator to next word break with current locale. Return offset, or -1 if EOT
    ssize_t next();

    // Advance iterator to the break just after "from" with using the new provided locale.
    // Return offset, or -1 if EOT
    ssize_t followingWithLocale(const Locale& locale, LineBreakStyle lbStyle,
                                LineBreakWordStyle lbWordStyle, size_t from);

    // Current offset of iterator, equal to 0 at BOT or last return from next()
    ssize_t current() const;

    // After calling next(), wordStart() and wordEnd() are offsets defining the previous
    // word. If wordEnd <= wordStart, it's not a word for the purpose of hyphenation.
    ssize_t wordStart() const;

    ssize_t wordEnd() const;

    // Returns the range from wordStart() to wordEnd().
    // If wordEnd() <= wordStart(), returns empty range.
    inline Range wordRange() const {
        const uint32_t start = wordStart();
        const uint32_t end = wordEnd();
        return start < end ? Range(start, end) : Range(end, end);
    }

    int breakBadness() const;

    void finish();

protected:
    // protected virtual for testing purpose.
    // Caller must release the pool.
    WordBreaker(ICULineBreakerPool* pool);

private:
    int32_t iteratorNext();
    void detectEmailOrUrl();
    ssize_t findNextBreakInEmailOrUrl();

    // Doesn't take ownership. Must not be nullptr. Must be set in constructor.
    ICULineBreakerPool* mPool;

    ICULineBreakerPool::Slot mIcuBreaker;

    std::unique_ptr<UText, decltype(&utext_close)> mUText;
    const uint16_t* mText = nullptr;
    size_t mTextSize;
    ssize_t mLast;
    ssize_t mCurrent;

    // state for the email address / url detector
    ssize_t mScanOffset;
    bool mInEmailOrUrl;
};

}  // namespace minikin

#endif  // MINIKIN_WORD_BREAKER_H