aboutsummaryrefslogtreecommitdiff
path: root/icing/tokenization/icu/icu-language-segmenter.cc
diff options
context:
space:
mode:
Diffstat (limited to 'icing/tokenization/icu/icu-language-segmenter.cc')
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc78
1 files changed, 65 insertions, 13 deletions
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index fd790cf..59bcc18 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -24,6 +24,7 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/mutex.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
@@ -48,9 +49,11 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// INTERNAL_ERROR if unable to create
static libtextclassifier3::StatusOr<
std::unique_ptr<LanguageSegmenter::Iterator>>
- Create(std::string_view text, std::string_view locale) {
+ Create(const IcuLanguageSegmenter* creator, UBreakIterator* break_iterator,
+ std::string_view text, std::string_view locale) {
std::unique_ptr<IcuLanguageSegmenterIterator> iterator(
- new IcuLanguageSegmenterIterator(text, locale));
+ new IcuLanguageSegmenterIterator(creator, break_iterator, text,
+ locale));
if (iterator->Initialize()) {
return iterator;
}
@@ -58,8 +61,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
}
~IcuLanguageSegmenterIterator() {
- ubrk_close(break_iterator_);
utext_close(u_text_);
+ creator_.ReturnBreakIterator(break_iterator_);
}
// Advances to the next term. Returns false if it has reached the end.
@@ -244,9 +247,12 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
}
private:
- explicit IcuLanguageSegmenterIterator(std::string_view text,
+ explicit IcuLanguageSegmenterIterator(const IcuLanguageSegmenter* creator,
+ UBreakIterator* break_iterator,
+ std::string_view text,
std::string_view locale)
- : break_iterator_(nullptr),
+ : creator_(*creator),
+ break_iterator_(break_iterator),
text_(text),
locale_(locale),
u_text_(nullptr),
@@ -256,13 +262,14 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Returns true on success
bool Initialize() {
+ if (break_iterator_ == nullptr) {
+ return false;
+ }
UErrorCode status = U_ZERO_ERROR;
u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status);
if (u_text_ == nullptr) {
return false;
}
- break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr,
- /*textLength=*/0, &status);
ubrk_setUText(break_iterator_, u_text_, &status);
return !U_FAILURE(status);
}
@@ -290,9 +297,11 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
term_start_index_ = 0;
}
+ const IcuLanguageSegmenter& creator_; // Does not own.
+
// The underlying class that does the segmentation, ubrk_close() must be
// called after using.
- UBreakIterator* break_iterator_;
+ UBreakIterator* break_iterator_; // Does not own
// Text to be segmented
std::string_view text_;
@@ -321,19 +330,62 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
int term_end_index_exclusive_;
};
-IcuLanguageSegmenter::IcuLanguageSegmenter(std::string locale)
- : locale_(std::move(locale)) {}
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IcuLanguageSegmenter>>
+IcuLanguageSegmenter::Create(std::string&& locale) {
+ UErrorCode status = U_ZERO_ERROR;
+ UBreakIterator* break_iterator = ubrk_open(
+ UBRK_WORD, locale.c_str(), /*text=*/nullptr, /*textLength=*/0, &status);
+ if (U_FAILURE(status) || break_iterator == nullptr) {
+ return absl_ports::AbortedError(
+ "Unable to create ICU break_iterator for language segmentation");
+ }
+ return std::unique_ptr<IcuLanguageSegmenter>(
+ new IcuLanguageSegmenter(std::move(locale), break_iterator));
+}
+
+UBreakIterator* IcuLanguageSegmenter::ProduceBreakIterator() const {
+ UBreakIterator* itr = nullptr;
+ {
+ absl_ports::unique_lock l(&mutex_);
+ if (cached_break_iterator_ != nullptr) {
+ itr = cached_break_iterator_;
+ cached_break_iterator_ = nullptr;
+ }
+ }
+ if (itr == nullptr) {
+ UErrorCode status = U_ZERO_ERROR;
+ itr = ubrk_open(UBRK_WORD, locale_.c_str(), /*text=*/nullptr,
+ /*textLength=*/0, &status);
+ if (U_FAILURE(status)) {
+ itr = nullptr;
+ }
+ }
+ return itr;
+}
+
+void IcuLanguageSegmenter::ReturnBreakIterator(UBreakIterator* itr) const {
+ {
+ absl_ports::unique_lock l(&mutex_);
+ if (cached_break_iterator_ == nullptr) {
+ cached_break_iterator_ = itr;
+ return;
+ }
+ }
+ ubrk_close(itr);
+}
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
IcuLanguageSegmenter::Segment(const std::string_view text,
LanguageSegmenter::AccessType) const {
- return IcuLanguageSegmenterIterator::Create(text, locale_);
+ return IcuLanguageSegmenterIterator::Create(this, ProduceBreakIterator(),
+ text, locale_);
}
libtextclassifier3::StatusOr<std::vector<std::string_view>>
IcuLanguageSegmenter::GetAllTerms(const std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
- IcuLanguageSegmenterIterator::Create(text, locale_));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<LanguageSegmenter::Iterator> iterator,
+ Segment(text, LanguageSegmenter::AccessType::kForwardIterator));
std::vector<std::string_view> terms;
while (iterator->Advance()) {
terms.push_back(iterator->GetTerm());