diff options
author | Tim Barron <tjbarron@google.com> | 2020-06-05 13:55:31 -0700 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2020-06-05 14:04:31 -0700 |
commit | a4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf (patch) | |
tree | 090955adb6f2abfc09f5275d6bab35a2c0d74198 /java | |
parent | 79321d1f286ac650cc99fcf795a67c5dde8c0597 (diff) | |
download | icing-a4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf.tar.gz |
Copy over changes made to Google3 codebase in Icing.
Change-Id: Ia36edb0a1b085e249dabfc220a5b72418063604f
Diffstat (limited to 'java')
-rw-r--r-- | java/src/com/google/android/icing/BreakIteratorBatcher.java | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/java/src/com/google/android/icing/BreakIteratorBatcher.java b/java/src/com/google/android/icing/BreakIteratorBatcher.java new file mode 100644 index 0000000..e3f325b --- /dev/null +++ b/java/src/com/google/android/icing/BreakIteratorBatcher.java @@ -0,0 +1,79 @@ +package com.google.android.icing; + +import com.google.android.apps.common.proguard.UsedByNative; +import java.text.BreakIterator; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +/** + * A simple wrapper around BreakIterator that allows batching of multiple BreakIterator#next calls + * to reduce the number of necessary reverse jni calls. + * + * <p>Example: The text "我每天走路去上班。" has a length of 9 bytes in UTF-16 and a length of 27 bytes in + * UTF-8. The text should be broken up into the following six terms when properly segmented: "我", + * "每天", "走路", "去", "上班", "。" + * + * <pre>{@code + * BreakIteratorBatcher brkItrBatcher = new BreakIteratorBatcher(Locale.US); + * brkItrBatcher.setText("我每天走路去上班。"); + * int[] utf16Boundaries = brkItrBatcher.next(5); + * assertThat(utf16Boundaries).asList().containsExactly(1, 3, 5, 6, 8); + * utf16Boundaries = brkItrBatcher.next(5); + * assertThat(utf16Boundaries).asList().containsExactly(9); + * }</pre> + */ +@UsedByNative("jni-cache.cc") +public class BreakIteratorBatcher { + + private final BreakIterator iterator; + + @UsedByNative("jni-cache.cc") + public BreakIteratorBatcher(Locale locale) { + this.iterator = BreakIterator.getWordInstance(locale); + } + + /* Direct calls to BreakIterator */ + @UsedByNative("jni-cache.cc") + public void setText(String text) { + iterator.setText(text); + } + + @UsedByNative("jni-cache.cc") + public int first() { + return iterator.first(); + } + + @UsedByNative("jni-cache.cc") + public int preceding(int utf16Offset) { + return iterator.preceding(utf16Offset); + } + + @UsedByNative("jni-cache.cc") + public int following(int utf16Offset) { + return iterator.following(utf16Offset); + } + + /** + * Batched version of next. Returns an array of ints of up to size batchSize, reflecting the + * return values of batchSize successful calls to BreakIterator.next. If the BreakIterator reaches + * the end of the text (returns BreakIterator#DONE), then only the results of the previous calls + * in that batch will be returned. + */ + @UsedByNative("jni-cache.cc") + public int[] next(int batchSize) { + List<Integer> breakIndices = new ArrayList<>(batchSize); + for (int i = 0; i < batchSize; ++i) { + int boundary = iterator.next(); + if (boundary == BreakIterator.DONE) { + break; + } + breakIndices.add(boundary); + } + int[] breakIndicesArray = new int[breakIndices.size()]; + for (int i = 0; i < breakIndices.size(); ++i) { + breakIndicesArray[i] = breakIndices.get(i); + } + return breakIndicesArray; + } +} |