aboutsummaryrefslogtreecommitdiff
path: root/java
diff options
context:
space:
mode:
authorTim Barron <tjbarron@google.com>2020-06-05 13:55:31 -0700
committerTim Barron <tjbarron@google.com>2020-06-05 14:04:31 -0700
commita4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf (patch)
tree090955adb6f2abfc09f5275d6bab35a2c0d74198 /java
parent79321d1f286ac650cc99fcf795a67c5dde8c0597 (diff)
downloadicing-a4a63ec8e7e70912ef04019e7dc9f3c3ecf2eabf.tar.gz
Copy over changes made to Google3 codebase in Icing.
Change-Id: Ia36edb0a1b085e249dabfc220a5b72418063604f
Diffstat (limited to 'java')
-rw-r--r--java/src/com/google/android/icing/BreakIteratorBatcher.java79
1 files changed, 79 insertions, 0 deletions
diff --git a/java/src/com/google/android/icing/BreakIteratorBatcher.java b/java/src/com/google/android/icing/BreakIteratorBatcher.java
new file mode 100644
index 0000000..e3f325b
--- /dev/null
+++ b/java/src/com/google/android/icing/BreakIteratorBatcher.java
@@ -0,0 +1,79 @@
+package com.google.android.icing;
+
+import com.google.android.apps.common.proguard.UsedByNative;
+import java.text.BreakIterator;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+/**
+ * A simple wrapper around BreakIterator that allows batching of multiple BreakIterator#next calls
+ * to reduce the number of necessary reverse jni calls.
+ *
+ * <p>Example: The text "我每天走路去上班。" has a length of 9 bytes in UTF-16 and a length of 27 bytes in
+ * UTF-8. The text should be broken up into the following six terms when properly segmented: "我",
+ * "每天", "走路", "去", "上班", "。"
+ *
+ * <pre>{@code
+ * BreakIteratorBatcher brkItrBatcher = new BreakIteratorBatcher(Locale.US);
+ * brkItrBatcher.setText("我每天走路去上班。");
+ * int[] utf16Boundaries = brkItrBatcher.next(5);
+ * assertThat(utf16Boundaries).asList().containsExactly(1, 3, 5, 6, 8);
+ * utf16Boundaries = brkItrBatcher.next(5);
+ * assertThat(utf16Boundaries).asList().containsExactly(9);
+ * }</pre>
+ */
+@UsedByNative("jni-cache.cc")
+public class BreakIteratorBatcher {
+
+ private final BreakIterator iterator;
+
+ @UsedByNative("jni-cache.cc")
+ public BreakIteratorBatcher(Locale locale) {
+ this.iterator = BreakIterator.getWordInstance(locale);
+ }
+
+ /* Direct calls to BreakIterator */
+ @UsedByNative("jni-cache.cc")
+ public void setText(String text) {
+ iterator.setText(text);
+ }
+
+ @UsedByNative("jni-cache.cc")
+ public int first() {
+ return iterator.first();
+ }
+
+ @UsedByNative("jni-cache.cc")
+ public int preceding(int utf16Offset) {
+ return iterator.preceding(utf16Offset);
+ }
+
+ @UsedByNative("jni-cache.cc")
+ public int following(int utf16Offset) {
+ return iterator.following(utf16Offset);
+ }
+
+ /**
+ * Batched version of next. Returns an array of ints of up to size batchSize, reflecting the
+ * return values of batchSize successful calls to BreakIterator.next. If the BreakIterator reaches
+ * the end of the text (returns BreakIterator#DONE), then only the results of the previous calls
+ * in that batch will be returned.
+ */
+ @UsedByNative("jni-cache.cc")
+ public int[] next(int batchSize) {
+ List<Integer> breakIndices = new ArrayList<>(batchSize);
+ for (int i = 0; i < batchSize; ++i) {
+ int boundary = iterator.next();
+ if (boundary == BreakIterator.DONE) {
+ break;
+ }
+ breakIndices.add(boundary);
+ }
+ int[] breakIndicesArray = new int[breakIndices.size()];
+ for (int i = 0; i < breakIndices.size(); ++i) {
+ breakIndicesArray[i] = breakIndices.get(i);
+ }
+ return breakIndicesArray;
+ }
+}