aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--common/rbbi.cpp31
-rw-r--r--common/ubrk.cpp12
-rw-r--r--common/unicode/brkiter.h29
-rw-r--r--common/unicode/rbbi.h27
-rw-r--r--common/unicode/ubrk.h31
-rw-r--r--test/cintltst/cbiapts.c50
-rw-r--r--test/intltest/rbbiapts.cpp49
-rw-r--r--test/intltest/rbbiapts.h2
8 files changed, 229 insertions, 2 deletions
diff --git a/common/rbbi.cpp b/common/rbbi.cpp
index 7196f049..ddee7b4a 100644
--- a/common/rbbi.cpp
+++ b/common/rbbi.cpp
@@ -486,6 +486,37 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
}
+/**
+ * Provide a new UText for the input text. Must reference text with contents identical
+ * to the original.
+ * Intended for use with text data originating in Java (garbage collected) environments
+ * where the data may be moved in memory at arbitrary times.
+ */
+RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) {
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ if (input == NULL) {
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ return *this;
+ }
+ int64_t pos = utext_getNativeIndex(fText);
+ // Shallow read-only clone of the new UText into the existing input UText
+ fText = utext_clone(fText, input, FALSE, TRUE, &status);
+ if (U_FAILURE(status)) {
+ return *this;
+ }
+ utext_setNativeIndex(fText, pos);
+ if (utext_getNativeIndex(fText) != pos) {
+ // Sanity check. The new input utext is supposed to have the exact same
+ // contents as the old. If we can't set to the same position, it doesn't.
+ // The contents underlying the old utext might be invalid at this point,
+ // so it's not safe to check directly.
+ status = U_ILLEGAL_ARGUMENT_ERROR;
+ }
+ return *this;
+}
+
/**
* Sets the current iteration position to the beginning of the text.
diff --git a/common/ubrk.cpp b/common/ubrk.cpp
index 141913f1..70d7e48f 100644
--- a/common/ubrk.cpp
+++ b/common/ubrk.cpp
@@ -1,6 +1,6 @@
/*
********************************************************************************
-* Copyright (C) 1996-2008, International Business Machines
+* Copyright (C) 1996-2011, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*/
@@ -300,4 +300,14 @@ ubrk_getLocaleByType(const UBreakIterator *bi,
}
+void ubrk_refreshUText(UBreakIterator *bi,
+ UText *text,
+ UErrorCode *status)
+{
+ BreakIterator *bii = reinterpret_cast<BreakIterator *>(bi);
+ bii->refreshInputText(text, *status);
+}
+
+
+
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/common/unicode/brkiter.h b/common/unicode/brkiter.h
index bdd3cc70..6cae690e 100644
--- a/common/unicode/brkiter.h
+++ b/common/unicode/brkiter.h
@@ -1,6 +1,6 @@
/*
********************************************************************************
-* Copyright (C) 1997-2010, International Business Machines
+* Copyright (C) 1997-2011, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*
@@ -514,6 +514,33 @@ public:
*/
const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
+ /**
+ * Set the subject text string upon which the break iterator is operating
+ * without changing any other aspect of the matching state.
+ * The new and previous text strings must have the same content.
+ *
+ * This function is intended for use in environments where ICU is operating on
+ * strings that may move around in memory. It provides a mechanism for notifying
+ * ICU that the string has been relocated, and providing a new UText to access the
+ * string in its new position.
+ *
+ * Note that the break iterator implementation never copies the underlying text
+ * of a string being processed, but always operates directly on the original text
+ * provided by the user. Refreshing simply drops the references to the old text
+ * and replaces them with references to the new.
+ *
+ * Caution: this function is normally used only by very specialized,
+ * system-level code. One example use case is with garbage collection that moves
+ * the text in memory.
+ *
+ * @param input The new (moved) text string.
+ * @param status Receives errors detected by this function.
+ * @return *this
+ *
+ * @draft ICU 5.0
+ */
+ virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
+
private:
static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
diff --git a/common/unicode/rbbi.h b/common/unicode/rbbi.h
index f93b5776..529a5897 100644
--- a/common/unicode/rbbi.h
+++ b/common/unicode/rbbi.h
@@ -633,6 +633,33 @@ public:
*/
virtual const uint8_t *getBinaryRules(uint32_t &length);
+ /**
+ * Set the subject text string upon which the break iterator is operating
+ * without changing any other aspect of the matching state.
+ * The new and previous text strings must have the same content.
+ *
+ * This function is intended for use in environments where ICU is operating on
+ * strings that may move around in memory. It provides a mechanism for notifying
+ * ICU that the string has been relocated, and providing a new UText to access the
+ * string in its new position.
+ *
+ * Note that the break iterator implementation never copies the underlying text
+ * of a string being processed, but always operates directly on the original text
+ * provided by the user. Refreshing simply drops the references to the old text
+ * and replaces them with references to the new.
+ *
+ * Caution: this function is normally used only by very specialized,
+ * system-level code. One example use case is with garbage collection that moves
+ * the text in memory.
+ *
+ * @param input The new (moved) text string.
+ * @param status Receives errors detected by this function.
+ * @return *this
+ *
+ * @draft ICU 5.0
+ */
+ virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);
+
protected:
//=======================================================================
diff --git a/common/unicode/ubrk.h b/common/unicode/ubrk.h
index 96dd2af0..c596861c 100644
--- a/common/unicode/ubrk.h
+++ b/common/unicode/ubrk.h
@@ -496,6 +496,37 @@ U_STABLE const char* U_EXPORT2
ubrk_getLocaleByType(const UBreakIterator *bi, ULocDataLocaleType type, UErrorCode* status);
+/**
+ * Set the subject text string upon which the break iterator is operating
+ * without changing any other aspect of the state.
+ * The new and previous text strings must have the same content.
+ *
+ * This function is intended for use in environments where ICU is operating on
+ * strings that may move around in memory. It provides a mechanism for notifying
+ * ICU that the string has been relocated, and providing a new UText to access the
+ * string in its new position.
+ *
+ * Note that the break iterator never copies the underlying text
+ * of a string being processed, but always operates directly on the original text
+ * provided by the user. Refreshing simply drops the references to the old text
+ * and replaces them with references to the new.
+ *
+ * Caution: this function is normally used only by very specialized
+ * system-level code. One example use case is with garbage collection
+ * that moves the text in memory.
+ *
+ * @param bi The break iterator.
+ * @param text The new (moved) text string.
+ * @param status Receives errors detected by this function.
+ *
+ * @draft ICU 5.0
+ */
+U_DRAFT void U_EXPORT2
+ubrk_refreshUText(UBreakIterator *bi,
+ UText *text,
+ UErrorCode *status);
+
+
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
#endif
diff --git a/test/cintltst/cbiapts.c b/test/cintltst/cbiapts.c
index b3c1f7af..1dab12f0 100644
--- a/test/cintltst/cbiapts.c
+++ b/test/cintltst/cbiapts.c
@@ -44,6 +44,7 @@ static void TestBreakIteratorRuleError(void);
static void TestBreakIteratorStatusVec(void);
static void TestBreakIteratorUText(void);
static void TestBreakIteratorTailoring(void);
+static void TestBreakIteratorRefresh(void);
void addBrkIterAPITest(TestNode** root);
@@ -58,6 +59,7 @@ void addBrkIterAPITest(TestNode** root)
addTest(root, &TestBreakIteratorRuleError, "tstxtbd/cbiapts/TestBreakIteratorRuleError");
addTest(root, &TestBreakIteratorStatusVec, "tstxtbd/cbiapts/TestBreakIteratorStatusVec");
addTest(root, &TestBreakIteratorTailoring, "tstxtbd/cbiapts/TestBreakIteratorTailoring");
+ addTest(root, &TestBreakIteratorRefresh, "tstxtbd/cbiapts/TestBreakIteratorRefresh");
}
#define CLONETEST_ITERATOR_COUNT 2
@@ -823,4 +825,52 @@ static void TestBreakIteratorTailoring(void) {
}
}
+
+static void TestBreakIteratorRefresh(void) {
+ /*
+ * RefreshInput changes out the input of a Break Iterator without
+ * changing anything else in the iterator's state. Used with Java JNI,
+ * when Java moves the underlying string storage. This test
+ * runs a ubrk_next() repeatedly, moving the text in the middle of the sequence.
+ * The right set of boundaries should still be found.
+ */
+ UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */
+ UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0};
+ UErrorCode status = U_ZERO_ERROR;
+ UBreakIterator *bi;
+ UText ut1 = UTEXT_INITIALIZER;
+ UText ut2 = UTEXT_INITIALIZER;
+
+ bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status);
+ TEST_ASSERT_SUCCESS(status);
+
+ utext_openUChars(&ut1, testStr, -1, &status);
+ TEST_ASSERT_SUCCESS(status);
+ ubrk_setUText(bi, &ut1, &status);
+ TEST_ASSERT_SUCCESS(status);
+
+ /* Line boundaries will occur before each letter in the original string */
+ TEST_ASSERT(1 == ubrk_next(bi));
+ TEST_ASSERT(3 == ubrk_next(bi));
+
+ /* Move the string, kill the original string. */
+ u_strcpy(movedStr, testStr);
+ u_memset(testStr, 0x20, u_strlen(testStr));
+ utext_openUChars(&ut2, movedStr, -1, &status);
+ TEST_ASSERT_SUCCESS(status);
+ ubrk_refreshUText(bi, &ut2, &status);
+ TEST_ASSERT_SUCCESS(status);
+
+ /* Find the following matches, now working in the moved string. */
+ TEST_ASSERT(5 == ubrk_next(bi));
+ TEST_ASSERT(7 == ubrk_next(bi));
+ TEST_ASSERT(8 == ubrk_next(bi));
+ TEST_ASSERT(UBRK_DONE == ubrk_next(bi));
+ TEST_ASSERT_SUCCESS(status);
+
+ ubrk_close(bi);
+ utext_close(&ut1);
+ utext_close(&ut2);
+}
+
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
diff --git a/test/intltest/rbbiapts.cpp b/test/intltest/rbbiapts.cpp
index 89afdb6d..cb40076b 100644
--- a/test/intltest/rbbiapts.cpp
+++ b/test/intltest/rbbiapts.cpp
@@ -1122,6 +1122,54 @@ void RBBIAPITest::TestCreateFromRBBIData() {
}
}
+
+void RBBIAPITest::TestRefreshInputText() {
+ /*
+ * RefreshInput changes out the input of a Break Iterator without
+ * changing anything else in the iterator's state. Used with Java JNI,
+ * when Java moves the underlying string storage. This test
+ * runs BreakIterator::next() repeatedly, moving the text in the middle of the sequence.
+ * The right set of boundaries should still be found.
+ */
+ UChar testStr[] = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0}; /* = " A B C D" */
+ UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0};
+ UErrorCode status = U_ZERO_ERROR;
+ UText ut1 = UTEXT_INITIALIZER;
+ UText ut2 = UTEXT_INITIALIZER;
+ RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
+ TEST_ASSERT_SUCCESS(status);
+
+ utext_openUChars(&ut1, testStr, -1, &status);
+ TEST_ASSERT_SUCCESS(status);
+ bi->setText(&ut1, status);
+ TEST_ASSERT_SUCCESS(status);
+
+ /* Line boundaries will occur before each letter in the original string */
+ TEST_ASSERT(1 == bi->next());
+ TEST_ASSERT(3 == bi->next());
+
+ /* Move the string, kill the original string. */
+ u_strcpy(movedStr, testStr);
+ u_memset(testStr, 0x20, u_strlen(testStr));
+ utext_openUChars(&ut2, movedStr, -1, &status);
+ TEST_ASSERT_SUCCESS(status);
+ RuleBasedBreakIterator *returnedBI = &bi->refreshInputText(&ut2, status);
+ TEST_ASSERT_SUCCESS(status);
+ TEST_ASSERT(bi == returnedBI);
+
+ /* Find the following matches, now working in the moved string. */
+ TEST_ASSERT(5 == bi->next());
+ TEST_ASSERT(7 == bi->next());
+ TEST_ASSERT(8 == bi->next());
+ TEST_ASSERT(UBRK_DONE == bi->next());
+
+ delete bi;
+ utext_close(&ut1);
+ utext_close(&ut2);
+
+}
+
+
//---------------------------------------------
// runIndexedTest
//---------------------------------------------
@@ -1153,6 +1201,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
#else
case 9: case 10: case 11: case 12: case 13: name = "skip"; break;
#endif
+ case 14: name = "TestRefreshInputText"; if (exec) TestRefreshInputText(); break;
default: name = ""; break; // needed to end loop
}
diff --git a/test/intltest/rbbiapts.h b/test/intltest/rbbiapts.h
index 0ce64ac3..d9a25aa7 100644
--- a/test/intltest/rbbiapts.h
+++ b/test/intltest/rbbiapts.h
@@ -86,6 +86,8 @@ public:
void TestRegistration();
+ void TestRefreshInputText();
+
/**
*Internal subroutines
**/