diff options
author | allenwtsu <allenwtsu@google.com> | 2022-12-20 16:34:42 +0000 |
---|---|---|
committer | Allen Su <allenwtsu@google.com> | 2023-01-09 03:22:04 +0000 |
commit | faa5d0ea7c4d8bfa89d6a9211c99c6632c045f01 (patch) | |
tree | aba27a199aa1eb0d55dda7d5be3c448056f85860 /icu4j | |
parent | 658a789ac2ed5839dfda8485a81045dac3cb6ef8 (diff) | |
download | icu-faa5d0ea7c4d8bfa89d6a9211c99c6632c045f01.tar.gz |
ICU-22100 Incorporate BudouX into ICU (Java)
Cherry-pick from https://github.com/unicode-org/icu/pull/2214
Bug: 219529457
Test: atest CtsIcuTestCases
Change-Id: Id67d4ab3114854cf3b557624ee61e7e7b6420298
Diffstat (limited to 'icu4j')
7 files changed, 523 insertions, 7 deletions
diff --git a/icu4j/build.xml b/icu4j/build.xml index 566d10f6f..b9027dbc7 100644 --- a/icu4j/build.xml +++ b/icu4j/build.xml @@ -338,11 +338,13 @@ <!--set the property - if it was set before it won't override--> <property name="user-jvm-options" value=""/> <property name="internal-jvm-options" value=""/> + <property name="com.ibm.icu.impl.breakiter.useMLPhraseBreaking" value=""/> <delete dir="${junit.out.dir}/@{test-name}"/> <mkdir dir="${junit.out.dir}/@{test-name}"/> <junit fork="yes" forkmode="once" printsummary="yes" haltonfailure="no" failureproperty="@{failure-status}" tempdir="${junit.out.dir}"> + <sysproperty key="com.ibm.icu.impl.breakiter.useMLPhraseBreaking" value="${com.ibm.icu.impl.breakiter.useMLPhraseBreaking}" /> <jvmarg value="-Xss4m"/> <jvmarg value="-ea"/> <jvmarg value="-Djava.awt.headless=true"/> diff --git a/icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties b/icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties index bc56ef6cf..e64b2f3ec 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties +++ b/icu4j/main/classes/core/src/com/ibm/icu/ICUConfig.properties @@ -63,3 +63,10 @@ com.ibm.icu.impl.ICUResourceBundle.skipRuntimeLocaleResourceScan = false # LocaleDisplayNames implementation class # @internal # com.ibm.icu.text.LocaleDisplayNames.impl = com.ibm.icu.impl.LocaleDisplayNamesImpl + +# +# [Internal Use Only] +# Enable ML phrase breaking +# Android patch, http://b/219529457, for ML-based phrase line breaking +# @internal +com.ibm.icu.impl.breakiter.useMLPhraseBreaking = true diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java index 61018c356..6ca912cb2 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/ICUConfig.java @@ -76,7 +76,7 @@ public class ICUConfig { val = System.getProperty(name); } - if (val == null) { + if (val == null || val.equals("")) { val = CONFIG_PROPS.getProperty(name, def); } return val; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java index ee66c46da..cf8da008d 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/CjkBreakEngine.java @@ -18,6 +18,7 @@ import java.text.CharacterIterator; import java.util.HashSet; import com.ibm.icu.impl.Assert; +import com.ibm.icu.impl.ICUConfig; import com.ibm.icu.impl.ICUData; import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.UnicodeSet; @@ -31,6 +32,8 @@ public class CjkBreakEngine extends DictionaryBreakEngine { private UnicodeSet fClosePunctuationSet; private DictionaryMatcher fDictionary = null; private HashSet<String> fSkipSet; + private MlBreakEngine fMlBreakEngine; + private boolean isCj = false; public CjkBreakEngine(boolean korean) throws IOException { fHangulWordSet = new UnicodeSet("[\\uac00-\\ud7a3]"); @@ -47,9 +50,16 @@ public class CjkBreakEngine extends DictionaryBreakEngine { if (korean) { setCharacters(fHangulWordSet); } else { //Chinese and Japanese + isCj = true; UnicodeSet cjSet = new UnicodeSet("[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"); setCharacters(cjSet); - initializeJapanesePhraseParamater(); + if (Boolean.parseBoolean( + ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) { + fMlBreakEngine = new MlBreakEngine(fDigitOrOpenPunctuationOrAlphabetSet, + fClosePunctuationSet); + } else { + initializeJapanesePhraseParamater(); + } } } @@ -151,6 +161,15 @@ public class CjkBreakEngine extends DictionaryBreakEngine { charPositions[numCodePts] = index; } } + // Use ML phrase breaking + if (Boolean.parseBoolean( + ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) { + // PhraseBreaking is supported in ja and ko; MlBreakEngine only supports ja. + if (isPhraseBreaking && isCj) { + return fMlBreakEngine.divideUpRange(inText, startPos, endPos, text, + numCodePts, charPositions, foundBreaks); + } + } // From here on out, do the algorithm. Note that our indices // refer to indices within the normalized string. @@ -276,10 +295,11 @@ public class CjkBreakEngine extends DictionaryBreakEngine { // In phrase breaking, there has to be a breakpoint between Cj character and close // punctuation. // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正 + inText.setIndex(pos); if (pos > previous) { if (pos != startPos || (isPhraseBreaking && pos > 0 - && fClosePunctuationSet.contains(inText.setIndex(pos - 1)))) { + && fClosePunctuationSet.contains(previous32(inText)))) { foundBreaks.push(charPositions[t_boundary[i]] + startPos); correctedNumBreaks++; } @@ -294,7 +314,9 @@ public class CjkBreakEngine extends DictionaryBreakEngine { // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9 // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U if (isPhraseBreaking) { - if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(inText.setIndex(endPos))) { + inText.setIndex(endPos); + int current = current32(inText); + if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) { foundBreaks.pop(); correctedNumBreaks--; } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java new file mode 100644 index 000000000..ceeb4879a --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/breakiter/MlBreakEngine.java @@ -0,0 +1,436 @@ +// © 2022 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +package com.ibm.icu.impl.breakiter; + +import static com.ibm.icu.impl.CharacterIteration.DONE32; +import static com.ibm.icu.impl.CharacterIteration.current32; +import static com.ibm.icu.impl.CharacterIteration.next32; +import static com.ibm.icu.impl.CharacterIteration.previous32; + +import com.ibm.icu.impl.Assert; +import com.ibm.icu.impl.ICUData; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.UResourceBundle; +import com.ibm.icu.util.UResourceBundleIterator; + +import java.lang.System; +import java.text.CharacterIterator; +import java.util.ArrayList; +import java.util.HashMap; + +public class MlBreakEngine { + + private static final int INVALID = '|'; + private static final String INVALID_STRING = "|"; + private static final int MAX_FEATURE = 26; + private UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet; + private UnicodeSet fClosePunctuationSet; + private HashMap<String, Integer> fModel; + + private int fNegativeSum; + + static class Element { + private int character; + private String ublock; + + /** + * Default constructor. + */ + public Element() { + character = 0; + ublock = null; + } + + /** + * Set the character and its unicode block. + * + * @param ch A unicode character. + * @param str The unicode block of the character. + */ + public void setCharAndUblock(int ch, String str) { + Assert.assrt(str.length() <= 3); + this.character = ch; + ublock = str; + } + + /** + * Get the unicode character. + * + * @return The unicode character. + */ + public int getCharacter() { + return character; + } + + /** + * Get the unicode character's unicode block. + * + * @return The unicode block. + */ + public String getUblock() { + return ublock; + } + } + + private static boolean isValid(Element element) { + String ublock = element.getUblock(); + return ublock.length() != 1 || (int) ublock.charAt(0) != INVALID; + } + + /** + * Constructor for Chinese and Japanese phrase breaking. + * + * @param digitOrOpenPunctuationOrAlphabetSet An unicode set with the digit and open punctuation + * and alphabet. + * @param closePunctuationSet An unicode set with the close punctuation. + */ + public MlBreakEngine(UnicodeSet digitOrOpenPunctuationOrAlphabetSet, + UnicodeSet closePunctuationSet) { + fDigitOrOpenPunctuationOrAlphabetSet = digitOrOpenPunctuationOrAlphabetSet; + fClosePunctuationSet = closePunctuationSet; + fModel = new HashMap<String, Integer>(); + fNegativeSum = 0; + loadMLModel(); + } + + /** + * Divide up a range of characters handled by this break engine. + * + * @param inText A input text. + * @param startPos The start index of the input text. + * @param endPos The end index of the input text. + * @param inString A input string normalized from inText from startPos to endPos + * @param numCodePts The number of code points of inString + * @param charPositions A map that transforms inString's code point index to code unit index. + * @param foundBreaks A list to store the breakpoint. + * @return The number of breakpoints + */ + public int divideUpRange(CharacterIterator inText, int startPos, int endPos, + CharacterIterator inString, int numCodePts, int[] charPositions, + DictionaryBreakEngine.DequeI foundBreaks) { + if (startPos >= endPos) { + return 0; + } + ArrayList<Integer> boundary = new ArrayList<Integer>(numCodePts); + int ch; + String ublock; + // The ML model groups six char to evaluate if the 4th char is a breakpoint. + // Like a sliding window, the elementList removes the first char and appends the new char + // from inString in each iteration so that its size always remains at six. + Element elementList[] = new Element[6]; + initElementList(inString, elementList, numCodePts); + + // Add a break for the start. + boundary.add(0, 0); + for (int i = 1; i < numCodePts; i++) { + evaluateBreakpoint(elementList, i, boundary); + if (i + 1 > numCodePts) { + break; + } + shiftLeftOne(elementList); + + ch = (i + 3) < numCodePts ? next32(inString) : INVALID; + ublock = (ch != INVALID) ? getUnicodeBlock(ch) : INVALID_STRING; + elementList[5].setCharAndUblock(ch, ublock); + } + + // Add a break for the end if there is not one there already. + if (boundary.get(boundary.size() - 1) != numCodePts) { + boundary.add(numCodePts); + } + + int correctedNumBreaks = 0; + int previous = -1; + int numBreaks = boundary.size(); + for (int i = 0; i < numBreaks; i++) { + int pos = charPositions[boundary.get(i)] + startPos; + // In phrase breaking, there has to be a breakpoint between Cj character and close + // punctuation. + // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正 + inText.setIndex(pos); + if (pos > previous) { + if (pos != startPos + || (pos > 0 + && fClosePunctuationSet.contains(previous32(inText)))) { + foundBreaks.push(pos); + correctedNumBreaks++; + } + } + previous = pos; + } + + if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) { + // In phrase breaking, there has to be a breakpoint between Cj character and + // the number/open punctuation. + // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「 + // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9 + // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U + inText.setIndex(endPos); + int current = current32(inText); + if (current != DONE32 && !fDigitOrOpenPunctuationOrAlphabetSet.contains(current)) { + foundBreaks.pop(); + correctedNumBreaks--; + } + + } + if (!foundBreaks.isEmpty()) { + inText.setIndex(foundBreaks.peek()); + } + return correctedNumBreaks; + } + + private void shiftLeftOne(Element[] elementList) { + int length = elementList.length; + for (int i = 1; i < length; i++) { + elementList[i - 1].character = elementList[i].character; + elementList[i - 1].ublock = elementList[i].ublock; + } + } + + /** + * Evaluate whether the index is a potential breakpoint. + * + * @param elementList A list including six elements for the breakpoint evaluation. + * @param index The breakpoint index to be evaluated. + * @param boundary An list including the index of the breakpoint. + */ + private void evaluateBreakpoint(Element[] elementList, int index, ArrayList<Integer> boundary) { + String[] featureList = new String[MAX_FEATURE]; + final int w1 = elementList[0].getCharacter(); + final int w2 = elementList[1].getCharacter(); + final int w3 = elementList[2].getCharacter(); + final int w4 = elementList[3].getCharacter(); + final int w5 = elementList[4].getCharacter(); + final int w6 = elementList[5].getCharacter(); + + StringBuilder sb = new StringBuilder(); + int idx = 0; + if (w1 != INVALID) { + featureList[idx++] = sb.append("UW1:").appendCodePoint(w1).toString(); + } + if (w2 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("UW2:").appendCodePoint(w2).toString(); + } + if (w3 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("UW3:").appendCodePoint(w3).toString(); + } + if (w4 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("UW4:").appendCodePoint(w4).toString(); + } + if (w5 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("UW5:").appendCodePoint(w5).toString(); + } + if (w6 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("UW6:").appendCodePoint(w6).toString(); + } + if (w2 != INVALID && w3 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("BW1:").appendCodePoint(w2).appendCodePoint( + w3).toString(); + } + if (w3 != INVALID && w4 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("BW2:").appendCodePoint(w3).appendCodePoint( + w4).toString(); + } + if (w4 != INVALID && w5 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("BW3:").appendCodePoint(w4).appendCodePoint( + w5).toString(); + } + if (w1 != INVALID && w2 != INVALID && w3 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("TW1:").appendCodePoint(w1).appendCodePoint( + w2).appendCodePoint(w3).toString(); + } + if (w2 != INVALID && w3 != INVALID && w4 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("TW2:").appendCodePoint(w2).appendCodePoint( + w3).appendCodePoint(w4).toString(); + } + if (w3 != INVALID && w4 != INVALID && w5 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("TW3:").appendCodePoint(w3).appendCodePoint( + w4).appendCodePoint(w5).toString(); + } + if (w4 != INVALID && w5 != INVALID && w6 != INVALID) { + sb.setLength(0); + featureList[idx++] = sb.append("TW4:").appendCodePoint(w4).appendCodePoint( + w5).appendCodePoint(w6).toString(); + } + if (isValid(elementList[0])) { + sb.setLength(0); + featureList[idx++] = sb.append("UB1:").append(elementList[0].getUblock()).toString(); + } + if (isValid(elementList[1])) { + sb.setLength(0); + featureList[idx++] = sb.append("UB2:").append(elementList[1].getUblock()).toString(); + } + if (isValid(elementList[2])) { + sb.setLength(0); + featureList[idx++] = sb.append("UB3:").append(elementList[2].getUblock()).toString(); + } + if (isValid(elementList[3])) { + sb.setLength(0); + featureList[idx++] = sb.append("UB4:").append(elementList[3].getUblock()).toString(); + } + if (isValid(elementList[4])) { + sb.setLength(0); + featureList[idx++] = sb.append("UB5:").append(elementList[4].getUblock()).toString(); + } + if (isValid(elementList[5])) { + sb.setLength(0); + featureList[idx++] = sb.append("UB6:").append(elementList[5].getUblock()).toString(); + } + if (isValid(elementList[1]) && isValid(elementList[2])) { + sb.setLength(0); + featureList[idx++] = sb.append("BB1:"). + append(elementList[1].getUblock()). + append(elementList[2].getUblock()).toString(); + } + if (isValid(elementList[2]) && isValid(elementList[3])) { + sb.setLength(0); + featureList[idx++] = sb.append("BB2:"). + append(elementList[2].getUblock()). + append(elementList[3].getUblock()).toString(); + } + if (isValid(elementList[3]) && isValid(elementList[4])) { + sb.setLength(0); + featureList[idx++] = sb.append("BB3:"). + append(elementList[3].getUblock()). + append(elementList[4].getUblock()).toString(); + } + if (isValid(elementList[0]) && isValid(elementList[1]) && isValid(elementList[2])) { + sb.setLength(0); + featureList[idx++] = sb.append("TB1:"). + append(elementList[0].getUblock()). + append(elementList[1].getUblock()). + append(elementList[2].getUblock()).toString(); + } + if (isValid(elementList[1]) && isValid(elementList[2]) && isValid(elementList[3])) { + sb.setLength(0); + featureList[idx++] = sb.append("TB2:"). + append(elementList[1].getUblock()). + append(elementList[2].getUblock()). + append(elementList[3].getUblock()).toString(); + } + if (isValid(elementList[2]) && isValid(elementList[3]) && isValid(elementList[4])) { + sb.setLength(0); + featureList[idx++] = sb.append("TB3:"). + append(elementList[2].getUblock()). + append(elementList[3].getUblock()). + append(elementList[4].getUblock()).toString(); + } + if (isValid(elementList[3]) && isValid(elementList[4]) && isValid(elementList[5])) { + sb.setLength(0); + featureList[idx++] = sb.append("TB4:"). + append(elementList[3].getUblock()). + append(elementList[4].getUblock()). + append(elementList[5].getUblock()).toString(); + } + int score = fNegativeSum; + for (int j = 0; j < idx; j++) { + if (fModel.containsKey(featureList[j])) { + score += (2 * fModel.get(featureList[j])); + } + } + if (score > 0) { + boundary.add(index); + } + } + + /** + * Initialize the element list from the input string. + * + * @param inString A input string to be segmented. + * @param elementList A list to store the first six characters and their unicode block codes. + * @param numCodePts The number of code points of input string + * @return The number of the code units of the first six characters in inString. + */ + private int initElementList(CharacterIterator inString, Element[] elementList, + int numCodePts) { + int index = 0; + inString.setIndex(index); + int w1, w2, w3, w4, w5, w6; + w1 = w2 = w3 = w4 = w5 = w6 = INVALID; + if (numCodePts > 0) { + w3 = current32(inString); + index += Character.charCount(w3); + } + if (numCodePts > 1) { + w4 = next32(inString); + index += Character.charCount(w3); + } + if (numCodePts > 2) { + w5 = next32(inString); + index += Character.charCount(w5); + } + if (numCodePts > 3) { + w6 = next32(inString); + index += Character.charCount(w6); + } + + final String b1 = INVALID_STRING; + final String b2 = b1; + final String b3 = getUnicodeBlock(w3); + final String b4 = getUnicodeBlock(w4); + final String b5 = getUnicodeBlock(w5); + final String b6 = getUnicodeBlock(w6); + + elementList[0] = new Element(); + elementList[0].setCharAndUblock(w1, b1); + elementList[1] = new Element(); + elementList[1].setCharAndUblock(w2, b2); + elementList[2] = new Element(); + elementList[2].setCharAndUblock(w3, b3); + elementList[3] = new Element(); + elementList[3].setCharAndUblock(w4, b4); + elementList[4] = new Element(); + elementList[4].setCharAndUblock(w5, b5); + elementList[5] = new Element(); + elementList[5].setCharAndUblock(w6, b6); + + return index; + } + + /** + * Get the character's unicode block code defined in UBlockCode. + * + * @param ch A char. + * @return The unicode block code which is 3 digits with '0' added in the beginning if the code + * is less than 3 digits. + */ + private String getUnicodeBlock(int ch) { + int blockId = UCharacter.UnicodeBlock.of(ch).getID(); + if (blockId == UCharacter.UnicodeBlock.NO_BLOCK.getID() + || blockId == UCharacter.UnicodeBlock.INVALID_CODE_ID) { + return INVALID_STRING; + } else { + return String.format("%03d", blockId); + } + } + + /** + * Load the machine learning's model file. + */ + private void loadMLModel() { + int index = 0; + UResourceBundle rb = UResourceBundle.getBundleInstance(ICUData.ICU_BRKITR_BASE_NAME, + "jaml"); + UResourceBundle keyBundle = rb.get("modelKeys"); + UResourceBundle valueBundle = rb.get("modelValues"); + int[] value = valueBundle.getIntVector(); + UResourceBundleIterator iterator = keyBundle.getIterator(); + while (iterator.hasNext()) { + fNegativeSum -= value[index]; + fModel.put(iterator.nextString(), value[index++]); + } + } +} diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java index 4bea6cc0e..427955bfa 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java @@ -20,6 +20,7 @@ import org.junit.runners.JUnit4; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.dev.test.TestUtil; +import com.ibm.icu.impl.ICUConfig; import com.ibm.icu.impl.Utility; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.BreakIterator; @@ -124,6 +125,7 @@ public void TestExtended() { int rulesFirstLine = 0; // Line number of the start of current <rules> block int len = testString.length(); + boolean skipTest = false; for (charIdx = 0; charIdx < len; ) { int c = testString.codePointAt(charIdx); @@ -157,6 +159,7 @@ public void TestExtended() { break; } if (testString.startsWith("<word>", charIdx-1)) { + skipTest = false; tp.bi = BreakIterator.getWordInstance(tp.currentLocale); charIdx += 5; break; @@ -167,22 +170,46 @@ public void TestExtended() { break; } if (testString.startsWith("<line>", charIdx-1)) { + skipTest = false; tp.bi = BreakIterator.getLineInstance(tp.currentLocale); + if (Boolean.parseBoolean( + ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) { + if (tp.currentLocale.getName().equals("ja@lw=phrase")) { + // skip <line> test cases of JP's phrase breaking when ML is enabled. + skipTest = true; + } + } charIdx += 5; break; } + if (testString.startsWith("<lineML>", charIdx-1)) { + skipTest = false; + tp.bi = BreakIterator.getLineInstance(tp.currentLocale); + if (!Boolean.parseBoolean( + ICUConfig.get("com.ibm.icu.impl.breakiter.useMLPhraseBreaking", "false"))) { + if (tp.currentLocale.getName().equals("ja@lw=phrase")) { + // skip <lineML> test cases of JP's phrase breaking when ML is disabled. + skipTest = true; + } + } + charIdx += 7; + break; + } if (testString.startsWith("<sent>", charIdx-1)) { + skipTest = false; tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale); charIdx += 5; break; } if (testString.startsWith("<title>", charIdx-1)) { + skipTest = false; tp.bi = BreakIterator.getTitleInstance(tp.currentLocale); charIdx += 6; break; } if (testString.startsWith("<rules>", charIdx-1) || testString.startsWith("<badrules>", charIdx-1)) { + skipTest = false; charIdx = testString.indexOf('>', charIdx) + 1; parseState = PARSE_RULES; rules.setLength(0); @@ -272,7 +299,9 @@ public void TestExtended() { charIdx += 6; // RUN THE TEST! - executeTest(tp); + if (!skipTest) { + executeTest(tp); + } break; } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 72bd15803..40c6745dd 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -1913,6 +1913,26 @@ Bangkok)•</data> <data>•\U0001F469\u200D\U0001F680•\U0001F469\U0001F3FD\u200D\U0001F680\u0020•</data> <locale ja@lw=phrase> +#phrase breaking test cases for the ML solution +<lineML> +#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た• +<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data> +#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」• +<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data> +#Kana supplement: 𛁈(U+1B048) -> \uD82C\uDC48, 𛀸(U+1B038) -> \uD82C\uDC38, 𛀙(U+1B019)-> \uD82C\uDC19</data> +#𛁈る𛀸(しるこ)、あ𛀙よろし(あかよろし) -> 𛁈る𛀸•(しるこ)、•あ𛀙よろし•(あかよろし) +<data>•\U0001B048\u308B\U0001B038•\uFF08\u3057\u308B\u3053\uFF09\u3001•\u3042\U0001B019\u3088\u308D\u3057•\uFF08\u3042\u304B\u3088\u308D\u3057\uFF09•</data> +#中国の携帯は約500元から5000元です -> 中国の▁携帯は▁約▁500元から▁5000元です +<data>•\u4E2D\u56FD\u306E•\u643A\u5E2F\u306F•\u7D04•\uFF15\uFF10\uFF10\u5143\u304B\u3089•\uFF15\uFF10\uFF10\uFF10\u5143\u3067\u3059•</data> +#しかもロゴがUnicode!! -> しかも▁ロゴが▁Unicode!! +<data>•\u3057\u304B\u3082•\u30ED\u30B4\u304C•\uFF35\uFF4E\uFF49\uFF43\uFF4F\uFF44\uFF45\uFF01\uFF01•</data> +#バッテリーを長持ちさせ、充電を最適化します -> バッテリーを▁長持ちさせ、▁充電を▁最適化します +<data>•\u30D0\u30C3\u30C6\u30EA\u30FC\u3092•\u9577\u6301\u3061\u3055\u305B\u3001•\u5145\u96FB\u3092•\u6700\u9069\u5316\u3057\u307E\u3059•</data> +#データのコピー、スマートフォンでのお支払いなど -> データの▁コピー、▁スマートフォンでの▁お支払いなど +<data>•\u30C7\u30FC\u30BF\u306E•\u30B3\u30D4\u30FC\u3001•\u30B9\u30DE\u30FC\u30C8\u30D5\u30A9\u30F3\u3067\u306E•\u304A\u652F\u6255\u3044\u306A\u3069•</data> + +<locale ja@lw=phrase> +#phrase breaking test cases for the dictionary based solution <line> #[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。• <data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data> @@ -2005,8 +2025,8 @@ Bangkok)•</data> #大韓民國은 民主共和國이다 #<data>•大韓民國은 •民主•共和國이다•</data> # All the tests for ja@lw=phrase should also work in Korean. -#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。• -<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data> +#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」• +<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data> #9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た• <data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data> |