aboutsummaryrefslogtreecommitdiff
path: root/i18n
diff options
context:
space:
mode:
authorJean-Baptiste Queru <jbq@google.com>2009-07-17 17:40:43 -0700
committerJean-Baptiste Queru <jbq@google.com>2009-07-17 17:40:43 -0700
commitc0f3e2506e4cc62ff8c220fe72849728e9d6cecf (patch)
tree778334c2c002f3c4b016a8d4de106cdb8dc959d8 /i18n
parent6b13cbaafaffaeeaf0477e95816759728fcdb763 (diff)
downloadicu4c-c0f3e2506e4cc62ff8c220fe72849728e9d6cecf.tar.gz
import cl @40073
Diffstat (limited to 'i18n')
-rw-r--r--i18n/regexcmp.cpp993
-rw-r--r--i18n/regexcmp.h40
-rw-r--r--i18n/regexcst.h380
-rw-r--r--i18n/regexcst.pl8
-rw-r--r--i18n/regexcst.txt279
-rw-r--r--i18n/regeximp.h28
-rw-r--r--i18n/regexst.cpp61
-rw-r--r--i18n/repattrn.cpp59
-rw-r--r--i18n/unicode/regex.h10
-rw-r--r--i18n/unicode/uregex.h12
10 files changed, 1428 insertions, 442 deletions
diff --git a/i18n/regexcmp.cpp b/i18n/regexcmp.cpp
index 860333c0..7d0343e4 100644
--- a/i18n/regexcmp.cpp
+++ b/i18n/regexcmp.cpp
@@ -27,6 +27,7 @@
#include "uvectr32.h"
#include "uassert.h"
#include "ucln_in.h"
+#include "uinvchar.h"
#include "regeximp.h"
#include "regexcst.h" // Contains state table for the regex pattern parser.
@@ -38,7 +39,16 @@
U_NAMESPACE_BEGIN
-
+// TODO: remove
+#if 0
+#include <stdio.h>
+static void printstring(const UnicodeString &s) {
+ for (int i=0; i<s.length(); i++) {
+ printf("%c", s[i]);
+ }
+ printf("\n");
+}
+#endif
@@ -47,7 +57,8 @@ U_NAMESPACE_BEGIN
// Constructor.
//
//------------------------------------------------------------------------------
-RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status)
+RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
+ fParenStack(status), fSetStack(status), fSetOpStack(status)
{
fStatus = &status;
@@ -71,6 +82,8 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(
}
}
+static const UChar chAmp = 0x26; // '&'
+static const UChar chDash = 0x2d; // '-'
//------------------------------------------------------------------------------
@@ -145,7 +158,7 @@ void RegexCompile::compile(
// the search will stop there, if not before.
//
tableEl = &gRuleParseStateTable[state];
- REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
+ REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
for (;;) { // loop through table rows belonging to this state, looking for one
@@ -226,7 +239,7 @@ void RegexCompile::compile(
// state stack underflow
// This will occur if the user pattern has mis-matched parentheses,
// with extra close parens.
- //
+ //
fStackPtr++;
error(U_REGEX_MISMATCHED_PAREN);
}
@@ -234,6 +247,16 @@ void RegexCompile::compile(
}
+ if (U_FAILURE(*fStatus)) {
+ // Bail out if the pattern had errors.
+ // Set stack cleanup: a successful compile would have left it empty,
+ // but errors can leave temporary sets hanging around.
+ while (!fSetStack.empty()) {
+ delete (UnicodeSet *)fSetStack.pop();
+ }
+ return;
+ }
+
//
// The pattern has now been read and processed, and the compiled code generated.
//
@@ -288,8 +311,8 @@ void RegexCompile::compile(
//
// Optimization passes
- //
- matchStartType();
+ //
+ matchStartType();
OptDotStar();
stripNOPs();
@@ -465,7 +488,7 @@ UBool RegexCompile::doParseActions(int32_t action)
case doOpenAtomicParen:
// Open Atomic Paren. (?>
// Compile to a
- // - NOP, which later may be replaced if the parenthesized group
+ // - NOP, which later may be replaced if the parenthesized group
// has a quantifier, followed by
// - STO_SP save state stack position, so it can be restored at the ")"
// - NOP, which may later be replaced by a save-state if there
@@ -500,11 +523,11 @@ UBool RegexCompile::doParseActions(int32_t action)
// 3. NOP may be replaced if there is are '|' ops in the block.
// 4. code for parenthesized stuff.
// 5. ENDLA
- //
+ //
// Two data slots are reserved, for saving the stack ptr and the input position.
{
int32_t dataLoc = fRXPat->fDataSize;
- fRXPat->fDataSize += 2;
+ fRXPat->fDataSize += 2;
int32_t op = URX_BUILD(URX_LA_START, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
@@ -513,7 +536,7 @@ UBool RegexCompile::doParseActions(int32_t action)
fRXPat->fCompiledPat->addElement(op, *fStatus);
// On the Parentheses stack, start a new frame and add the postions
- // of the NOPs.
+ // of the NOPs.
fParenStack.push(fModeFlags, *fStatus); // Match mode state
fParenStack.push(lookAhead, *fStatus); // Frame type.
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
@@ -534,7 +557,7 @@ UBool RegexCompile::doParseActions(int32_t action)
// 7. ...
{
int32_t dataLoc = fRXPat->fDataSize;
- fRXPat->fDataSize += 2;
+ fRXPat->fDataSize += 2;
int32_t op = URX_BUILD(URX_LA_START, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
@@ -545,12 +568,12 @@ UBool RegexCompile::doParseActions(int32_t action)
fRXPat->fCompiledPat->addElement(op, *fStatus);
// On the Parentheses stack, start a new frame and add the postions
- // of the StateSave and NOP.
+ // of the StateSave and NOP.
fParenStack.push(fModeFlags, *fStatus); // Match mode state
fParenStack.push( negLookAhead, *fStatus); // Frame type
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
-
+
// Instructions #5 and #6 will be added when the ')' is encountered.
}
break;
@@ -574,34 +597,34 @@ UBool RegexCompile::doParseActions(int32_t action)
// 0: Stack ptr on entry
// 1: Input Index on entry
// 2: Start index of match current match attempt.
- // 3: Original Input String len.
+ // 3: Original Input String len.
// Allocate data space
int32_t dataLoc = fRXPat->fDataSize;
- fRXPat->fDataSize += 4;
-
+ fRXPat->fDataSize += 4;
+
// Emit URX_LB_START
int32_t op = URX_BUILD(URX_LB_START, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
-
+
// Emit URX_LB_CONT
op = URX_BUILD(URX_LB_CONT, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
-
+
// Emit the NOP
op = URX_BUILD(URX_NOP, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
fRXPat->fCompiledPat->addElement(op, *fStatus);
-
+
// On the Parentheses stack, start a new frame and add the postions
- // of the URX_LB_CONT and the NOP.
+ // of the URX_LB_CONT and the NOP.
fParenStack.push(fModeFlags, *fStatus); // Match mode state
fParenStack.push(lookBehind, *fStatus); // Frame type
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
-
+
// The final two instructions will be added when the ')' is encountered.
}
@@ -627,35 +650,35 @@ UBool RegexCompile::doParseActions(int32_t action)
// 0: Stack ptr on entry
// 1: Input Index on entry
// 2: Start index of match current match attempt.
- // 3: Original Input String len.
+ // 3: Original Input String len.
// Allocate data space
int32_t dataLoc = fRXPat->fDataSize;
- fRXPat->fDataSize += 4;
-
+ fRXPat->fDataSize += 4;
+
// Emit URX_LB_START
int32_t op = URX_BUILD(URX_LB_START, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
-
+
// Emit URX_LBN_CONT
op = URX_BUILD(URX_LBN_CONT, dataLoc);
fRXPat->fCompiledPat->addElement(op, *fStatus);
fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later.
fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later.
fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later.
-
+
// Emit the NOP
op = URX_BUILD(URX_NOP, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
fRXPat->fCompiledPat->addElement(op, *fStatus);
-
+
// On the Parentheses stack, start a new frame and add the postions
- // of the URX_LB_CONT and the NOP.
+ // of the URX_LB_CONT and the NOP.
fParenStack.push(fModeFlags, *fStatus); // Match mode state
fParenStack.push(lookBehindN, *fStatus); // Frame type
fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
-
+
// The final two instructions will be added when the ')' is encountered.
}
break;
@@ -834,7 +857,7 @@ UBool RegexCompile::doParseActions(int32_t action)
// 2. LOOP_C stack location
// ...
//
- // Or if this is a .*
+ // Or if this is a .*
// 1. LOOP_DOT_I (. matches all mode flag)
// 2. LOOP_C stack location
//
@@ -855,7 +878,7 @@ UBool RegexCompile::doParseActions(int32_t action)
int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc);
if (URX_TYPE(repeatedOp) == URX_SETREF) {
- // Emit optimized code for a [char set]*
+ // Emit optimized code for a [char set]*
int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp));
fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
dataLoc = fRXPat->fFrameSize;
@@ -899,7 +922,7 @@ UBool RegexCompile::doParseActions(int32_t action)
fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2);
}
-
+
// Locate the position in the compiled pattern where the match will continue
// after completing the *. (4 or 5 in the comment above)
int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
@@ -982,8 +1005,8 @@ UBool RegexCompile::doParseActions(int32_t action)
// Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
{
// Remember the loc for the top of the block being looped over.
- // (Can not reserve a slot in the compiled pattern at this time, becuase
- // compileInterval needs to reserve also, and blockTopLoc can only reserve
+ // (Can not reserve a slot in the compiled pattern at this time, becuase
+ // compileInterval needs to reserve also, and blockTopLoc can only reserve
// once per block.)
int32_t topLoc = blockTopLoc(FALSE);
@@ -1022,11 +1045,22 @@ UBool RegexCompile::doParseActions(int32_t action)
break;
case doLiteralChar:
- // We've just scanned a "normal" character from the pattern,
+ // We've just scanned a "normal" character from the pattern,
literalChar(fC.fChar);
break;
+ case doEscapedLiteralChar:
+ // We've just scanned an backslashed escaped character with no
+ // special meaning. It represents itself.
+ if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
+ ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
+ (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
+ error(U_REGEX_BAD_ESCAPE_SEQUENCE);
+ }
+ literalChar(fC.fChar);
+ break;
+
case doDotAny:
// scanned a ".", match any single character.
@@ -1041,7 +1075,7 @@ UBool RegexCompile::doParseActions(int32_t action)
}
break;
- case doCaret:
+ case doCaret:
{
int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_CARET_M : URX_CARET;
fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
@@ -1049,7 +1083,7 @@ UBool RegexCompile::doParseActions(int32_t action)
break;
- case doDollar:
+ case doDollar:
{
int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_DOLLAR_M : URX_DOLLAR;
fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus);
@@ -1144,18 +1178,13 @@ UBool RegexCompile::doParseActions(int32_t action)
}
break;
-
- case doScanUnicodeSet:
+ case doNamedChar:
{
- UnicodeSet *theSet = scanSet();
- compileSet(theSet);
+ UChar32 c = scanNamedChar();
+ literalChar(c);
}
break;
-
- case doEnterQuoteMode:
- // Just scanned a \Q. Put character scanner into quote mode.
- fQuoteMode = TRUE;
- break;
+
case doBackRef:
// BackReference. Somewhat unusual in that the front-end can not completely parse
@@ -1182,7 +1211,7 @@ UBool RegexCompile::doParseActions(int32_t action)
}
// Scan of the back reference in the source regexp is complete. Now generate
- // the compiled code for it.
+ // the compiled code for it.
// Because capture groups can be forward-referenced by back-references,
// we fill the operand with the capture group number. At the end
// of compilation, it will be changed to the variable's location.
@@ -1222,7 +1251,7 @@ UBool RegexCompile::doParseActions(int32_t action)
// Emit the STATE_SAVE
op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
fRXPat->fCompiledPat->addElement(op, *fStatus);
-
+
// Emit the JMP
op = URX_BUILD(URX_JMP, topLoc+1);
fRXPat->fCompiledPat->addElement(op, *fStatus);
@@ -1259,7 +1288,7 @@ UBool RegexCompile::doParseActions(int32_t action)
op = URX_BUILD(URX_STATE_SAVE, L7);
fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
- // Append the JMP operation.
+ // Append the JMP operation.
op = URX_BUILD(URX_JMP, topLoc+1);
fRXPat->fCompiledPat->addElement(op, *fStatus);
@@ -1335,8 +1364,8 @@ UBool RegexCompile::doParseActions(int32_t action)
fModeFlags = fNewModeFlags;
// Prevent any string from spanning across the change of match mode.
- // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
- fixLiterals();
+ // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef"
+ fixLiterals();
break;
@@ -1379,6 +1408,272 @@ UBool RegexCompile::doParseActions(int32_t action)
break;
+ case doSetAddAmp:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ set->add(chAmp);
+ }
+ break;
+
+ case doSetAddDash:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ set->add(chDash);
+ }
+ break;
+
+ case doSetBackslash_s:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
+ break;
+ }
+
+ case doSetBackslash_S:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
+ SSet.complement();
+ set->addAll(SSet);
+ break;
+ }
+
+ case doSetBackslash_d:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ UnicodeSet digits(UnicodeString("\\p{Nd}"), *fStatus); // TODO - make a static set,
+ set->addAll(digits);
+ break;
+ }
+
+ case doSetBackslash_D:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ UnicodeSet digits(UnicodeString("\\P{Nd}"), *fStatus); // TODO - make a static set,
+ set->addAll(digits);
+ break;
+ }
+
+ case doSetBackslash_w:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
+ break;
+ }
+
+ case doSetBackslash_W:
+ {
+ UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
+ UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
+ SSet.complement();
+ set->addAll(SSet);
+ break;
+ }
+
+ case doSetBegin:
+ fSetStack.push(new UnicodeSet(), *fStatus);
+ fSetOpStack.push(setStart, *fStatus);
+ if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
+ fSetOpStack.push(setCaseClose, *fStatus);
+ }
+ break;
+
+ case doSetBeginDifference1:
+ // We have scanned something like [[abc]-[
+ // Set up a new UnicodeSet for the set beginning with the just-scanned '['
+ // Push a Difference operator, which will cause the new set to be subtracted from what
+ // went before once it is created.
+ setPushOp(setDifference1);
+ fSetOpStack.push(setStart, *fStatus);
+ if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
+ fSetOpStack.push(setCaseClose, *fStatus);
+ }
+ break;
+
+ case doSetBeginIntersection1:
+ // We have scanned something like [[abc]&[
+ // Need both the '&' operator and the open '[' operator.
+ setPushOp(setIntersection1);
+ fSetOpStack.push(setStart, *fStatus);
+ if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
+ fSetOpStack.push(setCaseClose, *fStatus);
+ }
+ break;
+
+ case doSetBeginUnion:
+ // We have scanned something like [[abc][
+ // Need to handle the union operation explicitly [[abc] | [
+ setPushOp(setUnion);
+ fSetOpStack.push(setStart, *fStatus);
+ if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
+ fSetOpStack.push(setCaseClose, *fStatus);
+ }
+ break;
+
+ case doSetDifference2:
+ // We have scanned something like [abc--
+ // Consider this to unambiguously be a set difference operator.
+ setPushOp(setDifference2);
+ break;
+
+ case doSetEnd:
+ // Have encountered the ']' that closes a set.
+ // Force the evaluation of any pending operations within this set,
+ // leave the completed set on the top of the set stack.
+ {
+ setEval(setEnd);
+ int32_t setOp = fSetOpStack.popi();
+ U_ASSERT(setOp==setStart);
+ break;
+ }
+
+ case doSetFinish:
+ {
+ // Finished a complete set expression, including all nested sets.
+ // The close bracket has already triggered clearing out pending set operators,
+ // the operator stack should be empty and the operand stack should have just
+ // one entry, the result set.
+ U_ASSERT(fSetOpStack.empty());
+ UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
+ U_ASSERT(fSetStack.empty());
+ compileSet(theSet);
+ break;
+ }
+
+ case doSetIntersection2:
+ // Have scanned something like [abc&&
+ setPushOp(setIntersection2);
+ break;
+
+ case doSetLiteral:
+ // Union the just-scanned literal character into the set being built.
+ // This operation is the highest precedence set operation, so we can always do
+ // it immediately, without waiting to see what follows. It is necessary to perform
+ // any pending '-' or '&' operation first, because these have the same precedence
+ // as union-ing in a literal'
+ {
+ setEval(setUnion);
+ UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
+ s->add(fC.fChar);
+ fLastSetLiteral = fC.fChar;
+ break;
+ }
+
+ case doSetLiteralEscaped:
+ // A back-slash escaped literal character was encountered.
+ // Processing is the same as with setLiteral, above, with the addition of
+ // the optional check for errors on escaped ASCII letters.
+ {
+ if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
+ ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
+ (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
+ error(U_REGEX_BAD_ESCAPE_SEQUENCE);
+ }
+ setEval(setUnion);
+ UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
+ s->add(fC.fChar);
+ fLastSetLiteral = fC.fChar;
+ break;
+ }
+
+ case doSetNamedChar:
+ // Scanning a \N{UNICODE CHARACTER NAME}
+ // Aside from the source of the character, the processing is identical to doSetLiteral,
+ // above.
+ {
+ UChar32 c = scanNamedChar();
+ setEval(setUnion);
+ UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
+ s->add(c);
+ fLastSetLiteral = c;
+ break;
+ }
+
+ case doSetNamedRange:
+ // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
+ // The left character is already in the set, and is saved in fLastSetLiteral.
+ // The right side needs to be picked up, the scan is at the 'N'.
+ // Lower Limit > Upper limit being an error matches both Java
+ // and ICU UnicodeSet behavior.
+ {
+ UChar32 c = scanNamedChar();
+ if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
+ error(U_REGEX_INVALID_RANGE);
+ }
+ UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
+ s->add(fLastSetLiteral, c);
+ fLastSetLiteral = c;
+ break;
+ }
+
+
+ case doSetNegate:
+ // Scanned a '^' at the start of a set.
+ // Push the negation operator onto the set op stack.
+ // A twist for case-insensitive matching:
+ // the case closure operation must happen _before_ negation.
+ // But the case closure operation will already be on the stack if it's required.
+ // This requires checking for case closure, and swapping the stack order
+ // if it is present.
+ {
+ int32_t tosOp = fSetOpStack.peeki();
+ if (tosOp == setCaseClose) {
+ fSetOpStack.popi();
+ fSetOpStack.push(setNegation, *fStatus);
+ fSetOpStack.push(setCaseClose, *fStatus);
+ } else {
+ fSetOpStack.push(setNegation, *fStatus);
+ }
+ }
+ break;
+
+ case doSetNoCloseError:
+ error(U_REGEX_MISSING_CLOSE_BRACKET);
+ break;
+
+ case doSetOpError:
+ error(U_REGEX_RULE_SYNTAX); // TODO: -- or && at the end of a set. Illegal.
+ break;
+
+ case doSetPosixProp:
+ {
+ UnicodeSet *s = scanPosixProp();
+ if (s != NULL) {
+ UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
+ tos->addAll(*s);
+ delete s;
+ } // else error. scanProp() reported the error status already.
+ }
+ break;
+
+ case doSetProp:
+ // Scanned a \p \P within [brackets].
+ {
+ UnicodeSet *s = scanProp();
+ if (s != NULL) {
+ UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
+ tos->addAll(*s);
+ delete s;
+ } // else error. scanProp() reported the error status already.
+ }
+ break;
+
+
+ case doSetRange:
+ // We have scanned literal-literal. Add the range to the set.
+ // The left character is already in the set, and is saved in fLastSetLiteral.
+ // The right side is the current character.
+ // Lower Limit > Upper limit being an error matches both Java
+ // and ICU UnicodeSet behavior.
+ {
+ if (fLastSetLiteral > fC.fChar) {
+ error(U_REGEX_INVALID_RANGE);
+ }
+ UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
+ s->add(fLastSetLiteral, fC.fChar);
+ break;
+ }
+
default:
U_ASSERT(FALSE);
@@ -1431,7 +1726,7 @@ void RegexCompile::literalChar(UChar32 c) {
fRXPat->fLiteralText.append(c);
return;
}
-
+
// We are adding onto an existing string
fRXPat->fLiteralText.append(c);
@@ -1439,7 +1734,7 @@ void RegexCompile::literalChar(UChar32 c) {
opType = URX_TYPE(op);
U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN);
- // If the most recently emitted op is a URX_ONECHAR,
+ // If the most recently emitted op is a URX_ONECHAR,
if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) {
if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) {
// The most recently emitted op is a ONECHAR that was the first half
@@ -1451,7 +1746,7 @@ void RegexCompile::literalChar(UChar32 c) {
fRXPat->fCompiledPat->setElementAt(op, patternLoc);
return;
}
-
+
// The most recently emitted op is a ONECHAR.
// We've now received another adjacent char. Change the ONECHAR op
// to a string op.
@@ -1465,7 +1760,7 @@ void RegexCompile::literalChar(UChar32 c) {
op = URX_BUILD(URX_STRING_LEN, 0);
fRXPat->fCompiledPat->addElement(op, *fStatus);
}
-
+
// The pattern contains a URX_SRING / URX_STRING_LEN. Update the
// string length to reflect the new char we just added to the string.
stringLen = fRXPat->fLiteralText.length() - fStringOpStart;
@@ -1523,7 +1818,7 @@ void RegexCompile::fixLiterals(UBool split) {
UChar32 nextToLastChar;
int32_t stringLen;
- fStringOpStart = -1;
+ fStringOpStart = -1;
if (!split) {
return;
}
@@ -1533,7 +1828,7 @@ void RegexCompile::fixLiterals(UBool split) {
// separate the last char from the rest of the string.
// If the last operation from the compiled pattern is not a string,
- // nothing needs to be done
+ // nothing needs to be done
op = fRXPat->fCompiledPat->lastElementi();
opType = URX_TYPE(op);
if (opType != URX_STRING_LEN) {
@@ -1651,7 +1946,7 @@ void RegexCompile::insertOp(int32_t where) {
//
// parameter reserveLoc : TRUE - ensure that there is space to add an opcode
// at the returned location.
-// FALSE - just return the address,
+// FALSE - just return the address,
// do not reserve a location there.
//
//------------------------------------------------------------------------------
@@ -1725,10 +2020,10 @@ void RegexCompile::handleCloseParen() {
// At the close of any parenthesized block, restore the match mode flags to
// the value they had at the open paren. Saved value is
- // at the top of the paren stack.
+ // at the top of the paren stack.
fModeFlags = fParenStack.popi();
U_ASSERT(fModeFlags < 0);
-
+
// DO any additional fixups, depending on the specific kind of
// parentesized grouping this is
@@ -1798,7 +2093,7 @@ void RegexCompile::handleCloseParen() {
case lookBehind:
{
// See comment at doOpenLookBehind.
-
+
// Append the URX_LB_END and URX_LA_END to the compiled pattern.
int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
@@ -1833,7 +2128,7 @@ void RegexCompile::handleCloseParen() {
case lookBehindN:
{
// See comment at doOpenLookBehindNeg.
-
+
// Append the URX_LBN_END to the compiled pattern.
int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
@@ -1890,24 +2185,23 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
if (theSet == NULL) {
return;
}
+ // Remove any strings from the set.
+ // There shoudn't be any, but just in case.
+ // (Case Closure can add them; if we had a simple case closure avaialble that
+ // ignored strings, that would be better.)
+ theSet->removeAllStrings();
int32_t setSize = theSet->size();
UChar32 firstSetChar = theSet->charAt(0);
- if (firstSetChar == -1) {
- // Sets that contain only strings, but no individual chars,
- // will end up here.
- error(U_REGEX_SET_CONTAINS_STRING);
- setSize = 0;
- }
switch (setSize) {
- case 0:
+ case 0:
{
- // Set of no elements. Always fails to match.
+ // Set of no elements. Always fails to match.
fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus);
delete theSet;
}
break;
-
+
case 1:
{
// The set contains only a single code point. Put it into
@@ -1917,8 +2211,8 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
delete theSet;
}
break;
-
- default:
+
+ default:
{
// The set contains two or more chars. (the normal case)
// Put it into the compiled pattern as a set.
@@ -1944,9 +2238,9 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
// 2 min count
// 3 max count (-1 for unbounded)
// 4 ... block to be iterated over
-// 5 CTR_LOOP
-//
-// In
+// 5 CTR_LOOP
+//
+// In
//------------------------------------------------------------------------------
void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
{
@@ -2020,9 +2314,9 @@ UBool RegexCompile::compileInlineInterval() {
//
int32_t op = fRXPat->fCompiledPat->elementAti(topOfBlock);
- // Compute the pattern location where the inline sequence
+ // Compute the pattern location where the inline sequence
// will end, and set up the state save op that will be needed.
- //
+ //
int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
+ fIntervalUpper + (fIntervalUpper-fIntervalLow);
int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc);
@@ -2127,7 +2421,7 @@ void RegexCompile::matchStartType() {
case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
case URX_LD_SP:
break;
-
+
case URX_CARET:
if (atStart) {
fRXPat->fStartType = START_START;
@@ -2139,7 +2433,7 @@ void RegexCompile::matchStartType() {
fRXPat->fStartType = START_LINE;
}
break;
-
+
case URX_ONECHAR:
if (currentLen == 0) {
// This character could appear at the start of a match.
@@ -2150,9 +2444,9 @@ void RegexCompile::matchStartType() {
currentLen++;
atStart = FALSE;
break;
-
- case URX_SETREF:
+
+ case URX_SETREF:
if (currentLen == 0) {
int32_t sn = URX_VAL(op);
U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
@@ -2189,7 +2483,7 @@ void RegexCompile::matchStartType() {
break;
- case URX_STATIC_SETREF:
+ case URX_STATIC_SETREF:
if (currentLen == 0) {
int32_t sn = URX_VAL(op);
U_ASSERT(sn>0 && sn<URX_LAST_SET);
@@ -2203,7 +2497,7 @@ void RegexCompile::matchStartType() {
- case URX_STAT_SETREF_N:
+ case URX_STAT_SETREF_N:
if (currentLen == 0) {
int32_t sn = URX_VAL(op);
const UnicodeSet *s = fRXPat->fStaticSets[sn];
@@ -2221,7 +2515,7 @@ void RegexCompile::matchStartType() {
case URX_BACKSLASH_D:
// Digit Char
if (currentLen == 0) {
- UnicodeSet s;
+ UnicodeSet s;
s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
if (URX_VAL(op) != 0) {
s.complement();
@@ -2282,7 +2576,7 @@ void RegexCompile::matchStartType() {
// Loop of some kind. Can safely ignore, the worst that will happen
// is that we understate the true minimum length
currentLen = forwardedLength.elementAti(loc+1);
-
+
} else {
// Forward jump. Propagate the current min length to the target loc of the jump.
U_ASSERT(jmpDest <= end+1);
@@ -2318,11 +2612,11 @@ void RegexCompile::matchStartType() {
if (currentLen < forwardedLength.elementAti(jmpDest)) {
forwardedLength.setElementAt(currentLen, jmpDest);
}
- }
+ }
}
atStart = FALSE;
break;
-
+
@@ -2346,7 +2640,7 @@ void RegexCompile::matchStartType() {
fRXPat->fInitialStringIdx = stringStartIdx;
fRXPat->fInitialStringLen = stringLen;
}
-
+
currentLen += stringLen;
atStart = FALSE;
}
@@ -2382,10 +2676,10 @@ void RegexCompile::matchStartType() {
{
// Loop Init Ops. These don't change the min length, but they are 4 word ops
// so location must be updated accordingly.
- // Loop Init Ops.
+ // Loop Init Ops.
// If the min loop count == 0
// move loc forwards to the end of the loop, skipping over the body.
- // If the min count is > 0,
+ // If the min count is > 0,
// continue normal processing of the body of the loop.
int32_t loopEndLoc = fRXPat->fCompiledPat->elementAti(loc+1);
loopEndLoc = URX_VAL(loopEndLoc);
@@ -2398,7 +2692,7 @@ void RegexCompile::matchStartType() {
if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
forwardedLength.setElementAt(currentLen, loopEndLoc);
}
- }
+ }
loc+=3; // Skips over operands of CTR_INIT
}
atStart = FALSE;
@@ -2407,17 +2701,17 @@ void RegexCompile::matchStartType() {
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
- // Loop ops.
+ // Loop ops.
// The jump is conditional, backwards only.
atStart = FALSE;
break;
-
+
case URX_LOOP_C:
// More loop ops. These state-save to themselves.
// don't change the minimum match
atStart = FALSE;
break;
-
+
case URX_LA_START:
case URX_LB_START:
@@ -2447,25 +2741,25 @@ void RegexCompile::matchStartType() {
}
}
}
- U_ASSERT(loc <= end);
+ U_ASSERT(loc <= end);
}
}
break;
-
+
case URX_LA_END:
case URX_LB_CONT:
case URX_LB_END:
case URX_LBN_CONT:
case URX_LBN_END:
- U_ASSERT(FALSE); // Shouldn't get here. These ops should be
+ U_ASSERT(FALSE); // Shouldn't get here. These ops should be
// consumed by the scan in URX_LA_START and LB_START
break;
-
+
default:
U_ASSERT(FALSE);
}
-
+
}
@@ -2524,7 +2818,7 @@ void RegexCompile::matchStartType() {
//------------------------------------------------------------------------------
//
// minMatchLength Calculate the length of the shortest string that could
-// match the specified pattern.
+// match the specified pattern.
// Length is in 16 bit code units, not code points.
//
// The calculated length may not be exact. The returned
@@ -2603,10 +2897,10 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_JMP_SAV:
case URX_JMP_SAV_X:
break;
-
+
// Ops that match a minimum of one character (one or two 16 bit code units.)
- //
+ //
case URX_ONECHAR:
case URX_STATIC_SETREF:
case URX_STAT_SETREF_N:
@@ -2661,10 +2955,10 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
if (currentLen < forwardedLength.elementAti(jmpDest)) {
forwardedLength.setElementAt(currentLen, jmpDest);
}
- }
+ }
}
break;
-
+
case URX_STRING:
case URX_STRING_I:
@@ -2679,10 +2973,10 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_CTR_INIT:
case URX_CTR_INIT_NG:
{
- // Loop Init Ops.
+ // Loop Init Ops.
// If the min loop count == 0
// move loc forwards to the end of the loop, skipping over the body.
- // If the min count is > 0,
+ // If the min count is > 0,
// continue normal processing of the body of the loop.
int32_t loopEndLoc = fRXPat->fCompiledPat->elementAti(loc+1);
loopEndLoc = URX_VAL(loopEndLoc);
@@ -2698,17 +2992,17 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
case URX_CTR_LOOP:
case URX_CTR_LOOP_NG:
- // Loop ops.
+ // Loop ops.
// The jump is conditional, backwards only.
break;
-
+
case URX_LOOP_SR_I:
case URX_LOOP_DOT_I:
case URX_LOOP_C:
// More loop ops. These state-save to themselves.
// don't change the minimum match - could match nothing at all.
break;
-
+
case URX_LA_START:
case URX_LB_START:
@@ -2740,12 +3034,12 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
}
}
}
-
- U_ASSERT(loc <= end);
+
+ U_ASSERT(loc <= end);
}
}
break;
-
+
case URX_LA_END:
case URX_LB_CONT:
case URX_LB_END:
@@ -2754,11 +3048,11 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
// Only come here if the matching URX_LA_START or URX_LB_START was not in the
// range being sized, which happens when measuring size of look-behind blocks.
break;
-
+
default:
U_ASSERT(FALSE);
}
-
+
}
// We have finished walking through the ops. Check whether some forward jump
@@ -2767,7 +3061,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
currentLen = forwardedLength.elementAti(end+1);
U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
}
-
+
return currentLen;
}
@@ -2776,7 +3070,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
//------------------------------------------------------------------------------
//
// maxMatchLength Calculate the length of the longest string that could
-// match the specified pattern.
+// match the specified pattern.
// Length is in 16 bit code units, not code points.
//
// The calculated length may not be exact. The returned
@@ -2843,7 +3137,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
case URX_LBN_CONT:
case URX_LBN_END:
break;
-
+
// Ops that increase that cause an unbounded increase in the length
// of a matched string, or that increase it a hard to characterize way.
@@ -2858,13 +3152,13 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
// Ops that match a max of one character (possibly two 16 bit code units.)
- //
+ //
case URX_STATIC_SETREF:
case URX_STAT_SETREF_N:
case URX_SETREF:
case URX_BACKSLASH_D:
case URX_ONECHAR_I:
- case URX_DOTANY_ALL:
+ case URX_DOTANY_ALL:
case URX_DOTANY:
currentLen+=2;
break;
@@ -2878,7 +3172,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
}
break;
- // Jumps.
+ // Jumps.
//
case URX_JMP:
case URX_JMPX:
@@ -2922,7 +3216,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
}
}
break;
-
+
@@ -2948,8 +3242,8 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
// INT32_MAX length will stop the per-instruction loop.
currentLen = INT32_MAX;
break;
-
-
+
+
case URX_LA_START:
case URX_LA_END:
@@ -2957,16 +3251,16 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
// it were normal pattern. Gives a too-long match length,
// but good enough for now.
break;
-
+
// End of look-ahead ops should always be consumed by the processing at
// the URX_LA_START op.
// U_ASSERT(FALSE);
// break;
-
+
case URX_LB_START:
{
// Look-behind. Scan forward until the matching look-around end,
- // without processing the look-behind block.
+ // without processing the look-behind block.
int32_t depth = 0;
for (;;) {
loc++;
@@ -2980,7 +3274,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
}
depth--;
}
- U_ASSERT(loc < end);
+ U_ASSERT(loc < end);
}
}
break;
@@ -2989,16 +3283,16 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
U_ASSERT(FALSE);
}
-
+
if (currentLen == INT32_MAX) {
// The maximum length is unbounded.
// Stop further processing of the pattern.
break;
}
-
+
}
return currentLen;
-
+
}
@@ -3154,9 +3448,9 @@ void RegexCompile::OptDotStar() {
U_ASSERT(jmpLoc>0);
op = fRXPat->fCompiledPat->elementAti(jmpLoc);
opType = URX_TYPE(op);
- switch(opType) {
+ switch(opType) {
+
-
case URX_END:
case URX_NOP:
case URX_END_CAPTURE:
@@ -3234,18 +3528,24 @@ void RegexCompile::error(UErrorCode e) {
// (Think EBCDIC).
//
static const UChar chCR = 0x0d; // New lines, for terminating comments.
-static const UChar chLF = 0x0a;
-static const UChar chNEL = 0x85; // NEL newline variant
-static const UChar chLS = 0x2028; // Unicode Line Separator
+static const UChar chLF = 0x0a; // Line Feed
static const UChar chPound = 0x23; // '#', introduces a comment.
+static const UChar chDigit0 = 0x30; // '0'
+static const UChar chDigit7 = 0x37; // '9'
+static const UChar chColon = 0x3A; // ':'
static const UChar chE = 0x45; // 'E'
-static const UChar chUpperN = 0x4E;
-static const UChar chLowerP = 0x70;
-static const UChar chUpperP = 0x50;
+static const UChar chQ = 0x51; // 'Q'
+static const UChar chN = 0x4E; // 'N'
+static const UChar chP = 0x50; // 'P'
static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
-static const UChar chLBracket = 0x5b;
-static const UChar chRBracket = 0x5d;
-static const UChar chRBrace = 0x7d;
+static const UChar chLBracket = 0x5b; // '['
+static const UChar chRBracket = 0x5d; // ']'
+static const UChar chUp = 0x5e; // '^'
+static const UChar chLowerP = 0x70;
+static const UChar chLBrace = 0x7b; // '{'
+static const UChar chRBrace = 0x7d; // '}'
+static const UChar chNEL = 0x85; // NEL newline variant
+static const UChar chLS = 0x2028; // Unicode Line Separator
//------------------------------------------------------------------------------
@@ -3278,10 +3578,6 @@ UChar32 RegexCompile::nextCharLL() {
// reset the column to 0.
fLineNum++;
fCharNum=0;
- if (fQuoteMode) {
- error(U_REGEX_RULE_SYNTAX);
- fQuoteMode = FALSE;
- }
}
else {
// Character is not starting a new line. Except in the case of a
@@ -3343,7 +3639,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
if (fModeFlags & UREGEX_COMMENTS) {
//
// We are in free-spacing and comments mode.
- // Scan through any white space and comments, until we
+ // Scan through any white space and comments, until we
// reach a significant character or the end of inut.
for (;;) {
if (c.fChar == (UChar32)-1) {
@@ -3362,6 +3658,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
}
}
}
+ // TODO: check what Java & Perl do with non-ASCII white spaces.
if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) {
break;
}
@@ -3372,9 +3669,9 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
//
// check for backslash escaped characters.
//
- int32_t startX = fNextIndex; // start and end positions of the
- int32_t endX = fNextIndex; // sequence following the '\'
if (c.fChar == chBackSlash) {
+ int32_t startX = fNextIndex; // start and end positions of the
+ int32_t endX = fNextIndex; // sequence following the '\'
if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
//
// A '\' sequence that is handled by ICU's standard unescapeAt function.
@@ -3390,11 +3687,39 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
fCharNum += endX - startX;
fNextIndex = endX;
}
+ else if (peekCharLL() == chDigit0) {
+ // Octal Escape, using Java Regexp Conventions
+ // which are \0 followed by 1-3 octal digits.
+ // Different from ICU Unescape handling of Octal, which does not
+ // require the leading 0.
+ c.fChar = 0;
+ nextCharLL(); // Consume the initial 0.
+ int index;
+ for (index=0; index<3; index++) {
+ int32_t ch = peekCharLL();
+ if (ch<chDigit0 || ch>chDigit7) {
+ break;
+ }
+ nextCharLL();
+ c.fChar <<= 3;
+ c.fChar += ch&7;
+ }
+ if (c.fChar>255) {
+ error(U_REGEX_OCTAL_TOO_BIG);
+ }
+ c.fQuoted = TRUE;
+ }
+ else if (peekCharLL() == chQ) {
+ // "\Q" enter quote mode, which will continue until "\E"
+ fQuoteMode = TRUE;
+ nextCharLL(); // discard the 'Q'.
+ nextChar(c); // recurse to get the real next char.
+ }
else
{
// We are in a '\' escape that will be handled by the state table scanner.
// Just return the backslash, but remember that the following char is to
- // be taken literally. TODO: this is awkward, think about alternatives.
+ // be taken literally.
fInBackslashQuote = TRUE;
}
}
@@ -3412,59 +3737,60 @@ void RegexCompile::nextChar(RegexPatternChar &c) {
//------------------------------------------------------------------------------
//
-// scanSet Construct a UnicodeSet from the text at the current scan
-// position. Advance the scan position to the first character
-// after the set.
+// scanNamedChar
+ // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
+//
+// The scan position will be at the 'N'. On return
+// the scan position should be just after the '}'
//
-// The scan position is normally under the control of the state machine
-// that controls pattern parsing. UnicodeSets, however, are parsed by
-// the UnicodeSet constructor, not by the Regex pattern parser.
+// Return the UChar32
//
//------------------------------------------------------------------------------
-UnicodeSet *RegexCompile::scanSet() {
+UChar32 RegexCompile::scanNamedChar() {
UnicodeSet *uset = NULL;
- ParsePosition pos;
- int i;
if (U_FAILURE(*fStatus)) {
- return NULL;
- }
-
- pos.setIndex(fScanIndex);
- UErrorCode localStatus = U_ZERO_ERROR;
- uint32_t usetFlags = 0;
- if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
- usetFlags |= USET_CASE_INSENSITIVE;
- }
- if (fModeFlags & UREGEX_COMMENTS) {
- usetFlags |= USET_IGNORE_SPACE;
+ return 0;
}
- uset = new UnicodeSet(fRXPat->fPattern, pos,
- usetFlags, NULL, localStatus);
- if (U_FAILURE(localStatus)) {
- // TODO: Get more accurate position of the error from UnicodeSet's return info.
- // UnicodeSet appears to not be reporting correctly at this time.
- REGEX_SCAN_DEBUG_PRINTF(("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex()));
- error(localStatus);
- delete uset;
- return NULL;
+ nextChar(fC);
+ if (fC.fChar != chLBrace) {
+ error(U_REGEX_PROPERTY_SYNTAX);
+ return 0;
}
-
- // Advance the current scan postion over the UnicodeSet.
- // Don't just set fScanIndex because the line/char positions maintained
- // for error reporting would be thrown off.
- i = pos.getIndex();
+
+ UnicodeString charName;
for (;;) {
- if (fNextIndex >= i) {
+ nextChar(fC);
+ if (fC.fChar == chRBrace) {
break;
}
- nextCharLL();
+ if (fC.fChar == -1) {
+ error(U_REGEX_PROPERTY_SYNTAX);
+ return 0;
+ }
+ charName.append(fC.fChar);
}
+
+ char name[100];
+ if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
+ charName.length()>=sizeof(name)) {
+ // All Unicode character names have only invariant characters.
+ // The API to get a character, given a name, accepts only char *, forcing us to convert,
+ // which requires this error check
+ error(U_REGEX_PROPERTY_SYNTAX);
+ return 0;
+ }
+ charName.extract(0, charName.length(), name, sizeof(name), US_INV);
- return uset;
-}
+ UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
+ if (U_FAILURE(*fStatus)) {
+ error(U_REGEX_PROPERTY_SYNTAX);
+ }
+ nextChar(fC); // Continue overall regex pattern processing with char after the '}'
+ return theChar;
+}
//------------------------------------------------------------------------------
//
@@ -3484,44 +3810,297 @@ UnicodeSet *RegexCompile::scanProp() {
if (U_FAILURE(*fStatus)) {
return NULL;
}
+ U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
+ UBool negated = (fC.fChar == chP);
- U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP || fC.fChar == chUpperN);
-
- // enclose the \p{property} from the regex pattern source in [brackets]
- UnicodeString setPattern;
- setPattern.append(chLBracket);
- setPattern.append(chBackSlash);
+ UnicodeString propertyName;
+ nextChar(fC);
+ if (fC.fChar != chLBrace) {
+ error(U_REGEX_PROPERTY_SYNTAX);
+ return NULL;
+ }
for (;;) {
- setPattern.append(fC.fChar);
+ nextChar(fC);
if (fC.fChar == chRBrace) {
break;
}
- nextChar(fC);
if (fC.fChar == -1) {
// Hit the end of the input string without finding the closing '}'
error(U_REGEX_PROPERTY_SYNTAX);
return NULL;
}
+ propertyName.append(fC.fChar);
}
- setPattern.append(chRBracket);
+ uset = createSetForProperty(propertyName, negated);
+ nextChar(fC); // Move input scan to position following the closing '}'
+ return uset;
+}
- uint32_t usetFlags = 0;
+//------------------------------------------------------------------------------
+//
+// scanPosixProp Construct a UnicodeSet from the text at the current scan
+// position, which is expected be of the form [:property expression:]
+//
+// The scan position will be at the opening ':'. On return
+// the scan position must be on the closing ']'
+//
+// Return a UnicodeSet constructed from the pattern,
+// or NULL if this is not a valid POSIX-style set expression.
+// If not a property expression, restore the initial scan position
+// (to the opening ':')
+//
+// Note: the opening '[:' is not sufficient to guarantee that
+// this is a [:property:] expression.
+// [:'+=,] is a perfectly good ordinary set expression that
+// happens to include ':' as one of its characters.
+//
+//------------------------------------------------------------------------------
+UnicodeSet *RegexCompile::scanPosixProp() {
+ UnicodeSet *uset = NULL;
+
+ if (U_FAILURE(*fStatus)) {
+ return NULL;
+ }
+
+ U_ASSERT(fC.fChar == chColon);
+
+ // Save the scanner state.
+ // TODO: move this into the scanner, with the state encapsulated in some way
+ int32_t savedScanIndex = fScanIndex;
+ int32_t savedNextIndex = fNextIndex;
+ UBool savedQuoteMode = fQuoteMode;
+ UBool savedInBackslashQuote = fInBackslashQuote;
+ UBool savedEOLComments = fEOLComments;
+ int32_t savedLineNum = fLineNum;
+ int32_t savedCharNum = fCharNum;
+ UChar32 savedLastChar = fLastChar;
+ UChar32 savedPeekChar = fPeekChar;
+ RegexPatternChar savedfC = fC;
+
+ // Scan for a closing ]. A little tricky because there are some perverse
+ // edge cases possible. "[:abc\Qdef;] \E]" is a valid non-property expression,
+ // ending on the second closing ].
+
+ UnicodeString propName;
+ UBool negated = FALSE;
+
+ // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
+ nextChar(fC);
+ if (fC.fChar == chUp) {
+ negated = TRUE;
+ nextChar(fC);
+ }
+
+ // Scan for the closing ":]", collecting the property name along the way.
+ UBool sawPropSetTerminator = FALSE;
+ for (;;) {
+ propName.append(fC.fChar);
+ nextChar(fC);
+ if (fC.fQuoted || fC.fChar == -1) {
+ // Escaped characters or end of input - either says this isn't a [:Property:]
+ break;
+ }
+ if (fC.fChar == chColon) {
+ nextChar(fC);
+ if (fC.fChar == chRBracket) {
+ sawPropSetTerminator = TRUE;
+ }
+ break;
+ }
+ }
+
+ if (sawPropSetTerminator) {
+ uset = createSetForProperty(propName, negated);
+ }
+ else
+ {
+ // No closing ":]".
+ // Restore the original scan position.
+ // The main scanner will retry the input as a normal set expression,
+ // not a [:Property:] expression.
+ fScanIndex = savedScanIndex;
+ fNextIndex = savedNextIndex;
+ fQuoteMode = savedQuoteMode;
+ fInBackslashQuote = savedInBackslashQuote;
+ fEOLComments = savedEOLComments;
+ fLineNum = savedLineNum;
+ fCharNum = savedCharNum;
+ fLastChar = savedLastChar;
+ fPeekChar = savedPeekChar;
+ fC = savedfC;
+ }
+ return uset;
+}
+
+//
+// Create a Unicode Set from a Unicode Property expression.
+// This is common code underlying both \p{...} ane [:...:] expressions.
+// Includes trying the Java "properties" that aren't supported as
+// normal ICU UnicodeSet properties
+//
+static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 00}; // "[\p{"
+static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 00}; // "[\p{"
+UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
+ UnicodeString setExpr;
+ UnicodeSet *set;
+ uint32_t usetFlags = 0;
+
+ if (U_FAILURE(*fStatus)) {
+ return NULL;
+ }
+
+ //
+ // First try the property as we received it
+ //
+ if (negated) {
+ setExpr.append(negSetPrefix, -1);
+ } else {
+ setExpr.append(posSetPrefix, -1);
+ }
+ setExpr.append(propName);
+ setExpr.append(chRBrace);
+ setExpr.append(chRBracket);
if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
usetFlags |= USET_CASE_INSENSITIVE;
}
- if (fModeFlags & UREGEX_COMMENTS) {
- usetFlags |= USET_IGNORE_SPACE;
+ set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
+ if (U_SUCCESS(*fStatus)) {
+ return set;
+ }
+
+ //
+ // The property as it was didn't work.
+ // See if it looks like a Java "InBlockName", which
+ // we will recast as "Block=BlockName"
+ //
+ static const UChar IN[] = {0x49, 0x6E, 0}; // "In"
+ static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block="
+ if (propName.startsWith(IN, 2) && propName.length()>=3) {
+ setExpr.truncate(4); // Leaves "[\p{", or "[\P{"
+ setExpr.append(BLOCK, -1);
+ setExpr.append(UnicodeString(propName, 2)); // Property with the leading "In" removed.
+ setExpr.append(chRBrace);
+ setExpr.append(chRBracket);
+ *fStatus = U_ZERO_ERROR;
+ set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
+ if (U_SUCCESS(*fStatus)) {
+ return set;
+ }
}
+
+ //
+ // Try the various Java specific properties.
+ // These all begin with "java"
+ //
+ #define IDENTIFIER_IGNORABLE "[\\u0000-\\u0008\\u000e-\\u001b\\u007f-\\u009f\\p{Cf}]"
+ static const char *javaProps[][2] = {
+ {"javaDefined", "\\P{Cn}"},
+ {"javaDigit", "\\p{Nd}"},
+ {"javaIdentifierIgnorable", IDENTIFIER_IGNORABLE},
+ {"javaISOControl", "[\\u0000-\\u001f\\u007f-\\u009f]"},
+ {"javaJavaIdentifierPart", "[[\\p{L}\\p{Sc}\\p{Pc}\\p{Nd}\\p{Nl}\\p{Mc}\\p{Mn}]" IDENTIFIER_IGNORABLE "]"},
+ {"javaJavaIdentifierStart", "[\\p{L}\\p{Nl}\\p{Sc}\\p{Pc}]"},
+ {"javaLetter", "\\p{L}"},
+ {"javaLetterOrDigit", "[\\p{L}\\p{Nd}]"},
+ {"javaLowerCase", "\\p{Ll}"},
+ {"javaMirrored", "\\p{Bidi_Mirrored}"},
+ {"javaSpaceChar", "\\p{Z}"},
+ {"javaSupplementaryCodePoint", "[\\U00010000-\\U0010ffff]"},
+ {"javaTitleCase", "\\p{Lt}"},
+ {"javaUnicodeIdentifierStart", "[\\p{L}\\p{Nl}]"},
+ {"javaUnicodeIdentifierPart", "[[\\p{L}\\p{Pc}\\p{Nd}\\p{Nl}\\p{Mc}\\p{Mn}]" IDENTIFIER_IGNORABLE "]"},
+ {"javaUpperCase", "[\\p{Lu}]"},
+ {"javaValidCodePoint", "[\\u0000-\\U0010ffff]"},
+ {"javaWhitespace", "[[\\p{Z}-[\\u00a0\\u2007\\u202f]]\\u0009-\\u000d\\u001c-\\u001f]"},
+ {NULL, NULL}
+ };
+
- // Build the UnicodeSet from the set pattern we just built up in a string.
- uset = new UnicodeSet(setPattern, usetFlags, NULL, *fStatus);
- if (U_FAILURE(*fStatus)) {
- delete uset;
- uset = NULL;
+ UnicodeString Java("java", -1, UnicodeString::kInvariant);
+ if (propName.startsWith(Java)) {
+ int i;
+ setExpr.remove();
+ for (i=0; javaProps[i][0] != NULL; i++) {
+ if (propName.compare(UnicodeString(javaProps[i][0], -1, UnicodeString::kInvariant))==0) {
+ setExpr = UnicodeString(javaProps[i][1]); // Default code page conversion here.
+ break; // Somewhat Inefficient.
+ }
+ }
+ if (setExpr.length()>0) {
+ *fStatus = U_ZERO_ERROR;
+ set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
+ if (U_SUCCESS(*fStatus)) {
+ if (negated) {
+ set->complement();
+ }
+ return set;
+ }
+ }
}
+ error(*fStatus);
+ return NULL;
+}
- nextChar(fC); // Continue overall regex pattern processing with char after the '}'
- return uset;
+
+
+//
+// SetEval Part of the evaluation of [set expressions].
+// Perform any pending (stacked) operations with precedence
+// equal or greater to that of the next operator encountered
+// in the expression.
+//
+void RegexCompile::setEval(int32_t nextOp) {
+ UnicodeSet *rightOperand = NULL;
+ UnicodeSet *leftOperand = NULL;
+ for (;;) {
+ U_ASSERT(fSetOpStack.empty()==FALSE);
+ int32_t pendingSetOperation = fSetOpStack.peeki();
+ if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
+ break;
+ }
+ fSetOpStack.popi();
+ U_ASSERT(fSetStack.empty() == FALSE);
+ rightOperand = (UnicodeSet *)fSetStack.peek();
+ switch (pendingSetOperation) {
+ case setNegation:
+ rightOperand->complement();
+ break;
+ case setCaseClose:
+ // TODO: need a simple close function.
+ rightOperand->closeOver(USET_CASE_INSENSITIVE);
+ rightOperand->removeAllStrings();
+ break;
+ case setDifference1:
+ case setDifference2:
+ fSetStack.pop();
+ leftOperand = (UnicodeSet *)fSetStack.peek();
+ leftOperand->removeAll(*rightOperand);
+ delete rightOperand;
+ break;
+ case setIntersection1:
+ case setIntersection2:
+ fSetStack.pop();
+ leftOperand = (UnicodeSet *)fSetStack.peek();
+ leftOperand->retainAll(*rightOperand);
+ delete rightOperand;
+ break;
+ case setUnion:
+ fSetStack.pop();
+ leftOperand = (UnicodeSet *)fSetStack.peek();
+ leftOperand->addAll(*rightOperand);
+ delete rightOperand;
+ break;
+ default:
+ U_ASSERT(FALSE);
+ break;
+ }
+ }
+ }
+
+void RegexCompile::setPushOp(int32_t op) {
+ setEval(op);
+ fSetOpStack.push(op, *fStatus);
+ fSetStack.push(new UnicodeSet(), *fStatus);
}
U_NAMESPACE_END
diff --git a/i18n/regexcmp.h b/i18n/regexcmp.h
index ac81684a..a0248a3f 100644
--- a/i18n/regexcmp.h
+++ b/i18n/regexcmp.h
@@ -51,7 +51,7 @@ public:
};
RegexCompile(RegexPattern *rp, UErrorCode &e);
-
+
void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
@@ -68,7 +68,7 @@ public:
// determines the code to be generated when the matching close ) is encountered.
enum EParenClass {
plain = -1, // No special handling
- capturing = -2,
+ capturing = -2,
atomic = -3,
lookAhead = -4,
negLookAhead = -5,
@@ -85,8 +85,8 @@ private:
UChar32 nextCharLL();
UChar32 peekCharLL();
- UnicodeSet *scanSet();
UnicodeSet *scanProp();
+ UnicodeSet *scanPosixProp();
void handleCloseParen();
int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern
// at the top of the just completed block
@@ -111,6 +111,11 @@ private:
void stripNOPs();
void OptDotStar();
+ void setEval(int32_t op);
+ void setPushOp(int32_t op);
+ UChar32 scanNamedChar();
+ UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
+
UErrorCode *fStatus;
RegexPattern *fRXPat;
@@ -125,7 +130,7 @@ private:
// is the first character not yet scanned.
UBool fQuoteMode; // Scan is in a \Q...\E quoted region
UBool fInBackslashQuote; // Scan is between a '\' and the following char.
- UBool fEOLComments; // When scan is just after '(?', inhibit #... to
+ UBool fEOLComments; // When scan is just after '(?', inhibit #... to
// end of line comments, in favor of (?#...) comments.
int32_t fLineNum; // Line number in input file.
int32_t fCharNum; // Char position within the line.
@@ -167,7 +172,7 @@ private:
UVector32 fParenStack; // parentheses stack. Each frame consists of
// the positions of compiled pattern operations
- // needing fixup, followed by negative value. The
+ // needing fixup, followed by negative value. The
// first entry in each frame is the position of the
// spot reserved for use when a quantifier
// needs to add a SAVE at the start of a (block)
@@ -194,8 +199,33 @@ private:
int32_t fNameStartPos; // Starting position of a \N{NAME} name in a
// pattern, valid while remainder of name is
// scanned.
+
+ UStack fSetStack; // Stack of UnicodeSets, used while evaluating
+ // (at compile time) set expressions within
+ // the pattern.
+ UStack fSetOpStack; // Stack of pending set operators (&&, --, union)
+
+ UChar32 fLastSetLiteral; // The last single code point added to a set.
+ // needed when "-y" is scanned, and we need
+ // to turn "x-y" into a range.
+
};
+// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
+// The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
+
+enum SetOperations {
+ setStart = 0 << 16 | 1,
+ setEnd = 1 << 16 | 2,
+ setNegation = 2 << 16 | 3,
+ setCaseClose = 2 << 16 | 9,
+ setDifference2 = 3 << 16 | 4, // '--' set difference operator
+ setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator
+ setUnion = 4 << 16 | 6, // implicit union of adjacent items
+ setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet.
+ setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet.
+ };
+
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
#endif // RBBISCAN_H
diff --git a/i18n/regexcst.h b/i18n/regexcst.h
index 6cca8453..5d319371 100644
--- a/i18n/regexcst.h
+++ b/i18n/regexcst.h
@@ -5,7 +5,7 @@
// It is generated by the Perl script "regexcst.pl" from
// the rule parser state definitions file "regexcst.txt".
//
-// Copyright (C) 2002-2003 International Business Machines Corporation
+// Copyright (C) 2002-2007 International Business Machines Corporation
// and others. All rights reserved.
//
//---------------------------------------------------------------------------------
@@ -17,74 +17,100 @@ U_NAMESPACE_BEGIN
// Character classes for regex pattern scanning.
//
static const uint8_t kRuleSet_digit_char = 128;
- static const uint8_t kRuleSet_white_space = 129;
- static const uint8_t kRuleSet_rule_char = 130;
+ static const uint8_t kRuleSet_rule_char = 129;
enum Regex_PatternParseAction {
- doPossessivePlus,
- doCloseParen,
+ doLiteralChar,
+ doSetEnd,
+ doBackslashA,
+ doSetBeginUnion,
+ doNOP,
+ doSetBackslash_w,
+ doSetRange,
+ doBackslashG,
+ doPerlInline,
+ doSetAddDash,
+ doIntevalLowerDigit,
doProperty,
- doBeginMatchMode,
- doOrOperator,
+ doBackslashX,
+ doOpenAtomicParen,
+ doSetLiteralEscaped,
+ doPatFinish,
+ doSetBackslash_D,
+ doSetDifference2,
+ doNamedChar,
+ doNGPlus,
+ doOpenLookBehindNeg,
+ doIntervalError,
+ doIntervalSame,
+ doBackRef,
+ doPlus,
doOpenCaptureParen,
- doBadOpenParenType,
- doRuleError,
- doIntevalLowerDigit,
- doBackslashs,
- doNGOpt,
- doBackslashw,
doMismatchedParenErr,
+ doBeginMatchMode,
+ doEscapeError,
+ doOpenNonCaptureParen,
+ doDollar,
+ doSetProp,
+ doIntervalUpperDigit,
+ doSetBegin,
+ doBackslashs,
doOpenLookBehind,
- doBackslashz,
- doIntervalError,
- doStar,
+ doSetMatchMode,
+ doOrOperator,
doCaret,
- doEnterQuoteMode,
- doNGStar,
+ doMatchModeParen,
+ doStar,
+ doOpt,
doMatchMode,
- doIntervalUpperDigit,
+ doSuppressComments,
+ doPossessiveInterval,
doOpenLookAheadNeg,
- doPlus,
- doOpenNonCaptureParen,
- doBackslashA,
+ doBackslashW,
+ doCloseParen,
+ doSetOpError,
+ doIntervalInit,
+ doSetFinish,
+ doSetIntersection2,
+ doNGStar,
+ doEnterQuoteMode,
+ doSetAddAmp,
doBackslashB,
- doNGPlus,
- doSetMatchMode,
- doPatFinish,
+ doBackslashw,
+ doPossessiveOpt,
+ doSetNegate,
+ doRuleError,
+ doBackslashb,
+ doConditionalExpr,
+ doPossessivePlus,
+ doBadOpenParenType,
+ doNGInterval,
+ doSetLiteral,
+ doSetNamedChar,
+ doBackslashd,
+ doSetBeginDifference1,
doBackslashD,
- doPossessiveInterval,
- doEscapeError,
- doBackslashG,
- doSuppressComments,
- doMatchModeParen,
- doOpt,
+ doExit,
+ doSetBackslash_S,
doInterval,
- doLiteralChar,
- doIntervalInit,
- doOpenAtomicParen,
+ doSetNoCloseError,
+ doNGOpt,
+ doSetPosixProp,
doBackslashS,
- doOpenLookAhead,
- doBackRef,
- doDollar,
- doDotAny,
- doBackslashW,
- doBackslashX,
- doScanUnicodeSet,
doBackslashZ,
- doPerlInline,
- doPossessiveOpt,
- doNOP,
- doConditionalExpr,
- doExit,
- doNGInterval,
- doPatStart,
+ doSetBeginIntersection1,
+ doSetBackslash_W,
+ doSetBackslash_d,
+ doOpenLookAhead,
doBadModeFlag,
- doBackslashb,
+ doPatStart,
+ doSetNamedRange,
doPossessiveStar,
- doBackslashd,
- doIntervalSame,
- doOpenLookBehindNeg,
+ doEscapedLiteralChar,
+ doSetBackslash_s,
+ doBackslashz,
+ doDotAny,
rbbiLastAction};
//-------------------------------------------------------------------------------
@@ -106,17 +132,17 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
{doNOP, 0, 0, 0, TRUE}
, {doPatStart, 255, 2,0, FALSE} // 1 start
, {doLiteralChar, 254, 14,0, TRUE} // 2 term
- , {doLiteralChar, 130, 14,0, TRUE} // 3
- , {doScanUnicodeSet, 91 /* [ */, 14,0, TRUE} // 4
+ , {doLiteralChar, 129, 14,0, TRUE} // 3
+ , {doSetBegin, 91 /* [ */, 100, 178, TRUE} // 4
, {doNOP, 40 /* ( */, 27,0, TRUE} // 5
, {doDotAny, 46 /* . */, 14,0, TRUE} // 6
- , {doCaret, 94 /* ^ */, 2,0, TRUE} // 7
- , {doDollar, 36 /* $ */, 2,0, TRUE} // 8
- , {doNOP, 92 /* \ */, 81,0, TRUE} // 9
+ , {doCaret, 94 /* ^ */, 14,0, TRUE} // 7
+ , {doDollar, 36 /* $ */, 14,0, TRUE} // 8
+ , {doNOP, 92 /* \ */, 80,0, TRUE} // 9
, {doOrOperator, 124 /* | */, 2,0, TRUE} // 10
, {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11
, {doPatFinish, 253, 2,0, FALSE} // 12
- , {doRuleError, 255, 101,0, FALSE} // 13
+ , {doRuleError, 255, 179,0, FALSE} // 13
, {doNOP, 42 /* * */, 59,0, TRUE} // 14 expr-quant
, {doNOP, 43 /* + */, 62,0, TRUE} // 15
, {doNOP, 63 /* ? */, 65,0, TRUE} // 16
@@ -144,14 +170,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doBeginMatchMode, 119 /* w */, 50,0, FALSE} // 38
, {doBeginMatchMode, 120 /* x */, 50,0, FALSE} // 39
, {doBeginMatchMode, 45 /* - */, 50,0, FALSE} // 40
- , {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 41
- , {doPerlInline, 123 /* { */, 101,0, TRUE} // 42
- , {doBadOpenParenType, 255, 101,0, FALSE} // 43
+ , {doConditionalExpr, 40 /* ( */, 179,0, TRUE} // 41
+ , {doPerlInline, 123 /* { */, 179,0, TRUE} // 42
+ , {doBadOpenParenType, 255, 179,0, FALSE} // 43
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 44 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 45
- , {doBadOpenParenType, 255, 101,0, FALSE} // 46
+ , {doBadOpenParenType, 255, 179,0, FALSE} // 46
, {doNOP, 41 /* ) */, 255,0, TRUE} // 47 paren-comment
- , {doMismatchedParenErr, 253, 101,0, FALSE} // 48
+ , {doMismatchedParenErr, 253, 179,0, FALSE} // 48
, {doNOP, 255, 47,0, TRUE} // 49
, {doMatchMode, 105 /* i */, 50,0, TRUE} // 50 paren-flag
, {doMatchMode, 109 /* m */, 50,0, TRUE} // 51
@@ -161,7 +187,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doMatchMode, 45 /* - */, 50,0, TRUE} // 55
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 56
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 57
- , {doBadModeFlag, 255, 101,0, FALSE} // 58
+ , {doBadModeFlag, 255, 179,0, FALSE} // 58
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 59 quant-star
, {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 60
, {doStar, 255, 20,0, FALSE} // 61
@@ -171,40 +197,118 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 65 quant-opt
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 66
, {doOpt, 255, 20,0, FALSE} // 67
- , {doNOP, 129, 68,0, TRUE} // 68 interval-open
- , {doNOP, 128, 71,0, FALSE} // 69
- , {doIntervalError, 255, 101,0, FALSE} // 70
- , {doIntevalLowerDigit, 128, 71,0, TRUE} // 71 interval-lower
- , {doNOP, 44 /* , */, 75,0, TRUE} // 72
- , {doIntervalSame, 125 /* } */, 78,0, TRUE} // 73
- , {doIntervalError, 255, 101,0, FALSE} // 74
- , {doIntervalUpperDigit, 128, 75,0, TRUE} // 75 interval-upper
- , {doNOP, 125 /* } */, 78,0, TRUE} // 76
- , {doIntervalError, 255, 101,0, FALSE} // 77
- , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 78 interval-type
- , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 79
- , {doInterval, 255, 20,0, FALSE} // 80
- , {doBackslashA, 65 /* A */, 2,0, TRUE} // 81 backslash
- , {doBackslashB, 66 /* B */, 2,0, TRUE} // 82
- , {doBackslashb, 98 /* b */, 2,0, TRUE} // 83
- , {doBackslashd, 100 /* d */, 14,0, TRUE} // 84
- , {doBackslashD, 68 /* D */, 14,0, TRUE} // 85
- , {doBackslashG, 71 /* G */, 2,0, TRUE} // 86
- , {doProperty, 78 /* N */, 14,0, FALSE} // 87
- , {doProperty, 112 /* p */, 14,0, FALSE} // 88
- , {doProperty, 80 /* P */, 14,0, FALSE} // 89
- , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 90
- , {doBackslashS, 83 /* S */, 14,0, TRUE} // 91
- , {doBackslashs, 115 /* s */, 14,0, TRUE} // 92
- , {doBackslashW, 87 /* W */, 14,0, TRUE} // 93
- , {doBackslashw, 119 /* w */, 14,0, TRUE} // 94
- , {doBackslashX, 88 /* X */, 14,0, TRUE} // 95
- , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 96
- , {doBackslashz, 122 /* z */, 2,0, TRUE} // 97
- , {doBackRef, 128, 14,0, TRUE} // 98
- , {doEscapeError, 253, 101,0, FALSE} // 99
- , {doLiteralChar, 255, 14,0, TRUE} // 100
- , {doExit, 255, 101,0, TRUE} // 101 errorDeath
+ , {doNOP, 128, 70,0, FALSE} // 68 interval-open
+ , {doIntervalError, 255, 179,0, FALSE} // 69
+ , {doIntevalLowerDigit, 128, 70,0, TRUE} // 70 interval-lower
+ , {doNOP, 44 /* , */, 74,0, TRUE} // 71
+ , {doIntervalSame, 125 /* } */, 77,0, TRUE} // 72
+ , {doIntervalError, 255, 179,0, FALSE} // 73
+ , {doIntervalUpperDigit, 128, 74,0, TRUE} // 74 interval-upper
+ , {doNOP, 125 /* } */, 77,0, TRUE} // 75
+ , {doIntervalError, 255, 179,0, FALSE} // 76
+ , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 77 interval-type
+ , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 78
+ , {doInterval, 255, 20,0, FALSE} // 79
+ , {doBackslashA, 65 /* A */, 2,0, TRUE} // 80 backslash
+ , {doBackslashB, 66 /* B */, 2,0, TRUE} // 81
+ , {doBackslashb, 98 /* b */, 2,0, TRUE} // 82
+ , {doBackslashd, 100 /* d */, 14,0, TRUE} // 83
+ , {doBackslashD, 68 /* D */, 14,0, TRUE} // 84
+ , {doBackslashG, 71 /* G */, 2,0, TRUE} // 85
+ , {doNamedChar, 78 /* N */, 14,0, FALSE} // 86
+ , {doProperty, 112 /* p */, 14,0, FALSE} // 87
+ , {doProperty, 80 /* P */, 14,0, FALSE} // 88
+ , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 89
+ , {doBackslashS, 83 /* S */, 14,0, TRUE} // 90
+ , {doBackslashs, 115 /* s */, 14,0, TRUE} // 91
+ , {doBackslashW, 87 /* W */, 14,0, TRUE} // 92
+ , {doBackslashw, 119 /* w */, 14,0, TRUE} // 93
+ , {doBackslashX, 88 /* X */, 14,0, TRUE} // 94
+ , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 95
+ , {doBackslashz, 122 /* z */, 2,0, TRUE} // 96
+ , {doBackRef, 128, 14,0, TRUE} // 97
+ , {doEscapeError, 253, 179,0, FALSE} // 98
+ , {doEscapedLiteralChar, 255, 14,0, TRUE} // 99
+ , {doSetNegate, 94 /* ^ */, 103,0, TRUE} // 100 set-open
+ , {doSetPosixProp, 58 /* : */, 105,0, FALSE} // 101
+ , {doNOP, 255, 103,0, FALSE} // 102
+ , {doSetLiteral, 93 /* ] */, 118,0, TRUE} // 103 set-open2
+ , {doNOP, 255, 108,0, FALSE} // 104
+ , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 105 set-posix
+ , {doNOP, 58 /* : */, 108,0, FALSE} // 106
+ , {doRuleError, 255, 179,0, FALSE} // 107
+ , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 108 set-start
+ , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 109
+ , {doNOP, 92 /* \ */, 168,0, TRUE} // 110
+ , {doNOP, 45 /* - */, 114,0, TRUE} // 111
+ , {doNOP, 38 /* & */, 116,0, TRUE} // 112
+ , {doSetLiteral, 255, 118,0, TRUE} // 113
+ , {doRuleError, 45 /* - */, 179,0, FALSE} // 114 set-start-dash
+ , {doSetAddDash, 255, 118,0, FALSE} // 115
+ , {doRuleError, 38 /* & */, 179,0, FALSE} // 116 set-start-amp
+ , {doSetAddAmp, 255, 118,0, FALSE} // 117
+ , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 118 set-after-lit
+ , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 119
+ , {doNOP, 45 /* - */, 155,0, TRUE} // 120
+ , {doNOP, 38 /* & */, 146,0, TRUE} // 121
+ , {doNOP, 92 /* \ */, 168,0, TRUE} // 122
+ , {doSetNoCloseError, 253, 179,0, FALSE} // 123
+ , {doSetLiteral, 255, 118,0, TRUE} // 124
+ , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 125 set-after-set
+ , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 126
+ , {doNOP, 45 /* - */, 148,0, TRUE} // 127
+ , {doNOP, 38 /* & */, 143,0, TRUE} // 128
+ , {doNOP, 92 /* \ */, 168,0, TRUE} // 129
+ , {doSetNoCloseError, 253, 179,0, FALSE} // 130
+ , {doSetLiteral, 255, 118,0, TRUE} // 131
+ , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 132 set-after-range
+ , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 133
+ , {doNOP, 45 /* - */, 151,0, TRUE} // 134
+ , {doNOP, 38 /* & */, 153,0, TRUE} // 135
+ , {doNOP, 92 /* \ */, 168,0, TRUE} // 136
+ , {doSetNoCloseError, 253, 179,0, FALSE} // 137
+ , {doSetLiteral, 255, 118,0, TRUE} // 138
+ , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 139 set-after-op
+ , {doSetOpError, 93 /* ] */, 179,0, FALSE} // 140
+ , {doNOP, 92 /* \ */, 168,0, TRUE} // 141
+ , {doSetLiteral, 255, 118,0, TRUE} // 142
+ , {doSetBeginIntersection1, 91 /* [ */, 100, 125, TRUE} // 143 set-set-amp
+ , {doSetIntersection2, 38 /* & */, 139,0, TRUE} // 144
+ , {doSetAddAmp, 255, 118,0, FALSE} // 145
+ , {doSetIntersection2, 38 /* & */, 139,0, TRUE} // 146 set-lit-amp
+ , {doSetAddAmp, 255, 118,0, FALSE} // 147
+ , {doSetBeginDifference1, 91 /* [ */, 100, 125, TRUE} // 148 set-set-dash
+ , {doSetDifference2, 45 /* - */, 139,0, TRUE} // 149
+ , {doSetAddDash, 255, 118,0, FALSE} // 150
+ , {doSetDifference2, 45 /* - */, 139,0, TRUE} // 151 set-range-dash
+ , {doSetAddDash, 255, 118,0, FALSE} // 152
+ , {doSetIntersection2, 38 /* & */, 139,0, TRUE} // 153 set-range-amp
+ , {doSetAddAmp, 255, 118,0, FALSE} // 154
+ , {doSetDifference2, 45 /* - */, 139,0, TRUE} // 155 set-lit-dash
+ , {doSetAddDash, 91 /* [ */, 118,0, FALSE} // 156
+ , {doSetAddDash, 93 /* ] */, 118,0, FALSE} // 157
+ , {doNOP, 92 /* \ */, 160,0, TRUE} // 158
+ , {doSetRange, 255, 132,0, TRUE} // 159
+ , {doSetAddDash, 115 /* s */, 168,0, FALSE} // 160 set-lit-dash-escape
+ , {doSetAddDash, 83 /* S */, 168,0, FALSE} // 161
+ , {doSetAddDash, 119 /* w */, 168,0, FALSE} // 162
+ , {doSetAddDash, 87 /* W */, 168,0, FALSE} // 163
+ , {doSetAddDash, 100 /* d */, 168,0, FALSE} // 164
+ , {doSetAddDash, 68 /* D */, 168,0, FALSE} // 165
+ , {doSetNamedRange, 78 /* N */, 132,0, FALSE} // 166
+ , {doSetRange, 255, 132,0, TRUE} // 167
+ , {doSetProp, 112 /* p */, 125,0, FALSE} // 168 set-escape
+ , {doSetProp, 80 /* P */, 125,0, FALSE} // 169
+ , {doSetNamedChar, 78 /* N */, 118,0, FALSE} // 170
+ , {doSetBackslash_s, 115 /* s */, 132,0, TRUE} // 171
+ , {doSetBackslash_S, 83 /* S */, 132,0, TRUE} // 172
+ , {doSetBackslash_w, 119 /* w */, 132,0, TRUE} // 173
+ , {doSetBackslash_W, 87 /* W */, 132,0, TRUE} // 174
+ , {doSetBackslash_d, 100 /* d */, 132,0, TRUE} // 175
+ , {doSetBackslash_D, 68 /* D */, 132,0, TRUE} // 176
+ , {doSetLiteralEscaped, 255, 118,0, TRUE} // 177
+ , {doSetFinish, 255, 14,0, FALSE} // 178 set-finish
+ , {doExit, 255, 179,0, TRUE} // 179 errorDeath
};
static const char * const RegexStateNames[] = { 0,
"start",
@@ -276,7 +380,6 @@ static const char * const RegexStateNames[] = { 0,
0,
"interval-open",
0,
- 0,
"interval-lower",
0,
0,
@@ -307,6 +410,85 @@ static const char * const RegexStateNames[] = { 0,
0,
0,
0,
+ "set-open",
+ 0,
+ 0,
+ "set-open2",
+ 0,
+ "set-posix",
+ 0,
+ 0,
+ "set-start",
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ "set-start-dash",
+ 0,
+ "set-start-amp",
+ 0,
+ "set-after-lit",
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ "set-after-set",
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ "set-after-range",
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ "set-after-op",
+ 0,
+ 0,
+ 0,
+ "set-set-amp",
+ 0,
+ 0,
+ "set-lit-amp",
+ 0,
+ "set-set-dash",
+ 0,
+ 0,
+ "set-range-dash",
+ 0,
+ "set-range-amp",
+ 0,
+ "set-lit-dash",
+ 0,
+ 0,
+ 0,
+ 0,
+ "set-lit-dash-escape",
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ "set-escape",
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ "set-finish",
"errorDeath",
0};
diff --git a/i18n/regexcst.pl b/i18n/regexcst.pl
index b525cf88..f1dc06af 100644
--- a/i18n/regexcst.pl
+++ b/i18n/regexcst.pl
@@ -1,7 +1,7 @@
#!/usr/bin/perl
# ********************************************************************
# * COPYRIGHT:
-# * Copyright (c) 2002-2003, International Business Machines Corporation and
+# * Copyright (c) 2002-2007, International Business Machines Corporation and
# * others. All Rights Reserved.
# ********************************************************************
#
@@ -22,10 +22,6 @@
# for the Rule Based Break Iterator Rule Parser. Perhaps they could be
# merged?
#
-#*********************************************************************
-# Copyright (C) 2002 International Business Machines Corporation *
-# and others. All rights reserved. *
-#*********************************************************************
$num_states = 1; # Always the state number for the line being compiled.
@@ -210,7 +206,7 @@ print "// This file contains the state table for the ICU Regular Expression P
print "// It is generated by the Perl script \"regexcst.pl\" from\n";
print "// the rule parser state definitions file \"regexcst.txt\".\n";
print "//\n";
-print "// Copyright (C) 2002-2003 International Business Machines Corporation \n";
+print "// Copyright (C) 2002-2007 International Business Machines Corporation \n";
print "// and others. All rights reserved. \n";
print "//\n";
print "//---------------------------------------------------------------------------------\n";
diff --git a/i18n/regexcst.txt b/i18n/regexcst.txt
index fec788c1..888a0c42 100644
--- a/i18n/regexcst.txt
+++ b/i18n/regexcst.txt
@@ -1,7 +1,7 @@
#*****************************************************************************
#
-# Copyright (C) 2002-2003, International Business Machines Corporation and others.
+# Copyright (C) 2002-2007, International Business Machines Corporation and others.
# All Rights Reserved.
#
#*****************************************************************************
@@ -25,8 +25,8 @@
#
#
#StateName:
-# input-char n next-state ^push-state action
-# input-char n next-state ^push-state action
+# input-char n next-state ^push-state action
+# input-char n next-state ^push-state action
# | | | | |
# | | | | |--- action to be performed by state machine
# | | | | See function RBBIRuleScanner::doParseActions()
@@ -46,7 +46,7 @@
# matches, peform the actions and go to the state specified on this line.
# The input character is tested sequentally, in the order written. The characters and
# character classes tested for do not need to be mutually exclusive. The first match wins.
-#
+#
@@ -56,27 +56,27 @@
#
start:
default term doPatStart
-
-
-
+
+
+
#
# term. At a position where we can accept the start most items in a pattern.
#
term:
quoted n expr-quant doLiteralChar
rule_char n expr-quant doLiteralChar
- '[' n expr-quant doScanUnicodeSet
- '(' n open-paren
+ '[' n set-open ^set-finish doSetBegin
+ '(' n open-paren
'.' n expr-quant doDotAny
- '^' n term doCaret
- '$' n term doDollar
+ '^' n expr-quant doCaret
+ '$' n expr-quant doDollar
'\' n backslash
'|' n term doOrOperator
')' n pop doCloseParen
eof term doPatFinish
default errorDeath doRuleError
-
+
#
@@ -84,14 +84,14 @@ term:
# trailing quantifier - *, +, ?, *?, etc.
#
expr-quant:
- '*' n quant-star
- '+' n quant-plus
- '?' n quant-opt
+ '*' n quant-star
+ '+' n quant-plus
+ '?' n quant-opt
'{' n interval-open doIntervalInit
'(' n open-paren-quant
- default expr-cont
-
-
+ default expr-cont
+
+
#
# expr-cont Expression, continuation. At a point where additional terms are
# allowed, but not required. No Quantifiers
@@ -99,8 +99,8 @@ expr-quant:
expr-cont:
'|' n term doOrOperator
')' n pop doCloseParen
- default term
-
+ default term
+
#
# open-paren-quant Special case handling for comments appearing before a quantifier,
@@ -111,12 +111,12 @@ expr-cont:
open-paren-quant:
'?' n open-paren-quant2 doSuppressComments
default open-paren
-
+
open-paren-quant2:
'#' n paren-comment ^expr-quant
default open-paren-extended
-
-
+
+
#
# open-paren We've got an open paren. We need to scan further to
# determine what kind of quantifier it is - plain (, (?:, (?>, or whatever.
@@ -124,7 +124,7 @@ open-paren-quant2:
open-paren:
'?' n open-paren-extended doSuppressComments
default term ^expr-quant doOpenCaptureParen
-
+
open-paren-extended:
':' n term ^expr-quant doOpenNonCaptureParen # (?:
'>' n term ^expr-quant doOpenAtomicParen # (?>
@@ -141,16 +141,15 @@ open-paren-extended:
'(' n errorDeath doConditionalExpr
'{' n errorDeath doPerlInline
default errorDeath doBadOpenParenType
-
+
open-paren-lookbehind:
'=' n term ^expr-cont doOpenLookBehind # (?<=
'!' n term ^expr-cont doOpenLookBehindNeg # (?<!
default errorDeath doBadOpenParenType
-
+
#
# paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')'
-# TODO: should parens nest here? Check what perl does.
#
paren-comment:
')' n pop
@@ -158,8 +157,8 @@ paren-comment:
default n paren-comment
#
-# paren-flag Scanned a (?ismx-ismx flag setting
-#
+# paren-flag Scanned a (?ismx-ismx flag setting
+#
paren-flag:
'i' n paren-flag doMatchMode
'm' n paren-flag doMatchMode
@@ -170,8 +169,8 @@ paren-flag:
')' n term doSetMatchMode
':' n term ^expr-quant doMatchModeParen
default errorDeath doBadModeFlag
-
-
+
+
#
# quant-star Scanning a '*' quantifier. Need to look ahead to decide
# between plain '*', '*?', '*+'
@@ -204,13 +203,12 @@ quant-opt:
#
# Interval scanning a '{', the opening delimiter for an interval specification
-# {number} or {min, max} or {min, }
+# {number} or {min, max} or {min,}
#
interval-open:
- white_space n interval-open # TODO: is white space allowed here in non-free mode?
- digit_char interval-lower
+ digit_char interval-lower
default errorDeath doIntervalError
-
+
interval-lower:
digit_char n interval-lower doIntevalLowerDigit
',' n interval-upper
@@ -221,13 +219,13 @@ interval-upper:
digit_char n interval-upper doIntervalUpperDigit
'}' n interval-type
default errorDeath doIntervalError
-
+
interval-type:
'?' n expr-cont doNGInterval # {n,m}?
'+' n expr-cont doPossessiveInterval # {n,m}+
default expr-cont doInterval # {m,n}
-
-
+
+
#
# backslash # Backslash. Figure out which of the \thingies we have encountered.
# The low level next-char function will have preprocessed
@@ -239,7 +237,7 @@ backslash:
'd' n expr-quant doBackslashd
'D' n expr-quant doBackslashD
'G' n term doBackslashG
- 'N' expr-quant doProperty # \N{NAME} named char
+ 'N' expr-quant doNamedChar # \N{NAME} named char
'p' expr-quant doProperty # \p{Lu} style property
'P' expr-quant doProperty
'Q' n term doEnterQuoteMode
@@ -250,11 +248,210 @@ backslash:
'X' n expr-quant doBackslashX
'Z' n term doBackslashZ
'z' n term doBackslashz
- digit_char n expr-quant doBackRef # Will scan multiple digits
+ digit_char n expr-quant doBackRef # Will scan multiple digits
eof errorDeath doEscapeError
- default n expr-quant doLiteralChar # Escaped literal char.
+ default n expr-quant doEscapedLiteralChar
+
+
+#
+# [set expression] parsing,
+# All states involved in parsing set expressions have names beginning with "set-"
+#
+
+set-open:
+ '^' n set-open2 doSetNegate
+ ':' set-posix doSetPosixProp
+ default set-open2
+
+set-open2:
+ ']' n set-after-lit doSetLiteral
+ default set-start
+
+# set-posix:
+# scanned a '[:' If it really is a [:property:], doSetPosixProp will have
+# moved the scan to the closing ']'. If it wasn't a property
+# expression, the scan will still be at the opening ':', which should
+# be interpreted as a normal set expression.
+set-posix:
+ ']' n pop doSetEnd
+ ':' set-start
+ default errorDeath doRuleError # should not be possible.
+
+#
+# set-start after the [ and special case leading characters (^ and/or ]) but before
+# everything else. A '-' is literal at this point.
+#
+set-start:
+ ']' n pop doSetEnd
+ '[' n set-open ^set-after-set doSetBeginUnion
+ '\' n set-escape
+ '-' n set-start-dash
+ '&' n set-start-amp
+ default n set-after-lit doSetLiteral
+
+# set-start-dash Turn "[--" into a syntax error.
+# "[-x" is good, - and x are literals.
+#
+set-start-dash:
+ '-' errorDeath doRuleError
+ default set-after-lit doSetAddDash
+
+# set-start-amp Turn "[&&" into a syntax error.
+# "[&x" is good, & and x are literals.
+#
+set-start-amp:
+ '&' errorDeath doRuleError
+ default set-after-lit doSetAddAmp
+
+#
+# set-after-lit The last thing scanned was a literal character within a set.
+# Can be followed by anything. Single '-' or '&' are
+# literals in this context, not operators.
+set-after-lit:
+ ']' n pop doSetEnd
+ '[' n set-open ^set-after-set doSetBeginUnion
+ '-' n set-lit-dash
+ '&' n set-lit-amp
+ '\' n set-escape
+ eof errorDeath doSetNoCloseError
+ default n set-after-lit doSetLiteral
+
+set-after-set:
+ ']' n pop doSetEnd
+ '[' n set-open ^set-after-set doSetBeginUnion
+ '-' n set-set-dash
+ '&' n set-set-amp
+ '\' n set-escape
+ eof errorDeath doSetNoCloseError
+ default n set-after-lit doSetLiteral
+
+set-after-range:
+ ']' n pop doSetEnd
+ '[' n set-open ^set-after-set doSetBeginUnion
+ '-' n set-range-dash
+ '&' n set-range-amp
+ '\' n set-escape
+ eof errorDeath doSetNoCloseError
+ default n set-after-lit doSetLiteral
+
+# set-after-op
+# After a -- or &&
+# It is an error to close a set at this point.
+#
+set-after-op:
+ '[' n set-open ^set-after-set doSetBeginUnion
+ ']' errorDeath doSetOpError
+ '\' n set-escape
+ default n set-after-lit doSetLiteral
+
+#
+# set-set-amp
+# Have scanned [[set]&
+# Could be a '&' intersection operator, if a set follows.
+# Could be the start of a '&&' operator.
+# Otherewise is a literal.
+set-set-amp:
+ '[' n set-open ^set-after-set doSetBeginIntersection1
+ '&' n set-after-op doSetIntersection2
+ default set-after-lit doSetAddAmp
+
+
+# set-lit-amp Have scanned "[literals&"
+# Could be a start of "&&" operator or a literal
+# In [abc&[def]], the '&' is a literal
+#
+set-lit-amp:
+ '&' n set-after-op doSetIntersection2
+ default set-after-lit doSetAddAmp
+
+
+#
+# set-set-dash
+# Have scanned [set]-
+# Could be a '-' difference operator, if a [set] follows.
+# Could be the start of a '--' operator.
+# Otherewise is a literal.
+set-set-dash:
+ '[' n set-open ^set-after-set doSetBeginDifference1
+ '-' n set-after-op doSetDifference2
+ default set-after-lit doSetAddDash
+
+
+#
+# set-range-dash
+# scanned a-b- or \w-
+# any set or range like item where the trailing single '-' should
+# be literal, not a set difference operation.
+# A trailing "--" is still a difference operator.
+set-range-dash:
+ '-' n set-after-op doSetDifference2
+ default set-after-lit doSetAddDash
+
+
+set-range-amp:
+ '&' n set-after-op doSetIntersection2
+ default set-after-lit doSetAddAmp
+
+
+# set-lit-dash
+# Have scanned "[literals-" Could be a range or a -- operator or a literal
+# In [abc-[def]], the '-' is a literal (confirmed with a Java test)
+# [abc-\p{xx} the '-' is a literal
+# [abc-] the '-' is a literal
+# [ab-xy] the '-' is a range
+#
+set-lit-dash:
+ '-' n set-after-op doSetDifference2
+ '[' set-after-lit doSetAddDash
+ ']' set-after-lit doSetAddDash
+ '\' n set-lit-dash-escape
+ default n set-after-range doSetRange
+
+# set-lit-dash-escape
+#
+# scanned "[literal-\"
+# Could be a range, if the \ introduces an escaped literal char or a named char.
+# Could be a literal '-', if the '\' introduces a set-like construct e.g. \s aut \p{...}
+#
+set-lit-dash-escape:
+ 's' set-escape doSetAddDash
+ 'S' set-escape doSetAddDash
+ 'w' set-escape doSetAddDash
+ 'W' set-escape doSetAddDash
+ 'd' set-escape doSetAddDash
+ 'D' set-escape doSetAddDash
+ 'N' set-after-range doSetNamedRange
+ default n set-after-range doSetRange
+
+
+#
+# set-escape
+# Common \ escape processing
+#
+set-escape:
+ 'p' set-after-set doSetProp
+ 'P' set-after-set doSetProp
+ 'N' set-after-lit doSetNamedChar
+ 's' n set-after-range doSetBackslash_s
+ 'S' n set-after-range doSetBackslash_S
+ 'w' n set-after-range doSetBackslash_w
+ 'W' n set-after-range doSetBackslash_W
+ 'd' n set-after-range doSetBackslash_d
+ 'D' n set-after-range doSetBackslash_D
+ default n set-after-lit doSetLiteralEscaped
+
+#
+# set-finish
+# Have just encountered the final ']' that completes a [set], and
+# arrived here via a pop. From here, we exit the set parsing world, and go
+# back to generic regular expression parsing.
+#
+set-finish:
+ default expr-quant doSetFinish
+
+
#
# errorDeath. This state is specified as the next state whenever a syntax error
# in the source rules is detected. Barring bugs, the state machine will never
diff --git a/i18n/regeximp.h b/i18n/regeximp.h
index fbf70067..6944c08a 100644
--- a/i18n/regeximp.h
+++ b/i18n/regeximp.h
@@ -1,6 +1,6 @@
-//
-// Copyright (C) 2002-2005 International Business Machines Corporation
-// and others. All rights reserved.
+//
+// Copyright (C) 2002-2005 International Business Machines Corporation
+// and others. All rights reserved.
//
// file: regeximp.h
//
@@ -66,16 +66,16 @@ enum {
URX_NOP = 7,
URX_START_CAPTURE = 8, // Value field is capture group number.
URX_END_CAPTURE = 9, // Value field is capture group number
- URX_STATIC_SETREF = 10, // Value field is index of set in array of sets.
+ URX_STATIC_SETREF = 10, // Value field is index of set in array of sets.
URX_SETREF = 11, // Value field is index of set in array of sets.
- URX_DOTANY = 12,
+ URX_DOTANY = 12,
URX_JMP = 13, // Value field is destination position in
// the pattern.
URX_FAIL = 14, // Stop match operation, No match.
URX_JMP_SAV = 15, // Operand: JMP destination location
URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B
- URX_BACKSLASH_G = 17,
+ URX_BACKSLASH_G = 17,
URX_JMP_SAV_X = 18, // Conditional JMP_SAV,
// Used in (x)+, breaks loop on zero length match.
// Operand: Jmp destination.
@@ -91,7 +91,7 @@ enum {
URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possessive.
// These are 4 word opcodes. See description.
// First Operand: Data loc of counter variable
- // 2nd Operand: Pat loc of the URX_CTR_LOOPx
+ // 2nd Operand: Pat loc of the URX_CTR_LOOPx
// at the end of the loop.
// 3rd Operand: Minimum count.
// 4th Operand: Max count, -1 for unbounded.
@@ -118,7 +118,7 @@ enum {
// within the matcher stack frame.
URX_JMPX = 36, // Conditional JMP.
// First Operand: JMP target location.
- // Second Operand: Data location containing an
+ // Second Operand: Data location containing an
// input position. If current input position ==
// saved input position, FAIL rather than taking
// the JMP
@@ -157,7 +157,7 @@ enum {
URX_LBN_END = 48, // Negative LookBehind end
// Parameter is the data location.
// Check that the match ended at the right spot.
- URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated
+ URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated
// Operand is index of set in array of sets.
URX_LOOP_SR_I = 50, // Init a [set]* loop.
// Operand is the sets index in array of user sets.
@@ -171,7 +171,7 @@ enum {
URX_BACKSLASH_BU = 53 // \b or \B in UREGEX_UWORD mode, using Unicode style
// word boundaries.
-};
+};
// Keep this list of opcode names in sync with the above enum
// Used for debug printing only.
@@ -236,14 +236,14 @@ enum {
// Convenience macros for assembling and disassembling a compiled operation.
//
#define URX_BUILD(type, val) (int32_t)((type << 24) | (val))
-#define URX_TYPE(x) ((uint32_t)(x) >> 24)
+#define URX_TYPE(x) ((uint32_t)(x) >> 24)
#define URX_VAL(x) ((x) & 0xffffff)
-
+
//
// Access to Unicode Sets composite character properties
// The sets are accessed by the match engine for things like \w (word boundary)
-//
+//
enum {
URX_ISWORD_SET = 1,
URX_ISALNUM_SET = 2,
@@ -297,7 +297,7 @@ enum StartOfMatch {
(v)==START_LINE? "START_LINE" : \
(v)==START_STRING? "START_STRING" : \
"ILLEGAL")
-
+
//
// 8 bit set, to fast-path latin-1 set membership tests.
diff --git a/i18n/regexst.cpp b/i18n/regexst.cpp
index 41014365..d624766b 100644
--- a/i18n/regexst.cpp
+++ b/i18n/regexst.cpp
@@ -59,9 +59,6 @@ static const UChar gRuleSet_rule_char_pattern[] = {
static const UChar gRuleSet_digit_char_pattern[] = {
// [ 0 - 9 ]
0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
-//static const UnicodeSet *gRuleDigits = NULL;
-
-
//
// Here are the backslash escape characters that ICU's unescape() function
@@ -73,23 +70,13 @@ static const UChar gUnescapeCharPattern[] = {
//
-// White space characters that may appear within a pattern in free-form mode
-//
-static const UChar gRuleWhiteSpacePattern[] = {
- /* "[[:Cf:][:WSpace:]]" */
- 91, 91, 58, 67, 102, 58, 93, 91, 58, 87,
- 83, 112, 97, 99, 101, 58, 93, 93, 0 };
-
-
-
-//
// Unicode Set Definitions for Regular Expression \w
//
static const UChar gIsWordPattern[] = {
// [ \ p { A l p h a b e t i c }
0x5b, 0x5c, 0x70, 0x7b, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x62, 0x65, 0x74, 0x69, 0x63, 0x7d,
// \ p { M } Mark
- 0x5c, 0x70, 0x7b, 0x4d, 0x7d,
+ 0x5c, 0x70, 0x7b, 0x4d, 0x7d,
// \ p { N d } Digit_Numeric
0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d,
// \ p { P c } ] Connector_Punctuation
@@ -108,8 +95,8 @@ static const UChar gIsSpacePattern[] = {
// UnicodeSets used in implementation of Grapheme Cluster detection, \X
//
static const UChar gGC_ControlPattern[] = {
-// [ [ : Z l : ] [ : Z p : ]
- 0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
+// [ [ : Z l : ] [ : Z p : ]
+ 0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d,
// [ : C c : ] [ : C f : ] -
0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d,
// [ : G r a p h e m e _
@@ -124,34 +111,35 @@ static const UChar gGC_ExtendPattern[] = {
0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0};
static const UChar gGC_LPattern[] = {
-// [ \ p { H a n g u l _ S y l
+// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = L } ]
- 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0};
+ 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0};
static const UChar gGC_VPattern[] = {
-// [ \ p { H a n g u l _ S y l
+// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = V } ]
- 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0};
+ 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0};
static const UChar gGC_TPattern[] = {
-// [ \ p { H a n g u l _ S y l
+// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = T } ]
- 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
+ 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0};
static const UChar gGC_LVPattern[] = {
-// [ \ p { H a n g u l _ S y l
+// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = L V } ]
- 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
+ 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0};
static const UChar gGC_LVTPattern[] = {
-// [ \ p { H a n g u l _ S y l
+// [ \ p { H a n g u l _ S y l
0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c,
// l a b l e _ T y p e = L V T } ]
- 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
+ 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0};
+
RegexStaticSets *RegexStaticSets::gStaticSets = NULL;
@@ -160,7 +148,7 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status)
fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status),
fRuleDigitsAlias(NULL)
{
- // First zero out everything
+ // First zero out everything
int i;
for (i=0; i<URX_LAST_SET; i++) {
fPropSets[i] = NULL;
@@ -171,7 +159,7 @@ fRuleDigitsAlias(NULL)
// Then init the sets to their correct values.
fPropSets[URX_ISWORD_SET] = new UnicodeSet(UnicodeString(TRUE, gIsWordPattern, -1), *status);
- fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1), *status);
+ fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1), *status);
fPropSets[URX_GC_EXTEND] = new UnicodeSet(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status);
fPropSets[URX_GC_CONTROL] = new UnicodeSet(UnicodeString(TRUE, gGC_ControlPattern, -1), *status);
fPropSets[URX_GC_L] = new UnicodeSet(UnicodeString(TRUE, gGC_LPattern, -1), *status);
@@ -184,14 +172,14 @@ fRuleDigitsAlias(NULL)
// The rest of the initialization needs them, so we cannot proceed.
return;
}
-
-
+
+
//
// The following sets are dynamically constructed, because their
// initialization strings would be unreasonable.
//
-
-
+
+
//
// "Normal" is the set of characters that don't need special handling
// when finding grapheme cluster boundaries.
@@ -202,7 +190,7 @@ fRuleDigitsAlias(NULL)
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]);
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]);
fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]);
-
+
// Initialize the 8-bit fast bit sets from the parallel full
// UnicodeSets.
for (i=0; i<URX_LAST_SET; i++) {
@@ -213,9 +201,8 @@ fRuleDigitsAlias(NULL)
}
// Sets used while parsing rules, but not referenced from the parse state table
- fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status);
- fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodeString(TRUE, gRuleWhiteSpacePattern, -1), *status);
- fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
+ fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status);
+ fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status);
fRuleDigitsAlias = fRuleSets[kRuleSet_digit_char-128];
for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
if (fRuleSets[i]) {
@@ -281,7 +268,7 @@ void RegexStaticSets::initGlobals(UErrorCode *status) {
ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup);
}
}
-
+
U_NAMESPACE_END
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
diff --git a/i18n/repattrn.cpp b/i18n/repattrn.cpp
index bcd11078..8cf55d7b 100644
--- a/i18n/repattrn.cpp
+++ b/i18n/repattrn.cpp
@@ -1,5 +1,5 @@
//
-// file: repattrn.cpp
+// file: repattrn.cpp
//
/*
***************************************************************************
@@ -46,7 +46,7 @@ RegexPattern::RegexPattern() {
//
//--------------------------------------------------------------------------
RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
- init();
+ init();
*this = other;
}
@@ -78,9 +78,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fFrameSize = other.fFrameSize;
fDataSize = other.fDataSize;
fMaxCaptureDigits = other.fMaxCaptureDigits;
- fStaticSets = other.fStaticSets;
+ fStaticSets = other.fStaticSets;
fStaticSets8 = other.fStaticSets8;
-
+
fStartType = other.fStartType;
fInitialStringIdx = other.fInitialStringIdx;
fInitialStringLen = other.fInitialStringLen;
@@ -92,9 +92,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
- // Copy the Unicode Sets.
+ // Copy the Unicode Sets.
// Could be made more efficient if the sets were reference counted and shared,
- // but I doubt that pattern copying will be particularly common.
+ // but I doubt that pattern copying will be particularly common.
// Note: init() already added an empty element zero to fSets
int32_t i;
int32_t numSets = other.fSets->size();
@@ -135,7 +135,7 @@ void RegexPattern::init() {
fFrameSize = 0;
fDataSize = 0;
fGroupMap = NULL;
- fMaxCaptureDigits = 1;
+ fMaxCaptureDigits = 1;
fStaticSets = NULL;
fStaticSets8 = NULL;
fStartType = START_NO_INFO;
@@ -144,7 +144,7 @@ void RegexPattern::init() {
fInitialChars = NULL;
fInitialChar = 0;
fInitialChars8 = NULL;
-
+
fCompiledPat = new UVector32(fDeferredStatus);
fGroupMap = new UVector32(fDeferredStatus);
fSets = new UVector(fDeferredStatus);
@@ -166,7 +166,7 @@ void RegexPattern::init() {
//--------------------------------------------------------------------------
//
-// zap Delete everything owned by this RegexPattern.
+// zap Delete everything owned by this RegexPattern.
//
//--------------------------------------------------------------------------
void RegexPattern::zap() {
@@ -208,7 +208,7 @@ RegexPattern::~RegexPattern() {
// Clone
//
//--------------------------------------------------------------------------
-RegexPattern *RegexPattern::clone() const {
+RegexPattern *RegexPattern::clone() const {
RegexPattern *copy = new RegexPattern(*this);
return copy;
}
@@ -229,7 +229,7 @@ UBool RegexPattern::operator ==(const RegexPattern &other) const {
//---------------------------------------------------------------------
//
-// compile
+// compile
//
//---------------------------------------------------------------------
RegexPattern * U_EXPORT2
@@ -244,7 +244,7 @@ RegexPattern::compile(const UnicodeString &regex,
}
const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
- UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD;
+ UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
if ((flags & ~allFlags) != 0) {
status = U_REGEX_INVALID_FLAG;
@@ -269,19 +269,24 @@ RegexPattern::compile(const UnicodeString &regex,
RegexCompile compiler(This, status);
compiler.compile(regex, pe, status);
+
+ if (U_FAILURE(status)) {
+ delete This;
+ This = NULL;
+ }
return This;
}
-
+
//
// compile with default flags.
//
RegexPattern * U_EXPORT2
RegexPattern::compile(const UnicodeString &regex,
UParseError &pe,
- UErrorCode &err)
+ UErrorCode &err)
{
- return compile(regex, 0, pe, err);
+ return compile(regex, 0, pe, err);
}
@@ -292,10 +297,10 @@ RegexPattern::compile(const UnicodeString &regex,
RegexPattern * U_EXPORT2
RegexPattern::compile( const UnicodeString &regex,
uint32_t flags,
- UErrorCode &err)
+ UErrorCode &err)
{
UParseError pe;
- return compile(regex, flags, pe, err);
+ return compile(regex, flags, pe, err);
}
@@ -326,7 +331,7 @@ RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
#if 0
RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
- UErrorCode &status) const
+ UErrorCode &status) const
{
/* This should never get called. The API with UnicodeString should be called instead. */
if (U_SUCCESS(status)) {
@@ -352,7 +357,7 @@ RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
return NULL;
}
- retMatcher = new RegexMatcher(this);
+ retMatcher = new RegexMatcher(this);
if (retMatcher == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
@@ -440,7 +445,7 @@ void RegexPattern::dumpOp(int32_t index) const {
if (pinnedType >= sizeof(opNames)/sizeof(char *)) {
pinnedType = 0;
}
-
+
REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType]));
switch (type) {
case URX_NOP:
@@ -458,7 +463,7 @@ void RegexPattern::dumpOp(int32_t index) const {
case URX_CARET_M:
// Types with no operand field of interest.
break;
-
+
case URX_RESERVED_OP:
case URX_START_CAPTURE:
case URX_END_CAPTURE:
@@ -494,12 +499,12 @@ void RegexPattern::dumpOp(int32_t index) const {
// types with an integer operand field.
REGEX_DUMP_DEBUG_PRINTF(("%d", val));
break;
-
+
case URX_ONECHAR:
case URX_ONECHAR_I:
REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
break;
-
+
case URX_STRING:
case URX_STRING_I:
{
@@ -543,7 +548,7 @@ void RegexPattern::dumpOp(int32_t index) const {
}
break;
-
+
default:
REGEX_DUMP_DEBUG_PRINTF(("??????"));
break;
@@ -554,7 +559,7 @@ void RegexPattern::dumpOp(int32_t index) const {
#if defined(REGEX_DEBUG)
-U_CAPI void U_EXPORT2
+U_CAPI void U_EXPORT2
RegexPatternDump(const RegexPattern *This) {
int index;
int i;
@@ -565,7 +570,7 @@ RegexPatternDump(const RegexPattern *This) {
}
REGEX_DUMP_DEBUG_PRINTF(("\n"));
REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
- REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
+ REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
if (This->fStartType == START_STRING) {
REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
@@ -580,7 +585,7 @@ RegexPatternDump(const RegexPattern *This) {
REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
for (i=0; i<numSetChars; i++) {
UChar32 c = This->fInitialChars->charAt(i);
- if (0x20<c && c <0x7e) {
+ if (0x20<c && c <0x7e) {
REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
} else {
REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
diff --git a/i18n/unicode/regex.h b/i18n/unicode/regex.h
index ce24ef5f..27f4b404 100644
--- a/i18n/unicode/regex.h
+++ b/i18n/unicode/regex.h
@@ -16,7 +16,7 @@
#ifndef REGEX_H
#define REGEX_H
-//#define REGEX_DEBUG
+// #define REGEX_DEBUG
/**
* \file
@@ -36,7 +36,7 @@
* operations, for search and replace operations, and for obtaining detailed
* information about bounds of a match. </p>
*
- * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
+ * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular
* expression pattern strings application code can be simplified and the explicit
* need for <code>RegexPattern</code> objects can usually be eliminated.
* </p>
@@ -480,7 +480,7 @@ public:
* critical that the string not be altered or deleted before use by the regular
* expression operations is complete.
*
- * @param regexp The Regular Expression to be compiled.
+ * @param regexp The Regular Expression to be compiled.
* @param input The string to match. The matcher retains a reference to the
* caller's string; mo copy is made.
* @param flags Regular expression options, such as case insensitive matching.
@@ -709,13 +709,13 @@ public:
/**
* Resets this matcher with a new input string. This allows instances of RegexMatcher
* to be reused, which is more efficient than creating a new RegexMatcher for
- * each input string to be processed.
+ * each input string to be processed.
* @param input The new string on which subsequent pattern matches will operate.
* The matcher retains a reference to the callers string, and operates
* directly on that. Ownership of the string remains with the caller.
* Because no copy of the string is made, it is essential that the
* caller not delete the string until after regexp operations on it
- * are done.
+ * are done.
* @return this RegexMatcher.
* @stable ICU 2.4
*/
diff --git a/i18n/unicode/uregex.h b/i18n/unicode/uregex.h
index 862cf344..3b0e7018 100644
--- a/i18n/unicode/uregex.h
+++ b/i18n/unicode/uregex.h
@@ -73,7 +73,17 @@ typedef enum URegexpFlag{
* http://unicode.org/reports/tr29/#Word_Boundaries
* @stable ICU 2.8
*/
- UREGEX_UWORD = 256
+ UREGEX_UWORD = 256,
+
+ /** Error on Unrecognized backslash escapes.
+ * If set, fail with an error on patterns that contain
+ * backslash-escaped ASCII letters without a known specail
+ * meaning. If this flag is not set, these
+ * escaped letters represent themselves.
+ * @draft ICU 4.0
+ */
+ UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512
+
} URegexpFlag;
/**