diff options
author | Jean-Baptiste Queru <jbq@google.com> | 2009-07-17 17:40:43 -0700 |
---|---|---|
committer | Jean-Baptiste Queru <jbq@google.com> | 2009-07-17 17:40:43 -0700 |
commit | c0f3e2506e4cc62ff8c220fe72849728e9d6cecf (patch) | |
tree | 778334c2c002f3c4b016a8d4de106cdb8dc959d8 /i18n | |
parent | 6b13cbaafaffaeeaf0477e95816759728fcdb763 (diff) | |
download | icu4c-c0f3e2506e4cc62ff8c220fe72849728e9d6cecf.tar.gz |
import cl @40073
Diffstat (limited to 'i18n')
-rw-r--r-- | i18n/regexcmp.cpp | 993 | ||||
-rw-r--r-- | i18n/regexcmp.h | 40 | ||||
-rw-r--r-- | i18n/regexcst.h | 380 | ||||
-rw-r--r-- | i18n/regexcst.pl | 8 | ||||
-rw-r--r-- | i18n/regexcst.txt | 279 | ||||
-rw-r--r-- | i18n/regeximp.h | 28 | ||||
-rw-r--r-- | i18n/regexst.cpp | 61 | ||||
-rw-r--r-- | i18n/repattrn.cpp | 59 | ||||
-rw-r--r-- | i18n/unicode/regex.h | 10 | ||||
-rw-r--r-- | i18n/unicode/uregex.h | 12 |
10 files changed, 1428 insertions, 442 deletions
diff --git a/i18n/regexcmp.cpp b/i18n/regexcmp.cpp index 860333c0..7d0343e4 100644 --- a/i18n/regexcmp.cpp +++ b/i18n/regexcmp.cpp @@ -27,6 +27,7 @@ #include "uvectr32.h" #include "uassert.h" #include "ucln_in.h" +#include "uinvchar.h" #include "regeximp.h" #include "regexcst.h" // Contains state table for the regex pattern parser. @@ -38,7 +39,16 @@ U_NAMESPACE_BEGIN - +// TODO: remove +#if 0 +#include <stdio.h> +static void printstring(const UnicodeString &s) { + for (int i=0; i<s.length(); i++) { + printf("%c", s[i]); + } + printf("\n"); +} +#endif @@ -47,7 +57,8 @@ U_NAMESPACE_BEGIN // Constructor. // //------------------------------------------------------------------------------ -RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack(status) +RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : + fParenStack(status), fSetStack(status), fSetOpStack(status) { fStatus = &status; @@ -71,6 +82,8 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : fParenStack( } } +static const UChar chAmp = 0x26; // '&' +static const UChar chDash = 0x2d; // '-' //------------------------------------------------------------------------------ @@ -145,7 +158,7 @@ void RegexCompile::compile( // the search will stop there, if not before. // tableEl = &gRuleParseStateTable[state]; - REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", + REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); for (;;) { // loop through table rows belonging to this state, looking for one @@ -226,7 +239,7 @@ void RegexCompile::compile( // state stack underflow // This will occur if the user pattern has mis-matched parentheses, // with extra close parens. - // + // fStackPtr++; error(U_REGEX_MISMATCHED_PAREN); } @@ -234,6 +247,16 @@ void RegexCompile::compile( } + if (U_FAILURE(*fStatus)) { + // Bail out if the pattern had errors. + // Set stack cleanup: a successful compile would have left it empty, + // but errors can leave temporary sets hanging around. + while (!fSetStack.empty()) { + delete (UnicodeSet *)fSetStack.pop(); + } + return; + } + // // The pattern has now been read and processed, and the compiled code generated. // @@ -288,8 +311,8 @@ void RegexCompile::compile( // // Optimization passes - // - matchStartType(); + // + matchStartType(); OptDotStar(); stripNOPs(); @@ -465,7 +488,7 @@ UBool RegexCompile::doParseActions(int32_t action) case doOpenAtomicParen: // Open Atomic Paren. (?> // Compile to a - // - NOP, which later may be replaced if the parenthesized group + // - NOP, which later may be replaced if the parenthesized group // has a quantifier, followed by // - STO_SP save state stack position, so it can be restored at the ")" // - NOP, which may later be replaced by a save-state if there @@ -500,11 +523,11 @@ UBool RegexCompile::doParseActions(int32_t action) // 3. NOP may be replaced if there is are '|' ops in the block. // 4. code for parenthesized stuff. // 5. ENDLA - // + // // Two data slots are reserved, for saving the stack ptr and the input position. { int32_t dataLoc = fRXPat->fDataSize; - fRXPat->fDataSize += 2; + fRXPat->fDataSize += 2; int32_t op = URX_BUILD(URX_LA_START, dataLoc); fRXPat->fCompiledPat->addElement(op, *fStatus); @@ -513,7 +536,7 @@ UBool RegexCompile::doParseActions(int32_t action) fRXPat->fCompiledPat->addElement(op, *fStatus); // On the Parentheses stack, start a new frame and add the postions - // of the NOPs. + // of the NOPs. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(lookAhead, *fStatus); // Frame type. fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location @@ -534,7 +557,7 @@ UBool RegexCompile::doParseActions(int32_t action) // 7. ... { int32_t dataLoc = fRXPat->fDataSize; - fRXPat->fDataSize += 2; + fRXPat->fDataSize += 2; int32_t op = URX_BUILD(URX_LA_START, dataLoc); fRXPat->fCompiledPat->addElement(op, *fStatus); @@ -545,12 +568,12 @@ UBool RegexCompile::doParseActions(int32_t action) fRXPat->fCompiledPat->addElement(op, *fStatus); // On the Parentheses stack, start a new frame and add the postions - // of the StateSave and NOP. + // of the StateSave and NOP. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push( negLookAhead, *fStatus); // Frame type fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location - + // Instructions #5 and #6 will be added when the ')' is encountered. } break; @@ -574,34 +597,34 @@ UBool RegexCompile::doParseActions(int32_t action) // 0: Stack ptr on entry // 1: Input Index on entry // 2: Start index of match current match attempt. - // 3: Original Input String len. + // 3: Original Input String len. // Allocate data space int32_t dataLoc = fRXPat->fDataSize; - fRXPat->fDataSize += 4; - + fRXPat->fDataSize += 4; + // Emit URX_LB_START int32_t op = URX_BUILD(URX_LB_START, dataLoc); fRXPat->fCompiledPat->addElement(op, *fStatus); - + // Emit URX_LB_CONT op = URX_BUILD(URX_LB_CONT, dataLoc); fRXPat->fCompiledPat->addElement(op, *fStatus); fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. - + // Emit the NOP op = URX_BUILD(URX_NOP, 0); fRXPat->fCompiledPat->addElement(op, *fStatus); fRXPat->fCompiledPat->addElement(op, *fStatus); - + // On the Parentheses stack, start a new frame and add the postions - // of the URX_LB_CONT and the NOP. + // of the URX_LB_CONT and the NOP. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(lookBehind, *fStatus); // Frame type fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location - + // The final two instructions will be added when the ')' is encountered. } @@ -627,35 +650,35 @@ UBool RegexCompile::doParseActions(int32_t action) // 0: Stack ptr on entry // 1: Input Index on entry // 2: Start index of match current match attempt. - // 3: Original Input String len. + // 3: Original Input String len. // Allocate data space int32_t dataLoc = fRXPat->fDataSize; - fRXPat->fDataSize += 4; - + fRXPat->fDataSize += 4; + // Emit URX_LB_START int32_t op = URX_BUILD(URX_LB_START, dataLoc); fRXPat->fCompiledPat->addElement(op, *fStatus); - + // Emit URX_LBN_CONT op = URX_BUILD(URX_LBN_CONT, dataLoc); fRXPat->fCompiledPat->addElement(op, *fStatus); fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later. - + // Emit the NOP op = URX_BUILD(URX_NOP, 0); fRXPat->fCompiledPat->addElement(op, *fStatus); fRXPat->fCompiledPat->addElement(op, *fStatus); - + // On the Parentheses stack, start a new frame and add the postions - // of the URX_LB_CONT and the NOP. + // of the URX_LB_CONT and the NOP. fParenStack.push(fModeFlags, *fStatus); // Match mode state fParenStack.push(lookBehindN, *fStatus); // Frame type fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location - + // The final two instructions will be added when the ')' is encountered. } break; @@ -834,7 +857,7 @@ UBool RegexCompile::doParseActions(int32_t action) // 2. LOOP_C stack location // ... // - // Or if this is a .* + // Or if this is a .* // 1. LOOP_DOT_I (. matches all mode flag) // 2. LOOP_C stack location // @@ -855,7 +878,7 @@ UBool RegexCompile::doParseActions(int32_t action) int32_t repeatedOp = fRXPat->fCompiledPat->elementAti(topLoc); if (URX_TYPE(repeatedOp) == URX_SETREF) { - // Emit optimized code for a [char set]* + // Emit optimized code for a [char set]* int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); dataLoc = fRXPat->fFrameSize; @@ -899,7 +922,7 @@ UBool RegexCompile::doParseActions(int32_t action) fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); } - + // Locate the position in the compiled pattern where the match will continue // after completing the *. (4 or 5 in the comment above) int32_t continueLoc = fRXPat->fCompiledPat->size()+1; @@ -982,8 +1005,8 @@ UBool RegexCompile::doParseActions(int32_t action) // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. { // Remember the loc for the top of the block being looped over. - // (Can not reserve a slot in the compiled pattern at this time, becuase - // compileInterval needs to reserve also, and blockTopLoc can only reserve + // (Can not reserve a slot in the compiled pattern at this time, becuase + // compileInterval needs to reserve also, and blockTopLoc can only reserve // once per block.) int32_t topLoc = blockTopLoc(FALSE); @@ -1022,11 +1045,22 @@ UBool RegexCompile::doParseActions(int32_t action) break; case doLiteralChar: - // We've just scanned a "normal" character from the pattern, + // We've just scanned a "normal" character from the pattern, literalChar(fC.fChar); break; + case doEscapedLiteralChar: + // We've just scanned an backslashed escaped character with no + // special meaning. It represents itself. + if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && + ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] + (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] + error(U_REGEX_BAD_ESCAPE_SEQUENCE); + } + literalChar(fC.fChar); + break; + case doDotAny: // scanned a ".", match any single character. @@ -1041,7 +1075,7 @@ UBool RegexCompile::doParseActions(int32_t action) } break; - case doCaret: + case doCaret: { int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_CARET_M : URX_CARET; fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); @@ -1049,7 +1083,7 @@ UBool RegexCompile::doParseActions(int32_t action) break; - case doDollar: + case doDollar: { int32_t op = (fModeFlags & UREGEX_MULTILINE)? URX_DOLLAR_M : URX_DOLLAR; fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); @@ -1144,18 +1178,13 @@ UBool RegexCompile::doParseActions(int32_t action) } break; - - case doScanUnicodeSet: + case doNamedChar: { - UnicodeSet *theSet = scanSet(); - compileSet(theSet); + UChar32 c = scanNamedChar(); + literalChar(c); } break; - - case doEnterQuoteMode: - // Just scanned a \Q. Put character scanner into quote mode. - fQuoteMode = TRUE; - break; + case doBackRef: // BackReference. Somewhat unusual in that the front-end can not completely parse @@ -1182,7 +1211,7 @@ UBool RegexCompile::doParseActions(int32_t action) } // Scan of the back reference in the source regexp is complete. Now generate - // the compiled code for it. + // the compiled code for it. // Because capture groups can be forward-referenced by back-references, // we fill the operand with the capture group number. At the end // of compilation, it will be changed to the variable's location. @@ -1222,7 +1251,7 @@ UBool RegexCompile::doParseActions(int32_t action) // Emit the STATE_SAVE op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); fRXPat->fCompiledPat->addElement(op, *fStatus); - + // Emit the JMP op = URX_BUILD(URX_JMP, topLoc+1); fRXPat->fCompiledPat->addElement(op, *fStatus); @@ -1259,7 +1288,7 @@ UBool RegexCompile::doParseActions(int32_t action) op = URX_BUILD(URX_STATE_SAVE, L7); fRXPat->fCompiledPat->setElementAt(op, topLoc+1); - // Append the JMP operation. + // Append the JMP operation. op = URX_BUILD(URX_JMP, topLoc+1); fRXPat->fCompiledPat->addElement(op, *fStatus); @@ -1335,8 +1364,8 @@ UBool RegexCompile::doParseActions(int32_t action) fModeFlags = fNewModeFlags; // Prevent any string from spanning across the change of match mode. - // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef" - fixLiterals(); + // Otherwise the pattern "abc(?i)def" would make a single string of "abcdef" + fixLiterals(); break; @@ -1379,6 +1408,272 @@ UBool RegexCompile::doParseActions(int32_t action) break; + case doSetAddAmp: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + set->add(chAmp); + } + break; + + case doSetAddDash: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + set->add(chDash); + } + break; + + case doSetBackslash_s: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); + break; + } + + case doSetBackslash_S: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); + SSet.complement(); + set->addAll(SSet); + break; + } + + case doSetBackslash_d: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + UnicodeSet digits(UnicodeString("\\p{Nd}"), *fStatus); // TODO - make a static set, + set->addAll(digits); + break; + } + + case doSetBackslash_D: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + UnicodeSet digits(UnicodeString("\\P{Nd}"), *fStatus); // TODO - make a static set, + set->addAll(digits); + break; + } + + case doSetBackslash_w: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); + break; + } + + case doSetBackslash_W: + { + UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); + UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); + SSet.complement(); + set->addAll(SSet); + break; + } + + case doSetBegin: + fSetStack.push(new UnicodeSet(), *fStatus); + fSetOpStack.push(setStart, *fStatus); + if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { + fSetOpStack.push(setCaseClose, *fStatus); + } + break; + + case doSetBeginDifference1: + // We have scanned something like [[abc]-[ + // Set up a new UnicodeSet for the set beginning with the just-scanned '[' + // Push a Difference operator, which will cause the new set to be subtracted from what + // went before once it is created. + setPushOp(setDifference1); + fSetOpStack.push(setStart, *fStatus); + if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { + fSetOpStack.push(setCaseClose, *fStatus); + } + break; + + case doSetBeginIntersection1: + // We have scanned something like [[abc]&[ + // Need both the '&' operator and the open '[' operator. + setPushOp(setIntersection1); + fSetOpStack.push(setStart, *fStatus); + if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { + fSetOpStack.push(setCaseClose, *fStatus); + } + break; + + case doSetBeginUnion: + // We have scanned something like [[abc][ + // Need to handle the union operation explicitly [[abc] | [ + setPushOp(setUnion); + fSetOpStack.push(setStart, *fStatus); + if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { + fSetOpStack.push(setCaseClose, *fStatus); + } + break; + + case doSetDifference2: + // We have scanned something like [abc-- + // Consider this to unambiguously be a set difference operator. + setPushOp(setDifference2); + break; + + case doSetEnd: + // Have encountered the ']' that closes a set. + // Force the evaluation of any pending operations within this set, + // leave the completed set on the top of the set stack. + { + setEval(setEnd); + int32_t setOp = fSetOpStack.popi(); + U_ASSERT(setOp==setStart); + break; + } + + case doSetFinish: + { + // Finished a complete set expression, including all nested sets. + // The close bracket has already triggered clearing out pending set operators, + // the operator stack should be empty and the operand stack should have just + // one entry, the result set. + U_ASSERT(fSetOpStack.empty()); + UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); + U_ASSERT(fSetStack.empty()); + compileSet(theSet); + break; + } + + case doSetIntersection2: + // Have scanned something like [abc&& + setPushOp(setIntersection2); + break; + + case doSetLiteral: + // Union the just-scanned literal character into the set being built. + // This operation is the highest precedence set operation, so we can always do + // it immediately, without waiting to see what follows. It is necessary to perform + // any pending '-' or '&' operation first, because these have the same precedence + // as union-ing in a literal' + { + setEval(setUnion); + UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); + s->add(fC.fChar); + fLastSetLiteral = fC.fChar; + break; + } + + case doSetLiteralEscaped: + // A back-slash escaped literal character was encountered. + // Processing is the same as with setLiteral, above, with the addition of + // the optional check for errors on escaped ASCII letters. + { + if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && + ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] + (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] + error(U_REGEX_BAD_ESCAPE_SEQUENCE); + } + setEval(setUnion); + UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); + s->add(fC.fChar); + fLastSetLiteral = fC.fChar; + break; + } + + case doSetNamedChar: + // Scanning a \N{UNICODE CHARACTER NAME} + // Aside from the source of the character, the processing is identical to doSetLiteral, + // above. + { + UChar32 c = scanNamedChar(); + setEval(setUnion); + UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); + s->add(c); + fLastSetLiteral = c; + break; + } + + case doSetNamedRange: + // We have scanned literal-\N{CHAR NAME}. Add the range to the set. + // The left character is already in the set, and is saved in fLastSetLiteral. + // The right side needs to be picked up, the scan is at the 'N'. + // Lower Limit > Upper limit being an error matches both Java + // and ICU UnicodeSet behavior. + { + UChar32 c = scanNamedChar(); + if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) { + error(U_REGEX_INVALID_RANGE); + } + UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); + s->add(fLastSetLiteral, c); + fLastSetLiteral = c; + break; + } + + + case doSetNegate: + // Scanned a '^' at the start of a set. + // Push the negation operator onto the set op stack. + // A twist for case-insensitive matching: + // the case closure operation must happen _before_ negation. + // But the case closure operation will already be on the stack if it's required. + // This requires checking for case closure, and swapping the stack order + // if it is present. + { + int32_t tosOp = fSetOpStack.peeki(); + if (tosOp == setCaseClose) { + fSetOpStack.popi(); + fSetOpStack.push(setNegation, *fStatus); + fSetOpStack.push(setCaseClose, *fStatus); + } else { + fSetOpStack.push(setNegation, *fStatus); + } + } + break; + + case doSetNoCloseError: + error(U_REGEX_MISSING_CLOSE_BRACKET); + break; + + case doSetOpError: + error(U_REGEX_RULE_SYNTAX); // TODO: -- or && at the end of a set. Illegal. + break; + + case doSetPosixProp: + { + UnicodeSet *s = scanPosixProp(); + if (s != NULL) { + UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); + tos->addAll(*s); + delete s; + } // else error. scanProp() reported the error status already. + } + break; + + case doSetProp: + // Scanned a \p \P within [brackets]. + { + UnicodeSet *s = scanProp(); + if (s != NULL) { + UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); + tos->addAll(*s); + delete s; + } // else error. scanProp() reported the error status already. + } + break; + + + case doSetRange: + // We have scanned literal-literal. Add the range to the set. + // The left character is already in the set, and is saved in fLastSetLiteral. + // The right side is the current character. + // Lower Limit > Upper limit being an error matches both Java + // and ICU UnicodeSet behavior. + { + if (fLastSetLiteral > fC.fChar) { + error(U_REGEX_INVALID_RANGE); + } + UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); + s->add(fLastSetLiteral, fC.fChar); + break; + } + default: U_ASSERT(FALSE); @@ -1431,7 +1726,7 @@ void RegexCompile::literalChar(UChar32 c) { fRXPat->fLiteralText.append(c); return; } - + // We are adding onto an existing string fRXPat->fLiteralText.append(c); @@ -1439,7 +1734,7 @@ void RegexCompile::literalChar(UChar32 c) { opType = URX_TYPE(op); U_ASSERT(opType == URX_ONECHAR || opType == URX_ONECHAR_I || opType == URX_STRING_LEN); - // If the most recently emitted op is a URX_ONECHAR, + // If the most recently emitted op is a URX_ONECHAR, if (opType == URX_ONECHAR || opType == URX_ONECHAR_I) { if (U16_IS_TRAIL(c) && U16_IS_LEAD(URX_VAL(op))) { // The most recently emitted op is a ONECHAR that was the first half @@ -1451,7 +1746,7 @@ void RegexCompile::literalChar(UChar32 c) { fRXPat->fCompiledPat->setElementAt(op, patternLoc); return; } - + // The most recently emitted op is a ONECHAR. // We've now received another adjacent char. Change the ONECHAR op // to a string op. @@ -1465,7 +1760,7 @@ void RegexCompile::literalChar(UChar32 c) { op = URX_BUILD(URX_STRING_LEN, 0); fRXPat->fCompiledPat->addElement(op, *fStatus); } - + // The pattern contains a URX_SRING / URX_STRING_LEN. Update the // string length to reflect the new char we just added to the string. stringLen = fRXPat->fLiteralText.length() - fStringOpStart; @@ -1523,7 +1818,7 @@ void RegexCompile::fixLiterals(UBool split) { UChar32 nextToLastChar; int32_t stringLen; - fStringOpStart = -1; + fStringOpStart = -1; if (!split) { return; } @@ -1533,7 +1828,7 @@ void RegexCompile::fixLiterals(UBool split) { // separate the last char from the rest of the string. // If the last operation from the compiled pattern is not a string, - // nothing needs to be done + // nothing needs to be done op = fRXPat->fCompiledPat->lastElementi(); opType = URX_TYPE(op); if (opType != URX_STRING_LEN) { @@ -1651,7 +1946,7 @@ void RegexCompile::insertOp(int32_t where) { // // parameter reserveLoc : TRUE - ensure that there is space to add an opcode // at the returned location. -// FALSE - just return the address, +// FALSE - just return the address, // do not reserve a location there. // //------------------------------------------------------------------------------ @@ -1725,10 +2020,10 @@ void RegexCompile::handleCloseParen() { // At the close of any parenthesized block, restore the match mode flags to // the value they had at the open paren. Saved value is - // at the top of the paren stack. + // at the top of the paren stack. fModeFlags = fParenStack.popi(); U_ASSERT(fModeFlags < 0); - + // DO any additional fixups, depending on the specific kind of // parentesized grouping this is @@ -1798,7 +2093,7 @@ void RegexCompile::handleCloseParen() { case lookBehind: { // See comment at doOpenLookBehind. - + // Append the URX_LB_END and URX_LA_END to the compiled pattern. int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); U_ASSERT(URX_TYPE(startOp) == URX_LB_START); @@ -1833,7 +2128,7 @@ void RegexCompile::handleCloseParen() { case lookBehindN: { // See comment at doOpenLookBehindNeg. - + // Append the URX_LBN_END to the compiled pattern. int32_t startOp = fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); U_ASSERT(URX_TYPE(startOp) == URX_LB_START); @@ -1890,24 +2185,23 @@ void RegexCompile::compileSet(UnicodeSet *theSet) if (theSet == NULL) { return; } + // Remove any strings from the set. + // There shoudn't be any, but just in case. + // (Case Closure can add them; if we had a simple case closure avaialble that + // ignored strings, that would be better.) + theSet->removeAllStrings(); int32_t setSize = theSet->size(); UChar32 firstSetChar = theSet->charAt(0); - if (firstSetChar == -1) { - // Sets that contain only strings, but no individual chars, - // will end up here. - error(U_REGEX_SET_CONTAINS_STRING); - setSize = 0; - } switch (setSize) { - case 0: + case 0: { - // Set of no elements. Always fails to match. + // Set of no elements. Always fails to match. fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus); delete theSet; } break; - + case 1: { // The set contains only a single code point. Put it into @@ -1917,8 +2211,8 @@ void RegexCompile::compileSet(UnicodeSet *theSet) delete theSet; } break; - - default: + + default: { // The set contains two or more chars. (the normal case) // Put it into the compiled pattern as a set. @@ -1944,9 +2238,9 @@ void RegexCompile::compileSet(UnicodeSet *theSet) // 2 min count // 3 max count (-1 for unbounded) // 4 ... block to be iterated over -// 5 CTR_LOOP -// -// In +// 5 CTR_LOOP +// +// In //------------------------------------------------------------------------------ void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) { @@ -2020,9 +2314,9 @@ UBool RegexCompile::compileInlineInterval() { // int32_t op = fRXPat->fCompiledPat->elementAti(topOfBlock); - // Compute the pattern location where the inline sequence + // Compute the pattern location where the inline sequence // will end, and set up the state save op that will be needed. - // + // int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 + fIntervalUpper + (fIntervalUpper-fIntervalLow); int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); @@ -2127,7 +2421,7 @@ void RegexCompile::matchStartType() { case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. case URX_LD_SP: break; - + case URX_CARET: if (atStart) { fRXPat->fStartType = START_START; @@ -2139,7 +2433,7 @@ void RegexCompile::matchStartType() { fRXPat->fStartType = START_LINE; } break; - + case URX_ONECHAR: if (currentLen == 0) { // This character could appear at the start of a match. @@ -2150,9 +2444,9 @@ void RegexCompile::matchStartType() { currentLen++; atStart = FALSE; break; - - case URX_SETREF: + + case URX_SETREF: if (currentLen == 0) { int32_t sn = URX_VAL(op); U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); @@ -2189,7 +2483,7 @@ void RegexCompile::matchStartType() { break; - case URX_STATIC_SETREF: + case URX_STATIC_SETREF: if (currentLen == 0) { int32_t sn = URX_VAL(op); U_ASSERT(sn>0 && sn<URX_LAST_SET); @@ -2203,7 +2497,7 @@ void RegexCompile::matchStartType() { - case URX_STAT_SETREF_N: + case URX_STAT_SETREF_N: if (currentLen == 0) { int32_t sn = URX_VAL(op); const UnicodeSet *s = fRXPat->fStaticSets[sn]; @@ -2221,7 +2515,7 @@ void RegexCompile::matchStartType() { case URX_BACKSLASH_D: // Digit Char if (currentLen == 0) { - UnicodeSet s; + UnicodeSet s; s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); if (URX_VAL(op) != 0) { s.complement(); @@ -2282,7 +2576,7 @@ void RegexCompile::matchStartType() { // Loop of some kind. Can safely ignore, the worst that will happen // is that we understate the true minimum length currentLen = forwardedLength.elementAti(loc+1); - + } else { // Forward jump. Propagate the current min length to the target loc of the jump. U_ASSERT(jmpDest <= end+1); @@ -2318,11 +2612,11 @@ void RegexCompile::matchStartType() { if (currentLen < forwardedLength.elementAti(jmpDest)) { forwardedLength.setElementAt(currentLen, jmpDest); } - } + } } atStart = FALSE; break; - + @@ -2346,7 +2640,7 @@ void RegexCompile::matchStartType() { fRXPat->fInitialStringIdx = stringStartIdx; fRXPat->fInitialStringLen = stringLen; } - + currentLen += stringLen; atStart = FALSE; } @@ -2382,10 +2676,10 @@ void RegexCompile::matchStartType() { { // Loop Init Ops. These don't change the min length, but they are 4 word ops // so location must be updated accordingly. - // Loop Init Ops. + // Loop Init Ops. // If the min loop count == 0 // move loc forwards to the end of the loop, skipping over the body. - // If the min count is > 0, + // If the min count is > 0, // continue normal processing of the body of the loop. int32_t loopEndLoc = fRXPat->fCompiledPat->elementAti(loc+1); loopEndLoc = URX_VAL(loopEndLoc); @@ -2398,7 +2692,7 @@ void RegexCompile::matchStartType() { if (forwardedLength.elementAti(loopEndLoc) > currentLen) { forwardedLength.setElementAt(currentLen, loopEndLoc); } - } + } loc+=3; // Skips over operands of CTR_INIT } atStart = FALSE; @@ -2407,17 +2701,17 @@ void RegexCompile::matchStartType() { case URX_CTR_LOOP: case URX_CTR_LOOP_NG: - // Loop ops. + // Loop ops. // The jump is conditional, backwards only. atStart = FALSE; break; - + case URX_LOOP_C: // More loop ops. These state-save to themselves. // don't change the minimum match atStart = FALSE; break; - + case URX_LA_START: case URX_LB_START: @@ -2447,25 +2741,25 @@ void RegexCompile::matchStartType() { } } } - U_ASSERT(loc <= end); + U_ASSERT(loc <= end); } } break; - + case URX_LA_END: case URX_LB_CONT: case URX_LB_END: case URX_LBN_CONT: case URX_LBN_END: - U_ASSERT(FALSE); // Shouldn't get here. These ops should be + U_ASSERT(FALSE); // Shouldn't get here. These ops should be // consumed by the scan in URX_LA_START and LB_START break; - + default: U_ASSERT(FALSE); } - + } @@ -2524,7 +2818,7 @@ void RegexCompile::matchStartType() { //------------------------------------------------------------------------------ // // minMatchLength Calculate the length of the shortest string that could -// match the specified pattern. +// match the specified pattern. // Length is in 16 bit code units, not code points. // // The calculated length may not be exact. The returned @@ -2603,10 +2897,10 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { case URX_JMP_SAV: case URX_JMP_SAV_X: break; - + // Ops that match a minimum of one character (one or two 16 bit code units.) - // + // case URX_ONECHAR: case URX_STATIC_SETREF: case URX_STAT_SETREF_N: @@ -2661,10 +2955,10 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { if (currentLen < forwardedLength.elementAti(jmpDest)) { forwardedLength.setElementAt(currentLen, jmpDest); } - } + } } break; - + case URX_STRING: case URX_STRING_I: @@ -2679,10 +2973,10 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { case URX_CTR_INIT: case URX_CTR_INIT_NG: { - // Loop Init Ops. + // Loop Init Ops. // If the min loop count == 0 // move loc forwards to the end of the loop, skipping over the body. - // If the min count is > 0, + // If the min count is > 0, // continue normal processing of the body of the loop. int32_t loopEndLoc = fRXPat->fCompiledPat->elementAti(loc+1); loopEndLoc = URX_VAL(loopEndLoc); @@ -2698,17 +2992,17 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { case URX_CTR_LOOP: case URX_CTR_LOOP_NG: - // Loop ops. + // Loop ops. // The jump is conditional, backwards only. break; - + case URX_LOOP_SR_I: case URX_LOOP_DOT_I: case URX_LOOP_C: // More loop ops. These state-save to themselves. // don't change the minimum match - could match nothing at all. break; - + case URX_LA_START: case URX_LB_START: @@ -2740,12 +3034,12 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { } } } - - U_ASSERT(loc <= end); + + U_ASSERT(loc <= end); } } break; - + case URX_LA_END: case URX_LB_CONT: case URX_LB_END: @@ -2754,11 +3048,11 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { // Only come here if the matching URX_LA_START or URX_LB_START was not in the // range being sized, which happens when measuring size of look-behind blocks. break; - + default: U_ASSERT(FALSE); } - + } // We have finished walking through the ops. Check whether some forward jump @@ -2767,7 +3061,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { currentLen = forwardedLength.elementAti(end+1); U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); } - + return currentLen; } @@ -2776,7 +3070,7 @@ int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { //------------------------------------------------------------------------------ // // maxMatchLength Calculate the length of the longest string that could -// match the specified pattern. +// match the specified pattern. // Length is in 16 bit code units, not code points. // // The calculated length may not be exact. The returned @@ -2843,7 +3137,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { case URX_LBN_CONT: case URX_LBN_END: break; - + // Ops that increase that cause an unbounded increase in the length // of a matched string, or that increase it a hard to characterize way. @@ -2858,13 +3152,13 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { // Ops that match a max of one character (possibly two 16 bit code units.) - // + // case URX_STATIC_SETREF: case URX_STAT_SETREF_N: case URX_SETREF: case URX_BACKSLASH_D: case URX_ONECHAR_I: - case URX_DOTANY_ALL: + case URX_DOTANY_ALL: case URX_DOTANY: currentLen+=2; break; @@ -2878,7 +3172,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { } break; - // Jumps. + // Jumps. // case URX_JMP: case URX_JMPX: @@ -2922,7 +3216,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { } } break; - + @@ -2948,8 +3242,8 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { // INT32_MAX length will stop the per-instruction loop. currentLen = INT32_MAX; break; - - + + case URX_LA_START: case URX_LA_END: @@ -2957,16 +3251,16 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { // it were normal pattern. Gives a too-long match length, // but good enough for now. break; - + // End of look-ahead ops should always be consumed by the processing at // the URX_LA_START op. // U_ASSERT(FALSE); // break; - + case URX_LB_START: { // Look-behind. Scan forward until the matching look-around end, - // without processing the look-behind block. + // without processing the look-behind block. int32_t depth = 0; for (;;) { loc++; @@ -2980,7 +3274,7 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { } depth--; } - U_ASSERT(loc < end); + U_ASSERT(loc < end); } } break; @@ -2989,16 +3283,16 @@ int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { U_ASSERT(FALSE); } - + if (currentLen == INT32_MAX) { // The maximum length is unbounded. // Stop further processing of the pattern. break; } - + } return currentLen; - + } @@ -3154,9 +3448,9 @@ void RegexCompile::OptDotStar() { U_ASSERT(jmpLoc>0); op = fRXPat->fCompiledPat->elementAti(jmpLoc); opType = URX_TYPE(op); - switch(opType) { + switch(opType) { + - case URX_END: case URX_NOP: case URX_END_CAPTURE: @@ -3234,18 +3528,24 @@ void RegexCompile::error(UErrorCode e) { // (Think EBCDIC). // static const UChar chCR = 0x0d; // New lines, for terminating comments. -static const UChar chLF = 0x0a; -static const UChar chNEL = 0x85; // NEL newline variant -static const UChar chLS = 0x2028; // Unicode Line Separator +static const UChar chLF = 0x0a; // Line Feed static const UChar chPound = 0x23; // '#', introduces a comment. +static const UChar chDigit0 = 0x30; // '0' +static const UChar chDigit7 = 0x37; // '9' +static const UChar chColon = 0x3A; // ':' static const UChar chE = 0x45; // 'E' -static const UChar chUpperN = 0x4E; -static const UChar chLowerP = 0x70; -static const UChar chUpperP = 0x50; +static const UChar chQ = 0x51; // 'Q' +static const UChar chN = 0x4E; // 'N' +static const UChar chP = 0x50; // 'P' static const UChar chBackSlash = 0x5c; // '\' introduces a char escape -static const UChar chLBracket = 0x5b; -static const UChar chRBracket = 0x5d; -static const UChar chRBrace = 0x7d; +static const UChar chLBracket = 0x5b; // '[' +static const UChar chRBracket = 0x5d; // ']' +static const UChar chUp = 0x5e; // '^' +static const UChar chLowerP = 0x70; +static const UChar chLBrace = 0x7b; // '{' +static const UChar chRBrace = 0x7d; // '}' +static const UChar chNEL = 0x85; // NEL newline variant +static const UChar chLS = 0x2028; // Unicode Line Separator //------------------------------------------------------------------------------ @@ -3278,10 +3578,6 @@ UChar32 RegexCompile::nextCharLL() { // reset the column to 0. fLineNum++; fCharNum=0; - if (fQuoteMode) { - error(U_REGEX_RULE_SYNTAX); - fQuoteMode = FALSE; - } } else { // Character is not starting a new line. Except in the case of a @@ -3343,7 +3639,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { if (fModeFlags & UREGEX_COMMENTS) { // // We are in free-spacing and comments mode. - // Scan through any white space and comments, until we + // Scan through any white space and comments, until we // reach a significant character or the end of inut. for (;;) { if (c.fChar == (UChar32)-1) { @@ -3362,6 +3658,7 @@ void RegexCompile::nextChar(RegexPatternChar &c) { } } } + // TODO: check what Java & Perl do with non-ASCII white spaces. if (uprv_isRuleWhiteSpace(c.fChar) == FALSE) { break; } @@ -3372,9 +3669,9 @@ void RegexCompile::nextChar(RegexPatternChar &c) { // // check for backslash escaped characters. // - int32_t startX = fNextIndex; // start and end positions of the - int32_t endX = fNextIndex; // sequence following the '\' if (c.fChar == chBackSlash) { + int32_t startX = fNextIndex; // start and end positions of the + int32_t endX = fNextIndex; // sequence following the '\' if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { // // A '\' sequence that is handled by ICU's standard unescapeAt function. @@ -3390,11 +3687,39 @@ void RegexCompile::nextChar(RegexPatternChar &c) { fCharNum += endX - startX; fNextIndex = endX; } + else if (peekCharLL() == chDigit0) { + // Octal Escape, using Java Regexp Conventions + // which are \0 followed by 1-3 octal digits. + // Different from ICU Unescape handling of Octal, which does not + // require the leading 0. + c.fChar = 0; + nextCharLL(); // Consume the initial 0. + int index; + for (index=0; index<3; index++) { + int32_t ch = peekCharLL(); + if (ch<chDigit0 || ch>chDigit7) { + break; + } + nextCharLL(); + c.fChar <<= 3; + c.fChar += ch&7; + } + if (c.fChar>255) { + error(U_REGEX_OCTAL_TOO_BIG); + } + c.fQuoted = TRUE; + } + else if (peekCharLL() == chQ) { + // "\Q" enter quote mode, which will continue until "\E" + fQuoteMode = TRUE; + nextCharLL(); // discard the 'Q'. + nextChar(c); // recurse to get the real next char. + } else { // We are in a '\' escape that will be handled by the state table scanner. // Just return the backslash, but remember that the following char is to - // be taken literally. TODO: this is awkward, think about alternatives. + // be taken literally. fInBackslashQuote = TRUE; } } @@ -3412,59 +3737,60 @@ void RegexCompile::nextChar(RegexPatternChar &c) { //------------------------------------------------------------------------------ // -// scanSet Construct a UnicodeSet from the text at the current scan -// position. Advance the scan position to the first character -// after the set. +// scanNamedChar + // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. +// +// The scan position will be at the 'N'. On return +// the scan position should be just after the '}' // -// The scan position is normally under the control of the state machine -// that controls pattern parsing. UnicodeSets, however, are parsed by -// the UnicodeSet constructor, not by the Regex pattern parser. +// Return the UChar32 // //------------------------------------------------------------------------------ -UnicodeSet *RegexCompile::scanSet() { +UChar32 RegexCompile::scanNamedChar() { UnicodeSet *uset = NULL; - ParsePosition pos; - int i; if (U_FAILURE(*fStatus)) { - return NULL; - } - - pos.setIndex(fScanIndex); - UErrorCode localStatus = U_ZERO_ERROR; - uint32_t usetFlags = 0; - if (fModeFlags & UREGEX_CASE_INSENSITIVE) { - usetFlags |= USET_CASE_INSENSITIVE; - } - if (fModeFlags & UREGEX_COMMENTS) { - usetFlags |= USET_IGNORE_SPACE; + return 0; } - uset = new UnicodeSet(fRXPat->fPattern, pos, - usetFlags, NULL, localStatus); - if (U_FAILURE(localStatus)) { - // TODO: Get more accurate position of the error from UnicodeSet's return info. - // UnicodeSet appears to not be reporting correctly at this time. - REGEX_SCAN_DEBUG_PRINTF(("UnicodeSet parse postion.ErrorIndex = %d\n", pos.getIndex())); - error(localStatus); - delete uset; - return NULL; + nextChar(fC); + if (fC.fChar != chLBrace) { + error(U_REGEX_PROPERTY_SYNTAX); + return 0; } - - // Advance the current scan postion over the UnicodeSet. - // Don't just set fScanIndex because the line/char positions maintained - // for error reporting would be thrown off. - i = pos.getIndex(); + + UnicodeString charName; for (;;) { - if (fNextIndex >= i) { + nextChar(fC); + if (fC.fChar == chRBrace) { break; } - nextCharLL(); + if (fC.fChar == -1) { + error(U_REGEX_PROPERTY_SYNTAX); + return 0; + } + charName.append(fC.fChar); } + + char name[100]; + if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || + charName.length()>=sizeof(name)) { + // All Unicode character names have only invariant characters. + // The API to get a character, given a name, accepts only char *, forcing us to convert, + // which requires this error check + error(U_REGEX_PROPERTY_SYNTAX); + return 0; + } + charName.extract(0, charName.length(), name, sizeof(name), US_INV); - return uset; -} + UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); + if (U_FAILURE(*fStatus)) { + error(U_REGEX_PROPERTY_SYNTAX); + } + nextChar(fC); // Continue overall regex pattern processing with char after the '}' + return theChar; +} //------------------------------------------------------------------------------ // @@ -3484,44 +3810,297 @@ UnicodeSet *RegexCompile::scanProp() { if (U_FAILURE(*fStatus)) { return NULL; } + U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); + UBool negated = (fC.fChar == chP); - U_ASSERT(fC.fChar == chLowerP || fC.fChar == chUpperP || fC.fChar == chUpperN); - - // enclose the \p{property} from the regex pattern source in [brackets] - UnicodeString setPattern; - setPattern.append(chLBracket); - setPattern.append(chBackSlash); + UnicodeString propertyName; + nextChar(fC); + if (fC.fChar != chLBrace) { + error(U_REGEX_PROPERTY_SYNTAX); + return NULL; + } for (;;) { - setPattern.append(fC.fChar); + nextChar(fC); if (fC.fChar == chRBrace) { break; } - nextChar(fC); if (fC.fChar == -1) { // Hit the end of the input string without finding the closing '}' error(U_REGEX_PROPERTY_SYNTAX); return NULL; } + propertyName.append(fC.fChar); } - setPattern.append(chRBracket); + uset = createSetForProperty(propertyName, negated); + nextChar(fC); // Move input scan to position following the closing '}' + return uset; +} - uint32_t usetFlags = 0; +//------------------------------------------------------------------------------ +// +// scanPosixProp Construct a UnicodeSet from the text at the current scan +// position, which is expected be of the form [:property expression:] +// +// The scan position will be at the opening ':'. On return +// the scan position must be on the closing ']' +// +// Return a UnicodeSet constructed from the pattern, +// or NULL if this is not a valid POSIX-style set expression. +// If not a property expression, restore the initial scan position +// (to the opening ':') +// +// Note: the opening '[:' is not sufficient to guarantee that +// this is a [:property:] expression. +// [:'+=,] is a perfectly good ordinary set expression that +// happens to include ':' as one of its characters. +// +//------------------------------------------------------------------------------ +UnicodeSet *RegexCompile::scanPosixProp() { + UnicodeSet *uset = NULL; + + if (U_FAILURE(*fStatus)) { + return NULL; + } + + U_ASSERT(fC.fChar == chColon); + + // Save the scanner state. + // TODO: move this into the scanner, with the state encapsulated in some way + int32_t savedScanIndex = fScanIndex; + int32_t savedNextIndex = fNextIndex; + UBool savedQuoteMode = fQuoteMode; + UBool savedInBackslashQuote = fInBackslashQuote; + UBool savedEOLComments = fEOLComments; + int32_t savedLineNum = fLineNum; + int32_t savedCharNum = fCharNum; + UChar32 savedLastChar = fLastChar; + UChar32 savedPeekChar = fPeekChar; + RegexPatternChar savedfC = fC; + + // Scan for a closing ]. A little tricky because there are some perverse + // edge cases possible. "[:abc\Qdef;] \E]" is a valid non-property expression, + // ending on the second closing ]. + + UnicodeString propName; + UBool negated = FALSE; + + // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] + nextChar(fC); + if (fC.fChar == chUp) { + negated = TRUE; + nextChar(fC); + } + + // Scan for the closing ":]", collecting the property name along the way. + UBool sawPropSetTerminator = FALSE; + for (;;) { + propName.append(fC.fChar); + nextChar(fC); + if (fC.fQuoted || fC.fChar == -1) { + // Escaped characters or end of input - either says this isn't a [:Property:] + break; + } + if (fC.fChar == chColon) { + nextChar(fC); + if (fC.fChar == chRBracket) { + sawPropSetTerminator = TRUE; + } + break; + } + } + + if (sawPropSetTerminator) { + uset = createSetForProperty(propName, negated); + } + else + { + // No closing ":]". + // Restore the original scan position. + // The main scanner will retry the input as a normal set expression, + // not a [:Property:] expression. + fScanIndex = savedScanIndex; + fNextIndex = savedNextIndex; + fQuoteMode = savedQuoteMode; + fInBackslashQuote = savedInBackslashQuote; + fEOLComments = savedEOLComments; + fLineNum = savedLineNum; + fCharNum = savedCharNum; + fLastChar = savedLastChar; + fPeekChar = savedPeekChar; + fC = savedfC; + } + return uset; +} + +// +// Create a Unicode Set from a Unicode Property expression. +// This is common code underlying both \p{...} ane [:...:] expressions. +// Includes trying the Java "properties" that aren't supported as +// normal ICU UnicodeSet properties +// +static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 00}; // "[\p{" +static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 00}; // "[\p{" +UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { + UnicodeString setExpr; + UnicodeSet *set; + uint32_t usetFlags = 0; + + if (U_FAILURE(*fStatus)) { + return NULL; + } + + // + // First try the property as we received it + // + if (negated) { + setExpr.append(negSetPrefix, -1); + } else { + setExpr.append(posSetPrefix, -1); + } + setExpr.append(propName); + setExpr.append(chRBrace); + setExpr.append(chRBracket); if (fModeFlags & UREGEX_CASE_INSENSITIVE) { usetFlags |= USET_CASE_INSENSITIVE; } - if (fModeFlags & UREGEX_COMMENTS) { - usetFlags |= USET_IGNORE_SPACE; + set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); + if (U_SUCCESS(*fStatus)) { + return set; + } + + // + // The property as it was didn't work. + // See if it looks like a Java "InBlockName", which + // we will recast as "Block=BlockName" + // + static const UChar IN[] = {0x49, 0x6E, 0}; // "In" + static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block=" + if (propName.startsWith(IN, 2) && propName.length()>=3) { + setExpr.truncate(4); // Leaves "[\p{", or "[\P{" + setExpr.append(BLOCK, -1); + setExpr.append(UnicodeString(propName, 2)); // Property with the leading "In" removed. + setExpr.append(chRBrace); + setExpr.append(chRBracket); + *fStatus = U_ZERO_ERROR; + set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); + if (U_SUCCESS(*fStatus)) { + return set; + } } + + // + // Try the various Java specific properties. + // These all begin with "java" + // + #define IDENTIFIER_IGNORABLE "[\\u0000-\\u0008\\u000e-\\u001b\\u007f-\\u009f\\p{Cf}]" + static const char *javaProps[][2] = { + {"javaDefined", "\\P{Cn}"}, + {"javaDigit", "\\p{Nd}"}, + {"javaIdentifierIgnorable", IDENTIFIER_IGNORABLE}, + {"javaISOControl", "[\\u0000-\\u001f\\u007f-\\u009f]"}, + {"javaJavaIdentifierPart", "[[\\p{L}\\p{Sc}\\p{Pc}\\p{Nd}\\p{Nl}\\p{Mc}\\p{Mn}]" IDENTIFIER_IGNORABLE "]"}, + {"javaJavaIdentifierStart", "[\\p{L}\\p{Nl}\\p{Sc}\\p{Pc}]"}, + {"javaLetter", "\\p{L}"}, + {"javaLetterOrDigit", "[\\p{L}\\p{Nd}]"}, + {"javaLowerCase", "\\p{Ll}"}, + {"javaMirrored", "\\p{Bidi_Mirrored}"}, + {"javaSpaceChar", "\\p{Z}"}, + {"javaSupplementaryCodePoint", "[\\U00010000-\\U0010ffff]"}, + {"javaTitleCase", "\\p{Lt}"}, + {"javaUnicodeIdentifierStart", "[\\p{L}\\p{Nl}]"}, + {"javaUnicodeIdentifierPart", "[[\\p{L}\\p{Pc}\\p{Nd}\\p{Nl}\\p{Mc}\\p{Mn}]" IDENTIFIER_IGNORABLE "]"}, + {"javaUpperCase", "[\\p{Lu}]"}, + {"javaValidCodePoint", "[\\u0000-\\U0010ffff]"}, + {"javaWhitespace", "[[\\p{Z}-[\\u00a0\\u2007\\u202f]]\\u0009-\\u000d\\u001c-\\u001f]"}, + {NULL, NULL} + }; + - // Build the UnicodeSet from the set pattern we just built up in a string. - uset = new UnicodeSet(setPattern, usetFlags, NULL, *fStatus); - if (U_FAILURE(*fStatus)) { - delete uset; - uset = NULL; + UnicodeString Java("java", -1, UnicodeString::kInvariant); + if (propName.startsWith(Java)) { + int i; + setExpr.remove(); + for (i=0; javaProps[i][0] != NULL; i++) { + if (propName.compare(UnicodeString(javaProps[i][0], -1, UnicodeString::kInvariant))==0) { + setExpr = UnicodeString(javaProps[i][1]); // Default code page conversion here. + break; // Somewhat Inefficient. + } + } + if (setExpr.length()>0) { + *fStatus = U_ZERO_ERROR; + set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); + if (U_SUCCESS(*fStatus)) { + if (negated) { + set->complement(); + } + return set; + } + } } + error(*fStatus); + return NULL; +} - nextChar(fC); // Continue overall regex pattern processing with char after the '}' - return uset; + + +// +// SetEval Part of the evaluation of [set expressions]. +// Perform any pending (stacked) operations with precedence +// equal or greater to that of the next operator encountered +// in the expression. +// +void RegexCompile::setEval(int32_t nextOp) { + UnicodeSet *rightOperand = NULL; + UnicodeSet *leftOperand = NULL; + for (;;) { + U_ASSERT(fSetOpStack.empty()==FALSE); + int32_t pendingSetOperation = fSetOpStack.peeki(); + if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { + break; + } + fSetOpStack.popi(); + U_ASSERT(fSetStack.empty() == FALSE); + rightOperand = (UnicodeSet *)fSetStack.peek(); + switch (pendingSetOperation) { + case setNegation: + rightOperand->complement(); + break; + case setCaseClose: + // TODO: need a simple close function. + rightOperand->closeOver(USET_CASE_INSENSITIVE); + rightOperand->removeAllStrings(); + break; + case setDifference1: + case setDifference2: + fSetStack.pop(); + leftOperand = (UnicodeSet *)fSetStack.peek(); + leftOperand->removeAll(*rightOperand); + delete rightOperand; + break; + case setIntersection1: + case setIntersection2: + fSetStack.pop(); + leftOperand = (UnicodeSet *)fSetStack.peek(); + leftOperand->retainAll(*rightOperand); + delete rightOperand; + break; + case setUnion: + fSetStack.pop(); + leftOperand = (UnicodeSet *)fSetStack.peek(); + leftOperand->addAll(*rightOperand); + delete rightOperand; + break; + default: + U_ASSERT(FALSE); + break; + } + } + } + +void RegexCompile::setPushOp(int32_t op) { + setEval(op); + fSetOpStack.push(op, *fStatus); + fSetStack.push(new UnicodeSet(), *fStatus); } U_NAMESPACE_END diff --git a/i18n/regexcmp.h b/i18n/regexcmp.h index ac81684a..a0248a3f 100644 --- a/i18n/regexcmp.h +++ b/i18n/regexcmp.h @@ -51,7 +51,7 @@ public: }; RegexCompile(RegexPattern *rp, UErrorCode &e); - + void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e); @@ -68,7 +68,7 @@ public: // determines the code to be generated when the matching close ) is encountered. enum EParenClass { plain = -1, // No special handling - capturing = -2, + capturing = -2, atomic = -3, lookAhead = -4, negLookAhead = -5, @@ -85,8 +85,8 @@ private: UChar32 nextCharLL(); UChar32 peekCharLL(); - UnicodeSet *scanSet(); UnicodeSet *scanProp(); + UnicodeSet *scanPosixProp(); void handleCloseParen(); int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern // at the top of the just completed block @@ -111,6 +111,11 @@ private: void stripNOPs(); void OptDotStar(); + void setEval(int32_t op); + void setPushOp(int32_t op); + UChar32 scanNamedChar(); + UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated); + UErrorCode *fStatus; RegexPattern *fRXPat; @@ -125,7 +130,7 @@ private: // is the first character not yet scanned. UBool fQuoteMode; // Scan is in a \Q...\E quoted region UBool fInBackslashQuote; // Scan is between a '\' and the following char. - UBool fEOLComments; // When scan is just after '(?', inhibit #... to + UBool fEOLComments; // When scan is just after '(?', inhibit #... to // end of line comments, in favor of (?#...) comments. int32_t fLineNum; // Line number in input file. int32_t fCharNum; // Char position within the line. @@ -167,7 +172,7 @@ private: UVector32 fParenStack; // parentheses stack. Each frame consists of // the positions of compiled pattern operations - // needing fixup, followed by negative value. The + // needing fixup, followed by negative value. The // first entry in each frame is the position of the // spot reserved for use when a quantifier // needs to add a SAVE at the start of a (block) @@ -194,8 +199,33 @@ private: int32_t fNameStartPos; // Starting position of a \N{NAME} name in a // pattern, valid while remainder of name is // scanned. + + UStack fSetStack; // Stack of UnicodeSets, used while evaluating + // (at compile time) set expressions within + // the pattern. + UStack fSetOpStack; // Stack of pending set operators (&&, --, union) + + UChar32 fLastSetLiteral; // The last single code point added to a set. + // needed when "-y" is scanned, and we need + // to turn "x-y" into a range. + }; +// Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions] +// The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself. + +enum SetOperations { + setStart = 0 << 16 | 1, + setEnd = 1 << 16 | 2, + setNegation = 2 << 16 | 3, + setCaseClose = 2 << 16 | 9, + setDifference2 = 3 << 16 | 4, // '--' set difference operator + setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator + setUnion = 4 << 16 | 6, // implicit union of adjacent items + setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet. + setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet. + }; + U_NAMESPACE_END #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS #endif // RBBISCAN_H diff --git a/i18n/regexcst.h b/i18n/regexcst.h index 6cca8453..5d319371 100644 --- a/i18n/regexcst.h +++ b/i18n/regexcst.h @@ -5,7 +5,7 @@ // It is generated by the Perl script "regexcst.pl" from // the rule parser state definitions file "regexcst.txt". // -// Copyright (C) 2002-2003 International Business Machines Corporation +// Copyright (C) 2002-2007 International Business Machines Corporation // and others. All rights reserved. // //--------------------------------------------------------------------------------- @@ -17,74 +17,100 @@ U_NAMESPACE_BEGIN // Character classes for regex pattern scanning. // static const uint8_t kRuleSet_digit_char = 128; - static const uint8_t kRuleSet_white_space = 129; - static const uint8_t kRuleSet_rule_char = 130; + static const uint8_t kRuleSet_rule_char = 129; enum Regex_PatternParseAction { - doPossessivePlus, - doCloseParen, + doLiteralChar, + doSetEnd, + doBackslashA, + doSetBeginUnion, + doNOP, + doSetBackslash_w, + doSetRange, + doBackslashG, + doPerlInline, + doSetAddDash, + doIntevalLowerDigit, doProperty, - doBeginMatchMode, - doOrOperator, + doBackslashX, + doOpenAtomicParen, + doSetLiteralEscaped, + doPatFinish, + doSetBackslash_D, + doSetDifference2, + doNamedChar, + doNGPlus, + doOpenLookBehindNeg, + doIntervalError, + doIntervalSame, + doBackRef, + doPlus, doOpenCaptureParen, - doBadOpenParenType, - doRuleError, - doIntevalLowerDigit, - doBackslashs, - doNGOpt, - doBackslashw, doMismatchedParenErr, + doBeginMatchMode, + doEscapeError, + doOpenNonCaptureParen, + doDollar, + doSetProp, + doIntervalUpperDigit, + doSetBegin, + doBackslashs, doOpenLookBehind, - doBackslashz, - doIntervalError, - doStar, + doSetMatchMode, + doOrOperator, doCaret, - doEnterQuoteMode, - doNGStar, + doMatchModeParen, + doStar, + doOpt, doMatchMode, - doIntervalUpperDigit, + doSuppressComments, + doPossessiveInterval, doOpenLookAheadNeg, - doPlus, - doOpenNonCaptureParen, - doBackslashA, + doBackslashW, + doCloseParen, + doSetOpError, + doIntervalInit, + doSetFinish, + doSetIntersection2, + doNGStar, + doEnterQuoteMode, + doSetAddAmp, doBackslashB, - doNGPlus, - doSetMatchMode, - doPatFinish, + doBackslashw, + doPossessiveOpt, + doSetNegate, + doRuleError, + doBackslashb, + doConditionalExpr, + doPossessivePlus, + doBadOpenParenType, + doNGInterval, + doSetLiteral, + doSetNamedChar, + doBackslashd, + doSetBeginDifference1, doBackslashD, - doPossessiveInterval, - doEscapeError, - doBackslashG, - doSuppressComments, - doMatchModeParen, - doOpt, + doExit, + doSetBackslash_S, doInterval, - doLiteralChar, - doIntervalInit, - doOpenAtomicParen, + doSetNoCloseError, + doNGOpt, + doSetPosixProp, doBackslashS, - doOpenLookAhead, - doBackRef, - doDollar, - doDotAny, - doBackslashW, - doBackslashX, - doScanUnicodeSet, doBackslashZ, - doPerlInline, - doPossessiveOpt, - doNOP, - doConditionalExpr, - doExit, - doNGInterval, - doPatStart, + doSetBeginIntersection1, + doSetBackslash_W, + doSetBackslash_d, + doOpenLookAhead, doBadModeFlag, - doBackslashb, + doPatStart, + doSetNamedRange, doPossessiveStar, - doBackslashd, - doIntervalSame, - doOpenLookBehindNeg, + doEscapedLiteralChar, + doSetBackslash_s, + doBackslashz, + doDotAny, rbbiLastAction}; //------------------------------------------------------------------------------- @@ -106,17 +132,17 @@ static const struct RegexTableEl gRuleParseStateTable[] = { {doNOP, 0, 0, 0, TRUE} , {doPatStart, 255, 2,0, FALSE} // 1 start , {doLiteralChar, 254, 14,0, TRUE} // 2 term - , {doLiteralChar, 130, 14,0, TRUE} // 3 - , {doScanUnicodeSet, 91 /* [ */, 14,0, TRUE} // 4 + , {doLiteralChar, 129, 14,0, TRUE} // 3 + , {doSetBegin, 91 /* [ */, 100, 178, TRUE} // 4 , {doNOP, 40 /* ( */, 27,0, TRUE} // 5 , {doDotAny, 46 /* . */, 14,0, TRUE} // 6 - , {doCaret, 94 /* ^ */, 2,0, TRUE} // 7 - , {doDollar, 36 /* $ */, 2,0, TRUE} // 8 - , {doNOP, 92 /* \ */, 81,0, TRUE} // 9 + , {doCaret, 94 /* ^ */, 14,0, TRUE} // 7 + , {doDollar, 36 /* $ */, 14,0, TRUE} // 8 + , {doNOP, 92 /* \ */, 80,0, TRUE} // 9 , {doOrOperator, 124 /* | */, 2,0, TRUE} // 10 , {doCloseParen, 41 /* ) */, 255,0, TRUE} // 11 , {doPatFinish, 253, 2,0, FALSE} // 12 - , {doRuleError, 255, 101,0, FALSE} // 13 + , {doRuleError, 255, 179,0, FALSE} // 13 , {doNOP, 42 /* * */, 59,0, TRUE} // 14 expr-quant , {doNOP, 43 /* + */, 62,0, TRUE} // 15 , {doNOP, 63 /* ? */, 65,0, TRUE} // 16 @@ -144,14 +170,14 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doBeginMatchMode, 119 /* w */, 50,0, FALSE} // 38 , {doBeginMatchMode, 120 /* x */, 50,0, FALSE} // 39 , {doBeginMatchMode, 45 /* - */, 50,0, FALSE} // 40 - , {doConditionalExpr, 40 /* ( */, 101,0, TRUE} // 41 - , {doPerlInline, 123 /* { */, 101,0, TRUE} // 42 - , {doBadOpenParenType, 255, 101,0, FALSE} // 43 + , {doConditionalExpr, 40 /* ( */, 179,0, TRUE} // 41 + , {doPerlInline, 123 /* { */, 179,0, TRUE} // 42 + , {doBadOpenParenType, 255, 179,0, FALSE} // 43 , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 44 open-paren-lookbehind , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 45 - , {doBadOpenParenType, 255, 101,0, FALSE} // 46 + , {doBadOpenParenType, 255, 179,0, FALSE} // 46 , {doNOP, 41 /* ) */, 255,0, TRUE} // 47 paren-comment - , {doMismatchedParenErr, 253, 101,0, FALSE} // 48 + , {doMismatchedParenErr, 253, 179,0, FALSE} // 48 , {doNOP, 255, 47,0, TRUE} // 49 , {doMatchMode, 105 /* i */, 50,0, TRUE} // 50 paren-flag , {doMatchMode, 109 /* m */, 50,0, TRUE} // 51 @@ -161,7 +187,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doMatchMode, 45 /* - */, 50,0, TRUE} // 55 , {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 56 , {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 57 - , {doBadModeFlag, 255, 101,0, FALSE} // 58 + , {doBadModeFlag, 255, 179,0, FALSE} // 58 , {doNGStar, 63 /* ? */, 20,0, TRUE} // 59 quant-star , {doPossessiveStar, 43 /* + */, 20,0, TRUE} // 60 , {doStar, 255, 20,0, FALSE} // 61 @@ -171,40 +197,118 @@ static const struct RegexTableEl gRuleParseStateTable[] = { , {doNGOpt, 63 /* ? */, 20,0, TRUE} // 65 quant-opt , {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 66 , {doOpt, 255, 20,0, FALSE} // 67 - , {doNOP, 129, 68,0, TRUE} // 68 interval-open - , {doNOP, 128, 71,0, FALSE} // 69 - , {doIntervalError, 255, 101,0, FALSE} // 70 - , {doIntevalLowerDigit, 128, 71,0, TRUE} // 71 interval-lower - , {doNOP, 44 /* , */, 75,0, TRUE} // 72 - , {doIntervalSame, 125 /* } */, 78,0, TRUE} // 73 - , {doIntervalError, 255, 101,0, FALSE} // 74 - , {doIntervalUpperDigit, 128, 75,0, TRUE} // 75 interval-upper - , {doNOP, 125 /* } */, 78,0, TRUE} // 76 - , {doIntervalError, 255, 101,0, FALSE} // 77 - , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 78 interval-type - , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 79 - , {doInterval, 255, 20,0, FALSE} // 80 - , {doBackslashA, 65 /* A */, 2,0, TRUE} // 81 backslash - , {doBackslashB, 66 /* B */, 2,0, TRUE} // 82 - , {doBackslashb, 98 /* b */, 2,0, TRUE} // 83 - , {doBackslashd, 100 /* d */, 14,0, TRUE} // 84 - , {doBackslashD, 68 /* D */, 14,0, TRUE} // 85 - , {doBackslashG, 71 /* G */, 2,0, TRUE} // 86 - , {doProperty, 78 /* N */, 14,0, FALSE} // 87 - , {doProperty, 112 /* p */, 14,0, FALSE} // 88 - , {doProperty, 80 /* P */, 14,0, FALSE} // 89 - , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 90 - , {doBackslashS, 83 /* S */, 14,0, TRUE} // 91 - , {doBackslashs, 115 /* s */, 14,0, TRUE} // 92 - , {doBackslashW, 87 /* W */, 14,0, TRUE} // 93 - , {doBackslashw, 119 /* w */, 14,0, TRUE} // 94 - , {doBackslashX, 88 /* X */, 14,0, TRUE} // 95 - , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 96 - , {doBackslashz, 122 /* z */, 2,0, TRUE} // 97 - , {doBackRef, 128, 14,0, TRUE} // 98 - , {doEscapeError, 253, 101,0, FALSE} // 99 - , {doLiteralChar, 255, 14,0, TRUE} // 100 - , {doExit, 255, 101,0, TRUE} // 101 errorDeath + , {doNOP, 128, 70,0, FALSE} // 68 interval-open + , {doIntervalError, 255, 179,0, FALSE} // 69 + , {doIntevalLowerDigit, 128, 70,0, TRUE} // 70 interval-lower + , {doNOP, 44 /* , */, 74,0, TRUE} // 71 + , {doIntervalSame, 125 /* } */, 77,0, TRUE} // 72 + , {doIntervalError, 255, 179,0, FALSE} // 73 + , {doIntervalUpperDigit, 128, 74,0, TRUE} // 74 interval-upper + , {doNOP, 125 /* } */, 77,0, TRUE} // 75 + , {doIntervalError, 255, 179,0, FALSE} // 76 + , {doNGInterval, 63 /* ? */, 20,0, TRUE} // 77 interval-type + , {doPossessiveInterval, 43 /* + */, 20,0, TRUE} // 78 + , {doInterval, 255, 20,0, FALSE} // 79 + , {doBackslashA, 65 /* A */, 2,0, TRUE} // 80 backslash + , {doBackslashB, 66 /* B */, 2,0, TRUE} // 81 + , {doBackslashb, 98 /* b */, 2,0, TRUE} // 82 + , {doBackslashd, 100 /* d */, 14,0, TRUE} // 83 + , {doBackslashD, 68 /* D */, 14,0, TRUE} // 84 + , {doBackslashG, 71 /* G */, 2,0, TRUE} // 85 + , {doNamedChar, 78 /* N */, 14,0, FALSE} // 86 + , {doProperty, 112 /* p */, 14,0, FALSE} // 87 + , {doProperty, 80 /* P */, 14,0, FALSE} // 88 + , {doEnterQuoteMode, 81 /* Q */, 2,0, TRUE} // 89 + , {doBackslashS, 83 /* S */, 14,0, TRUE} // 90 + , {doBackslashs, 115 /* s */, 14,0, TRUE} // 91 + , {doBackslashW, 87 /* W */, 14,0, TRUE} // 92 + , {doBackslashw, 119 /* w */, 14,0, TRUE} // 93 + , {doBackslashX, 88 /* X */, 14,0, TRUE} // 94 + , {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 95 + , {doBackslashz, 122 /* z */, 2,0, TRUE} // 96 + , {doBackRef, 128, 14,0, TRUE} // 97 + , {doEscapeError, 253, 179,0, FALSE} // 98 + , {doEscapedLiteralChar, 255, 14,0, TRUE} // 99 + , {doSetNegate, 94 /* ^ */, 103,0, TRUE} // 100 set-open + , {doSetPosixProp, 58 /* : */, 105,0, FALSE} // 101 + , {doNOP, 255, 103,0, FALSE} // 102 + , {doSetLiteral, 93 /* ] */, 118,0, TRUE} // 103 set-open2 + , {doNOP, 255, 108,0, FALSE} // 104 + , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 105 set-posix + , {doNOP, 58 /* : */, 108,0, FALSE} // 106 + , {doRuleError, 255, 179,0, FALSE} // 107 + , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 108 set-start + , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 109 + , {doNOP, 92 /* \ */, 168,0, TRUE} // 110 + , {doNOP, 45 /* - */, 114,0, TRUE} // 111 + , {doNOP, 38 /* & */, 116,0, TRUE} // 112 + , {doSetLiteral, 255, 118,0, TRUE} // 113 + , {doRuleError, 45 /* - */, 179,0, FALSE} // 114 set-start-dash + , {doSetAddDash, 255, 118,0, FALSE} // 115 + , {doRuleError, 38 /* & */, 179,0, FALSE} // 116 set-start-amp + , {doSetAddAmp, 255, 118,0, FALSE} // 117 + , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 118 set-after-lit + , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 119 + , {doNOP, 45 /* - */, 155,0, TRUE} // 120 + , {doNOP, 38 /* & */, 146,0, TRUE} // 121 + , {doNOP, 92 /* \ */, 168,0, TRUE} // 122 + , {doSetNoCloseError, 253, 179,0, FALSE} // 123 + , {doSetLiteral, 255, 118,0, TRUE} // 124 + , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 125 set-after-set + , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 126 + , {doNOP, 45 /* - */, 148,0, TRUE} // 127 + , {doNOP, 38 /* & */, 143,0, TRUE} // 128 + , {doNOP, 92 /* \ */, 168,0, TRUE} // 129 + , {doSetNoCloseError, 253, 179,0, FALSE} // 130 + , {doSetLiteral, 255, 118,0, TRUE} // 131 + , {doSetEnd, 93 /* ] */, 255,0, TRUE} // 132 set-after-range + , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 133 + , {doNOP, 45 /* - */, 151,0, TRUE} // 134 + , {doNOP, 38 /* & */, 153,0, TRUE} // 135 + , {doNOP, 92 /* \ */, 168,0, TRUE} // 136 + , {doSetNoCloseError, 253, 179,0, FALSE} // 137 + , {doSetLiteral, 255, 118,0, TRUE} // 138 + , {doSetBeginUnion, 91 /* [ */, 100, 125, TRUE} // 139 set-after-op + , {doSetOpError, 93 /* ] */, 179,0, FALSE} // 140 + , {doNOP, 92 /* \ */, 168,0, TRUE} // 141 + , {doSetLiteral, 255, 118,0, TRUE} // 142 + , {doSetBeginIntersection1, 91 /* [ */, 100, 125, TRUE} // 143 set-set-amp + , {doSetIntersection2, 38 /* & */, 139,0, TRUE} // 144 + , {doSetAddAmp, 255, 118,0, FALSE} // 145 + , {doSetIntersection2, 38 /* & */, 139,0, TRUE} // 146 set-lit-amp + , {doSetAddAmp, 255, 118,0, FALSE} // 147 + , {doSetBeginDifference1, 91 /* [ */, 100, 125, TRUE} // 148 set-set-dash + , {doSetDifference2, 45 /* - */, 139,0, TRUE} // 149 + , {doSetAddDash, 255, 118,0, FALSE} // 150 + , {doSetDifference2, 45 /* - */, 139,0, TRUE} // 151 set-range-dash + , {doSetAddDash, 255, 118,0, FALSE} // 152 + , {doSetIntersection2, 38 /* & */, 139,0, TRUE} // 153 set-range-amp + , {doSetAddAmp, 255, 118,0, FALSE} // 154 + , {doSetDifference2, 45 /* - */, 139,0, TRUE} // 155 set-lit-dash + , {doSetAddDash, 91 /* [ */, 118,0, FALSE} // 156 + , {doSetAddDash, 93 /* ] */, 118,0, FALSE} // 157 + , {doNOP, 92 /* \ */, 160,0, TRUE} // 158 + , {doSetRange, 255, 132,0, TRUE} // 159 + , {doSetAddDash, 115 /* s */, 168,0, FALSE} // 160 set-lit-dash-escape + , {doSetAddDash, 83 /* S */, 168,0, FALSE} // 161 + , {doSetAddDash, 119 /* w */, 168,0, FALSE} // 162 + , {doSetAddDash, 87 /* W */, 168,0, FALSE} // 163 + , {doSetAddDash, 100 /* d */, 168,0, FALSE} // 164 + , {doSetAddDash, 68 /* D */, 168,0, FALSE} // 165 + , {doSetNamedRange, 78 /* N */, 132,0, FALSE} // 166 + , {doSetRange, 255, 132,0, TRUE} // 167 + , {doSetProp, 112 /* p */, 125,0, FALSE} // 168 set-escape + , {doSetProp, 80 /* P */, 125,0, FALSE} // 169 + , {doSetNamedChar, 78 /* N */, 118,0, FALSE} // 170 + , {doSetBackslash_s, 115 /* s */, 132,0, TRUE} // 171 + , {doSetBackslash_S, 83 /* S */, 132,0, TRUE} // 172 + , {doSetBackslash_w, 119 /* w */, 132,0, TRUE} // 173 + , {doSetBackslash_W, 87 /* W */, 132,0, TRUE} // 174 + , {doSetBackslash_d, 100 /* d */, 132,0, TRUE} // 175 + , {doSetBackslash_D, 68 /* D */, 132,0, TRUE} // 176 + , {doSetLiteralEscaped, 255, 118,0, TRUE} // 177 + , {doSetFinish, 255, 14,0, FALSE} // 178 set-finish + , {doExit, 255, 179,0, TRUE} // 179 errorDeath }; static const char * const RegexStateNames[] = { 0, "start", @@ -276,7 +380,6 @@ static const char * const RegexStateNames[] = { 0, 0, "interval-open", 0, - 0, "interval-lower", 0, 0, @@ -307,6 +410,85 @@ static const char * const RegexStateNames[] = { 0, 0, 0, 0, + "set-open", + 0, + 0, + "set-open2", + 0, + "set-posix", + 0, + 0, + "set-start", + 0, + 0, + 0, + 0, + 0, + "set-start-dash", + 0, + "set-start-amp", + 0, + "set-after-lit", + 0, + 0, + 0, + 0, + 0, + 0, + "set-after-set", + 0, + 0, + 0, + 0, + 0, + 0, + "set-after-range", + 0, + 0, + 0, + 0, + 0, + 0, + "set-after-op", + 0, + 0, + 0, + "set-set-amp", + 0, + 0, + "set-lit-amp", + 0, + "set-set-dash", + 0, + 0, + "set-range-dash", + 0, + "set-range-amp", + 0, + "set-lit-dash", + 0, + 0, + 0, + 0, + "set-lit-dash-escape", + 0, + 0, + 0, + 0, + 0, + 0, + 0, + "set-escape", + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + "set-finish", "errorDeath", 0}; diff --git a/i18n/regexcst.pl b/i18n/regexcst.pl index b525cf88..f1dc06af 100644 --- a/i18n/regexcst.pl +++ b/i18n/regexcst.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl # ******************************************************************** # * COPYRIGHT: -# * Copyright (c) 2002-2003, International Business Machines Corporation and +# * Copyright (c) 2002-2007, International Business Machines Corporation and # * others. All Rights Reserved. # ******************************************************************** # @@ -22,10 +22,6 @@ # for the Rule Based Break Iterator Rule Parser. Perhaps they could be # merged? # -#********************************************************************* -# Copyright (C) 2002 International Business Machines Corporation * -# and others. All rights reserved. * -#********************************************************************* $num_states = 1; # Always the state number for the line being compiled. @@ -210,7 +206,7 @@ print "// This file contains the state table for the ICU Regular Expression P print "// It is generated by the Perl script \"regexcst.pl\" from\n"; print "// the rule parser state definitions file \"regexcst.txt\".\n"; print "//\n"; -print "// Copyright (C) 2002-2003 International Business Machines Corporation \n"; +print "// Copyright (C) 2002-2007 International Business Machines Corporation \n"; print "// and others. All rights reserved. \n"; print "//\n"; print "//---------------------------------------------------------------------------------\n"; diff --git a/i18n/regexcst.txt b/i18n/regexcst.txt index fec788c1..888a0c42 100644 --- a/i18n/regexcst.txt +++ b/i18n/regexcst.txt @@ -1,7 +1,7 @@ #***************************************************************************** # -# Copyright (C) 2002-2003, International Business Machines Corporation and others. +# Copyright (C) 2002-2007, International Business Machines Corporation and others. # All Rights Reserved. # #***************************************************************************** @@ -25,8 +25,8 @@ # # #StateName: -# input-char n next-state ^push-state action -# input-char n next-state ^push-state action +# input-char n next-state ^push-state action +# input-char n next-state ^push-state action # | | | | | # | | | | |--- action to be performed by state machine # | | | | See function RBBIRuleScanner::doParseActions() @@ -46,7 +46,7 @@ # matches, peform the actions and go to the state specified on this line. # The input character is tested sequentally, in the order written. The characters and # character classes tested for do not need to be mutually exclusive. The first match wins. -# +# @@ -56,27 +56,27 @@ # start: default term doPatStart - - - + + + # # term. At a position where we can accept the start most items in a pattern. # term: quoted n expr-quant doLiteralChar rule_char n expr-quant doLiteralChar - '[' n expr-quant doScanUnicodeSet - '(' n open-paren + '[' n set-open ^set-finish doSetBegin + '(' n open-paren '.' n expr-quant doDotAny - '^' n term doCaret - '$' n term doDollar + '^' n expr-quant doCaret + '$' n expr-quant doDollar '\' n backslash '|' n term doOrOperator ')' n pop doCloseParen eof term doPatFinish default errorDeath doRuleError - + # @@ -84,14 +84,14 @@ term: # trailing quantifier - *, +, ?, *?, etc. # expr-quant: - '*' n quant-star - '+' n quant-plus - '?' n quant-opt + '*' n quant-star + '+' n quant-plus + '?' n quant-opt '{' n interval-open doIntervalInit '(' n open-paren-quant - default expr-cont - - + default expr-cont + + # # expr-cont Expression, continuation. At a point where additional terms are # allowed, but not required. No Quantifiers @@ -99,8 +99,8 @@ expr-quant: expr-cont: '|' n term doOrOperator ')' n pop doCloseParen - default term - + default term + # # open-paren-quant Special case handling for comments appearing before a quantifier, @@ -111,12 +111,12 @@ expr-cont: open-paren-quant: '?' n open-paren-quant2 doSuppressComments default open-paren - + open-paren-quant2: '#' n paren-comment ^expr-quant default open-paren-extended - - + + # # open-paren We've got an open paren. We need to scan further to # determine what kind of quantifier it is - plain (, (?:, (?>, or whatever. @@ -124,7 +124,7 @@ open-paren-quant2: open-paren: '?' n open-paren-extended doSuppressComments default term ^expr-quant doOpenCaptureParen - + open-paren-extended: ':' n term ^expr-quant doOpenNonCaptureParen # (?: '>' n term ^expr-quant doOpenAtomicParen # (?> @@ -141,16 +141,15 @@ open-paren-extended: '(' n errorDeath doConditionalExpr '{' n errorDeath doPerlInline default errorDeath doBadOpenParenType - + open-paren-lookbehind: '=' n term ^expr-cont doOpenLookBehind # (?<= '!' n term ^expr-cont doOpenLookBehindNeg # (?<! default errorDeath doBadOpenParenType - + # # paren-comment We've got a (?# ... ) style comment. Eat pattern text till we get to the ')' -# TODO: should parens nest here? Check what perl does. # paren-comment: ')' n pop @@ -158,8 +157,8 @@ paren-comment: default n paren-comment # -# paren-flag Scanned a (?ismx-ismx flag setting -# +# paren-flag Scanned a (?ismx-ismx flag setting +# paren-flag: 'i' n paren-flag doMatchMode 'm' n paren-flag doMatchMode @@ -170,8 +169,8 @@ paren-flag: ')' n term doSetMatchMode ':' n term ^expr-quant doMatchModeParen default errorDeath doBadModeFlag - - + + # # quant-star Scanning a '*' quantifier. Need to look ahead to decide # between plain '*', '*?', '*+' @@ -204,13 +203,12 @@ quant-opt: # # Interval scanning a '{', the opening delimiter for an interval specification -# {number} or {min, max} or {min, } +# {number} or {min, max} or {min,} # interval-open: - white_space n interval-open # TODO: is white space allowed here in non-free mode? - digit_char interval-lower + digit_char interval-lower default errorDeath doIntervalError - + interval-lower: digit_char n interval-lower doIntevalLowerDigit ',' n interval-upper @@ -221,13 +219,13 @@ interval-upper: digit_char n interval-upper doIntervalUpperDigit '}' n interval-type default errorDeath doIntervalError - + interval-type: '?' n expr-cont doNGInterval # {n,m}? '+' n expr-cont doPossessiveInterval # {n,m}+ default expr-cont doInterval # {m,n} - - + + # # backslash # Backslash. Figure out which of the \thingies we have encountered. # The low level next-char function will have preprocessed @@ -239,7 +237,7 @@ backslash: 'd' n expr-quant doBackslashd 'D' n expr-quant doBackslashD 'G' n term doBackslashG - 'N' expr-quant doProperty # \N{NAME} named char + 'N' expr-quant doNamedChar # \N{NAME} named char 'p' expr-quant doProperty # \p{Lu} style property 'P' expr-quant doProperty 'Q' n term doEnterQuoteMode @@ -250,11 +248,210 @@ backslash: 'X' n expr-quant doBackslashX 'Z' n term doBackslashZ 'z' n term doBackslashz - digit_char n expr-quant doBackRef # Will scan multiple digits + digit_char n expr-quant doBackRef # Will scan multiple digits eof errorDeath doEscapeError - default n expr-quant doLiteralChar # Escaped literal char. + default n expr-quant doEscapedLiteralChar + + +# +# [set expression] parsing, +# All states involved in parsing set expressions have names beginning with "set-" +# + +set-open: + '^' n set-open2 doSetNegate + ':' set-posix doSetPosixProp + default set-open2 + +set-open2: + ']' n set-after-lit doSetLiteral + default set-start + +# set-posix: +# scanned a '[:' If it really is a [:property:], doSetPosixProp will have +# moved the scan to the closing ']'. If it wasn't a property +# expression, the scan will still be at the opening ':', which should +# be interpreted as a normal set expression. +set-posix: + ']' n pop doSetEnd + ':' set-start + default errorDeath doRuleError # should not be possible. + +# +# set-start after the [ and special case leading characters (^ and/or ]) but before +# everything else. A '-' is literal at this point. +# +set-start: + ']' n pop doSetEnd + '[' n set-open ^set-after-set doSetBeginUnion + '\' n set-escape + '-' n set-start-dash + '&' n set-start-amp + default n set-after-lit doSetLiteral + +# set-start-dash Turn "[--" into a syntax error. +# "[-x" is good, - and x are literals. +# +set-start-dash: + '-' errorDeath doRuleError + default set-after-lit doSetAddDash + +# set-start-amp Turn "[&&" into a syntax error. +# "[&x" is good, & and x are literals. +# +set-start-amp: + '&' errorDeath doRuleError + default set-after-lit doSetAddAmp + +# +# set-after-lit The last thing scanned was a literal character within a set. +# Can be followed by anything. Single '-' or '&' are +# literals in this context, not operators. +set-after-lit: + ']' n pop doSetEnd + '[' n set-open ^set-after-set doSetBeginUnion + '-' n set-lit-dash + '&' n set-lit-amp + '\' n set-escape + eof errorDeath doSetNoCloseError + default n set-after-lit doSetLiteral + +set-after-set: + ']' n pop doSetEnd + '[' n set-open ^set-after-set doSetBeginUnion + '-' n set-set-dash + '&' n set-set-amp + '\' n set-escape + eof errorDeath doSetNoCloseError + default n set-after-lit doSetLiteral + +set-after-range: + ']' n pop doSetEnd + '[' n set-open ^set-after-set doSetBeginUnion + '-' n set-range-dash + '&' n set-range-amp + '\' n set-escape + eof errorDeath doSetNoCloseError + default n set-after-lit doSetLiteral + +# set-after-op +# After a -- or && +# It is an error to close a set at this point. +# +set-after-op: + '[' n set-open ^set-after-set doSetBeginUnion + ']' errorDeath doSetOpError + '\' n set-escape + default n set-after-lit doSetLiteral + +# +# set-set-amp +# Have scanned [[set]& +# Could be a '&' intersection operator, if a set follows. +# Could be the start of a '&&' operator. +# Otherewise is a literal. +set-set-amp: + '[' n set-open ^set-after-set doSetBeginIntersection1 + '&' n set-after-op doSetIntersection2 + default set-after-lit doSetAddAmp + + +# set-lit-amp Have scanned "[literals&" +# Could be a start of "&&" operator or a literal +# In [abc&[def]], the '&' is a literal +# +set-lit-amp: + '&' n set-after-op doSetIntersection2 + default set-after-lit doSetAddAmp + + +# +# set-set-dash +# Have scanned [set]- +# Could be a '-' difference operator, if a [set] follows. +# Could be the start of a '--' operator. +# Otherewise is a literal. +set-set-dash: + '[' n set-open ^set-after-set doSetBeginDifference1 + '-' n set-after-op doSetDifference2 + default set-after-lit doSetAddDash + + +# +# set-range-dash +# scanned a-b- or \w- +# any set or range like item where the trailing single '-' should +# be literal, not a set difference operation. +# A trailing "--" is still a difference operator. +set-range-dash: + '-' n set-after-op doSetDifference2 + default set-after-lit doSetAddDash + + +set-range-amp: + '&' n set-after-op doSetIntersection2 + default set-after-lit doSetAddAmp + + +# set-lit-dash +# Have scanned "[literals-" Could be a range or a -- operator or a literal +# In [abc-[def]], the '-' is a literal (confirmed with a Java test) +# [abc-\p{xx} the '-' is a literal +# [abc-] the '-' is a literal +# [ab-xy] the '-' is a range +# +set-lit-dash: + '-' n set-after-op doSetDifference2 + '[' set-after-lit doSetAddDash + ']' set-after-lit doSetAddDash + '\' n set-lit-dash-escape + default n set-after-range doSetRange + +# set-lit-dash-escape +# +# scanned "[literal-\" +# Could be a range, if the \ introduces an escaped literal char or a named char. +# Could be a literal '-', if the '\' introduces a set-like construct e.g. \s aut \p{...} +# +set-lit-dash-escape: + 's' set-escape doSetAddDash + 'S' set-escape doSetAddDash + 'w' set-escape doSetAddDash + 'W' set-escape doSetAddDash + 'd' set-escape doSetAddDash + 'D' set-escape doSetAddDash + 'N' set-after-range doSetNamedRange + default n set-after-range doSetRange + + +# +# set-escape +# Common \ escape processing +# +set-escape: + 'p' set-after-set doSetProp + 'P' set-after-set doSetProp + 'N' set-after-lit doSetNamedChar + 's' n set-after-range doSetBackslash_s + 'S' n set-after-range doSetBackslash_S + 'w' n set-after-range doSetBackslash_w + 'W' n set-after-range doSetBackslash_W + 'd' n set-after-range doSetBackslash_d + 'D' n set-after-range doSetBackslash_D + default n set-after-lit doSetLiteralEscaped + +# +# set-finish +# Have just encountered the final ']' that completes a [set], and +# arrived here via a pop. From here, we exit the set parsing world, and go +# back to generic regular expression parsing. +# +set-finish: + default expr-quant doSetFinish + + # # errorDeath. This state is specified as the next state whenever a syntax error # in the source rules is detected. Barring bugs, the state machine will never diff --git a/i18n/regeximp.h b/i18n/regeximp.h index fbf70067..6944c08a 100644 --- a/i18n/regeximp.h +++ b/i18n/regeximp.h @@ -1,6 +1,6 @@ -// -// Copyright (C) 2002-2005 International Business Machines Corporation -// and others. All rights reserved. +// +// Copyright (C) 2002-2005 International Business Machines Corporation +// and others. All rights reserved. // // file: regeximp.h // @@ -66,16 +66,16 @@ enum { URX_NOP = 7, URX_START_CAPTURE = 8, // Value field is capture group number. URX_END_CAPTURE = 9, // Value field is capture group number - URX_STATIC_SETREF = 10, // Value field is index of set in array of sets. + URX_STATIC_SETREF = 10, // Value field is index of set in array of sets. URX_SETREF = 11, // Value field is index of set in array of sets. - URX_DOTANY = 12, + URX_DOTANY = 12, URX_JMP = 13, // Value field is destination position in // the pattern. URX_FAIL = 14, // Stop match operation, No match. URX_JMP_SAV = 15, // Operand: JMP destination location URX_BACKSLASH_B = 16, // Value field: 0: \b 1: \B - URX_BACKSLASH_G = 17, + URX_BACKSLASH_G = 17, URX_JMP_SAV_X = 18, // Conditional JMP_SAV, // Used in (x)+, breaks loop on zero length match. // Operand: Jmp destination. @@ -91,7 +91,7 @@ enum { URX_CTR_INIT_NG = 26, // 3 kinds, normal, non-greedy, and possessive. // These are 4 word opcodes. See description. // First Operand: Data loc of counter variable - // 2nd Operand: Pat loc of the URX_CTR_LOOPx + // 2nd Operand: Pat loc of the URX_CTR_LOOPx // at the end of the loop. // 3rd Operand: Minimum count. // 4th Operand: Max count, -1 for unbounded. @@ -118,7 +118,7 @@ enum { // within the matcher stack frame. URX_JMPX = 36, // Conditional JMP. // First Operand: JMP target location. - // Second Operand: Data location containing an + // Second Operand: Data location containing an // input position. If current input position == // saved input position, FAIL rather than taking // the JMP @@ -157,7 +157,7 @@ enum { URX_LBN_END = 48, // Negative LookBehind end // Parameter is the data location. // Check that the match ended at the right spot. - URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated + URX_STAT_SETREF_N = 49, // Reference to a prebuilt set (e.g. \w), negated // Operand is index of set in array of sets. URX_LOOP_SR_I = 50, // Init a [set]* loop. // Operand is the sets index in array of user sets. @@ -171,7 +171,7 @@ enum { URX_BACKSLASH_BU = 53 // \b or \B in UREGEX_UWORD mode, using Unicode style // word boundaries. -}; +}; // Keep this list of opcode names in sync with the above enum // Used for debug printing only. @@ -236,14 +236,14 @@ enum { // Convenience macros for assembling and disassembling a compiled operation. // #define URX_BUILD(type, val) (int32_t)((type << 24) | (val)) -#define URX_TYPE(x) ((uint32_t)(x) >> 24) +#define URX_TYPE(x) ((uint32_t)(x) >> 24) #define URX_VAL(x) ((x) & 0xffffff) - + // // Access to Unicode Sets composite character properties // The sets are accessed by the match engine for things like \w (word boundary) -// +// enum { URX_ISWORD_SET = 1, URX_ISALNUM_SET = 2, @@ -297,7 +297,7 @@ enum StartOfMatch { (v)==START_LINE? "START_LINE" : \ (v)==START_STRING? "START_STRING" : \ "ILLEGAL") - + // // 8 bit set, to fast-path latin-1 set membership tests. diff --git a/i18n/regexst.cpp b/i18n/regexst.cpp index 41014365..d624766b 100644 --- a/i18n/regexst.cpp +++ b/i18n/regexst.cpp @@ -59,9 +59,6 @@ static const UChar gRuleSet_rule_char_pattern[] = { static const UChar gRuleSet_digit_char_pattern[] = { // [ 0 - 9 ] 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0}; -//static const UnicodeSet *gRuleDigits = NULL; - - // // Here are the backslash escape characters that ICU's unescape() function @@ -73,23 +70,13 @@ static const UChar gUnescapeCharPattern[] = { // -// White space characters that may appear within a pattern in free-form mode -// -static const UChar gRuleWhiteSpacePattern[] = { - /* "[[:Cf:][:WSpace:]]" */ - 91, 91, 58, 67, 102, 58, 93, 91, 58, 87, - 83, 112, 97, 99, 101, 58, 93, 93, 0 }; - - - -// // Unicode Set Definitions for Regular Expression \w // static const UChar gIsWordPattern[] = { // [ \ p { A l p h a b e t i c } 0x5b, 0x5c, 0x70, 0x7b, 0x61, 0x6c, 0x70, 0x68, 0x61, 0x62, 0x65, 0x74, 0x69, 0x63, 0x7d, // \ p { M } Mark - 0x5c, 0x70, 0x7b, 0x4d, 0x7d, + 0x5c, 0x70, 0x7b, 0x4d, 0x7d, // \ p { N d } Digit_Numeric 0x5c, 0x70, 0x7b, 0x4e, 0x64, 0x7d, // \ p { P c } ] Connector_Punctuation @@ -108,8 +95,8 @@ static const UChar gIsSpacePattern[] = { // UnicodeSets used in implementation of Grapheme Cluster detection, \X // static const UChar gGC_ControlPattern[] = { -// [ [ : Z l : ] [ : Z p : ] - 0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d, +// [ [ : Z l : ] [ : Z p : ] + 0x5b, 0x5b, 0x3a, 0x5A, 0x6c, 0x3a, 0x5d, 0x5b, 0x3a, 0x5A, 0x70, 0x3a, 0x5d, // [ : C c : ] [ : C f : ] - 0x5b, 0x3a, 0x43, 0x63, 0x3a, 0x5d, 0x5b, 0x3a, 0x43, 0x66, 0x3a, 0x5d, 0x2d, // [ : G r a p h e m e _ @@ -124,34 +111,35 @@ static const UChar gGC_ExtendPattern[] = { 0x45, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x7d, 0x5d, 0}; static const UChar gGC_LPattern[] = { -// [ \ p { H a n g u l _ S y l +// [ \ p { H a n g u l _ S y l 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, // l a b l e _ T y p e = L } ] - 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0}; + 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x7d, 0x5d, 0}; static const UChar gGC_VPattern[] = { -// [ \ p { H a n g u l _ S y l +// [ \ p { H a n g u l _ S y l 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, // l a b l e _ T y p e = V } ] - 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0}; + 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x56, 0x7d, 0x5d, 0}; static const UChar gGC_TPattern[] = { -// [ \ p { H a n g u l _ S y l +// [ \ p { H a n g u l _ S y l 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, // l a b l e _ T y p e = T } ] - 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0}; + 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x54, 0x7d, 0x5d, 0}; static const UChar gGC_LVPattern[] = { -// [ \ p { H a n g u l _ S y l +// [ \ p { H a n g u l _ S y l 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, // l a b l e _ T y p e = L V } ] - 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0}; + 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x7d, 0x5d, 0}; static const UChar gGC_LVTPattern[] = { -// [ \ p { H a n g u l _ S y l +// [ \ p { H a n g u l _ S y l 0x5b, 0x5c, 0x70, 0x7b, 0x48, 0x61, 0x6e, 0x67, 0x75, 0x6c, 0x5f, 0x53, 0x79, 0x6c, // l a b l e _ T y p e = L V T } ] - 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0}; + 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x54, 0x79, 0x70, 0x65, 0x3d, 0x4c, 0x56, 0x54, 0x7d, 0x5d, 0}; + RegexStaticSets *RegexStaticSets::gStaticSets = NULL; @@ -160,7 +148,7 @@ RegexStaticSets::RegexStaticSets(UErrorCode *status) fUnescapeCharSet(UnicodeString(TRUE, gUnescapeCharPattern, -1), *status), fRuleDigitsAlias(NULL) { - // First zero out everything + // First zero out everything int i; for (i=0; i<URX_LAST_SET; i++) { fPropSets[i] = NULL; @@ -171,7 +159,7 @@ fRuleDigitsAlias(NULL) // Then init the sets to their correct values. fPropSets[URX_ISWORD_SET] = new UnicodeSet(UnicodeString(TRUE, gIsWordPattern, -1), *status); - fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1), *status); + fPropSets[URX_ISSPACE_SET] = new UnicodeSet(UnicodeString(TRUE, gIsSpacePattern, -1), *status); fPropSets[URX_GC_EXTEND] = new UnicodeSet(UnicodeString(TRUE, gGC_ExtendPattern, -1), *status); fPropSets[URX_GC_CONTROL] = new UnicodeSet(UnicodeString(TRUE, gGC_ControlPattern, -1), *status); fPropSets[URX_GC_L] = new UnicodeSet(UnicodeString(TRUE, gGC_LPattern, -1), *status); @@ -184,14 +172,14 @@ fRuleDigitsAlias(NULL) // The rest of the initialization needs them, so we cannot proceed. return; } - - + + // // The following sets are dynamically constructed, because their // initialization strings would be unreasonable. // - - + + // // "Normal" is the set of characters that don't need special handling // when finding grapheme cluster boundaries. @@ -202,7 +190,7 @@ fRuleDigitsAlias(NULL) fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_L]); fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_V]); fPropSets[URX_GC_NORMAL]->removeAll(*fPropSets[URX_GC_T]); - + // Initialize the 8-bit fast bit sets from the parallel full // UnicodeSets. for (i=0; i<URX_LAST_SET; i++) { @@ -213,9 +201,8 @@ fRuleDigitsAlias(NULL) } // Sets used while parsing rules, but not referenced from the parse state table - fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status); - fRuleSets[kRuleSet_white_space-128] = new UnicodeSet(UnicodeString(TRUE, gRuleWhiteSpacePattern, -1), *status); - fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status); + fRuleSets[kRuleSet_rule_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1), *status); + fRuleSets[kRuleSet_digit_char-128] = new UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1), *status); fRuleDigitsAlias = fRuleSets[kRuleSet_digit_char-128]; for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) { if (fRuleSets[i]) { @@ -281,7 +268,7 @@ void RegexStaticSets::initGlobals(UErrorCode *status) { ucln_i18n_registerCleanup(UCLN_I18N_REGEX, regex_cleanup); } } - + U_NAMESPACE_END #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS diff --git a/i18n/repattrn.cpp b/i18n/repattrn.cpp index bcd11078..8cf55d7b 100644 --- a/i18n/repattrn.cpp +++ b/i18n/repattrn.cpp @@ -1,5 +1,5 @@ // -// file: repattrn.cpp +// file: repattrn.cpp // /* *************************************************************************** @@ -46,7 +46,7 @@ RegexPattern::RegexPattern() { // //-------------------------------------------------------------------------- RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) { - init(); + init(); *this = other; } @@ -78,9 +78,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { fFrameSize = other.fFrameSize; fDataSize = other.fDataSize; fMaxCaptureDigits = other.fMaxCaptureDigits; - fStaticSets = other.fStaticSets; + fStaticSets = other.fStaticSets; fStaticSets8 = other.fStaticSets8; - + fStartType = other.fStartType; fInitialStringIdx = other.fInitialStringIdx; fInitialStringLen = other.fInitialStringLen; @@ -92,9 +92,9 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) { fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus); fGroupMap->assign(*other.fGroupMap, fDeferredStatus); - // Copy the Unicode Sets. + // Copy the Unicode Sets. // Could be made more efficient if the sets were reference counted and shared, - // but I doubt that pattern copying will be particularly common. + // but I doubt that pattern copying will be particularly common. // Note: init() already added an empty element zero to fSets int32_t i; int32_t numSets = other.fSets->size(); @@ -135,7 +135,7 @@ void RegexPattern::init() { fFrameSize = 0; fDataSize = 0; fGroupMap = NULL; - fMaxCaptureDigits = 1; + fMaxCaptureDigits = 1; fStaticSets = NULL; fStaticSets8 = NULL; fStartType = START_NO_INFO; @@ -144,7 +144,7 @@ void RegexPattern::init() { fInitialChars = NULL; fInitialChar = 0; fInitialChars8 = NULL; - + fCompiledPat = new UVector32(fDeferredStatus); fGroupMap = new UVector32(fDeferredStatus); fSets = new UVector(fDeferredStatus); @@ -166,7 +166,7 @@ void RegexPattern::init() { //-------------------------------------------------------------------------- // -// zap Delete everything owned by this RegexPattern. +// zap Delete everything owned by this RegexPattern. // //-------------------------------------------------------------------------- void RegexPattern::zap() { @@ -208,7 +208,7 @@ RegexPattern::~RegexPattern() { // Clone // //-------------------------------------------------------------------------- -RegexPattern *RegexPattern::clone() const { +RegexPattern *RegexPattern::clone() const { RegexPattern *copy = new RegexPattern(*this); return copy; } @@ -229,7 +229,7 @@ UBool RegexPattern::operator ==(const RegexPattern &other) const { //--------------------------------------------------------------------- // -// compile +// compile // //--------------------------------------------------------------------- RegexPattern * U_EXPORT2 @@ -244,7 +244,7 @@ RegexPattern::compile(const UnicodeString ®ex, } const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | - UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD; + UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | UREGEX_ERROR_ON_UNKNOWN_ESCAPES; if ((flags & ~allFlags) != 0) { status = U_REGEX_INVALID_FLAG; @@ -269,19 +269,24 @@ RegexPattern::compile(const UnicodeString ®ex, RegexCompile compiler(This, status); compiler.compile(regex, pe, status); + + if (U_FAILURE(status)) { + delete This; + This = NULL; + } return This; } - + // // compile with default flags. // RegexPattern * U_EXPORT2 RegexPattern::compile(const UnicodeString ®ex, UParseError &pe, - UErrorCode &err) + UErrorCode &err) { - return compile(regex, 0, pe, err); + return compile(regex, 0, pe, err); } @@ -292,10 +297,10 @@ RegexPattern::compile(const UnicodeString ®ex, RegexPattern * U_EXPORT2 RegexPattern::compile( const UnicodeString ®ex, uint32_t flags, - UErrorCode &err) + UErrorCode &err) { UParseError pe; - return compile(regex, flags, pe, err); + return compile(regex, flags, pe, err); } @@ -326,7 +331,7 @@ RegexMatcher *RegexPattern::matcher(const UnicodeString &input, #if 0 RegexMatcher *RegexPattern::matcher(const UChar * /*input*/, - UErrorCode &status) const + UErrorCode &status) const { /* This should never get called. The API with UnicodeString should be called instead. */ if (U_SUCCESS(status)) { @@ -352,7 +357,7 @@ RegexMatcher *RegexPattern::matcher(UErrorCode &status) const { return NULL; } - retMatcher = new RegexMatcher(this); + retMatcher = new RegexMatcher(this); if (retMatcher == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; @@ -440,7 +445,7 @@ void RegexPattern::dumpOp(int32_t index) const { if (pinnedType >= sizeof(opNames)/sizeof(char *)) { pinnedType = 0; } - + REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType])); switch (type) { case URX_NOP: @@ -458,7 +463,7 @@ void RegexPattern::dumpOp(int32_t index) const { case URX_CARET_M: // Types with no operand field of interest. break; - + case URX_RESERVED_OP: case URX_START_CAPTURE: case URX_END_CAPTURE: @@ -494,12 +499,12 @@ void RegexPattern::dumpOp(int32_t index) const { // types with an integer operand field. REGEX_DUMP_DEBUG_PRINTF(("%d", val)); break; - + case URX_ONECHAR: case URX_ONECHAR_I: REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?')); break; - + case URX_STRING: case URX_STRING_I: { @@ -543,7 +548,7 @@ void RegexPattern::dumpOp(int32_t index) const { } break; - + default: REGEX_DUMP_DEBUG_PRINTF(("??????")); break; @@ -554,7 +559,7 @@ void RegexPattern::dumpOp(int32_t index) const { #if defined(REGEX_DEBUG) -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 RegexPatternDump(const RegexPattern *This) { int index; int i; @@ -565,7 +570,7 @@ RegexPatternDump(const RegexPattern *This) { } REGEX_DUMP_DEBUG_PRINTF(("\n")); REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen)); - REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType))); + REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType))); if (This->fStartType == START_STRING) { REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \"")); for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) { @@ -580,7 +585,7 @@ RegexPatternDump(const RegexPattern *This) { REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : ")); for (i=0; i<numSetChars; i++) { UChar32 c = This->fInitialChars->charAt(i); - if (0x20<c && c <0x7e) { + if (0x20<c && c <0x7e) { REGEX_DUMP_DEBUG_PRINTF(("%c ", c)); } else { REGEX_DUMP_DEBUG_PRINTF(("%#x ", c)); diff --git a/i18n/unicode/regex.h b/i18n/unicode/regex.h index ce24ef5f..27f4b404 100644 --- a/i18n/unicode/regex.h +++ b/i18n/unicode/regex.h @@ -16,7 +16,7 @@ #ifndef REGEX_H #define REGEX_H -//#define REGEX_DEBUG +// #define REGEX_DEBUG /** * \file @@ -36,7 +36,7 @@ * operations, for search and replace operations, and for obtaining detailed * information about bounds of a match. </p> * - * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular + * <p>Note that by constructing <code>RegexMatcher</code> objects directly from regular * expression pattern strings application code can be simplified and the explicit * need for <code>RegexPattern</code> objects can usually be eliminated. * </p> @@ -480,7 +480,7 @@ public: * critical that the string not be altered or deleted before use by the regular * expression operations is complete. * - * @param regexp The Regular Expression to be compiled. + * @param regexp The Regular Expression to be compiled. * @param input The string to match. The matcher retains a reference to the * caller's string; mo copy is made. * @param flags Regular expression options, such as case insensitive matching. @@ -709,13 +709,13 @@ public: /** * Resets this matcher with a new input string. This allows instances of RegexMatcher * to be reused, which is more efficient than creating a new RegexMatcher for - * each input string to be processed. + * each input string to be processed. * @param input The new string on which subsequent pattern matches will operate. * The matcher retains a reference to the callers string, and operates * directly on that. Ownership of the string remains with the caller. * Because no copy of the string is made, it is essential that the * caller not delete the string until after regexp operations on it - * are done. + * are done. * @return this RegexMatcher. * @stable ICU 2.4 */ diff --git a/i18n/unicode/uregex.h b/i18n/unicode/uregex.h index 862cf344..3b0e7018 100644 --- a/i18n/unicode/uregex.h +++ b/i18n/unicode/uregex.h @@ -73,7 +73,17 @@ typedef enum URegexpFlag{ * http://unicode.org/reports/tr29/#Word_Boundaries * @stable ICU 2.8 */ - UREGEX_UWORD = 256 + UREGEX_UWORD = 256, + + /** Error on Unrecognized backslash escapes. + * If set, fail with an error on patterns that contain + * backslash-escaped ASCII letters without a known specail + * meaning. If this flag is not set, these + * escaped letters represent themselves. + * @draft ICU 4.0 + */ + UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 + } URegexpFlag; /** |