# # start state, scan position is at the beginning of the pattern. # start: '[' n set-open ^set-finish '\' n set-escape ^set-finish default errorDeath doRuleError # # [set expression] parsing, # All states involved in parsing set expressions have names beginning with "set-" # set-open: '^' n set-open2 doSetNegate ':' set-posix doSetPosixProp default set-open2 set-open2: ']' n set-after-lit doSetLiteral default set-start # set-posix: # scanned a '[:' If it really is a [:property:], doSetPosixProp will have # moved the scan to the closing ']'. If it wasn't a property # expression, the scan will still be at the opening ':', which should # be interpreted as a normal set expression. set-posix: ']' n pop doSetEnd ':' set-start default errorDeath doRuleError # should not be possible. # # set-start after the [ and special case leading characters (^ and/or ]) but before # everything else. A '-' is literal at this point. # set-start: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '\' n set-escape '-' n set-start-dash '&' n set-start-amp default n set-after-lit doSetLiteral # set-start-dash Turn "[--" into a syntax error. # "[-x" is good, - and x are literals. # set-start-dash: '-' errorDeath doRuleError default set-after-lit doSetAddDash # set-start-amp Turn "[&&" into a syntax error. # "[&x" is good, & and x are literals. # set-start-amp: '&' errorDeath doRuleError default set-after-lit doSetAddAmp # # set-after-lit The last thing scanned was a literal character within a set. # Can be followed by anything. Single '-' or '&' are # literals in this context, not operators. set-after-lit: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '-' n set-lit-dash '&' n set-lit-amp '\' n set-escape eof errorDeath doSetNoCloseError default n set-after-lit doSetLiteral set-after-set: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '-' n set-set-dash '&' n set-set-amp '\' n set-escape eof errorDeath doSetNoCloseError default n set-after-lit doSetLiteral set-after-range: ']' n pop doSetEnd '[' n set-open ^set-after-set doSetBeginUnion '-' n set-range-dash '&' n set-range-amp '\' n set-escape eof errorDeath doSetNoCloseError default n set-after-lit doSetLiteral # set-after-op # After a -- or && # It is an error to close a set at this point. # set-after-op: '[' n set-open ^set-after-set doSetBeginUnion ']' errorDeath doSetOpError '\' n set-escape default n set-after-lit doSetLiteral # # set-set-amp # Have scanned [[set]& # Could be a '&' intersection operator, if a set follows. # Could be the start of a '&&' operator. # Otherewise is a literal. set-set-amp: '[' n set-open ^set-after-set doSetBeginIntersection1 '&' n set-after-op doSetIntersection2 default set-after-lit doSetAddAmp # set-lit-amp Have scanned "[literals&" # Could be a start of "&&" operator or a literal # In [abc&[def]], the '&' is a literal # set-lit-amp: '&' n set-after-op doSetIntersection2 default set-after-lit doSetAddAmp # # set-set-dash # Have scanned [set]- # Could be a '-' difference operator, if a [set] follows. # Could be the start of a '--' operator. # Otherwise is a literal. set-set-dash: '[' n set-open ^set-after-set doSetBeginDifference1 '-' n set-after-op doSetDifference2 default set-after-lit doSetAddDash # # set-range-dash # scanned a-b- or \w- # any set or range like item where the trailing single '-' should # be literal, not a set difference operation. # A trailing "--" is still a difference operator. set-range-dash: '-' n set-after-op doSetDifference2 default set-after-lit doSetAddDash set-range-amp: '&' n set-after-op doSetIntersection2 default set-after-lit doSetAddAmp # set-lit-dash # Have scanned "[literals-" Could be a range or a -- operator or a literal # In [abc-[def]], the '-' is a literal (confirmed with a Java test) # [abc-\p{xx} the '-' is an error # [abc-] the '-' is a literal # [ab-xy] the '-' is a range # set-lit-dash: '-' n set-after-op doSetDifference2 '[' set-after-lit doSetAddDash ']' set-after-lit doSetAddDash '\' n set-lit-dash-escape default n set-after-range doSetRange # set-lit-dash-escape # # scanned "[literal-\" # Could be a range, if the \ introduces an escaped literal char or a named char. # Otherwise it is an error. # set-lit-dash-escape: 's' errorDeath doSetOpError 'S' errorDeath doSetOpError 'w' errorDeath doSetOpError 'W' errorDeath doSetOpError 'd' errorDeath doSetOpError 'D' errorDeath doSetOpError 'N' set-name-start ^set-after-range doStartNamedChar 'x' set-hex-start ^set-after-range doStartHex default n set-after-range doSetRange # TODO fix 'N', 'x' # # set-escape # Common back-slash escape processing within set expressions # set-escape: 'p' n set-prop-start ^set-after-set doStartSetProp 'P' n set-prop-start ^set-after-set doStartSetProp 'N' n set-name-start ^set-after-lit doStartNamedChar 'x' n set-hex-start ^set-after-lit doStartHex 's' n set-after-range doSetBackslash_s 'S' n set-after-range doSetBackslash_S 'w' n set-after-range doSetBackslash_w 'W' n set-after-range doSetBackslash_W 'd' n set-after-range doSetBackslash_d 'D' n set-after-range doSetBackslash_D default n set-after-lit doSetLiteralEscaped # TODO add \r, \n, etc set-prop-start: '{' n set-prop-cont default errorDeath set-prop-cont: '}' n pop doPropName '=' n set-value doPropRelation '≠' n set-value doPropRelation default n set-prop-cont set-value: '}' n pop doPropValue default n set-value set-name-start: '{' n set-name-cont default errorDeath set-name-cont: '}' n pop doName [\ \-0-9A-Za-z] n set-name-cont default n errorDeath set-hex-start: '{' n set-hex-cont default errorDeath set-hex-cont: '}' n pop doHex [0-9A-Fa-f] n set-hex-cont default n errorDeath # # set-finish # Have just encountered the final ']' that completes a [set], and # arrived here via a pop. From here, we exit the set parsing world, and go # back to generic regular expression parsing. # set-finish: default exit doSetFinish