aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/apache/commons/lang3/text/StrTokenizer.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/apache/commons/lang3/text/StrTokenizer.java')
-rw-r--r--src/main/java/org/apache/commons/lang3/text/StrTokenizer.java1109
1 files changed, 1109 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/lang3/text/StrTokenizer.java b/src/main/java/org/apache/commons/lang3/text/StrTokenizer.java
new file mode 100644
index 000000000..3236329a6
--- /dev/null
+++ b/src/main/java/org/apache/commons/lang3/text/StrTokenizer.java
@@ -0,0 +1,1109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.lang3.text;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.NoSuchElementException;
+import java.util.StringTokenizer;
+
+import org.apache.commons.lang3.ArrayUtils;
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * Tokenizes a string based on delimiters (separators)
+ * and supporting quoting and ignored character concepts.
+ * <p>
+ * This class can split a String into many smaller strings. It aims
+ * to do a similar job to {@link java.util.StringTokenizer StringTokenizer},
+ * however it offers much more control and flexibility including implementing
+ * the {@link ListIterator} interface. By default, it is set up
+ * like {@link StringTokenizer}.
+ * </p>
+ * <p>
+ * The input String is split into a number of <i>tokens</i>.
+ * Each token is separated from the next String by a <i>delimiter</i>.
+ * One or more delimiter characters must be specified.
+ * </p>
+ * <p>
+ * Each token may be surrounded by quotes.
+ * The <i>quote</i> matcher specifies the quote character(s).
+ * A quote may be escaped within a quoted section by duplicating itself.
+ * </p>
+ * <p>
+ * Between each token and the delimiter are potentially characters that need trimming.
+ * The <i>trimmer</i> matcher specifies these characters.
+ * One usage might be to trim whitespace characters.
+ * </p>
+ * <p>
+ * At any point outside the quotes there might potentially be invalid characters.
+ * The <i>ignored</i> matcher specifies these characters to be removed.
+ * One usage might be to remove new line characters.
+ * </p>
+ * <p>
+ * Empty tokens may be removed or returned as null.
+ * </p>
+ * <pre>
+ * "a,b,c" - Three tokens "a","b","c" (comma delimiter)
+ * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace)
+ * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched)
+ * </pre>
+ *
+ * <table>
+ * <caption>StrTokenizer properties and options</caption>
+ * <tr>
+ * <th>Property</th><th>Type</th><th>Default</th>
+ * </tr>
+ * <tr>
+ * <td>delim</td><td>CharSetMatcher</td><td>{ \t\n\r\f}</td>
+ * </tr>
+ * <tr>
+ * <td>quote</td><td>NoneMatcher</td><td>{}</td>
+ * </tr>
+ * <tr>
+ * <td>ignore</td><td>NoneMatcher</td><td>{}</td>
+ * </tr>
+ * <tr>
+ * <td>emptyTokenAsNull</td><td>boolean</td><td>false</td>
+ * </tr>
+ * <tr>
+ * <td>ignoreEmptyTokens</td><td>boolean</td><td>true</td>
+ * </tr>
+ * </table>
+ *
+ * @since 2.2
+ * @deprecated As of 3.6, use Apache Commons Text
+ * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/StringTokenizer.html">
+ * StringTokenizer</a> instead
+ */
+@Deprecated
+public class StrTokenizer implements ListIterator<String>, Cloneable {
+
+ private static final StrTokenizer CSV_TOKENIZER_PROTOTYPE;
+ private static final StrTokenizer TSV_TOKENIZER_PROTOTYPE;
+ static {
+ CSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
+ CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.commaMatcher());
+ CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
+ CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
+ CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
+ CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
+ CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
+
+ TSV_TOKENIZER_PROTOTYPE = new StrTokenizer();
+ TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StrMatcher.tabMatcher());
+ TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StrMatcher.doubleQuoteMatcher());
+ TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StrMatcher.noneMatcher());
+ TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StrMatcher.trimMatcher());
+ TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false);
+ TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false);
+ }
+
+ /** The text to work on. */
+ private char[] chars;
+ /** The parsed tokens */
+ private String[] tokens;
+ /** The current iteration position */
+ private int tokenPos;
+
+ /** The delimiter matcher */
+ private StrMatcher delimMatcher = StrMatcher.splitMatcher();
+ /** The quote matcher */
+ private StrMatcher quoteMatcher = StrMatcher.noneMatcher();
+ /** The ignored matcher */
+ private StrMatcher ignoredMatcher = StrMatcher.noneMatcher();
+ /** The trimmer matcher */
+ private StrMatcher trimmerMatcher = StrMatcher.noneMatcher();
+
+ /** Whether to return empty tokens as null */
+ private boolean emptyAsNull;
+ /** Whether to ignore empty tokens */
+ private boolean ignoreEmptyTokens = true;
+
+
+ /**
+ * Returns a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
+ *
+ * @return a clone of {@code CSV_TOKENIZER_PROTOTYPE}.
+ */
+ private static StrTokenizer getCSVClone() {
+ return (StrTokenizer) CSV_TOKENIZER_PROTOTYPE.clone();
+ }
+
+ /**
+ * Gets a new tokenizer instance which parses Comma Separated Value strings
+ * initializing it with the given input. The default for CSV processing
+ * will be trim whitespace from both ends (which can be overridden with
+ * the setTrimmer method).
+ * <p>
+ * You must call a "reset" method to set the string which you want to parse.
+ * </p>
+ * @return a new tokenizer instance which parses Comma Separated Value strings
+ */
+ public static StrTokenizer getCSVInstance() {
+ return getCSVClone();
+ }
+
+ /**
+ * Gets a new tokenizer instance which parses Comma Separated Value strings
+ * initializing it with the given input. The default for CSV processing
+ * will be trim whitespace from both ends (which can be overridden with
+ * the setTrimmer method).
+ *
+ * @param input the text to parse
+ * @return a new tokenizer instance which parses Comma Separated Value strings
+ */
+ public static StrTokenizer getCSVInstance(final String input) {
+ final StrTokenizer tok = getCSVClone();
+ tok.reset(input);
+ return tok;
+ }
+
+ /**
+ * Gets a new tokenizer instance which parses Comma Separated Value strings
+ * initializing it with the given input. The default for CSV processing
+ * will be trim whitespace from both ends (which can be overridden with
+ * the setTrimmer method).
+ *
+ * @param input the text to parse
+ * @return a new tokenizer instance which parses Comma Separated Value strings
+ */
+ public static StrTokenizer getCSVInstance(final char[] input) {
+ final StrTokenizer tok = getCSVClone();
+ tok.reset(input);
+ return tok;
+ }
+
+ /**
+ * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
+ *
+ * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}.
+ */
+ private static StrTokenizer getTSVClone() {
+ return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone();
+ }
+
+
+ /**
+ * Gets a new tokenizer instance which parses Tab Separated Value strings.
+ * The default for CSV processing will be trim whitespace from both ends
+ * (which can be overridden with the setTrimmer method).
+ * <p>
+ * You must call a "reset" method to set the string which you want to parse.
+ * </p>
+ * @return a new tokenizer instance which parses Tab Separated Value strings.
+ */
+ public static StrTokenizer getTSVInstance() {
+ return getTSVClone();
+ }
+
+ /**
+ * Gets a new tokenizer instance which parses Tab Separated Value strings.
+ * The default for CSV processing will be trim whitespace from both ends
+ * (which can be overridden with the setTrimmer method).
+ * @param input the string to parse
+ * @return a new tokenizer instance which parses Tab Separated Value strings.
+ */
+ public static StrTokenizer getTSVInstance(final String input) {
+ final StrTokenizer tok = getTSVClone();
+ tok.reset(input);
+ return tok;
+ }
+
+ /**
+ * Gets a new tokenizer instance which parses Tab Separated Value strings.
+ * The default for CSV processing will be trim whitespace from both ends
+ * (which can be overridden with the setTrimmer method).
+ * @param input the string to parse
+ * @return a new tokenizer instance which parses Tab Separated Value strings.
+ */
+ public static StrTokenizer getTSVInstance(final char[] input) {
+ final StrTokenizer tok = getTSVClone();
+ tok.reset(input);
+ return tok;
+ }
+
+ /**
+ * Constructs a tokenizer splitting on space, tab, newline and formfeed
+ * as per StringTokenizer, but with no text to tokenize.
+ * <p>
+ * This constructor is normally used with {@link #reset(String)}.
+ * </p>
+ */
+ public StrTokenizer() {
+ this.chars = null;
+ }
+
+ /**
+ * Constructs a tokenizer splitting on space, tab, newline and formfeed
+ * as per StringTokenizer.
+ *
+ * @param input the string which is to be parsed
+ */
+ public StrTokenizer(final String input) {
+ if (input != null) {
+ chars = input.toCharArray();
+ } else {
+ chars = null;
+ }
+ }
+
+ /**
+ * Constructs a tokenizer splitting on the specified delimiter character.
+ *
+ * @param input the string which is to be parsed
+ * @param delim the field delimiter character
+ */
+ public StrTokenizer(final String input, final char delim) {
+ this(input);
+ setDelimiterChar(delim);
+ }
+
+ /**
+ * Constructs a tokenizer splitting on the specified delimiter string.
+ *
+ * @param input the string which is to be parsed
+ * @param delim the field delimiter string
+ */
+ public StrTokenizer(final String input, final String delim) {
+ this(input);
+ setDelimiterString(delim);
+ }
+
+ /**
+ * Constructs a tokenizer splitting using the specified delimiter matcher.
+ *
+ * @param input the string which is to be parsed
+ * @param delim the field delimiter matcher
+ */
+ public StrTokenizer(final String input, final StrMatcher delim) {
+ this(input);
+ setDelimiterMatcher(delim);
+ }
+
+ /**
+ * Constructs a tokenizer splitting on the specified delimiter character
+ * and handling quotes using the specified quote character.
+ *
+ * @param input the string which is to be parsed
+ * @param delim the field delimiter character
+ * @param quote the field quoted string character
+ */
+ public StrTokenizer(final String input, final char delim, final char quote) {
+ this(input, delim);
+ setQuoteChar(quote);
+ }
+
+ /**
+ * Constructs a tokenizer splitting using the specified delimiter matcher
+ * and handling quotes using the specified quote matcher.
+ *
+ * @param input the string which is to be parsed
+ * @param delim the field delimiter matcher
+ * @param quote the field quoted string matcher
+ */
+ public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) {
+ this(input, delim);
+ setQuoteMatcher(quote);
+ }
+
+ /**
+ * Constructs a tokenizer splitting on space, tab, newline and formfeed
+ * as per StringTokenizer.
+ *
+ * @param input the string which is to be parsed, not cloned
+ */
+ public StrTokenizer(final char[] input) {
+ this.chars = ArrayUtils.clone(input);
+ }
+
+ /**
+ * Constructs a tokenizer splitting on the specified character.
+ *
+ * @param input the string which is to be parsed, not cloned
+ * @param delim the field delimiter character
+ */
+ public StrTokenizer(final char[] input, final char delim) {
+ this(input);
+ setDelimiterChar(delim);
+ }
+
+ /**
+ * Constructs a tokenizer splitting on the specified string.
+ *
+ * @param input the string which is to be parsed, not cloned
+ * @param delim the field delimiter string
+ */
+ public StrTokenizer(final char[] input, final String delim) {
+ this(input);
+ setDelimiterString(delim);
+ }
+
+ /**
+ * Constructs a tokenizer splitting using the specified delimiter matcher.
+ *
+ * @param input the string which is to be parsed, not cloned
+ * @param delim the field delimiter matcher
+ */
+ public StrTokenizer(final char[] input, final StrMatcher delim) {
+ this(input);
+ setDelimiterMatcher(delim);
+ }
+
+ /**
+ * Constructs a tokenizer splitting on the specified delimiter character
+ * and handling quotes using the specified quote character.
+ *
+ * @param input the string which is to be parsed, not cloned
+ * @param delim the field delimiter character
+ * @param quote the field quoted string character
+ */
+ public StrTokenizer(final char[] input, final char delim, final char quote) {
+ this(input, delim);
+ setQuoteChar(quote);
+ }
+
+ /**
+ * Constructs a tokenizer splitting using the specified delimiter matcher
+ * and handling quotes using the specified quote matcher.
+ *
+ * @param input the string which is to be parsed, not cloned
+ * @param delim the field delimiter character
+ * @param quote the field quoted string character
+ */
+ public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) {
+ this(input, delim);
+ setQuoteMatcher(quote);
+ }
+
+ // API
+ /**
+ * Gets the number of tokens found in the String.
+ *
+ * @return the number of matched tokens
+ */
+ public int size() {
+ checkTokenized();
+ return tokens.length;
+ }
+
+ /**
+ * Gets the next token from the String.
+ * Equivalent to {@link #next()} except it returns null rather than
+ * throwing {@link NoSuchElementException} when no tokens remain.
+ *
+ * @return the next sequential token, or null when no more tokens are found
+ */
+ public String nextToken() {
+ if (hasNext()) {
+ return tokens[tokenPos++];
+ }
+ return null;
+ }
+
+ /**
+ * Gets the previous token from the String.
+ *
+ * @return the previous sequential token, or null when no more tokens are found
+ */
+ public String previousToken() {
+ if (hasPrevious()) {
+ return tokens[--tokenPos];
+ }
+ return null;
+ }
+
+ /**
+ * Gets a copy of the full token list as an independent modifiable array.
+ *
+ * @return the tokens as a String array
+ */
+ public String[] getTokenArray() {
+ checkTokenized();
+ return tokens.clone();
+ }
+
+ /**
+ * Gets a copy of the full token list as an independent modifiable list.
+ *
+ * @return the tokens as a String array
+ */
+ public List<String> getTokenList() {
+ checkTokenized();
+ final List<String> list = new ArrayList<>(tokens.length);
+ list.addAll(Arrays.asList(tokens));
+ return list;
+ }
+
+ /**
+ * Resets this tokenizer, forgetting all parsing and iteration already completed.
+ * <p>
+ * This method allows the same tokenizer to be reused for the same String.
+ * </p>
+ *
+ * @return this, to enable chaining
+ */
+ public StrTokenizer reset() {
+ tokenPos = 0;
+ tokens = null;
+ return this;
+ }
+
+ /**
+ * Reset this tokenizer, giving it a new input string to parse.
+ * In this manner you can re-use a tokenizer with the same settings
+ * on multiple input lines.
+ *
+ * @param input the new string to tokenize, null sets no text to parse
+ * @return this, to enable chaining
+ */
+ public StrTokenizer reset(final String input) {
+ reset();
+ if (input != null) {
+ this.chars = input.toCharArray();
+ } else {
+ this.chars = null;
+ }
+ return this;
+ }
+
+ /**
+ * Reset this tokenizer, giving it a new input string to parse.
+ * In this manner you can re-use a tokenizer with the same settings
+ * on multiple input lines.
+ *
+ * @param input the new character array to tokenize, not cloned, null sets no text to parse
+ * @return this, to enable chaining
+ */
+ public StrTokenizer reset(final char[] input) {
+ reset();
+ this.chars = ArrayUtils.clone(input);
+ return this;
+ }
+
+ /**
+ * Checks whether there are any more tokens.
+ *
+ * @return true if there are more tokens
+ */
+ @Override
+ public boolean hasNext() {
+ checkTokenized();
+ return tokenPos < tokens.length;
+ }
+
+ /**
+ * Gets the next token.
+ *
+ * @return the next String token
+ * @throws NoSuchElementException if there are no more elements
+ */
+ @Override
+ public String next() {
+ if (hasNext()) {
+ return tokens[tokenPos++];
+ }
+ throw new NoSuchElementException();
+ }
+
+ /**
+ * Gets the index of the next token to return.
+ *
+ * @return the next token index
+ */
+ @Override
+ public int nextIndex() {
+ return tokenPos;
+ }
+
+ /**
+ * Checks whether there are any previous tokens that can be iterated to.
+ *
+ * @return true if there are previous tokens
+ */
+ @Override
+ public boolean hasPrevious() {
+ checkTokenized();
+ return tokenPos > 0;
+ }
+
+ /**
+ * Gets the token previous to the last returned token.
+ *
+ * @return the previous token
+ */
+ @Override
+ public String previous() {
+ if (hasPrevious()) {
+ return tokens[--tokenPos];
+ }
+ throw new NoSuchElementException();
+ }
+
+ /**
+ * Gets the index of the previous token.
+ *
+ * @return the previous token index
+ */
+ @Override
+ public int previousIndex() {
+ return tokenPos - 1;
+ }
+
+ /**
+ * Unsupported ListIterator operation.
+ *
+ * @throws UnsupportedOperationException always
+ */
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException("remove() is unsupported");
+ }
+
+ /**
+ * Unsupported ListIterator operation.
+ * @param obj this parameter ignored.
+ * @throws UnsupportedOperationException always
+ */
+ @Override
+ public void set(final String obj) {
+ throw new UnsupportedOperationException("set() is unsupported");
+ }
+
+ /**
+ * Unsupported ListIterator operation.
+ * @param obj this parameter ignored.
+ * @throws UnsupportedOperationException always
+ */
+ @Override
+ public void add(final String obj) {
+ throw new UnsupportedOperationException("add() is unsupported");
+ }
+
+ /**
+ * Checks if tokenization has been done, and if not then do it.
+ */
+ private void checkTokenized() {
+ if (tokens == null) {
+ if (chars == null) {
+ // still call tokenize as subclass may do some work
+ final List<String> split = tokenize(null, 0, 0);
+ tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
+ } else {
+ final List<String> split = tokenize(chars, 0, chars.length);
+ tokens = split.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
+ }
+ }
+ }
+
+ /**
+ * Internal method to performs the tokenization.
+ * <p>
+ * Most users of this class do not need to call this method. This method
+ * will be called automatically by other (public) methods when required.
+ * </p>
+ * <p>
+ * This method exists to allow subclasses to add code before or after the
+ * tokenization. For example, a subclass could alter the character array,
+ * offset or count to be parsed, or call the tokenizer multiple times on
+ * multiple strings. It is also be possible to filter the results.
+ * </p>
+ * <p>
+ * {@link StrTokenizer} will always pass a zero offset and a count
+ * equal to the length of the array to this method, however a subclass
+ * may pass other values, or even an entirely different array.
+ * </p>
+ *
+ * @param srcChars the character array being tokenized, may be null
+ * @param offset the start position within the character array, must be valid
+ * @param count the number of characters to tokenize, must be valid
+ * @return the modifiable list of String tokens, unmodifiable if null array or zero count
+ */
+ protected List<String> tokenize(final char[] srcChars, final int offset, final int count) {
+ if (ArrayUtils.isEmpty(srcChars)) {
+ return Collections.emptyList();
+ }
+ final StrBuilder buf = new StrBuilder();
+ final List<String> tokenList = new ArrayList<>();
+ int pos = offset;
+
+ // loop around the entire buffer
+ while (pos >= 0 && pos < count) {
+ // find next token
+ pos = readNextToken(srcChars, pos, count, buf, tokenList);
+
+ // handle case where end of string is a delimiter
+ if (pos >= count) {
+ addToken(tokenList, StringUtils.EMPTY);
+ }
+ }
+ return tokenList;
+ }
+
+ /**
+ * Adds a token to a list, paying attention to the parameters we've set.
+ *
+ * @param list the list to add to
+ * @param tok the token to add
+ */
+ private void addToken(final List<String> list, String tok) {
+ if (StringUtils.isEmpty(tok)) {
+ if (isIgnoreEmptyTokens()) {
+ return;
+ }
+ if (isEmptyTokenAsNull()) {
+ tok = null;
+ }
+ }
+ list.add(tok);
+ }
+
+ /**
+ * Reads character by character through the String to get the next token.
+ *
+ * @param srcChars the character array being tokenized
+ * @param start the first character of field
+ * @param len the length of the character array being tokenized
+ * @param workArea a temporary work area
+ * @param tokenList the list of parsed tokens
+ * @return the starting position of the next field (the character
+ * immediately after the delimiter), or -1 if end of string found
+ */
+ private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List<String> tokenList) {
+ // skip all leading whitespace, unless it is the
+ // field delimiter or the quote character
+ while (start < len) {
+ final int removeLen = Math.max(
+ getIgnoredMatcher().isMatch(srcChars, start, start, len),
+ getTrimmerMatcher().isMatch(srcChars, start, start, len));
+ if (removeLen == 0 ||
+ getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 ||
+ getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) {
+ break;
+ }
+ start += removeLen;
+ }
+
+ // handle reaching end
+ if (start >= len) {
+ addToken(tokenList, StringUtils.EMPTY);
+ return -1;
+ }
+
+ // handle empty token
+ final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len);
+ if (delimLen > 0) {
+ addToken(tokenList, StringUtils.EMPTY);
+ return start + delimLen;
+ }
+
+ // handle found token
+ final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len);
+ if (quoteLen > 0) {
+ return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen);
+ }
+ return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0);
+ }
+
+ /**
+ * Reads a possibly quoted string token.
+ *
+ * @param srcChars the character array being tokenized
+ * @param start the first character of field
+ * @param len the length of the character array being tokenized
+ * @param workArea a temporary work area
+ * @param tokenList the list of parsed tokens
+ * @param quoteStart the start position of the matched quote, 0 if no quoting
+ * @param quoteLen the length of the matched quote, 0 if no quoting
+ * @return the starting position of the next field (the character
+ * immediately after the delimiter, or if end of string found,
+ * then the length of string
+ */
+ private int readWithQuotes(final char[] srcChars, final int start, final int len, final StrBuilder workArea,
+ final List<String> tokenList, final int quoteStart, final int quoteLen) {
+ // Loop until we've found the end of the quoted
+ // string or the end of the input
+ workArea.clear();
+ int pos = start;
+ boolean quoting = quoteLen > 0;
+ int trimStart = 0;
+
+ while (pos < len) {
+ // quoting mode can occur several times throughout a string
+ // we must switch between quoting and non-quoting until we
+ // encounter a non-quoted delimiter, or end of string
+ if (quoting) {
+ // In quoting mode
+
+ // If we've found a quote character, see if it's
+ // followed by a second quote. If so, then we need
+ // to actually put the quote character into the token
+ // rather than end the token.
+ if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
+ if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) {
+ // matched pair of quotes, thus an escaped quote
+ workArea.append(srcChars, pos, quoteLen);
+ pos += quoteLen * 2;
+ trimStart = workArea.size();
+ continue;
+ }
+
+ // end of quoting
+ quoting = false;
+ pos += quoteLen;
+ continue;
+ }
+
+ } else {
+ // Not in quoting mode
+
+ // check for delimiter, and thus end of token
+ final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len);
+ if (delimLen > 0) {
+ // return condition when end of token found
+ addToken(tokenList, workArea.substring(0, trimStart));
+ return pos + delimLen;
+ }
+
+ // check for quote, and thus back into quoting mode
+ if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) {
+ quoting = true;
+ pos += quoteLen;
+ continue;
+ }
+
+ // check for ignored (outside quotes), and ignore
+ final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len);
+ if (ignoredLen > 0) {
+ pos += ignoredLen;
+ continue;
+ }
+
+ // check for trimmed character
+ // don't yet know if it's at the end, so copy to workArea
+ // use trimStart to keep track of trim at the end
+ final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len);
+ if (trimmedLen > 0) {
+ workArea.append(srcChars, pos, trimmedLen);
+ pos += trimmedLen;
+ continue;
+ }
+ }
+ // copy regular character from inside quotes
+ workArea.append(srcChars[pos++]);
+ trimStart = workArea.size();
+ }
+
+ // return condition when end of string found
+ addToken(tokenList, workArea.substring(0, trimStart));
+ return -1;
+ }
+
+ /**
+ * Checks if the characters at the index specified match the quote
+ * already matched in readNextToken().
+ *
+ * @param srcChars the character array being tokenized
+ * @param pos the position to check for a quote
+ * @param len the length of the character array being tokenized
+ * @param quoteStart the start position of the matched quote, 0 if no quoting
+ * @param quoteLen the length of the matched quote, 0 if no quoting
+ * @return true if a quote is matched
+ */
+ private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) {
+ for (int i = 0; i < quoteLen; i++) {
+ if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /**
+ * Gets the field delimiter matcher.
+ *
+ * @return the delimiter matcher in use
+ */
+ public StrMatcher getDelimiterMatcher() {
+ return this.delimMatcher;
+ }
+
+ /**
+ * Sets the field delimiter matcher.
+ * <p>
+ * The delimiter is used to separate one token from another.
+ * </p>
+ *
+ * @param delim the delimiter matcher to use
+ * @return this, to enable chaining
+ */
+ public StrTokenizer setDelimiterMatcher(final StrMatcher delim) {
+ if (delim == null) {
+ this.delimMatcher = StrMatcher.noneMatcher();
+ } else {
+ this.delimMatcher = delim;
+ }
+ return this;
+ }
+
+ /**
+ * Sets the field delimiter character.
+ *
+ * @param delim the delimiter character to use
+ * @return this, to enable chaining
+ */
+ public StrTokenizer setDelimiterChar(final char delim) {
+ return setDelimiterMatcher(StrMatcher.charMatcher(delim));
+ }
+
+ /**
+ * Sets the field delimiter string.
+ *
+ * @param delim the delimiter string to use
+ * @return this, to enable chaining
+ */
+ public StrTokenizer setDelimiterString(final String delim) {
+ return setDelimiterMatcher(StrMatcher.stringMatcher(delim));
+ }
+
+ /**
+ * Gets the quote matcher currently in use.
+ * <p>
+ * The quote character is used to wrap data between the tokens.
+ * This enables delimiters to be entered as data.
+ * The default value is '"' (double quote).
+ * </p>
+ *
+ * @return the quote matcher in use
+ */
+ public StrMatcher getQuoteMatcher() {
+ return quoteMatcher;
+ }
+
+ /**
+ * Set the quote matcher to use.
+ * <p>
+ * The quote character is used to wrap data between the tokens.
+ * This enables delimiters to be entered as data.
+ * </p>
+ *
+ * @param quote the quote matcher to use, null ignored
+ * @return this, to enable chaining
+ */
+ public StrTokenizer setQuoteMatcher(final StrMatcher quote) {
+ if (quote != null) {
+ this.quoteMatcher = quote;
+ }
+ return this;
+ }
+
+ /**
+ * Sets the quote character to use.
+ * <p>
+ * The quote character is used to wrap data between the tokens.
+ * This enables delimiters to be entered as data.
+ * </p>
+ *
+ * @param quote the quote character to use
+ * @return this, to enable chaining
+ */
+ public StrTokenizer setQuoteChar(final char quote) {
+ return setQuoteMatcher(StrMatcher.charMatcher(quote));
+ }
+
+ // Ignored
+ /**
+ * Gets the ignored character matcher.
+ * <p>
+ * These characters are ignored when parsing the String, unless they are
+ * within a quoted region.
+ * The default value is not to ignore anything.
+ * </p>
+ *
+ * @return the ignored matcher in use
+ */
+ public StrMatcher getIgnoredMatcher() {
+ return ignoredMatcher;
+ }
+
+ /**
+ * Set the matcher for characters to ignore.
+ * <p>
+ * These characters are ignored when parsing the String, unless they are
+ * within a quoted region.
+ * </p>
+ *
+ * @param ignored the ignored matcher to use, null ignored
+ * @return this, to enable chaining
+ */
+ public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) {
+ if (ignored != null) {
+ this.ignoredMatcher = ignored;
+ }
+ return this;
+ }
+
+ /**
+ * Set the character to ignore.
+ * <p>
+ * This character is ignored when parsing the String, unless it is
+ * within a quoted region.
+ *
+ * @param ignored the ignored character to use
+ * @return this, to enable chaining
+ */
+ public StrTokenizer setIgnoredChar(final char ignored) {
+ return setIgnoredMatcher(StrMatcher.charMatcher(ignored));
+ }
+
+ /**
+ * Gets the trimmer character matcher.
+ * <p>
+ * These characters are trimmed off on each side of the delimiter
+ * until the token or quote is found.
+ * The default value is not to trim anything.
+ * </p>
+ *
+ * @return the trimmer matcher in use
+ */
+ public StrMatcher getTrimmerMatcher() {
+ return trimmerMatcher;
+ }
+
+ /**
+ * Sets the matcher for characters to trim.
+ * <p>
+ * These characters are trimmed off on each side of the delimiter
+ * until the token or quote is found.
+ * </p>
+ *
+ * @param trimmer the trimmer matcher to use, null ignored
+ * @return this, to enable chaining
+ */
+ public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) {
+ if (trimmer != null) {
+ this.trimmerMatcher = trimmer;
+ }
+ return this;
+ }
+
+ /**
+ * Gets whether the tokenizer currently returns empty tokens as null.
+ * The default for this property is false.
+ *
+ * @return true if empty tokens are returned as null
+ */
+ public boolean isEmptyTokenAsNull() {
+ return this.emptyAsNull;
+ }
+
+ /**
+ * Sets whether the tokenizer should return empty tokens as null.
+ * The default for this property is false.
+ *
+ * @param emptyAsNull whether empty tokens are returned as null
+ * @return this, to enable chaining
+ */
+ public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) {
+ this.emptyAsNull = emptyAsNull;
+ return this;
+ }
+
+ /**
+ * Gets whether the tokenizer currently ignores empty tokens.
+ * The default for this property is true.
+ *
+ * @return true if empty tokens are not returned
+ */
+ public boolean isIgnoreEmptyTokens() {
+ return ignoreEmptyTokens;
+ }
+
+ /**
+ * Sets whether the tokenizer should ignore and not return empty tokens.
+ * The default for this property is true.
+ *
+ * @param ignoreEmptyTokens whether empty tokens are not returned
+ * @return this, to enable chaining
+ */
+ public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) {
+ this.ignoreEmptyTokens = ignoreEmptyTokens;
+ return this;
+ }
+
+ /**
+ * Gets the String content that the tokenizer is parsing.
+ *
+ * @return the string content being parsed
+ */
+ public String getContent() {
+ if (chars == null) {
+ return null;
+ }
+ return new String(chars);
+ }
+
+ /**
+ * Creates a new instance of this Tokenizer. The new instance is reset so
+ * that it will be at the start of the token list.
+ * If a {@link CloneNotSupportedException} is caught, return {@code null}.
+ *
+ * @return a new instance of this Tokenizer which has been reset.
+ */
+ @Override
+ public Object clone() {
+ try {
+ return cloneReset();
+ } catch (final CloneNotSupportedException ex) {
+ return null;
+ }
+ }
+
+ /**
+ * Creates a new instance of this Tokenizer. The new instance is reset so that
+ * it will be at the start of the token list.
+ *
+ * @return a new instance of this Tokenizer which has been reset.
+ * @throws CloneNotSupportedException if there is a problem cloning
+ */
+ Object cloneReset() throws CloneNotSupportedException {
+ // this method exists to enable 100% test coverage
+ final StrTokenizer cloned = (StrTokenizer) super.clone();
+ if (cloned.chars != null) {
+ cloned.chars = cloned.chars.clone();
+ }
+ cloned.reset();
+ return cloned;
+ }
+
+ /**
+ * Gets the String content that the tokenizer is parsing.
+ *
+ * @return the string content being parsed
+ */
+ @Override
+ public String toString() {
+ if (tokens == null) {
+ return "StrTokenizer[not tokenized yet]";
+ }
+ return "StrTokenizer" + getTokenList();
+ }
+
+}