aboutsummaryrefslogtreecommitdiff
path: root/src/main/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaper.java
diff options
context:
space:
mode:
Diffstat (limited to 'src/main/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaper.java')
-rw-r--r--src/main/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaper.java159
1 files changed, 159 insertions, 0 deletions
diff --git a/src/main/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaper.java b/src/main/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaper.java
new file mode 100644
index 000000000..a238c9d43
--- /dev/null
+++ b/src/main/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaper.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.lang3.text.translate;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.EnumSet;
+
+/**
+ * Translate XML numeric entities of the form &#[xX]?\d+;? to
+ * the specific code point.
+ *
+ * Note that the semicolon is optional.
+ *
+ * @since 3.0
+ * @deprecated As of 3.6, use Apache Commons Text
+ * <a href="https://commons.apache.org/proper/commons-text/javadocs/api-release/org/apache/commons/text/translate/NumericEntityUnescaper.html">
+ * NumericEntityUnescaper</a> instead
+ */
+@Deprecated
+public class NumericEntityUnescaper extends CharSequenceTranslator {
+
+ /** Enumerates NumericEntityUnescaper options for unescaping. */
+ public enum OPTION {
+
+ /**
+ * Require a semicolon.
+ */
+ semiColonRequired,
+
+ /**
+ * Do not require a semicolon.
+ */
+ semiColonOptional,
+
+ /**
+ * Throw an exception if a semicolon is missing.
+ */
+ errorIfNoSemiColon
+ }
+
+ // TODO?: Create an OptionsSet class to hide some of the conditional logic below
+ private final EnumSet<OPTION> options;
+
+ /**
+ * Create a UnicodeUnescaper.
+ *
+ * The constructor takes a list of options, only one type of which is currently
+ * available (whether to allow, error or ignore the semicolon on the end of a
+ * numeric entity to being missing).
+ *
+ * For example, to support numeric entities without a ';':
+ * new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.semiColonOptional)
+ * and to throw an IllegalArgumentException when they're missing:
+ * new NumericEntityUnescaper(NumericEntityUnescaper.OPTION.errorIfNoSemiColon)
+ *
+ * Note that the default behavior is to ignore them.
+ *
+ * @param options to apply to this unescaper
+ */
+ public NumericEntityUnescaper(final OPTION... options) {
+ if (options.length > 0) {
+ this.options = EnumSet.copyOf(Arrays.asList(options));
+ } else {
+ this.options = EnumSet.copyOf(Collections.singletonList(OPTION.semiColonRequired));
+ }
+ }
+
+ /**
+ * Whether the passed in option is currently set.
+ *
+ * @param option to check state of
+ * @return whether the option is set
+ */
+ public boolean isSet(final OPTION option) {
+ return options != null && options.contains(option);
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public int translate(final CharSequence input, final int index, final Writer out) throws IOException {
+ final int seqEnd = input.length();
+ // Uses -2 to ensure there is something after the &#
+ if (input.charAt(index) == '&' && index < seqEnd - 2 && input.charAt(index + 1) == '#') {
+ int start = index + 2;
+ boolean isHex = false;
+
+ final char firstChar = input.charAt(start);
+ if (firstChar == 'x' || firstChar == 'X') {
+ start++;
+ isHex = true;
+
+ // Check there's more than just an x after the &#
+ if (start == seqEnd) {
+ return 0;
+ }
+ }
+
+ int end = start;
+ // Note that this supports character codes without a ; on the end
+ while (end < seqEnd && ( input.charAt(end) >= '0' && input.charAt(end) <= '9' ||
+ input.charAt(end) >= 'a' && input.charAt(end) <= 'f' ||
+ input.charAt(end) >= 'A' && input.charAt(end) <= 'F' ) ) {
+ end++;
+ }
+
+ final boolean semiNext = end != seqEnd && input.charAt(end) == ';';
+
+ if (!semiNext) {
+ if (isSet(OPTION.semiColonRequired)) {
+ return 0;
+ }
+ if (isSet(OPTION.errorIfNoSemiColon)) {
+ throw new IllegalArgumentException("Semi-colon required at end of numeric entity");
+ }
+ }
+
+ final int entityValue;
+ try {
+ if (isHex) {
+ entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 16);
+ } else {
+ entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 10);
+ }
+ } catch(final NumberFormatException nfe) {
+ return 0;
+ }
+
+ if (entityValue > 0xFFFF) {
+ final char[] chars = Character.toChars(entityValue);
+ out.write(chars[0]);
+ out.write(chars[1]);
+ } else {
+ out.write(entityValue);
+ }
+
+ return 2 + end - start + (isHex ? 1 : 0) + (semiNext ? 1 : 0);
+ }
+ return 0;
+ }
+}