core/src/main/java/org/owasp/encoder/Encoder.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264

// Copyright (c) 2012 Jeff Ichnowski
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
//     * Redistributions of source code must retain the above
//       copyright notice, this list of conditions and the following
//       disclaimer.
//
//     * Redistributions in binary form must reproduce the above
//       copyright notice, this list of conditions and the following
//       disclaimer in the documentation and/or other materials
//       provided with the distribution.
//
//     * Neither the name of the OWASP nor the names of its
//       contributors may be used to endorse or promote products
//       derived from this software without specific prior written
//       permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
// OF THE POSSIBILITY OF SUCH DAMAGE.

package org.owasp.encoder;

import java.nio.CharBuffer;
import java.nio.charset.CoderResult;

/**
 * <p>This is the low-level encoding API.  For each flavor of encoding
 * there is an instance of this class that performs the actual
 * encoding.  Overriding and implementing Encoders outside of the
 * OWASP Encoder's project is not currently supported.</p>
 *
 * <p>Unless otherwise documented, instances of these classes are
 * thread-safe.  Encoders implementations do not generally carry
 * state, and if they do the state will be flush with a call to {@link
 * #encode(java.nio.CharBuffer, java.nio.CharBuffer, boolean)} with
 * {@code endOfInput} set to {@code true}.</p>
 *
 * <p>To use an Encoder instance directly, repeatedly call {@link
 * #encode(java.nio.CharBuffer, java.nio.CharBuffer, boolean)} with
 * the {@code endOfInput} parameter set to {@code false} while there
 * is (the possibility of) more input to encode.  Once there is no
 * more input to encode, call {@link #encode(java.nio.CharBuffer,
 * java.nio.CharBuffer, boolean)} with {@code endOfInput} set to
 * {@code true} until the method returns {@link
 * java.nio.charset.CoderResult#UNDERFLOW}.</p>
 *
 * <p>In general, this class is not expected to be needed directly.
 * Use the {@link Encode} fluent interface for encoding Strings or
 * {@link EncodedWriter} for large blocks of contextual encoding.</p>
 *
 * @author Jeff Ichnowski
 * @see Encode
 * @see EncodedWriter
 */
public abstract class Encoder {
    /**
     * Hexadecimal conversion array.  Package private to prevent corruption.
     */
    static final char[] HEX = "0123456789abcdef".toCharArray();

    /**
     * Bit-shift used for encoding values in hexadecimal.
     */
    static final int HEX_SHIFT = 4;

    /**
     * Bit-mask used for encoding values in hexadecimal.
     */
    static final int HEX_MASK = 0xf;

    /**
     * Package-private constructor to prevent having to support
     * external implementations of this class.  This may be opened up
     * in future releases.
     */
    Encoder() {}

    /**
     * <p>This is the kernel of encoding.  Currently only CharBuffers
     * backed by arrays (i.e. {@link java.nio.CharBuffer#hasArray()}
     * returns {@code true}) are supported.  <strong>Using a
     * direct-mapped CharBuffer will result in an
     * UnsupportedOperationException</strong>, though this behavior
     * may change in future releases.</p>
     *
     * <p>This method should be called repeatedly while {@code
     * endOfInput} set to {@code false} while there is more input.
     * Once there is no more input, this method should be called
     * {@code endOfInput} set to {@code false} until {@link
     * java.nio.charset.CoderResult#UNDERFLOW} is returned.</p>
     *
     * <p>After any call to this method, except when {@code
     * endOfInput} is {@code true} and the method returns {@code
     * UNDERFLOW}, there may be characters left to encode in the
     * {@code input} buffer (i.e. {@code input.hasRemaining() ==
     * true}).  This will happen when the encoder needs to see more
     * input before determining what to do--for example when encoding
     * for CDATA, if the input ends with {@code "foo]]"}, the encoder
     * will need to see the next character to determine if it is a "&gt;"
     * or not.</p>
     *
     * <p>Example usage:</p>
     * <pre>
     *   CharBuffer input = CharBuffer.allocate(1024);
     *   CharBuffer output = CharBuffer.allocate(1024);
     *   CoderResult cr;
     *   // assuming doRead fills in the input buffer or
     *   // returns -1 at end of input
     *   while(doRead(input) != -1) {
     *     input.flip();
     *     for (;;) {
     *       cr = encoder.encode(input, output, false);
     *       if (cr.isUnderflow()) {
     *         break;
     *       }
     *       if (cr.isOverflow()) {
     *         // assuming doWrite flushes the encoded
     *         // characters somewhere.
     *         output.flip();
     *         doWrite(output);
     *         output.compact();
     *       }
     *     }
     *     input.compact();
     *   }
     *
     *   // at end of input
     *   input.flip();
     *   do {
     *     cr = encoder.encode(input, output, true);
     *     output.flip();
     *     doWrite(output);
     *     output.compact();
     *   } while (cr.isOverflow());
     * </pre>
     *
     * @param input the input buffer to encode
     * @param output the output buffer to receive the encoded results
     * @param endOfInput set to {@code true} if there is no more input, and any
     * remaining characters at the end of input will either be encoded or
     * replaced as invalid.
     * @return Either {@link java.nio.charset.CoderResult#UNDERFLOW}
     * or {@link java.nio.charset.CoderResult#OVERFLOW}.  No other
     * CoderResult value will be returned.  Characters or sequences
     * that might conceivably return and invalid or unmappable
     * character result (as part of the nio Charset API) are
     * automatically replaced to avoid security implications.
     */
    public CoderResult encode(CharBuffer input, CharBuffer output, boolean endOfInput) {
        if (input.hasRemaining()) {
            if (input.hasArray() && output.hasArray()) {
                return encodeArrays(input, output, endOfInput);
            } else {
                return encodeBuffers(input, output, endOfInput);
            }
        } else {
            return CoderResult.UNDERFLOW;
        }
    }

    /**
     * The core encoding loop used when both the input and output buffers
     * are array backed.  The loop is expected to fetch the arrays and
     * interact with the arrays directly for performance.
     *
     * @param input the input buffer.
     * @param output the output buffer.
     * @param endOfInput when true, this is the last input to encode
     * @return UNDERFLOW or OVERFLOW
     */
    CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) {
        throw new UnsupportedOperationException();
    }

    /**
     * The core encoding loop used when either or both input and output
     * buffers are NOT array-backed.  E.g. they are direct buffers or
     * perhaps the input buffer is a read-only wrapper.  In any case,
     * this method is not currently implemented by any of the encoder
     * implementations since it is not expected to be common use-case.
     * The stub is included here for completeness and to demarcate
     * where the non-array-backed use-case would be included.
     *
     * @param input the input buffer.
     * @param output the output buffer.
     * @param endOfInput when true, this is the last input to encode
     * @return never returns.
     * @throws UnsupportedOperationException -- always
     */
    CoderResult encodeBuffers(CharBuffer input, CharBuffer output, boolean endOfInput)
        throws UnsupportedOperationException
    {
        throw new UnsupportedOperationException();
    }

    /**
     * Returns the maximum encoded length (in chars) of an input sequence of
     * {@code n} characters.
     *
     * @param n the number of characters of input
     * @return the worst-case number of characters required to encode
     */
    abstract int maxEncodedLength(int n);

    /**
     * Scans the input string for the first character index that requires
     * encoding.  If the entire input does not require encoding then the
     * length is returned.  This method is used by the Encode.forXYZ methods
     * to return input strings unchanged when possible.
     *
     * @param input the input to check for encoding
     * @param off the offset of the first character to check
     * @param len the number of characters to check
     * @return the index of the first character to encode.  The return value
     * will be {@code off+len} if no characters in the input require encoding.
     */
    abstract int firstEncodedOffset(String input, int off, int len);

    /**
     * Internal helper method to properly position buffers after encoding up
     * until an overflow.
     *
     * @param input the input buffer
     * @param i the array offset in the input buffer (translated to position)
     * @param output the output buffer
     * @param j the array offset in the output buffer (translated to position)
     * @return CoderResult.OVERFLOW
     */
    static CoderResult overflow(CharBuffer input, int i, CharBuffer output, int j) {
        input.position(i - input.arrayOffset());
        output.position(j - output.arrayOffset());
        return CoderResult.OVERFLOW;
    }

    /**
     * Internal helper method to properly position buffers after encoding up
     * until an underflow.
     *
     * @param input the input buffer
     * @param i the array offset in the input buffer (translated to position)
     * @param output the output buffer
     * @param j the array offset in the output buffer (translated to position)
     * @return CoderResult.UNDERFLOW
     */
    static CoderResult underflow(CharBuffer input, int i, CharBuffer output, int j) {
        input.position(i - input.arrayOffset());
        output.position(j - output.arrayOffset());
        return CoderResult.UNDERFLOW;
    }
}