diff options
Diffstat (limited to 'src/google/protobuf/util/internal/json_escaping.cc')
-rw-r--r-- | src/google/protobuf/util/internal/json_escaping.cc | 404 |
1 files changed, 0 insertions, 404 deletions
diff --git a/src/google/protobuf/util/internal/json_escaping.cc b/src/google/protobuf/util/internal/json_escaping.cc deleted file mode 100644 index 06d2791b..00000000 --- a/src/google/protobuf/util/internal/json_escaping.cc +++ /dev/null @@ -1,404 +0,0 @@ -// Protocol Buffers - Google's data interchange format -// Copyright 2008 Google Inc. All rights reserved. -// https://developers.google.com/protocol-buffers/ -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -#include <google/protobuf/util/internal/json_escaping.h> - -#include <google/protobuf/stubs/logging.h> -#include <google/protobuf/stubs/common.h> - -namespace google { -namespace protobuf { -namespace util { -namespace converter { - -namespace { - -// Array of hex characters for conversion to hex. -static const char kHex[] = "0123456789abcdef"; - -// Characters 0x00 to 0x9f are very commonly used, so we provide a special -// table lookup. -// -// For unicode code point ch < 0xa0: -// kCommonEscapes[ch] is the escaped string of ch, if escaping is needed; -// or an empty string, if escaping is not needed. -static const char kCommonEscapes[160][7] = { - // C0 (ASCII and derivatives) control characters - "\\u0000", "\\u0001", "\\u0002", "\\u0003", // 0x00 - "\\u0004", "\\u0005", "\\u0006", "\\u0007", - "\\b", "\\t", "\\n", "\\u000b", - "\\f", "\\r", "\\u000e", "\\u000f", - "\\u0010", "\\u0011", "\\u0012", "\\u0013", // 0x10 - "\\u0014", "\\u0015", "\\u0016", "\\u0017", - "\\u0018", "\\u0019", "\\u001a", "\\u001b", - "\\u001c", "\\u001d", "\\u001e", "\\u001f", - // Escaping of " and \ are required by www.json.org string definition. - // Escaping of < and > are required for HTML security. - "", "", "\\\"", "", "", "", "", "", // 0x20 - "", "", "", "", "", "", "", "", - "", "", "", "", "", "", "", "", // 0x30 - "", "", "", "", "\\u003c", "", "\\u003e", "", - "", "", "", "", "", "", "", "", // 0x40 - "", "", "", "", "", "", "", "", - "", "", "", "", "", "", "", "", // 0x50 - "", "", "", "", "\\\\", "", "", "", - "", "", "", "", "", "", "", "", // 0x60 - "", "", "", "", "", "", "", "", - "", "", "", "", "", "", "", "", // 0x70 - "", "", "", "", "", "", "", "\\u007f", - // C1 (ISO 8859 and Unicode) extended control characters - "\\u0080", "\\u0081", "\\u0082", "\\u0083", // 0x80 - "\\u0084", "\\u0085", "\\u0086", "\\u0087", - "\\u0088", "\\u0089", "\\u008a", "\\u008b", - "\\u008c", "\\u008d", "\\u008e", "\\u008f", - "\\u0090", "\\u0091", "\\u0092", "\\u0093", // 0x90 - "\\u0094", "\\u0095", "\\u0096", "\\u0097", - "\\u0098", "\\u0099", "\\u009a", "\\u009b", - "\\u009c", "\\u009d", "\\u009e", "\\u009f" -}; - -// Determines if the given char value is a unicode high-surrogate code unit. -// Such values do not represent characters by themselves, but are used in the -// representation of supplementary characters in the utf-16 encoding. -inline bool IsHighSurrogate(uint16 c) { - // Optimized form of: - // return c >= kMinHighSurrogate && c <= kMaxHighSurrogate; - // (Reduced from 3 ALU instructions to 2 ALU instructions) - return (c & ~(JsonEscaping::kMaxHighSurrogate - - JsonEscaping::kMinHighSurrogate)) - == JsonEscaping::kMinHighSurrogate; -} - -// Determines if the given char value is a unicode low-surrogate code unit. -// Such values do not represent characters by themselves, but are used in the -// representation of supplementary characters in the utf-16 encoding. -inline bool IsLowSurrogate(uint16 c) { - // Optimized form of: - // return c >= kMinLowSurrogate && c <= kMaxLowSurrogate; - // (Reduced from 3 ALU instructions to 2 ALU instructions) - return (c & ~(JsonEscaping::kMaxLowSurrogate - - JsonEscaping::kMinLowSurrogate)) - == JsonEscaping::kMinLowSurrogate; -} - -// Determines if the given char value is a unicode surrogate code unit (either -// high-surrogate or low-surrogate). -inline bool IsSurrogate(uint32 c) { - // Optimized form of: - // return c >= kMinHighSurrogate && c <= kMaxLowSurrogate; - // (Reduced from 3 ALU instructions to 2 ALU instructions) - return (c & 0xfffff800) == JsonEscaping::kMinHighSurrogate; -} - -// Returns true if the given unicode code point cp is -// in the supplementary character range. -inline bool IsSupplementalCodePoint(uint32 cp) { - // Optimized form of: - // return kMinSupplementaryCodePoint <= cp && cp <= kMaxCodePoint; - // (Reduced from 3 ALU instructions to 2 ALU instructions) - return (cp & ~(JsonEscaping::kMinSupplementaryCodePoint - 1)) - < JsonEscaping::kMaxCodePoint; -} - -// Returns true if the given unicode code point cp is a valid -// unicode code point (i.e. in the range 0 <= cp <= kMaxCodePoint). -inline bool IsValidCodePoint(uint32 cp) { - return cp <= JsonEscaping::kMaxCodePoint; -} - -// Converts the specified surrogate pair to its supplementary code point value. -// It is the callers' responsibility to validate the specified surrogate pair. -inline uint32 ToCodePoint(uint16 high, uint16 low) { - // Optimized form of: - // return ((high - kMinHighSurrogate) << 10) - // + (low - kMinLowSurrogate) - // + kMinSupplementaryCodePoint; - // (Reduced from 5 ALU instructions to 3 ALU instructions) - return (high << 10) + low + - (JsonEscaping::kMinSupplementaryCodePoint - - (static_cast<unsigned>(JsonEscaping::kMinHighSurrogate) << 10) - - JsonEscaping::kMinLowSurrogate); -} - -// Returns the low surrogate for the given unicode code point. The result is -// meaningless if the given code point is not a supplementary character. -inline uint16 ToLowSurrogate(uint32 cp) { - return (cp & (JsonEscaping::kMaxLowSurrogate - - JsonEscaping::kMinLowSurrogate)) - + JsonEscaping::kMinLowSurrogate; -} - -// Returns the high surrogate for the given unicode code point. The result is -// meaningless if the given code point is not a supplementary character. -inline uint16 ToHighSurrogate(uint32 cp) { - return (cp >> 10) + (JsonEscaping::kMinHighSurrogate - - (JsonEscaping::kMinSupplementaryCodePoint >> 10)); -} - -// Input str is encoded in UTF-8. A unicode code point could be encoded in -// UTF-8 using anywhere from 1 to 4 characters, and it could span multiple -// reads of the ByteSource. -// -// This function reads the next unicode code point from the input (str) at -// the given position (index), taking into account any left-over partial -// code point from the previous iteration (cp), together with the number -// of characters left to read to complete this code point (num_left). -// -// This function assumes that the input (str) is valid at the given position -// (index). In order words, at least one character could be read successfully. -// -// The code point read (partial or complete) is stored in (cp). Upon return, -// (num_left) stores the number of characters that has yet to be read in -// order to complete the current unicode code point. If the read is complete, -// then (num_left) is 0. Also, (num_read) is the number of characters read. -// -// Returns false if we encounter an invalid UTF-8 string. Returns true -// otherwise, including the case when we reach the end of the input (str) -// before a complete unicode code point is read. -bool ReadCodePoint(StringPiece str, int index, - uint32 *cp, int* num_left, int *num_read) { - if (*num_left == 0) { - // Last read was complete. Start reading a new unicode code point. - *cp = static_cast<uint8>(str[index++]); - *num_read = 1; - // The length of the code point is determined from reading the first byte. - // - // If the first byte is between: - // 0..0x7f: that's the value of the code point. - // 0x80..0xbf: <invalid> - // 0xc0..0xdf: 11-bit code point encoded in 2 bytes. - // bit 10-6, bit 5-0 - // 0xe0..0xef: 16-bit code point encoded in 3 bytes. - // bit 15-12, bit 11-6, bit 5-0 - // 0xf0..0xf7: 21-bit code point encoded in 4 bytes. - // bit 20-18, bit 17-12, bit 11-6, bit 5-0 - // 0xf8..0xff: <invalid> - // - // Meaning of each bit: - // <msb> bit 7: 0 - single byte code point: bits 6-0 are values. - // 1 - multibyte code point - // bit 6: 0 - subsequent bytes of multibyte code point: - // bits 5-0 are values. - // 1 - first byte of multibyte code point - // bit 5: 0 - first byte of 2-byte code point: bits 4-0 are values. - // 1 - first byte of code point with >= 3 bytes. - // bit 4: 0 - first byte of 3-byte code point: bits 3-0 are values. - // 1 - first byte of code point with >= 4 bytes. - // bit 3: 0 - first byte of 4-byte code point: bits 2-0 are values. - // 1 - reserved for future expansion. - if (*cp <= 0x7f) { - return true; - } else if (*cp <= 0xbf) { - return false; - } else if (*cp <= 0xdf) { - *cp &= 0x1f; - *num_left = 1; - } else if (*cp <= 0xef) { - *cp &= 0x0f; - *num_left = 2; - } else if (*cp <= 0xf7) { - *cp &= 0x07; - *num_left = 3; - } else { - return false; - } - } else { - // Last read was partial. Initialize num_read to 0 and continue reading - // the last unicode code point. - *num_read = 0; - } - while (*num_left > 0 && index < str.size()) { - uint32 ch = static_cast<uint8>(str[index++]); - --(*num_left); - ++(*num_read); - *cp = (*cp << 6) | (ch & 0x3f); - if (ch < 0x80 || ch > 0xbf) return false; - } - return *num_left > 0 || (!IsSurrogate(*cp) && IsValidCodePoint(*cp)); -} - -// Stores the 16-bit unicode code point as its hexadecimal digits in buffer -// and returns a StringPiece that points to this buffer. The input buffer needs -// to be at least 6 bytes long. -StringPiece ToHex(uint16 cp, char* buffer) { - buffer[5] = kHex[cp & 0x0f]; - cp >>= 4; - buffer[4] = kHex[cp & 0x0f]; - cp >>= 4; - buffer[3] = kHex[cp & 0x0f]; - cp >>= 4; - buffer[2] = kHex[cp & 0x0f]; - return StringPiece(buffer).substr(0, 6); -} - -// Stores the 32-bit unicode code point as its hexadecimal digits in buffer -// and returns a StringPiece that points to this buffer. The input buffer needs -// to be at least 12 bytes long. -StringPiece ToSurrogateHex(uint32 cp, char* buffer) { - uint16 low = ToLowSurrogate(cp); - uint16 high = ToHighSurrogate(cp); - - buffer[11] = kHex[low & 0x0f]; - low >>= 4; - buffer[10] = kHex[low & 0x0f]; - low >>= 4; - buffer[9] = kHex[low & 0x0f]; - low >>= 4; - buffer[8] = kHex[low & 0x0f]; - - buffer[5] = kHex[high & 0x0f]; - high >>= 4; - buffer[4] = kHex[high & 0x0f]; - high >>= 4; - buffer[3] = kHex[high & 0x0f]; - high >>= 4; - buffer[2] = kHex[high & 0x0f]; - - return StringPiece(buffer, 12); -} - -// If the given unicode code point needs escaping, then returns the -// escaped form. The returned StringPiece either points to statically -// pre-allocated char[] or to the given buffer. The input buffer needs -// to be at least 12 bytes long. -// -// If the given unicode code point does not need escaping, an empty -// StringPiece is returned. -StringPiece EscapeCodePoint(uint32 cp, char* buffer) { - if (cp < 0xa0) return kCommonEscapes[cp]; - switch (cp) { - // These are not required by json spec - // but used to prevent security bugs in javascript. - case 0xfeff: // Zero width no-break space - case 0xfff9: // Interlinear annotation anchor - case 0xfffa: // Interlinear annotation separator - case 0xfffb: // Interlinear annotation terminator - - case 0x00ad: // Soft-hyphen - case 0x06dd: // Arabic end of ayah - case 0x070f: // Syriac abbreviation mark - case 0x17b4: // Khmer vowel inherent Aq - case 0x17b5: // Khmer vowel inherent Aa - return ToHex(cp, buffer); - - default: - if ((cp >= 0x0600 && cp <= 0x0603) || // Arabic signs - (cp >= 0x200b && cp <= 0x200f) || // Zero width etc. - (cp >= 0x2028 && cp <= 0x202e) || // Separators etc. - (cp >= 0x2060 && cp <= 0x2064) || // Invisible etc. - (cp >= 0x206a && cp <= 0x206f)) { // Shaping etc. - return ToHex(cp, buffer); - } - - if (cp == 0x000e0001 || // Language tag - (cp >= 0x0001d173 && cp <= 0x0001d17a) || // Music formatting - (cp >= 0x000e0020 && cp <= 0x000e007f)) { // TAG symbols - return ToSurrogateHex(cp, buffer); - } - } - return StringPiece(); -} - -// Tries to escape the given code point first. If the given code point -// does not need to be escaped, but force_output is true, then render -// the given multi-byte code point in UTF8 in the buffer and returns it. -StringPiece EscapeCodePoint(uint32 cp, char* buffer, bool force_output) { - StringPiece sp = EscapeCodePoint(cp, buffer); - if (force_output && sp.empty()) { - buffer[5] = (cp & 0x3f) | 0x80; - cp >>= 6; - if (cp <= 0x1f) { - buffer[4] = cp | 0xc0; - sp.set(buffer + 4, 2); - return sp; - } - buffer[4] = (cp & 0x3f) | 0x80; - cp >>= 6; - if (cp <= 0x0f) { - buffer[3] = cp | 0xe0; - sp.set(buffer + 3, 3); - return sp; - } - buffer[3] = (cp & 0x3f) | 0x80; - buffer[2] = ((cp >> 6) & 0x07) | 0xf0; - sp.set(buffer + 2, 4); - } - return sp; -} - -} // namespace - -void JsonEscaping::Escape(strings::ByteSource* input, - strings::ByteSink* output) { - char buffer[12] = "\\udead\\ubee"; - uint32 cp = 0; // Current unicode code point. - int num_left = 0; // Num of chars to read to complete the code point. - while (input->Available() > 0) { - StringPiece str = input->Peek(); - StringPiece escaped; - int i = 0; - int num_read; - bool ok; - bool cp_was_split = num_left > 0; - // Loop until we encounter either - // i) a code point that needs to be escaped; or - // ii) a split code point is completely read; or - // iii) a character that is not a valid utf8; or - // iv) end of the StringPiece str is reached. - do { - ok = ReadCodePoint(str, i, &cp, &num_left, &num_read); - if (num_left > 0 || !ok) break; // case iii or iv - escaped = EscapeCodePoint(cp, buffer, cp_was_split); - if (!escaped.empty()) break; // case i or ii - i += num_read; - num_read = 0; - } while (i < str.length()); // case iv - // First copy the un-escaped prefix, if any, to the output ByteSink. - if (i > 0) input->CopyTo(output, i); - if (num_read > 0) input->Skip(num_read); - if (!ok) { - // Case iii: Report error. - // TODO(wpoon): Add error reporting. - num_left = 0; - } else if (num_left == 0 && !escaped.empty()) { - // Case i or ii: Append the escaped code point to the output ByteSink. - output->Append(escaped.data(), escaped.size()); - } - } - if (num_left > 0) { - // Treat as case iii: report error. - // TODO(wpoon): Add error reporting. - } -} - -} // namespace converter -} // namespace util -} // namespace protobuf -} // namespace google |