diff options
Diffstat (limited to 'utf/unilib.h')
-rw-r--r-- | utf/unilib.h | 95 |
1 files changed, 95 insertions, 0 deletions
diff --git a/utf/unilib.h b/utf/unilib.h new file mode 100644 index 0000000..4cfc787 --- /dev/null +++ b/utf/unilib.h @@ -0,0 +1,95 @@ +/** + * Copyright 2010 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Routines to do manipulation of Unicode characters or text +// +// The StructurallyValid routines accept buffers of arbitrary bytes. +// For CoerceToStructurallyValid(), the input buffer and output buffers may +// point to exactly the same memory. +// +// In all other cases, the UTF-8 string must be structurally valid and +// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF. +// Debug builds take a fatal error for invalid UTF-8 input. +// The input and output buffers may not overlap at all. +// +// The char32 routines are here only for convenience; they convert to UTF-8 +// internally and use the UTF-8 routines. + +#ifndef UTIL_UTF8_UNILIB_H__ +#define UTIL_UTF8_UNILIB_H__ + +#include <string> +#include "base/basictypes.h" + +namespace UniLib { + +// Returns true unless a surrogate code point +inline bool IsValidCodepoint(char32 c) { + // In the range [0, 0xD800) or [0xE000, 0x10FFFF] + return (static_cast<uint32>(c) < 0xD800) + || (c >= 0xE000 && c <= 0x10FFFF); +} + +// Table of UTF-8 character lengths, based on first byte +static const unsigned char kUTF8LenTbl[256] = { + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 +}; + +// Return length of a single UTF-8 source character +inline int OneCharLen(const char* src) { + return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)]; +} + +// Return length of a single UTF-8 source character +inline int OneCharLen(const uint8* src) { + return kUTF8LenTbl[*src]; +} + +// Return true if this byte is a trailing UTF-8 byte (10xx xxxx) +inline bool IsTrailByte(char x) { + // return (x & 0xC0) == 0x80; + // Since trail bytes are always in [0x80, 0xBF], we can optimize: + return static_cast<signed char>(x) < -0x40; +} + +// Returns the length in bytes of the prefix of src that is all +// interchange valid UTF-8 +int SpanInterchangeValid(const char* src, int byte_length); +inline int SpanInterchangeValid(const std::string& src) { + return SpanInterchangeValid(src.data(), src.size()); +} + +// Returns true if the source is all interchange valid UTF-8 +// "Interchange valid" is a stronger than structurally valid -- +// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters. +inline bool IsInterchangeValid(const char* src, int byte_length) { + return (byte_length == SpanInterchangeValid(src, byte_length)); +} +inline bool IsInterchangeValid(const std::string& src) { + return IsInterchangeValid(src.data(), src.size()); +} + +} // namespace UniLib + +#endif // UTIL_UTF8_PUBLIC_UNILIB_H_ |