diff options
Diffstat (limited to 'base/i18n/streaming_utf8_validator.h')
-rw-r--r-- | base/i18n/streaming_utf8_validator.h | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/base/i18n/streaming_utf8_validator.h b/base/i18n/streaming_utf8_validator.h new file mode 100644 index 0000000000..ebf38a69b3 --- /dev/null +++ b/base/i18n/streaming_utf8_validator.h @@ -0,0 +1,66 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// A streaming validator for UTF-8. Validation is based on the definition in +// RFC-3629. In particular, it does not reject the invalid characters rejected +// by base::IsStringUTF8(). +// +// The implementation detects errors on the first possible byte. + +#ifndef BASE_I18N_STREAMING_UTF8_VALIDATOR_H_ +#define BASE_I18N_STREAMING_UTF8_VALIDATOR_H_ + +#include <stddef.h> +#include <stdint.h> + +#include <string> + +#include "base/i18n/base_i18n_export.h" +#include "base/macros.h" + +namespace base { + +class BASE_I18N_EXPORT StreamingUtf8Validator { + public: + // The validator exposes 3 states. It starts in state VALID_ENDPOINT. As it + // processes characters it alternates between VALID_ENDPOINT and + // VALID_MIDPOINT. If it encounters an invalid byte or UTF-8 sequence the + // state changes permanently to INVALID. + enum State { + VALID_ENDPOINT, + VALID_MIDPOINT, + INVALID + }; + + StreamingUtf8Validator() : state_(0u) {} + // Trivial destructor intentionally omitted. + + // Validate |size| bytes starting at |data|. If the concatenation of all calls + // to AddBytes() since this object was constructed or reset is a valid UTF-8 + // string, returns VALID_ENDPOINT. If it could be the prefix of a valid UTF-8 + // string, returns VALID_MIDPOINT. If an invalid byte or UTF-8 sequence was + // present, returns INVALID. + State AddBytes(const char* data, size_t size); + + // Return the object to a freshly-constructed state so that it can be re-used. + void Reset(); + + // Validate a complete string using the same criteria. Returns true if the + // string only contains complete, valid UTF-8 codepoints. + static bool Validate(const std::string& string); + + private: + // The current state of the validator. Value 0 is the initial/valid state. + // The state is stored as an offset into |kUtf8ValidatorTables|. The special + // state |kUtf8InvalidState| is invalid. + uint8_t state_; + + // This type could be made copyable but there is currently no use-case for + // it. + DISALLOW_COPY_AND_ASSIGN(StreamingUtf8Validator); +}; + +} // namespace base + +#endif // BASE_I18N_STREAMING_UTF8_VALIDATOR_H_ |