diff options
Diffstat (limited to 'base/i18n/streaming_utf8_validator_perftest.cc')
-rw-r--r-- | base/i18n/streaming_utf8_validator_perftest.cc | 240 |
1 files changed, 240 insertions, 0 deletions
diff --git a/base/i18n/streaming_utf8_validator_perftest.cc b/base/i18n/streaming_utf8_validator_perftest.cc new file mode 100644 index 0000000000..ad328f886d --- /dev/null +++ b/base/i18n/streaming_utf8_validator_perftest.cc @@ -0,0 +1,240 @@ +// Copyright 2014 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// All data that is passed through a WebSocket with type "Text" needs to be +// validated as UTF8. Since this is done on the IO thread, it needs to be +// reasonably fast. + +// We are only interested in the performance on valid UTF8. Invalid UTF8 will +// result in a connection failure, so is unlikely to become a source of +// performance issues. + +#include "base/i18n/streaming_utf8_validator.h" + +#include <stddef.h> + +#include <string> + +#include "base/bind.h" +#include "base/callback.h" +#include "base/macros.h" +#include "base/strings/string_util.h" +#include "base/strings/stringprintf.h" +#include "base/test/perf_time_logger.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace base { +namespace { + +// We want to test ranges of valid UTF-8 sequences. These ranges are inclusive. +// They are intended to be large enough that the validator needs to do +// meaningful work while being in some sense "realistic" (eg. control characters +// are not included). +const char kOneByteSeqRangeStart[] = " "; // U+0020 +const char kOneByteSeqRangeEnd[] = "~"; // U+007E + +const char kTwoByteSeqRangeStart[] = "\xc2\xa0"; // U+00A0 non-breaking space +const char kTwoByteSeqRangeEnd[] = "\xc9\x8f"; // U+024F small y with stroke + +const char kThreeByteSeqRangeStart[] = "\xe3\x81\x82"; // U+3042 Hiragana "a" +const char kThreeByteSeqRangeEnd[] = "\xe9\xbf\x83"; // U+9FC3 "to blink" + +const char kFourByteSeqRangeStart[] = "\xf0\xa0\x80\x8b"; // U+2000B +const char kFourByteSeqRangeEnd[] = "\xf0\xaa\x9a\xb2"; // U+2A6B2 + +// The different lengths of strings to test. +const size_t kTestLengths[] = {1, 32, 256, 32768, 1 << 20}; + +// Simplest possible byte-at-a-time validator, to provide a baseline +// for comparison. This is only tried on 1-byte UTF-8 sequences, as +// the results will not be meaningful with sequences containing +// top-bit-set bytes. +bool IsString7Bit(const std::string& s) { + for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) { + if (*it & 0x80) + return false; + } + return true; +} + +// Assumes that |previous| is a valid UTF-8 sequence, and attempts to return +// the next one. Is just barely smart enough to iterate through the ranges +// defined about. +std::string NextUtf8Sequence(const std::string& previous) { + DCHECK(StreamingUtf8Validator::Validate(previous)); + std::string next = previous; + for (int i = static_cast<int>(previous.length() - 1); i >= 0; --i) { + // All bytes in a UTF-8 sequence except the first one are + // constrained to the range 0x80 to 0xbf, inclusive. When we + // increment past 0xbf, we carry into the previous byte. + if (i > 0 && next[i] == '\xbf') { + next[i] = '\x80'; + continue; // carry + } + ++next[i]; + break; // no carry + } + DCHECK(StreamingUtf8Validator::Validate(next)) + << "Result \"" << next << "\" failed validation"; + return next; +} + +typedef bool (*TestTargetType)(const std::string&); + +// Run fuction |target| over |test_string| |times| times, and report the results +// using |description|. +bool RunTest(const std::string& description, + TestTargetType target, + const std::string& test_string, + int times) { + base::PerfTimeLogger timer(description.c_str()); + bool result = true; + for (int i = 0; i < times; ++i) { + result = target(test_string) && result; + } + timer.Done(); + return result; +} + +// Construct a string by repeating |input| enough times to equal or exceed +// |length|. +std::string ConstructRepeatedTestString(const std::string& input, + size_t length) { + std::string output = input; + while (output.length() * 2 < length) { + output += output; + } + if (output.length() < length) { + output += ConstructRepeatedTestString(input, length - output.length()); + } + return output; +} + +// Construct a string by expanding the range of UTF-8 sequences +// between |input_start| and |input_end|, inclusive, and then +// repeating the resulting string until it equals or exceeds |length| +// bytes. |input_start| and |input_end| must be valid UTF-8 +// sequences. +std::string ConstructRangedTestString(const std::string& input_start, + const std::string& input_end, + size_t length) { + std::string output = input_start; + std::string input = input_start; + while (output.length() < length && input != input_end) { + input = NextUtf8Sequence(input); + output += input; + } + if (output.length() < length) { + output = ConstructRepeatedTestString(output, length); + } + return output; +} + +struct TestFunctionDescription { + TestTargetType function; + const char* function_name; +}; + +bool IsStringUTF8(const std::string& str) { + return base::IsStringUTF8(base::StringPiece(str)); +} + +// IsString7Bit is intentionally placed last so it can be excluded easily. +const TestFunctionDescription kTestFunctions[] = { + {&StreamingUtf8Validator::Validate, "StreamingUtf8Validator"}, + {&IsStringUTF8, "IsStringUTF8"}, {&IsString7Bit, "IsString7Bit"}}; + +// Construct a test string from |construct_test_string| for each of the lengths +// in |kTestLengths| in turn. For each string, run each test in |test_functions| +// for a number of iterations such that the total number of bytes validated +// is around 16MB. +void RunSomeTests( + const char format[], + base::Callback<std::string(size_t length)> construct_test_string, + const TestFunctionDescription* test_functions, + size_t test_count) { + for (size_t i = 0; i < arraysize(kTestLengths); ++i) { + const size_t length = kTestLengths[i]; + const std::string test_string = construct_test_string.Run(length); + const int real_length = static_cast<int>(test_string.length()); + const int times = (1 << 24) / real_length; + for (size_t test_index = 0; test_index < test_count; ++test_index) { + EXPECT_TRUE(RunTest(StringPrintf(format, + test_functions[test_index].function_name, + real_length, + times), + test_functions[test_index].function, + test_string, + times)); + } + } +} + +TEST(StreamingUtf8ValidatorPerfTest, OneByteRepeated) { + RunSomeTests("%s: bytes=1 repeated length=%d repeat=%d", + base::Bind(ConstructRepeatedTestString, kOneByteSeqRangeStart), + kTestFunctions, + 3); +} + +TEST(StreamingUtf8ValidatorPerfTest, OneByteRange) { + RunSomeTests("%s: bytes=1 ranged length=%d repeat=%d", + base::Bind(ConstructRangedTestString, + kOneByteSeqRangeStart, + kOneByteSeqRangeEnd), + kTestFunctions, + 3); +} + +TEST(StreamingUtf8ValidatorPerfTest, TwoByteRepeated) { + RunSomeTests("%s: bytes=2 repeated length=%d repeat=%d", + base::Bind(ConstructRepeatedTestString, kTwoByteSeqRangeStart), + kTestFunctions, + 2); +} + +TEST(StreamingUtf8ValidatorPerfTest, TwoByteRange) { + RunSomeTests("%s: bytes=2 ranged length=%d repeat=%d", + base::Bind(ConstructRangedTestString, + kTwoByteSeqRangeStart, + kTwoByteSeqRangeEnd), + kTestFunctions, + 2); +} + +TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRepeated) { + RunSomeTests( + "%s: bytes=3 repeated length=%d repeat=%d", + base::Bind(ConstructRepeatedTestString, kThreeByteSeqRangeStart), + kTestFunctions, + 2); +} + +TEST(StreamingUtf8ValidatorPerfTest, ThreeByteRange) { + RunSomeTests("%s: bytes=3 ranged length=%d repeat=%d", + base::Bind(ConstructRangedTestString, + kThreeByteSeqRangeStart, + kThreeByteSeqRangeEnd), + kTestFunctions, + 2); +} + +TEST(StreamingUtf8ValidatorPerfTest, FourByteRepeated) { + RunSomeTests("%s: bytes=4 repeated length=%d repeat=%d", + base::Bind(ConstructRepeatedTestString, kFourByteSeqRangeStart), + kTestFunctions, + 2); +} + +TEST(StreamingUtf8ValidatorPerfTest, FourByteRange) { + RunSomeTests("%s: bytes=4 ranged length=%d repeat=%d", + base::Bind(ConstructRangedTestString, + kFourByteSeqRangeStart, + kFourByteSeqRangeEnd), + kTestFunctions, + 2); +} + +} // namespace +} // namespace base |