diff options
Diffstat (limited to 'base/i18n/build_utf8_validator_tables.cc')
-rw-r--r-- | base/i18n/build_utf8_validator_tables.cc | 466 |
1 files changed, 0 insertions, 466 deletions
diff --git a/base/i18n/build_utf8_validator_tables.cc b/base/i18n/build_utf8_validator_tables.cc deleted file mode 100644 index ae5b1a71e9..0000000000 --- a/base/i18n/build_utf8_validator_tables.cc +++ /dev/null @@ -1,466 +0,0 @@ -// Copyright 2014 The Chromium Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -// Create a state machine for validating UTF-8. The algorithm in brief: -// 1. Convert the complete unicode range of code points, except for the -// surrogate code points, to an ordered array of sequences of bytes in -// UTF-8. -// 2. Convert individual bytes to ranges, starting from the right of each byte -// sequence. For each range, ensure the bytes on the left and the ranges -// on the right are the identical. -// 3. Convert the resulting list of ranges into a state machine, collapsing -// identical states. -// 4. Convert the state machine to an array of bytes. -// 5. Output as a C++ file. -// -// To use: -// $ ninja -C out/Release build_utf8_validator_tables -// $ out/Release/build_utf8_validator_tables -// --output=base/i18n/utf8_validator_tables.cc -// $ git add base/i18n/utf8_validator_tables.cc -// -// Because the table is not expected to ever change, it is checked into the -// repository rather than being regenerated at build time. -// -// This code uses type uint8 throughout to represent bytes, to avoid -// signed/unsigned char confusion. - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include <algorithm> -#include <map> -#include <string> -#include <vector> - -#include "base/basictypes.h" -#include "base/command_line.h" -#include "base/files/file_path.h" -#include "base/files/file_util.h" -#include "base/logging.h" -#include "base/numerics/safe_conversions.h" -#include "base/strings/stringprintf.h" -#include "third_party/icu/source/common/unicode/utf8.h" - -namespace { - -const char kHelpText[] = - "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n"; - -const char kProlog[] = - "// Copyright 2013 The Chromium Authors. All rights reserved.\n" - "// Use of this source code is governed by a BSD-style license that can " - "be\n" - "// found in the LICENSE file.\n" - "\n" - "// This file is auto-generated by build_utf8_validator_tables.\n" - "// DO NOT EDIT.\n" - "\n" - "#include \"base/i18n/utf8_validator_tables.h\"\n" - "\n" - "namespace base {\n" - "namespace internal {\n" - "\n" - "const uint8 kUtf8ValidatorTables[] = {\n"; - -const char kEpilog[] = - "};\n" - "\n" - "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n" - "\n" - "} // namespace internal\n" - "} // namespace base\n"; - -// Ranges are inclusive at both ends--they represent [from, to] -class Range { - public: - // Ranges always start with just one byte. - explicit Range(uint8 value) : from_(value), to_(value) {} - - // Range objects are copyable and assignable to be used in STL - // containers. Since they only contain non-pointer POD types, the default copy - // constructor, assignment operator and destructor will work. - - // Add a byte to the range. We intentionally only support adding a byte at the - // end, since that is the only operation the code needs. - void AddByte(uint8 to) { - CHECK(to == to_ + 1); - to_ = to; - } - - uint8 from() const { return from_; } - uint8 to() const { return to_; } - - bool operator<(const Range& rhs) const { - return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to())); - } - - bool operator==(const Range& rhs) const { - return from() == rhs.from() && to() == rhs.to(); - } - - private: - uint8 from_; - uint8 to_; -}; - -// A vector of Ranges is like a simple regular expression--it corresponds to -// a set of strings of the same length that have bytes in each position in -// the appropriate range. -typedef std::vector<Range> StringSet; - -// A UTF-8 "character" is represented by a sequence of bytes. -typedef std::vector<uint8> Character; - -// In the second stage of the algorithm, we want to convert a large list of -// Characters into a small list of StringSets. -struct Pair { - Character character; - StringSet set; -}; - -typedef std::vector<Pair> PairVector; - -// A class to print a table of numbers in the same style as clang-format. -class TablePrinter { - public: - explicit TablePrinter(FILE* stream) - : stream_(stream), values_on_this_line_(0), current_offset_(0) {} - - void PrintValue(uint8 value) { - if (values_on_this_line_ == 0) { - fputs(" ", stream_); - } else if (values_on_this_line_ == kMaxValuesPerLine) { - fprintf(stream_, " // 0x%02x\n ", current_offset_); - values_on_this_line_ = 0; - } - fprintf(stream_, " 0x%02x,", static_cast<int>(value)); - ++values_on_this_line_; - ++current_offset_; - } - - void NewLine() { - while (values_on_this_line_ < kMaxValuesPerLine) { - fputs(" ", stream_); - ++values_on_this_line_; - } - fprintf(stream_, " // 0x%02x\n", current_offset_); - values_on_this_line_ = 0; - } - - private: - // stdio stream. Not owned. - FILE* stream_; - - // Number of values so far printed on this line. - int values_on_this_line_; - - // Total values printed so far. - int current_offset_; - - static const int kMaxValuesPerLine = 8; - - DISALLOW_COPY_AND_ASSIGN(TablePrinter); -}; - -// Start by filling a PairVector with characters. The resulting vector goes from -// "\x00" to "\xf4\x8f\xbf\xbf". -PairVector InitializeCharacters() { - PairVector vector; - for (int i = 0; i <= 0x10FFFF; ++i) { - if (i >= 0xD800 && i < 0xE000) { - // Surrogate codepoints are not permitted. Non-character code points are - // explicitly permitted. - continue; - } - uint8 bytes[4]; - unsigned int offset = 0; - UBool is_error = false; - U8_APPEND(bytes, offset, arraysize(bytes), i, is_error); - DCHECK(!is_error); - DCHECK_GT(offset, 0u); - DCHECK_LE(offset, arraysize(bytes)); - Pair pair = {Character(bytes, bytes + offset), StringSet()}; - vector.push_back(pair); - } - return vector; -} - -// Construct a new Pair from |character| and the concatenation of |new_range| -// and |existing_set|, and append it to |pairs|. -void ConstructPairAndAppend(const Character& character, - const Range& new_range, - const StringSet& existing_set, - PairVector* pairs) { - Pair new_pair = {character, StringSet(1, new_range)}; - new_pair.set.insert( - new_pair.set.end(), existing_set.begin(), existing_set.end()); - pairs->push_back(new_pair); -} - -// Each pass over the PairVector strips one byte off the right-hand-side of the -// characters and adds a range to the set on the right. For example, the first -// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0", -// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0", -// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0", -// [\xa0-\xbf][\x80-\xbf]). -void MoveRightMostCharToSet(PairVector* pairs) { - PairVector new_pairs; - PairVector::const_iterator it = pairs->begin(); - while (it != pairs->end() && it->character.empty()) { - new_pairs.push_back(*it); - ++it; - } - CHECK(it != pairs->end()); - Character unconverted_bytes(it->character.begin(), it->character.end() - 1); - Range new_range(it->character.back()); - StringSet converted = it->set; - ++it; - while (it != pairs->end()) { - const Pair& current_pair = *it++; - if (current_pair.character.size() == unconverted_bytes.size() + 1 && - std::equal(unconverted_bytes.begin(), - unconverted_bytes.end(), - current_pair.character.begin()) && - converted == current_pair.set) { - // The particular set of UTF-8 codepoints we are validating guarantees - // that each byte range will be contiguous. This would not necessarily be - // true for an arbitrary set of UTF-8 codepoints. - DCHECK_EQ(new_range.to() + 1, current_pair.character.back()); - new_range.AddByte(current_pair.character.back()); - continue; - } - ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); - unconverted_bytes = Character(current_pair.character.begin(), - current_pair.character.end() - 1); - new_range = Range(current_pair.character.back()); - converted = current_pair.set; - } - ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); - new_pairs.swap(*pairs); -} - -void MoveAllCharsToSets(PairVector* pairs) { - // Since each pass of the function moves one character, and UTF-8 sequences - // are at most 4 characters long, this simply runs the algorithm four times. - for (int i = 0; i < 4; ++i) { - MoveRightMostCharToSet(pairs); - } -#if DCHECK_IS_ON() - for (PairVector::const_iterator it = pairs->begin(); it != pairs->end(); - ++it) { - DCHECK(it->character.empty()); - } -#endif -} - -// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], -// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the -// algorithm is working. Use the command-line option -// --vmodule=build_utf8_validator_tables=1 to see this output. -void LogStringSets(const PairVector& pairs) { - for (PairVector::const_iterator pair_it = pairs.begin(); - pair_it != pairs.end(); - ++pair_it) { - std::string set_as_string; - for (StringSet::const_iterator set_it = pair_it->set.begin(); - set_it != pair_it->set.end(); - ++set_it) { - set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]", - static_cast<int>(set_it->from()), - static_cast<int>(set_it->to())); - } - VLOG(1) << set_as_string; - } -} - -// A single state in the state machine is represented by a sorted vector of -// start bytes and target states. All input bytes in the range between the start -// byte and the next entry in the vector (or 0xFF) result in a transition to the -// target state. -struct StateRange { - uint8 from; - uint8 target_state; -}; - -typedef std::vector<StateRange> State; - -// Generates a state where all bytes go to state 1 (invalid). This is also used -// as an initialiser for other states (since bytes from outside the desired -// range are invalid). -State GenerateInvalidState() { - const StateRange range = {0, 1}; - return State(1, range); -} - -// A map from a state (ie. a set of strings which will match from this state) to -// a number (which is an index into the array of states). -typedef std::map<StringSet, uint8> StateMap; - -// Create a new state corresponding to |set|, add it |states| and |state_map| -// and return the index it was given in |states|. -uint8 MakeState(const StringSet& set, - std::vector<State>* states, - StateMap* state_map) { - DCHECK(!set.empty()); - const Range& range = set.front(); - const StringSet rest(set.begin() + 1, set.end()); - const StateMap::const_iterator where = state_map->find(rest); - const uint8 target_state = where == state_map->end() - ? MakeState(rest, states, state_map) - : where->second; - DCHECK_LT(0, range.from()); - DCHECK_LT(range.to(), 0xFF); - const StateRange new_state_initializer[] = { - {0, 1}, {range.from(), target_state}, - {static_cast<uint8>(range.to() + 1), 1}}; - states->push_back( - State(new_state_initializer, - new_state_initializer + arraysize(new_state_initializer))); - const uint8 new_state_number = - base::checked_cast<uint8>(states->size() - 1); - CHECK(state_map->insert(std::make_pair(set, new_state_number)).second); - return new_state_number; -} - -std::vector<State> GenerateStates(const PairVector& pairs) { - // States 0 and 1 are the initial/valid state and invalid state, respectively. - std::vector<State> states(2, GenerateInvalidState()); - StateMap state_map; - state_map.insert(std::make_pair(StringSet(), 0)); - for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) { - DCHECK(it->character.empty()); - DCHECK(!it->set.empty()); - const Range& range = it->set.front(); - const StringSet rest(it->set.begin() + 1, it->set.end()); - const StateMap::const_iterator where = state_map.find(rest); - const uint8 target_state = where == state_map.end() - ? MakeState(rest, &states, &state_map) - : where->second; - if (states[0].back().from == range.from()) { - DCHECK_EQ(1, states[0].back().target_state); - states[0].back().target_state = target_state; - DCHECK_LT(range.to(), 0xFF); - const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1}; - states[0].push_back(new_range); - } else { - DCHECK_LT(range.to(), 0xFF); - const StateRange new_range_initializer[] = {{range.from(), target_state}, - {static_cast<uint8>(range.to() + 1), 1}}; - states[0] - .insert(states[0].end(), - new_range_initializer, - new_range_initializer + arraysize(new_range_initializer)); - } - } - return states; -} - -// Output the generated states as a C++ table. Two tricks are used to compact -// the table: each state in the table starts with a shift value which indicates -// how many bits we can discard from the right-hand-side of the byte before -// doing the table lookup. Secondly, only the state-transitions for bytes -// with the top-bit set are included in the table; bytes without the top-bit set -// are just ASCII and are handled directly by the code. -void PrintStates(const std::vector<State>& states, FILE* stream) { - // First calculate the start-offset of each state. This allows the state - // machine to jump directly to the correct offset, avoiding an extra - // indirection. State 0 starts at offset 0. - std::vector<uint8> state_offset(1, 0); - std::vector<uint8> shifts; - uint8 pos = 0; - - for (std::vector<State>::const_iterator state_it = states.begin(); - state_it != states.end(); - ++state_it) { - // We want to set |shift| to the (0-based) index of the least-significant - // set bit in any of the ranges for this state, since this tells us how many - // bits we can discard and still determine what range a byte lies in. Sadly - // it appears that ffs() is not portable, so we do it clumsily. - uint8 shift = 7; - for (State::const_iterator range_it = state_it->begin(); - range_it != state_it->end(); - ++range_it) { - while (shift > 0 && range_it->from % (1 << shift) != 0) { - --shift; - } - } - shifts.push_back(shift); - pos += 1 + (1 << (7 - shift)); - state_offset.push_back(pos); - } - - DCHECK_EQ(129, state_offset[1]); - - fputs(kProlog, stream); - TablePrinter table_printer(stream); - - for (uint8 state_index = 0; state_index < states.size(); ++state_index) { - const uint8 shift = shifts[state_index]; - uint8 next_range = 0; - uint8 target_state = 1; - fprintf(stream, - " // State %d, offset 0x%02x\n", - static_cast<int>(state_index), - static_cast<int>(state_offset[state_index])); - table_printer.PrintValue(shift); - for (int i = 0; i < 0x100; i += (1 << shift)) { - if (next_range < states[state_index].size() && - states[state_index][next_range].from == i) { - target_state = states[state_index][next_range].target_state; - ++next_range; - } - if (i >= 0x80) { - table_printer.PrintValue(state_offset[target_state]); - } - } - table_printer.NewLine(); - } - - fputs(kEpilog, stream); -} - -} // namespace - -int main(int argc, char* argv[]) { - base::CommandLine::Init(argc, argv); - logging::LoggingSettings settings; - settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG; - logging::InitLogging(settings); - if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) { - fwrite(kHelpText, 1, arraysize(kHelpText), stdout); - exit(EXIT_SUCCESS); - } - base::FilePath filename = - base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output"); - - FILE* output = stdout; - if (!filename.empty()) { - output = base::OpenFile(filename, "wb"); - if (!output) - PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe() - << "' for writing"; - } - - // Step 1: Enumerate the characters - PairVector pairs = InitializeCharacters(); - // Step 2: Convert to sets. - MoveAllCharsToSets(&pairs); - if (VLOG_IS_ON(1)) { - LogStringSets(pairs); - } - // Step 3: Generate states. - std::vector<State> states = GenerateStates(pairs); - // Step 4/5: Print output - PrintStates(states, output); - - if (!filename.empty()) { - if (!base::CloseFile(output)) - PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe() - << "'"; - } - - return EXIT_SUCCESS; -} |