pw_tokenizer/token_database_fuzzer.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

// Copyright 2020 The Pigweed Authors
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

// This file implements a basic fuzz test for the TokenDatabase class
// A database is created from fuzz data, and a random entry count (also
// derived from the fuzz data) is set. We then run iterations and 'find'
// operations on this database.

#include <cstring>
#include <span>

#include "pw_fuzzer/asan_interface.h"
#include "pw_fuzzer/fuzzed_data_provider.h"
#include "pw_preprocessor/util.h"
#include "pw_tokenizer/token_database.h"

namespace pw::tokenizer {
namespace {

enum FuzzTestType : uint8_t {
  kValidHeader,
  kRandomHeader,
  kMaxValue = kRandomHeader,
};

constexpr size_t kTokenHeaderSize = 16;

// The default max length in bytes of fuzzed data provided. Note that
// this needs to change if the fuzzer executable is run with a
// '-max_len' argument.
constexpr size_t kFuzzDataSizeMax = 4096;

// Location of the 'EntryCount' field in the token header.
constexpr size_t kEntryCountOffset = 8;
constexpr size_t kEntryCountSize = 4;

void SetTokenEntryCountInBuffer(uint8_t* buffer, uint32_t count) {
  memcpy(buffer + kEntryCountOffset, &count, kEntryCountSize);
}

void IterateOverDatabase(TokenDatabase* const database) {
  for (TokenDatabase::Entry entry : *database) {
    // Since we don't "use" the contents of the entry, we exercise
    // the entry by extracting its contents into volatile variables
    // to prevent it from being optimized out during compilation.
    [[maybe_unused]] volatile const char* entry_string = entry.string;
    [[maybe_unused]] volatile uint32_t entry_token = entry.token;
  }
}

}  // namespace

extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
  constexpr size_t kBufferSizeMax = kFuzzDataSizeMax + kTokenHeaderSize;
  constexpr char kDefaultHeader[] = "TOKENS\0\0\0\0\0\0\0\0\0";
  static uint8_t buffer[kBufferSizeMax];

  if (size > kFuzzDataSizeMax) {
    return 0;
  }

  FuzzedDataProvider provider(data, size);

  // Initialize the token header with either a valid or invalid header
  // based on a random enum consumed from the fuzz data.
  switch (provider.ConsumeEnum<FuzzTestType>()) {
    case kValidHeader:
      memcpy(buffer, kDefaultHeader, kTokenHeaderSize);
      break;

    case kRandomHeader: {
      std::vector<uint8_t> random_header =
          provider.ConsumeBytes<uint8_t>(kTokenHeaderSize);
      random_header.resize(kTokenHeaderSize);
      memcpy(buffer, &random_header[0], kTokenHeaderSize);
      break;
    }
  }

  // Consume a 'test token' integer to look up later in the database.
  uint32_t random_token = provider.ConsumeIntegral<uint32_t>();

  // Consume a 'token count' integer to set as our database entry count.
  uint32_t random_token_count =
      provider.ConsumeIntegralInRange<uint32_t>(0, kFuzzDataSizeMax);

  // Consume the remaining data. Note that the data corresponding to the
  // string entries in the database are not explicitly null-terminated.
  // TODO(karthikmb): Once OSS-Fuzz updates to Clang11.0, switch to
  // provider.ConsumeData() to avoid extra memory and the memcpy call.
  auto consumed_bytes =
      provider.ConsumeBytes<uint8_t>(provider.remaining_bytes());
  memcpy(buffer + kTokenHeaderSize, &consumed_bytes[0], consumed_bytes.size());

  SetTokenEntryCountInBuffer(buffer, random_token_count);

  // Poison the unused buffer space for this run of the fuzzer to
  // prevent the token database creator from reading too far in.
  size_t data_size = kTokenHeaderSize + consumed_bytes.size();
  size_t poisoned_length = kBufferSizeMax - data_size;
  void* poisoned = &buffer[data_size];

  ASAN_POISON_MEMORY_REGION(poisoned, poisoned_length);

  // We create a database from a std::span of the buffer since the string
  // entries might not be null terminated, and the creation of a database
  // from a raw buffer has an explicit null terminated string requirement
  // specified in the API.
  std::span<uint8_t> data_span(buffer, data_size);
  auto token_database = TokenDatabase::Create<std::span<uint8_t>>(data_span);
  [[maybe_unused]] volatile auto match = token_database.Find(random_token);

  IterateOverDatabase(&token_database);

  // Un-poison for the next iteration.
  ASAN_UNPOISON_MEMORY_REGION(poisoned, poisoned_length);

  return 0;
}

}  // namespace pw::tokenizer