aboutsummaryrefslogtreecommitdiff
path: root/icing/testing/random-string.h
blob: a313c1c882f9740f8c1f8df552776661443efea5 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef ICING_TESTING_RANDOM_STRING_H_
#define ICING_TESTING_RANDOM_STRING_H_

#include <algorithm>
#include <random>
#include <string>

namespace icing {
namespace lib {

inline constexpr std::string_view kAlNumAlphabet =
    "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

// Average length of word in English is 4.7 characters.
inline constexpr int kAvgTokenLen = 5;
// Made up value. This results in a fairly reasonable language - the majority of
// generated words are 3-9 characters, ~3% of words are >=20 chars, and the
// longest ones are 27 chars, (roughly consistent with the longest,
// non-contrived English words
// https://en.wikipedia.org/wiki/Longest_word_in_English)
inline constexpr int kTokenStdDev = 7;

template <typename Gen>
std::string RandomString(const std::string_view alphabet, size_t len,
                         Gen* gen) {
  std::uniform_int_distribution<size_t> uniform(0u, alphabet.size() - 1);
  std::string result(len, '\0');
  std::generate(
      std::begin(result), std::end(result),
      [&gen, &alphabet, &uniform]() { return alphabet[uniform(*gen)]; });

  return result;
}

// Creates a vector containing num_words randomly-generated words for use by
// documents.
template <typename Rand>
std::vector<std::string> CreateLanguages(int num_words, Rand* r) {
  std::vector<std::string> language;
  std::normal_distribution<> norm_dist(kAvgTokenLen, kTokenStdDev);
  while (--num_words >= 0) {
    int word_length = 0;
    while (word_length < 1) {
      word_length = std::round(norm_dist(*r));
    }
    language.push_back(RandomString(kAlNumAlphabet, word_length, r));
  }
  return language;
}

// Returns a vector containing num_terms unique terms. Terms are created in
// non-random order starting with "a" to "z" to "aa" to "zz", etc.
std::vector<std::string> GenerateUniqueTerms(int num_terms);

}  // namespace lib
}  // namespace icing

#endif  // ICING_TESTING_RANDOM_STRING_H_