diff options
Diffstat (limited to 'icing/testing/random-string.h')
-rw-r--r-- | icing/testing/random-string.h | 25 |
1 files changed, 25 insertions, 0 deletions
diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h index fd8d87b..a313c1c 100644 --- a/icing/testing/random-string.h +++ b/icing/testing/random-string.h @@ -25,6 +25,15 @@ namespace lib { inline constexpr std::string_view kAlNumAlphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; +// Average length of word in English is 4.7 characters. +inline constexpr int kAvgTokenLen = 5; +// Made up value. This results in a fairly reasonable language - the majority of +// generated words are 3-9 characters, ~3% of words are >=20 chars, and the +// longest ones are 27 chars, (roughly consistent with the longest, +// non-contrived English words +// https://en.wikipedia.org/wiki/Longest_word_in_English) +inline constexpr int kTokenStdDev = 7; + template <typename Gen> std::string RandomString(const std::string_view alphabet, size_t len, Gen* gen) { @@ -37,6 +46,22 @@ std::string RandomString(const std::string_view alphabet, size_t len, return result; } +// Creates a vector containing num_words randomly-generated words for use by +// documents. +template <typename Rand> +std::vector<std::string> CreateLanguages(int num_words, Rand* r) { + std::vector<std::string> language; + std::normal_distribution<> norm_dist(kAvgTokenLen, kTokenStdDev); + while (--num_words >= 0) { + int word_length = 0; + while (word_length < 1) { + word_length = std::round(norm_dist(*r)); + } + language.push_back(RandomString(kAlNumAlphabet, word_length, r)); + } + return language; +} + // Returns a vector containing num_terms unique terms. Terms are created in // non-random order starting with "a" to "z" to "aa" to "zz", etc. std::vector<std::string> GenerateUniqueTerms(int num_terms); |