aboutsummaryrefslogtreecommitdiff
path: root/icing/testing/random-string.h
diff options
context:
space:
mode:
Diffstat (limited to 'icing/testing/random-string.h')
-rw-r--r--icing/testing/random-string.h25
1 files changed, 25 insertions, 0 deletions
diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h
index fd8d87b..a313c1c 100644
--- a/icing/testing/random-string.h
+++ b/icing/testing/random-string.h
@@ -25,6 +25,15 @@ namespace lib {
inline constexpr std::string_view kAlNumAlphabet =
"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+// Average length of word in English is 4.7 characters.
+inline constexpr int kAvgTokenLen = 5;
+// Made up value. This results in a fairly reasonable language - the majority of
+// generated words are 3-9 characters, ~3% of words are >=20 chars, and the
+// longest ones are 27 chars, (roughly consistent with the longest,
+// non-contrived English words
+// https://en.wikipedia.org/wiki/Longest_word_in_English)
+inline constexpr int kTokenStdDev = 7;
+
template <typename Gen>
std::string RandomString(const std::string_view alphabet, size_t len,
Gen* gen) {
@@ -37,6 +46,22 @@ std::string RandomString(const std::string_view alphabet, size_t len,
return result;
}
+// Creates a vector containing num_words randomly-generated words for use by
+// documents.
+template <typename Rand>
+std::vector<std::string> CreateLanguages(int num_words, Rand* r) {
+ std::vector<std::string> language;
+ std::normal_distribution<> norm_dist(kAvgTokenLen, kTokenStdDev);
+ while (--num_words >= 0) {
+ int word_length = 0;
+ while (word_length < 1) {
+ word_length = std::round(norm_dist(*r));
+ }
+ language.push_back(RandomString(kAlNumAlphabet, word_length, r));
+ }
+ return language;
+}
+
// Returns a vector containing num_terms unique terms. Terms are created in
// non-random order starting with "a" to "z" to "aa" to "zz", etc.
std::vector<std::string> GenerateUniqueTerms(int num_terms);