aboutsummaryrefslogtreecommitdiff
path: root/binary_data_histogram_unittest.cc
blob: ca710102567ab7997012e0e2e85dae37dc8949af (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/zucchini/binary_data_histogram.h"

#include <stddef.h>

#include <memory>
#include <vector>

#include "components/zucchini/buffer_view.h"
#include "testing/gtest/include/gtest/gtest.h"

namespace zucchini {

TEST(OutlierDetectorTest, Basic) {
  auto make_detector = [](const std::vector<double>& values) {
    auto detector = std::make_unique<OutlierDetector>();
    for (double v : values)
      detector->Add(v);
    detector->Prepare();
    return detector;
  };

  std::unique_ptr<OutlierDetector> detector;
  // No data: Should at least not cause error.
  detector = make_detector({});
  EXPECT_EQ(0, detector->DecideOutlier(0.0));
  // Single point: Trivially inert.
  detector = make_detector({0.5});
  EXPECT_EQ(0, detector->DecideOutlier(0.1));
  EXPECT_EQ(0, detector->DecideOutlier(0.5));
  EXPECT_EQ(0, detector->DecideOutlier(0.9));
  // Two identical points: StdDev is 0, so falls back to built-in tolerance.
  detector = make_detector({0.5, 0.5});
  EXPECT_EQ(-1, detector->DecideOutlier(0.3));
  EXPECT_EQ(0, detector->DecideOutlier(0.499));
  EXPECT_EQ(0, detector->DecideOutlier(0.5));
  EXPECT_EQ(0, detector->DecideOutlier(0.501));
  EXPECT_EQ(1, detector->DecideOutlier(0.7));
  // Two separate points: Outliner test is pretty lax.
  detector = make_detector({0.4, 0.6});
  EXPECT_EQ(-1, detector->DecideOutlier(0.2));
  EXPECT_EQ(0, detector->DecideOutlier(0.3));
  EXPECT_EQ(0, detector->DecideOutlier(0.5));
  EXPECT_EQ(0, detector->DecideOutlier(0.7));
  EXPECT_EQ(1, detector->DecideOutlier(0.8));
  // Sharpen distribution by clustering toward norm: Now test is stricter.
  detector = make_detector({0.4, 0.47, 0.48, 0.49, 0.50, 0.51, 0.52, 0.6});
  EXPECT_EQ(-1, detector->DecideOutlier(0.3));
  EXPECT_EQ(0, detector->DecideOutlier(0.4));
  EXPECT_EQ(0, detector->DecideOutlier(0.5));
  EXPECT_EQ(0, detector->DecideOutlier(0.6));
  EXPECT_EQ(1, detector->DecideOutlier(0.7));
  // Shift numbers around: Mean is 0.3, and data order scrambled.
  detector = make_detector({0.28, 0.2, 0.31, 0.4, 0.29, 0.32, 0.27, 0.30});
  EXPECT_EQ(-1, detector->DecideOutlier(0.0));
  EXPECT_EQ(-1, detector->DecideOutlier(0.1));
  EXPECT_EQ(0, detector->DecideOutlier(0.2));
  EXPECT_EQ(0, detector->DecideOutlier(0.3));
  EXPECT_EQ(0, detector->DecideOutlier(0.4));
  EXPECT_EQ(1, detector->DecideOutlier(0.5));
  EXPECT_EQ(1, detector->DecideOutlier(1.0));
  // Typical usage: Potential outlier would be part of original input data!
  detector = make_detector({0.3, 0.29, 0.31, 0.0, 0.3, 0.32, 0.3, 0.29, 0.6});
  EXPECT_EQ(-1, detector->DecideOutlier(0.0));
  EXPECT_EQ(0, detector->DecideOutlier(0.28));
  EXPECT_EQ(0, detector->DecideOutlier(0.29));
  EXPECT_EQ(0, detector->DecideOutlier(0.3));
  EXPECT_EQ(0, detector->DecideOutlier(0.31));
  EXPECT_EQ(0, detector->DecideOutlier(0.32));
  EXPECT_EQ(1, detector->DecideOutlier(0.6));
}

TEST(BinaryDataHistogramTest, Basic) {
  constexpr double kUninitScore = -1;

  constexpr uint8_t kTestData[] = {2, 137, 42, 0, 0, 0, 7, 11, 1, 11, 255};
  const size_t n = sizeof(kTestData);
  ConstBufferView region(kTestData, n);

  std::vector<BinaryDataHistogram> prefix_histograms(n + 1);  // Short to long.
  std::vector<BinaryDataHistogram> suffix_histograms(n + 1);  // Long to short.

  for (size_t i = 0; i <= n; ++i) {
    ConstBufferView prefix(region.begin(), i);
    ConstBufferView suffix(region.begin() + i, n - i);
    // If regions are smaller than 2 bytes then it is invalid. Else valid.
    EXPECT_EQ(prefix.size() >= 2, prefix_histograms[i].Compute(prefix));
    EXPECT_EQ(suffix.size() >= 2, suffix_histograms[i].Compute(suffix));
    // IsValid() returns the same results.
    EXPECT_EQ(prefix.size() >= 2, prefix_histograms[i].IsValid());
    EXPECT_EQ(suffix.size() >= 2, suffix_histograms[i].IsValid());
  }

  // Full-prefix = full-suffix = full data.
  EXPECT_EQ(0.0, prefix_histograms[n].Distance(suffix_histograms[0]));
  EXPECT_EQ(0.0, suffix_histograms[0].Distance(prefix_histograms[n]));

  // Testing heuristics without overreliance on implementation details.

  // Strict prefixes, in increasing size. Compare against full data.
  double prev_prefix_score = kUninitScore;
  for (size_t i = 2; i < n; ++i) {
    double score = prefix_histograms[i].Distance(prefix_histograms[n]);
    // Positivity.
    EXPECT_GT(score, 0.0);
    // Symmetry.
    EXPECT_EQ(score, prefix_histograms[n].Distance(prefix_histograms[i]));
    // Distance should decrease as prefix gets nearer to full data.
    if (prev_prefix_score != kUninitScore)
      EXPECT_LT(score, prev_prefix_score);
    prev_prefix_score = score;
  }

  // Strict suffixes, in decreasing size. Compare against full data.
  double prev_suffix_score = -1;
  for (size_t i = 1; i <= n - 2; ++i) {
    double score = suffix_histograms[i].Distance(suffix_histograms[0]);
    // Positivity.
    EXPECT_GT(score, 0.0);
    // Symmetry.
    EXPECT_EQ(score, suffix_histograms[0].Distance(suffix_histograms[i]));
    // Distance should increase as suffix gets farther from full data.
    if (prev_suffix_score != kUninitScore)
      EXPECT_GT(score, prev_suffix_score);
    prev_suffix_score = score;
  }
}

}  // namespace zucchini