aboutsummaryrefslogtreecommitdiff
path: root/binary_data_histogram.cc
blob: 7f6ece89011c4781237cb156b701ad0eccba7b9d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
// Copyright 2017 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/zucchini/binary_data_histogram.h"

#include <algorithm>
#include <cmath>
#include <limits>

#include "base/check_op.h"
#include "base/format_macros.h"
#include "base/strings/stringprintf.h"

namespace zucchini {

/******** OutlierDetector ********/

OutlierDetector::OutlierDetector() = default;

OutlierDetector::~OutlierDetector() = default;

// For BinaryDataHistogram, |sample| is typically in interval [0, 1].
void OutlierDetector::Add(double sample) {
  ++n_;
  sum_ += sample;
  sum_of_squares_ += sample * sample;
}

void OutlierDetector::Prepare() {
  if (n_ > 0) {
    mean_ = sum_ / n_;
    standard_deviation_ = ::sqrt((sum_of_squares_ - sum_ * mean_) /
                                 std::max(static_cast<size_t>(1), n_ - 1));
  }
}

std::string OutlierDetector::RenderStats() {
  return base::StringPrintf("Mean = %.5f, StdDev = %.5f over %" PRIuS
                            " samples",
                            mean_, standard_deviation_, n_);
}

// Constants are chosen for BinaryDataHistogram, where |sample| is typically in
// [0, 1].
int OutlierDetector::DecideOutlier(double sample) {
  // Lower bound to avoid divide-by-zero and penalizing tight clusters.
  constexpr double kMinTolerance = 0.1;
  // Number of standard deviations away from mean for value to become outlier.
  constexpr double kSigmaBound = 1.9;
  if (n_ <= 1)
    return 0;
  double tolerance = std::max(kMinTolerance, standard_deviation_);
  double num_sigma = (sample - mean_) / tolerance;
  return num_sigma > kSigmaBound ? 1 : num_sigma < -kSigmaBound ? -1 : 0;
}

/******** BinaryDataHistogram ********/

BinaryDataHistogram::BinaryDataHistogram() = default;

BinaryDataHistogram::~BinaryDataHistogram() = default;

bool BinaryDataHistogram::Compute(ConstBufferView region) {
  DCHECK(!histogram_);
  // Binary data with size < 2 are invalid.
  if (region.size() < sizeof(uint16_t))
    return false;
  DCHECK_LE(region.size(),
            static_cast<size_t>(std::numeric_limits<int32_t>::max()));

  histogram_ = std::make_unique<int32_t[]>(kNumBins);
  size_ = region.size();
  // Number of 2-byte intervals fully contained in |region|.
  size_t bound = size_ - sizeof(uint16_t) + 1;
  for (size_t i = 0; i < bound; ++i)
    ++histogram_[region.read<uint16_t>(i)];
  return true;
}

double BinaryDataHistogram::Distance(const BinaryDataHistogram& other) const {
  DCHECK(IsValid() && other.IsValid());
  // Compute Manhattan (L1) distance between respective histograms.
  double total_diff = 0;
  for (int i = 0; i < kNumBins; ++i)
    total_diff += std::abs(histogram_[i] - other.histogram_[i]);
  // Normalize by total size, so result lies in [0, 1].
  return total_diff / (size_ + other.size_);
}

}  // namespace zucchini