summaryrefslogtreecommitdiff
path: root/Stats.h
diff options
context:
space:
mode:
authortanjent@gmail.com <tanjent@gmail.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a>2010-11-05 01:20:58 +0000
committertanjent@gmail.com <tanjent@gmail.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a>2010-11-05 01:20:58 +0000
commitad4b363201477cb33966510b850c2193b1f825fe (patch)
tree9b4bdb1b5d9ed22197655b796a205f3c458ab4fc /Stats.h
parent2e8984f1e39b91dae37a23aea35d30b78f46c096 (diff)
downloadsrc-ad4b363201477cb33966510b850c2193b1f825fe.tar.gz
MurmurHash3 is released to beta
(potentially some constant-tweaking yet to be done, but it is quite usable and all variants pass all tests) git-svn-id: http://smhasher.googlecode.com/svn/trunk@5 77a7d1d3-4c08-bdc2-d393-d5859734b01a
Diffstat (limited to 'Stats.h')
-rw-r--r--Stats.h368
1 files changed, 80 insertions, 288 deletions
diff --git a/Stats.h b/Stats.h
index 5cae64e..b4afe2c 100644
--- a/Stats.h
+++ b/Stats.h
@@ -1,13 +1,12 @@
#pragma once
-#include "Core.h"
+#include "Types.h"
-#include <algorithm>
#include <math.h>
-#include <assert.h>
-#include <float.h>
+#include <vector>
+#include <algorithm> // for std::sort
-double calcScore ( std::vector<int> const & bins, int balls );
+double calcScore ( const int * bins, const int bincount, const int ballcount );
void plot ( double n );
@@ -16,12 +15,11 @@ inline double ExpectedCollisions ( double balls, double bins )
return balls - bins + bins * pow(1 - 1/bins,balls);
}
-double comparenorms ( double u1, double s1, double u2, double s2 );
-void beta2norm ( double a, double b, double & u, double & s );
-
double chooseK ( int b, int k );
double chooseUpToK ( int n, int k );
+//-----------------------------------------------------------------------------
+
inline uint32_t f3mix ( uint32_t k )
{
k ^= k >> 16;
@@ -53,7 +51,6 @@ int CountCollisions ( std::vector<hashtype> const & hashes )
//-----------------------------------------------------------------------------
-/*
template < class keytype, typename hashtype >
int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys )
{
@@ -81,20 +78,18 @@ int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys )
}
else
{
- htab.insert( htab::value_type(h,k);
+ htab.insert( htab::value_type(h,k) );
}
}
return collcount;
}
-*/
//----------------------------------------------------------------------------
template < typename hashtype >
-bool testhashlist( std::vector<hashtype> & hashes, bool testColl, bool testDist, bool drawDiagram )
+bool TestHashList ( std::vector<hashtype> & hashes, bool testColl, bool testDist, bool drawDiagram )
{
- bool verbose = true;
bool result = true;
if(testColl)
@@ -103,39 +98,33 @@ bool testhashlist( std::vector<hashtype> & hashes, bool testColl, bool testDist,
double expected = (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1));
- if(verbose) printf("Testing collisions - Expected %8.2f, ",expected);
+ printf("Testing collisions - Expected %8.2f, ",expected);
double collcount = 0;
collcount = CountCollisions(hashes);
- if(verbose)
- {
- printf("actual %8.2f (%5.2fx) \n",collcount, collcount / expected);
- }
- else
- {
- double collscore = collcount / expected;
-
- printf("Coll score %5.3f, ",collscore);
- }
+ printf("actual %8.2f (%5.2fx)",collcount, collcount / expected);
// 2x expected collisions = fail
+ // #TODO - collision failure cutoff needs to be expressed as a standard deviation instead
+ // of a scale factor, otherwise we fail erroneously if there are a small expected number
+ // of collisions
+
if(double(collcount) / double(expected) > 2.0)
{
+ printf(" !!!!! ");
result = false;
}
+
+ printf("\n");
}
//----------
if(testDist)
{
- if(verbose) printf("Testing distribution - ");
-
- if(drawDiagram) printf("\n");
-
TestDistribution(hashes,drawDiagram);
}
@@ -145,7 +134,7 @@ bool testhashlist( std::vector<hashtype> & hashes, bool testColl, bool testDist,
//-----------------------------------------------------------------------------
template < class keytype, typename hashtype >
-bool testkeylist ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool testColl, bool testDist, bool drawDiagram )
+bool TestKeyList ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool testColl, bool testDist, bool drawDiagram )
{
int keycount = (int)keys.size();
@@ -153,49 +142,22 @@ bool testkeylist ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool te
hashes.resize(keycount);
- //printf("Hashing keyset");
+ printf("Hashing");
- for(int ikey = 0; ikey < keycount; ikey++)
+ for(int i = 0; i < keycount; i++)
{
- keytype & k = keys[ikey];
+ if(i % (keycount / 10) == 0) printf(".");
- //if(ikey % (keycount / 10) == 0) printf(".");
+ keytype & k = keys[i];
- hashes[ikey] = hash(&k,sizeof(k),0);
+ hash(&k,sizeof(k),0,&hashes[i]);
}
- //printf("\n");
+ printf("\n");
- bool result = testhashlist(hashes,testColl,testDist,drawDiagram);
+ bool result = TestHashList(hashes,testColl,testDist,drawDiagram);
- return result;
-}
-
-//-----------------------------------------------------------------------------
-
-template < typename hashtype >
-bool testkeylist_string ( hashfunc<hashtype> hash, std::vector<std::string> & keys, bool testColl, bool testDist )
-{
- int keycount = (int)keys.size();
-
- std::vector<hashtype> hashes;
-
- hashes.resize(keycount);
-
- //printf("Hashing keyset");
-
- for(int ikey = 0; ikey < keycount; ikey++)
- {
- std::string & k = keys[ikey];
-
- //if(ikey % (keycount / 10) == 0) printf(".");
-
- hashes[ikey] = hash(&k[0],(int)k.size(),0);
- }
-
- //printf("\n");
-
- bool result = testhashlist(hashes,testColl,testDist);
+ printf("\n");
return result;
}
@@ -214,7 +176,7 @@ template < typename hashtype >
double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiagram )
{
const int nbytes = sizeof(hashtype);
- const int nbits = nbytes * 8;
+ const int hashbits = nbytes * 8;
const int nbins = 65536;
@@ -222,13 +184,13 @@ double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiag
double worst = 0;
- for(int a = 0; a < nbits; a++)
+ for(int a = 0; a < hashbits; a++)
{
if(drawDiagram) if((a % 8 == 0) && (a > 0)) printf("\n");
if(drawDiagram) printf("[");
- for(int b = 0; b < nbits; b++)
+ for(int b = 0; b < hashbits; b++)
{
if(drawDiagram) if((b % 8 == 0) && (b > 0)) printf(" ");
@@ -245,7 +207,7 @@ double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiag
bins[pa | (pb << 8)]++;
}
- double s = calcScore(bins,hashes.size());
+ double s = calcScore(bins,bins.size(),hashes.size());
if(drawDiagram) plot(s);
@@ -263,48 +225,60 @@ double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiag
//----------------------------------------------------------------------------
-// Measure the distribution "score" for each possible N-bit span up to 16 bits
-// and draw a nice graph of the output. 'X' in graph = 10% deviation from ideal.
+// Measure the distribution "score" for each possible N-bit span up to 20 bits
template< typename hashtype >
double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram )
{
- bool verbose = false;
+ printf("Testing distribution - ");
+
+ if(drawDiagram) printf("\n");
+
+ const int hashbits = sizeof(hashtype) * 8;
- const int nbits = sizeof(hashtype) * 8;
- const int maxwidth = 20;
+ int maxwidth = 20;
+
+ // We need at least 5 keys per bin to reliably test distribution biases
+ // down to 1%, so don't bother to test sparser distributions than that
+
+ while(double(hashes.size()) / double(1 << maxwidth) < 5.0)
+ {
+ maxwidth--;
+ }
std::vector<int> bins;
+ bins.resize(1 << maxwidth);
double worst = 0;
int worstStart = -1;
int worstWidth = -1;
- for(int width = 1; width <= maxwidth; width++)
+ for(int start = 0; start < hashbits; start++)
{
- const int bincount = (1 << width);
+ int width = maxwidth;
+ int bincount = (1 << width);
- //If we don't have enough keys to get 2 per bin, skip the test
+ memset(&bins[0],0,sizeof(int)*bincount);
- //if(double(hashes.size()) / double(bincount) < 2.0) continue;
+ for(size_t j = 0; j < hashes.size(); j++)
+ {
+ hashtype & hash = hashes[j];
- if(drawDiagram) printf("%2d - [",width);
+ uint32_t index = window(&hash,sizeof(hash),start,width);
- for(int start = 0; start < nbits; start++)
- {
- bins.clear();
- bins.resize(bincount, 0);
+ bins[index]++;
+ }
- for(size_t j = 0; j < hashes.size(); j++)
- {
- hashtype & hash = hashes[j];
+ // Test the distribution, then fold the bins in half,
+ // repeat until we're down to 256 bins
- uint32_t index = window(&hash,sizeof(hash),start,width);
+ if(drawDiagram) printf("[");
- bins[index]++;
- }
+ while(bincount >= 256)
+ {
+ double n = calcScore(&bins[0],bincount,(int)hashes.size());
- double n = calcScore(bins,(int)hashes.size());
+ if(drawDiagram) plot(n);
if(n > worst)
{
@@ -313,20 +287,25 @@ double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram )
worstWidth = width;
}
- if(drawDiagram) plot(n);
+ width--;
+ bincount /= 2;
+
+ if(width < 8) break;
+
+ for(int i = 0; i < bincount; i++)
+ {
+ bins[i] += bins[i+bincount];
+ }
}
if(drawDiagram) printf("]\n");
}
- if(verbose)
- {
- printf("Worst distribution is for (%d:%d) - %f\n",worstStart,(worstStart+worstWidth-1)%32,worst);
- }
- else
- {
- printf("Dist score %6.3f\n",(1.0 - worst) * 100);
- }
+ double pct = worst * 100.0;
+
+ printf("Worst bias is the %3d-bit window at bit %3d - %5.3f%%",worstWidth,worstStart,pct);
+ if(pct >= 1.0) printf(" !!!!! ");
+ printf("\n");
return worst;
}
@@ -337,7 +316,7 @@ double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram )
template < typename hashtype >
void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, double & davg )
{
- const int nbits = sizeof(hashtype) * 8;
+ const int hashbits = sizeof(hashtype) * 8;
const int nbins = 65536;
std::vector<int> bins(nbins,0);
@@ -345,7 +324,7 @@ void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, dou
dworst = -1.0e90;
davg = 0;
- for(int start = 0; start < nbits; start += 8)
+ for(int start = 0; start < hashbits; start += 8)
{
bins.clear();
bins.resize(nbins,0);
@@ -366,194 +345,7 @@ void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, dou
if(n > dworst) dworst = n;
}
- davg /= double(nbits/8);
+ davg /= double(hashbits/8);
}
//-----------------------------------------------------------------------------
-
-/*
-struct Stats
-{
- enum mode
- {
- AVALANCHE,
- HISTOGRAM,
- };
-
- Stats ( int mode, std::vector<int> const & bins, int balls )
- {
- switch(mode)
- {
- case AVALANCHE:
- calcAvalanche(bins,balls);
- break;
-
- case HISTOGRAM:
- calcHistogram(bins,balls);
- break;
-
- default:
- assert(false);
- break;
- }
- }
-
- //----------
- // Histogram mode
-
- void calcHistogram ( std::vector<int> const & bins, int balls )
- {
- m_nbins = (int)bins.size();
- m_nballs = balls;
-
- m_mean = 0;
- m_rms = 0;
- m_sigma = 0;
- m_max = -DBL_MAX;
- m_min = DBL_MAX;
-
- for(size_t i = 0; i < bins.size(); i++)
- {
- double x = bins[i];
-
- m_mean += x;
- m_rms += x*x;
-
- m_max = x > m_max ? x : m_max;
- m_min = x < m_min ? x : m_min;
- }
-
- m_mean /= m_nbins;
- m_rms /= m_nbins;
- m_rms = sqrt(m_rms);
-
- for(size_t i = 0; i < bins.size(); i++)
- {
- double d = bins[i] - m_mean;
-
- m_sigma += d*d;
- }
-
- m_sigma /= m_nbins;
- m_sigma = sqrt(m_sigma);
- }
-
- //----------
- // Normalized standard deviation
-
- double calcNSD ( std::vector<int> const & bins, int balls )
- {
- double n = (int)bins.size();
- double k = balls;
- double p = 1.0/n;
-
- double u = k*p;
- double s = sqrt(k*p*(1-p));
-
- double c = 0;
-
- for(size_t i = 0; i < bins.size(); i++)
- {
- double d = bins[i];
-
- d = (d-u)/s;
-
- c += d*d;
- }
-
- m_nsd = sqrt(c / m_nbins);
- }
-
- double calcScore ( std::vector<int> const & bins, int balls )
- {
- double n = (int)bins.size();
- double k = balls;
-
- // compute rms value
-
- double r = 0;
-
- for(size_t i = 0; i < bins.size(); i++)
- {
- double b = bins[i];
-
- r += b*b;
- }
-
- r = sqrt(r / n);
-
- // compute fill factor
-
- double f = (k*k - 1) / (n*r*r - k);
-
- // rescale to (0,1) with 0 = good, 1 = bad
-
- m_score = 1 - (f / n);
- }
-
- //----------
- // Avalanche statistics - convert each table entry to a bias value
- // and compute stats based on that.
-
- void calcAvalanche ( std::vector<int> const & bins, int balls )
- {
- m_nbins = (int)bins.size();
- m_nballs = balls;
-
- m_mean = 0;
- m_rms = 0;
- m_sigma = 0;
- m_max = -DBL_MAX;
- m_min = DBL_MAX;
- m_nbad = 0;
-
- for(size_t i = 0; i < bins.size(); i++)
- {
- double x = (bins[i] / m_nballs) * 2 - 1;
-
- m_mean += x;
- m_rms += x*x;
-
- x = fabs(x);
-
- if(x > 0.7) m_nbad++;
-
- m_max = x > m_max ? x : m_max;
- m_min = x < m_min ? x : m_min;
- }
-
- m_mean /= m_nbins;
- m_rms /= m_nbins;
- m_rms = sqrt(m_rms);
-
- for(size_t i = 0; i < bins.size(); i++)
- {
- double x = (bins[i] / m_nballs) * 2 - 1;
-
- double d = x - m_mean;
-
- m_sigma += d*d;
- }
-
- m_sigma /= m_nbins;
- m_sigma = sqrt(m_sigma);
- }
-
- double m_nbins;
- double m_nballs;
-
- double m_mean;
- double m_rms;
- double m_sigma;
-
- double m_nsd;
- double m_score;
-
- double m_nbad;
-
- double m_max;
- double m_min;
-};
-*/
-
-//-----------------------------------------------------------------------------