Initial source checkin

git-svn-id: http://smhasher.googlecode.com/svn/trunk@2 77a7d1d3-4c08-bdc2-d393-d5859734b01a
author: tanjent@gmail.com <tanjent@gmail.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a> 2010-11-02 00:50:04 +0000
committer: tanjent@gmail.com <tanjent@gmail.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a> 2010-11-02 00:50:04 +0000
commit: 7e5c363a6f2ae202b928097bd3a1936db46df1b9 (patch)
tree: f780f37fae2211ec8edefe7b3bbdfa04424751ff /Stats.h
parent: b5082b0d1cb91eabc73e91d6db5b5c5035bdbad6 (diff)
download: src-7e5c363a6f2ae202b928097bd3a1936db46df1b9.tar.gz
1 files changed, 559 insertions, 0 deletions
diff --git a/Stats.h b/Stats.h
new file mode 100644
index 0000000..5cae64e
--- /dev/null
+++ b/Stats.h
@@ -0,0 +1,559 @@
+#pragma once
+
+#include "Core.h"
+
+#include <algorithm>
+#include <math.h>
+#include <assert.h>
+#include <float.h>
+
+double calcScore ( std::vector<int> const & bins, int balls );
+
+void plot ( double n );
+
+inline double ExpectedCollisions ( double balls, double bins )
+{
+	return balls - bins + bins * pow(1 - 1/bins,balls);
+}
+
+double comparenorms ( double u1, double s1, double u2, double s2 );
+void beta2norm ( double a, double b, double & u, double & s );
+
+double chooseK ( int b, int k );
+double chooseUpToK ( int n, int k );
+
+inline uint32_t f3mix ( uint32_t k )
+{
+	k ^= k >> 16;
+	k *= 0x85ebca6b;
+	k ^= k >> 13;
+	k *= 0xc2b2ae35;
+	k ^= k >> 16;
+
+	return k;
+}
+
+//-----------------------------------------------------------------------------
+
+template< typename hashtype >
+int CountCollisions ( std::vector<hashtype> const & hashes )
+{
+	int collcount = 0;
+
+	std::vector<hashtype> temp = hashes;
+	std::sort(temp.begin(),temp.end());
+
+	for(size_t i = 1; i < hashes.size(); i++)
+	{
+		if(temp[i] == temp[i-1]) collcount++;
+	}
+
+	return collcount;
+}
+
+//-----------------------------------------------------------------------------
+
+/*
+template < class keytype, typename hashtype >
+int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys )
+{
+	int collcount = 0;
+
+	typedef std::map<hashtype,keytype> htab;
+	htab tab;
+
+	for(size_t i = 1; i < keys.size(); i++)
+	{
+		keytype & k1 = keys[i];
+
+		hashtype h = hash(&k1,sizeof(k),0);
+
+		htab::iterator it = tab.find(h);
+
+		if(it != tab.end())
+		{
+			keytype & k2 = (*it).second;
+
+			printf("A: ");
+			printbits(&k1,sizeof(k1));
+			printf("B: ");
+			printbits(&k2,sizeof(k2));
+		}
+		else
+		{
+			htab.insert( htab::value_type(h,k);
+		}
+	}
+
+	return collcount;
+}
+*/
+
+//----------------------------------------------------------------------------
+
+template < typename hashtype >
+bool testhashlist( std::vector<hashtype> & hashes, bool testColl, bool testDist, bool drawDiagram )
+{
+	bool verbose = true;
+	bool result = true;
+
+	if(testColl)
+	{
+		size_t count = hashes.size();
+
+		double expected = (double(count) * double(count-1)) / pow(2.0,double(sizeof(hashtype) * 8 + 1));
+
+		if(verbose) printf("Testing collisions - Expected %8.2f, ",expected);
+
+		double collcount = 0;
+
+		collcount = CountCollisions(hashes);
+
+		if(verbose)
+		{
+			printf("actual %8.2f (%5.2fx) \n",collcount, collcount / expected);
+		}
+		else
+		{
+			double collscore = collcount / expected;
+
+			printf("Coll score %5.3f, ",collscore);
+		}
+
+		// 2x expected collisions = fail
+
+		if(double(collcount) / double(expected) > 2.0)
+		{
+			result = false;
+		}
+	}
+
+	//----------
+
+	if(testDist)
+	{
+		if(verbose) printf("Testing distribution - ");
+
+		if(drawDiagram) printf("\n");
+
+		TestDistribution(hashes,drawDiagram);
+	}
+
+	return result;
+}
+
+//-----------------------------------------------------------------------------
+
+template < class keytype, typename hashtype >
+bool testkeylist ( hashfunc<hashtype> hash, std::vector<keytype> & keys, bool testColl, bool testDist, bool drawDiagram )
+{
+	int keycount = (int)keys.size();
+
+	std::vector<hashtype> hashes;
+
+	hashes.resize(keycount);
+
+	//printf("Hashing keyset");
+
+	for(int ikey = 0; ikey < keycount; ikey++)
+	{
+		keytype & k = keys[ikey];
+
+		//if(ikey % (keycount / 10) == 0) printf(".");
+
+		hashes[ikey] = hash(&k,sizeof(k),0);
+	}
+
+	//printf("\n");
+
+	bool result = testhashlist(hashes,testColl,testDist,drawDiagram);
+
+	return result;
+}
+
+//-----------------------------------------------------------------------------
+
+template < typename hashtype >
+bool testkeylist_string ( hashfunc<hashtype> hash, std::vector<std::string> & keys, bool testColl, bool testDist )
+{
+	int keycount = (int)keys.size();
+
+	std::vector<hashtype> hashes;
+
+	hashes.resize(keycount);
+
+	//printf("Hashing keyset");
+
+	for(int ikey = 0; ikey < keycount; ikey++)
+	{
+		std::string & k = keys[ikey];
+
+		//if(ikey % (keycount / 10) == 0) printf(".");
+
+		hashes[ikey] = hash(&k[0],(int)k.size(),0);
+	}
+
+	//printf("\n");
+
+	bool result = testhashlist(hashes,testColl,testDist);
+
+	return result;
+}
+
+//-----------------------------------------------------------------------------
+// Bytepair test - generate 16-bit indices from all possible non-overlapping
+// 8-bit sections of the hash value, check distribution on all of them.
+
+// This is a very good test for catching weak intercorrelations between bits - 
+// much harder to pass than the normal distribution test. However, it doesn't
+// really model the normal usage of hash functions in hash table lookup, so
+// I'm not sure it's that useful (and hash functions that fail this test but
+// pass the normal distribution test still work well in practice)
+
+template < typename hashtype >
+double TestDistributionBytepairs ( std::vector<hashtype> & hashes, bool drawDiagram )
+{
+	const int nbytes = sizeof(hashtype);
+	const int nbits = nbytes * 8;
+	
+	const int nbins = 65536;
+
+	std::vector<int> bins(nbins,0);
+
+	double worst = 0;
+
+	for(int a = 0; a < nbits; a++)
+	{
+		if(drawDiagram) if((a % 8 == 0) && (a > 0)) printf("\n");
+
+		if(drawDiagram) printf("[");
+
+		for(int b = 0; b < nbits; b++)
+		{
+			if(drawDiagram) if((b % 8 == 0) && (b > 0)) printf(" ");
+
+			bins.clear();
+			bins.resize(nbins,0);
+
+			for(size_t i = 0; i < hashes.size(); i++)
+			{
+				hashtype & hash = hashes[i];
+
+				uint32_t pa = window(&hash,sizeof(hash),a,8);
+				uint32_t pb = window(&hash,sizeof(hash),b,8);
+
+				bins[pa | (pb << 8)]++;
+			}
+
+			double s = calcScore(bins,hashes.size());
+
+			if(drawDiagram) plot(s);
+
+			if(s > worst)
+			{
+				worst = s;
+			}
+		}
+
+		if(drawDiagram) printf("]\n");
+	}
+
+	return worst;
+}
+
+
+//----------------------------------------------------------------------------
+// Measure the distribution "score" for each possible N-bit span up to 16 bits
+// and draw a nice graph of the output. 'X' in graph = 10% deviation from ideal.
+
+template< typename hashtype >
+double TestDistribution ( std::vector<hashtype> & hashes, bool drawDiagram )
+{
+	bool verbose = false;
+
+	const int nbits = sizeof(hashtype) * 8;
+	const int maxwidth = 20;
+
+	std::vector<int> bins;
+
+	double worst = 0;
+	int worstStart = -1;
+	int worstWidth = -1;
+
+	for(int width = 1; width <= maxwidth; width++)
+	{
+		const int bincount = (1 << width);
+
+		//If we don't have enough keys to get 2 per bin, skip the test
+
+		//if(double(hashes.size()) / double(bincount) < 2.0) continue;
+
+		if(drawDiagram) printf("%2d - [",width);
+
+		for(int start = 0; start < nbits; start++)
+		{
+			bins.clear();
+			bins.resize(bincount, 0);
+
+			for(size_t j = 0; j < hashes.size(); j++)
+			{
+				hashtype & hash = hashes[j];
+
+				uint32_t index = window(&hash,sizeof(hash),start,width);
+
+				bins[index]++;
+			}
+
+			double n = calcScore(bins,(int)hashes.size());
+
+			if(n > worst)
+			{
+				worst = n;
+				worstStart = start;
+				worstWidth = width;
+			}
+
+			if(drawDiagram) plot(n);
+		}
+
+		if(drawDiagram) printf("]\n");
+	}
+
+	if(verbose)
+	{
+		printf("Worst distribution is for (%d:%d) - %f\n",worstStart,(worstStart+worstWidth-1)%32,worst);
+	}
+	else
+	{
+		printf("Dist score %6.3f\n",(1.0 - worst) * 100);
+	}
+
+	return worst;
+}
+
+//-----------------------------------------------------------------------------
+// Simplified test - only check 64k distributions, and only on byte boundaries
+
+template < typename hashtype >
+void TestDistributionFast ( std::vector<hashtype> & hashes, double & dworst, double & davg )
+{
+	const int nbits = sizeof(hashtype) * 8;
+	const int nbins = 65536;
+	
+	std::vector<int> bins(nbins,0);
+
+	dworst = -1.0e90;
+	davg = 0;
+
+	for(int start = 0; start < nbits; start += 8)
+	{
+		bins.clear();
+		bins.resize(nbins,0);
+
+		for(size_t j = 0; j < hashes.size(); j++)
+		{
+			hashtype & hash = hashes[j];
+
+			uint32_t index = window(&hash,sizeof(hash),start,16);
+
+			bins[index]++;
+		}
+
+		double n = calcScore(bins,(int)hashes.size());
+		
+		davg += n;
+
+		if(n > dworst) dworst = n;
+	}
+
+	davg /= double(nbits/8);
+}
+
+//-----------------------------------------------------------------------------
+
+/*
+struct Stats 
+{
+	enum mode
+	{
+		AVALANCHE,
+		HISTOGRAM,
+	};
+
+	Stats ( int mode, std::vector<int> const & bins, int balls )
+	{
+		switch(mode)
+		{
+		case AVALANCHE:
+			calcAvalanche(bins,balls);
+			break;
+
+		case HISTOGRAM:
+			calcHistogram(bins,balls);
+			break;
+
+		default:
+			assert(false);
+			break;
+		}
+	}
+
+	//----------
+	// Histogram mode
+
+	void calcHistogram ( std::vector<int> const & bins, int balls )
+	{
+		m_nbins  = (int)bins.size();
+		m_nballs = balls;
+
+		m_mean   = 0;
+		m_rms    = 0;
+		m_sigma  = 0;
+		m_max    = -DBL_MAX;
+		m_min    = DBL_MAX;
+
+		for(size_t i = 0; i < bins.size(); i++)
+		{
+			double x = bins[i];
+
+			m_mean += x;
+			m_rms += x*x;
+
+			m_max = x > m_max ? x : m_max;
+			m_min = x < m_min ? x : m_min;
+		}
+
+		m_mean /= m_nbins;
+		m_rms /= m_nbins;
+		m_rms = sqrt(m_rms);
+
+		for(size_t i = 0; i < bins.size(); i++)
+		{
+			double d = bins[i] - m_mean;
+			
+			m_sigma += d*d;
+		}
+
+		m_sigma /= m_nbins;
+		m_sigma = sqrt(m_sigma);
+	}
+	
+	//----------
+	// Normalized standard deviation
+
+	double calcNSD ( std::vector<int> const & bins, int balls )
+	{
+		double n = (int)bins.size();
+		double k = balls;
+		double p = 1.0/n;
+
+		double u = k*p;
+		double s = sqrt(k*p*(1-p));
+
+		double c = 0;
+
+		for(size_t i = 0; i < bins.size(); i++)
+		{
+			double d = bins[i];
+
+			d = (d-u)/s;
+
+			c += d*d;
+		}
+
+		m_nsd = sqrt(c / m_nbins);
+	}
+
+	double calcScore ( std::vector<int> const & bins, int balls )
+	{
+		double n = (int)bins.size();
+		double k = balls;
+
+		// compute rms value
+
+		double r = 0;
+
+		for(size_t i = 0; i < bins.size(); i++)
+		{
+			double b = bins[i];
+
+			r += b*b;
+		}
+
+		r = sqrt(r / n);
+
+		// compute fill factor
+
+		double f = (k*k - 1) / (n*r*r - k);
+
+		// rescale to (0,1) with 0 = good, 1 = bad
+
+		m_score = 1 - (f / n);
+	}
+
+	//----------
+	// Avalanche statistics - convert each table entry to a bias value
+	// and compute stats based on that.
+
+	void calcAvalanche ( std::vector<int> const & bins, int balls )
+	{
+		m_nbins  = (int)bins.size();
+		m_nballs = balls;
+
+		m_mean   = 0;
+		m_rms    = 0;
+		m_sigma  = 0;
+		m_max    = -DBL_MAX;
+		m_min    = DBL_MAX;
+		m_nbad   = 0;
+
+		for(size_t i = 0; i < bins.size(); i++)
+		{
+			double x = (bins[i] / m_nballs) * 2 - 1;
+
+			m_mean += x;
+			m_rms += x*x;
+
+			x = fabs(x);
+
+			if(x > 0.7) m_nbad++;
+
+			m_max = x > m_max ? x : m_max;
+			m_min = x < m_min ? x : m_min;
+		}
+
+		m_mean /= m_nbins;
+		m_rms /= m_nbins;
+		m_rms = sqrt(m_rms);
+
+		for(size_t i = 0; i < bins.size(); i++)
+		{
+			double x = (bins[i] / m_nballs) * 2 - 1;
+
+			double d = x - m_mean;
+			
+			m_sigma += d*d;
+		}
+
+		m_sigma /= m_nbins;
+		m_sigma = sqrt(m_sigma);
+	}
+
+	double m_nbins;
+	double m_nballs;
+
+	double m_mean;
+	double m_rms;
+	double m_sigma;
+
+	double m_nsd;
+	double m_score;
+
+	double m_nbad;
+
+	double m_max;
+	double m_min;
+};
+*/
+
+//-----------------------------------------------------------------------------
author	tanjent@gmail.com <tanjent@gmail.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a>	2010-11-02 00:50:04 +0000
committer	tanjent@gmail.com <tanjent@gmail.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a>	2010-11-02 00:50:04 +0000
commit	7e5c363a6f2ae202b928097bd3a1936db46df1b9 (patch)
tree	f780f37fae2211ec8edefe7b3bbdfa04424751ff /Stats.h
parent	b5082b0d1cb91eabc73e91d6db5b5c5035bdbad6 (diff)
download	src-7e5c363a6f2ae202b928097bd3a1936db46df1b9.tar.gz