summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortanjent@gmail.com <tanjent@gmail.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a>2011-04-01 08:50:06 +0000
committertanjent@gmail.com <tanjent@gmail.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a>2011-04-01 08:50:06 +0000
commit603c8781c055989d0123fcb61f60871bd26e095c (patch)
tree03fa917ea12582546035f0cc7fbd99447b8a843f
parent96601f2cd5d12b4618ecd830e8a155cad18e15dc (diff)
downloadsrc-603c8781c055989d0123fcb61f60871bd26e095c.tar.gz
Tweak to Murmur3a (yeah, even though I said it was finalized...)
Added key-processing callback experiments, may move all keyset tests to use it git-svn-id: http://smhasher.googlecode.com/svn/trunk@102 77a7d1d3-4c08-bdc2-d393-d5859734b01a
-rw-r--r--Bitvec.cpp46
-rw-r--r--Bitvec.h16
-rw-r--r--KeysetTest.cpp68
-rw-r--r--KeysetTest.h75
-rw-r--r--MurmurHash3.cpp12
-rw-r--r--SMHasher.vcproj8
-rw-r--r--SpeedTest.cpp2
-rw-r--r--Stats.h46
-rw-r--r--Types.h116
-rw-r--r--main.cpp38
10 files changed, 316 insertions, 111 deletions
diff --git a/Bitvec.cpp b/Bitvec.cpp
index 932b200..2160060 100644
--- a/Bitvec.cpp
+++ b/Bitvec.cpp
@@ -14,9 +14,9 @@ void assert ( bool )
//----------------------------------------------------------------------------
-void printbits ( void * blob, int len )
+void printbits ( const void * blob, int len )
{
- uint8_t * data = (uint8_t*)blob;
+ const uint8_t * data = (const uint8_t *)blob;
printf("[");
for(int i = 0; i < len; i++)
@@ -37,7 +37,7 @@ void printbits ( void * blob, int len )
printf("]");
}
-void printbits2 ( uint8_t * k, int nbytes )
+void printbits2 ( const uint8_t * k, int nbytes )
{
printf("[");
@@ -55,7 +55,7 @@ void printbits2 ( uint8_t * k, int nbytes )
printf("]");
}
-void printhex32 ( void * blob, int len )
+void printhex32 ( const void * blob, int len )
{
assert((len & 3) == 0);
@@ -71,7 +71,7 @@ void printhex32 ( void * blob, int len )
printf("}");
}
-void printbytes ( void * blob, int len )
+void printbytes ( const void * blob, int len )
{
uint8_t * d = (uint8_t*)blob;
@@ -85,9 +85,41 @@ void printbytes ( void * blob, int len )
printf(" };");
}
+void printbytes2 ( const void * blob, int len )
+{
+ uint8_t * d = (uint8_t*)blob;
+
+ for(int i = 0; i < len; i++)
+ {
+ printf("%02x ",d[i]);
+ }
+}
+
+//-----------------------------------------------------------------------------
+// Bit-level manipulation
+
+// These two are from the "Bit Twiddling Hacks" webpage
+
+uint32_t popcount ( uint32_t v )
+{
+ v = v - ((v >> 1) & 0x55555555); // reuse input as temporary
+ v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
+ uint32_t c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
+
+ return c;
+}
+
+uint32_t parity ( uint32_t v )
+{
+ v ^= v >> 1;
+ v ^= v >> 2;
+ v = (v & 0x11111111U) * 0x11111111U;
+ return (v >> 28) & 1;
+}
+
//-----------------------------------------------------------------------------
-uint32_t getbit ( void * block, int len, uint32_t bit )
+uint32_t getbit ( const void * block, int len, uint32_t bit )
{
uint8_t * b = (uint8_t*)block;
@@ -99,7 +131,7 @@ uint32_t getbit ( void * block, int len, uint32_t bit )
return 0;
}
-uint32_t getbit_wrap ( void * block, int len, uint32_t bit )
+uint32_t getbit_wrap ( const void * block, int len, uint32_t bit )
{
uint8_t * b = (uint8_t*)block;
diff --git a/Bitvec.h b/Bitvec.h
index ac91c36..8a3a1b0 100644
--- a/Bitvec.h
+++ b/Bitvec.h
@@ -6,12 +6,16 @@
//-----------------------------------------------------------------------------
-void printbits ( void * blob, int len );
-void printhex32 ( void * blob, int len );
-void printbytes ( void * blob, int len );
+void printbits ( const void * blob, int len );
+void printhex32 ( const void * blob, int len );
+void printbytes ( const void * blob, int len );
+void printbytes2 ( const void * blob, int len );
-uint32_t getbit ( void * blob, int len, uint32_t bit );
-uint32_t getbit_wrap ( void * blob, int len, uint32_t bit );
+uint32_t popcount ( uint32_t v );
+uint32_t parity ( uint32_t v );
+
+uint32_t getbit ( const void * blob, int len, uint32_t bit );
+uint32_t getbit_wrap ( const void * blob, int len, uint32_t bit );
void setbit ( void * blob, int len, uint32_t bit );
void setbit ( void * blob, int len, uint32_t bit, uint32_t val );
@@ -23,7 +27,7 @@ void flipbit ( void * blob, int len, uint32_t bit );
int countbits ( uint32_t v );
int countbits ( std::vector<uint32_t> & v );
-int countbits ( void * blob, int len );
+int countbits ( const void * blob, int len );
void invert ( std::vector<uint32_t> & v );
diff --git a/KeysetTest.cpp b/KeysetTest.cpp
index 6519f8b..f11d512 100644
--- a/KeysetTest.cpp
+++ b/KeysetTest.cpp
@@ -1,7 +1,11 @@
#include "KeysetTest.h"
+#include "Platform.h"
#include "Random.h"
+#include <map>
+#include <set>
+
//-----------------------------------------------------------------------------
// This should hopefully be a thorough and uambiguous test of whether a hash
// is correctly implemented on a given platform
@@ -194,3 +198,67 @@ void AppendedZeroesTest ( pfHash hash, const int hashbits )
}
//-----------------------------------------------------------------------------
+// Generate all keys of up to N bytes containing two non-zero bytes
+
+void TwoBytesKeygen ( int maxlen, KeyCallback & c )
+{
+ //----------
+ // Compute # of keys
+
+ int keycount = 0;
+
+ for(int i = 2; i <= maxlen; i++) keycount += (int)chooseK(i,2);
+
+ keycount *= 255*255;
+
+ for(int i = 2; i <= maxlen; i++) keycount += i*255;
+
+ printf("Keyset 'TwoBytes' - up-to-%d-byte keys, %d total keys\n",maxlen, keycount);
+
+ c.reserve(keycount);
+
+ //----------
+ // Add all keys with one non-zero byte
+
+ uint8_t key[256];
+
+ memset(key,0,256);
+
+ for(int keylen = 2; keylen <= maxlen; keylen++)
+ for(int byteA = 0; byteA < keylen; byteA++)
+ {
+ for(int valA = 1; valA <= 255; valA++)
+ {
+ key[byteA] = (uint8_t)valA;
+
+ c(key,keylen);
+ }
+
+ key[byteA] = 0;
+ }
+
+ //----------
+ // Add all keys with two non-zero bytes
+
+ for(int keylen = 2; keylen <= maxlen; keylen++)
+ for(int byteA = 0; byteA < keylen-1; byteA++)
+ for(int byteB = byteA+1; byteB < keylen; byteB++)
+ {
+ for(int valA = 1; valA <= 255; valA++)
+ {
+ key[byteA] = (uint8_t)valA;
+
+ for(int valB = 1; valB <= 255; valB++)
+ {
+ key[byteB] = (uint8_t)valB;
+ c(key,keylen);
+ }
+
+ key[byteB] = 0;
+ }
+
+ key[byteA] = 0;
+ }
+}
+
+//-----------------------------------------------------------------------------
diff --git a/KeysetTest.h b/KeysetTest.h
index 9e41b6f..55d5d5f 100644
--- a/KeysetTest.h
+++ b/KeysetTest.h
@@ -210,7 +210,7 @@ bool WindowedKeyTest ( hashfunc<hashtype> hash, const int windowbits, bool testC
bool result = true;
- int testcount = (keybits-windowbits);
+ int testcount = keybits;
printf("Keyset 'Windowed' - %3d-bit key, %3d-bit window - %d tests, %d keys per test\n",keybits,windowbits,testcount,keycount);
@@ -223,7 +223,9 @@ bool WindowedKeyTest ( hashfunc<hashtype> hash, const int windowbits, bool testC
for(int i = 0; i < keycount; i++)
{
key = i;
- key = key << minbit;
+ //key = key << minbit;
+
+ lrot(&key,sizeof(keytype),minbit);
hash(&key,sizeof(keytype),0,&hashes[i]);
}
@@ -291,77 +293,20 @@ bool CyclicKeyTest ( pfHash hash, int cycleLen, int cycleReps, const int keycoun
//-----------------------------------------------------------------------------
// Keyset 'TwoBytes' - generate all keys up to length N with two non-zero bytes
+void TwoBytesKeygen ( int maxlen, KeyCallback & c );
+
template < typename hashtype >
-bool TwoBytesTest ( pfHash hash, int maxlen, bool drawDiagram )
+bool TwoBytesTest2 ( pfHash hash, int maxlen, bool drawDiagram )
{
- int keycount = 0;
-
- for(int i = 2; i <= maxlen; i++) keycount += (int)chooseK(i,2);
-
- keycount *= 255*255;
-
- for(int i = 2; i <= maxlen; i++) keycount += i*255;
-
- printf("Keyset 'TwoBytes' - %d keys of up to %d bytes\n",keycount,maxlen);
-
std::vector<hashtype> hashes;
- hashes.resize(keycount);
- int cursor = 0;
-
- uint8_t key[256];
-
- memset(key,0,256);
-
- //----------
- // Add all keys with one non-zero byte
-
- for(int keylen = 2; keylen <= maxlen; keylen++)
- for(int byteA = 0; byteA < keylen; byteA++)
- {
- for(int valA = 1; valA <= 255; valA++)
- {
- key[byteA] = (uint8_t)valA;
-
- assert(cursor <= keycount);
- hash(key,keylen,0,&hashes[cursor++]);
- }
-
- key[byteA] = 0;
- }
-
- //----------
- // Add all keys with two non-zero bytes
-
- for(int keylen = 2; keylen <= maxlen; keylen++)
- for(int byteA = 0; byteA < keylen-1; byteA++)
- for(int byteB = byteA+1; byteB < keylen; byteB++)
- {
- for(int valA = 1; valA <= 255; valA++)
- {
- key[byteA] = (uint8_t)valA;
- for(int valB = 1; valB <= 255; valB++)
- {
- key[byteB] = (uint8_t)valB;
- assert(cursor <= keycount);
- hash(key,keylen,0,&hashes[cursor++]);
- }
+ HashCallback<hashtype> c(hash,hashes);
- key[byteB] = 0;
- }
-
- key[byteA] = 0;
- }
-
- //----------
-
- printf("Actually %d keys\n",cursor);
-
- assert(cursor == keycount);
+ TwoBytesKeygen(maxlen,c);
bool result = true;
- result &= TestHashList(hashes,true,false,drawDiagram);
+ result &= TestHashList(hashes,true,true,drawDiagram);
printf("\n");
return result;
diff --git a/MurmurHash3.cpp b/MurmurHash3.cpp
index 7a6e435..28aa6a0 100644
--- a/MurmurHash3.cpp
+++ b/MurmurHash3.cpp
@@ -20,16 +20,16 @@ FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i )
FORCE_INLINE void bmix32 ( uint32_t & h1, uint32_t & k1,
uint32_t & c1, uint32_t & c2 )
{
- c1 = c1*5+0x7b7d159c;
- c2 = c2*5+0x6bce6396;
-
k1 *= c1;
- k1 = ROTL32(k1,11);
+ k1 = ROTL32(k1,16);
k1 *= c2;
- h1 = ROTL32(h1,13);
- h1 = h1*5+0x52dce729;
h1 ^= k1;
+ h1 = h1*3+0x52dce729;
+ h1 = ROTL32(h1,15);
+
+ c1 = c1*3+0x7b7d159c;
+ c2 = c2*3+0x6bce6396;
}
//----------
diff --git a/SMHasher.vcproj b/SMHasher.vcproj
index bb4125e..dffe6dc 100644
--- a/SMHasher.vcproj
+++ b/SMHasher.vcproj
@@ -435,6 +435,14 @@
>
</File>
<File
+ RelativePath=".\Experiments.cpp"
+ >
+ </File>
+ <File
+ RelativePath=".\Experiments.h"
+ >
+ </File>
+ <File
RelativePath=".\Platform.cpp"
>
</File>
diff --git a/SpeedTest.cpp b/SpeedTest.cpp
index dc6d7cc..a6d131e 100644
--- a/SpeedTest.cpp
+++ b/SpeedTest.cpp
@@ -17,7 +17,7 @@ void BulkSpeedTest ( pfHash hash, uint32_t seed )
printf("Bulk speed test - %d-byte keys\n",blocksize);
- char * block = new char[blocksize + 16];
+ uint8_t * block = new uint8_t[blocksize + 16];
r.rand_p(block,blocksize+16);
diff --git a/Stats.h b/Stats.h
index 5f60c61..5106299 100644
--- a/Stats.h
+++ b/Stats.h
@@ -35,9 +35,13 @@ inline uint32_t f3mix ( uint32_t k )
}
//-----------------------------------------------------------------------------
+// Sort the hash list, count the total number of collisions and return
+// the first N collisions for further processing
template< typename hashtype >
-int CountCollisions ( std::vector<hashtype> & hashes )
+int FindCollisions ( std::vector<hashtype> & hashes,
+ HashSet<hashtype> & collisions,
+ int maxCollisions )
{
int collcount = 0;
@@ -45,7 +49,15 @@ int CountCollisions ( std::vector<hashtype> & hashes )
for(size_t i = 1; i < hashes.size(); i++)
{
- if(hashes[i] == hashes[i-1]) collcount++;
+ if(hashes[i] == hashes[i-1])
+ {
+ collcount++;
+
+ if((int)collisions.size() < maxCollisions)
+ {
+ collisions.insert(hashes[i]);
+ }
+ }
}
return collcount;
@@ -90,11 +102,10 @@ int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys )
//----------------------------------------------------------------------------
template < typename hashtype >
-bool TestHashList ( std::vector<hashtype> & hashes, bool testColl, bool testDist, bool drawDiagram )
+bool TestHashList ( std::vector<hashtype> & hashes, std::vector<hashtype> & collisions, bool testDist, bool drawDiagram )
{
bool result = true;
- if(testColl)
{
size_t count = hashes.size();
@@ -104,10 +115,14 @@ bool TestHashList ( std::vector<hashtype> & hashes, bool testColl, bool testDist
double collcount = 0;
- collcount = CountCollisions(hashes);
+ HashSet<hashtype> collisions;
+
+ collcount = FindCollisions(hashes,collisions,1000);
printf("actual %8.2f (%5.2fx)",collcount, collcount / expected);
+ if(sizeof(hashtype) == sizeof(uint32_t))
+ {
// 2x expected collisions = fail
// #TODO - collision failure cutoff needs to be expressed as a standard deviation instead
@@ -119,6 +134,17 @@ bool TestHashList ( std::vector<hashtype> & hashes, bool testColl, bool testDist
printf(" !!!!! ");
result = false;
}
+ }
+ else
+ {
+ // For all hashes larger than 32 bits, _any_ collisions are a failure.
+
+ if(collcount > 0)
+ {
+ printf(" !!!!! ");
+ result = false;
+ }
+ }
printf("\n");
}
@@ -133,6 +159,16 @@ bool TestHashList ( std::vector<hashtype> & hashes, bool testColl, bool testDist
return result;
}
+//----------
+
+template < typename hashtype >
+bool TestHashList ( std::vector<hashtype> & hashes, bool /*testColl*/, bool testDist, bool drawDiagram )
+{
+ std::vector<hashtype> collisions;
+
+ return TestHashList(hashes,collisions,testDist,drawDiagram);
+}
+
//-----------------------------------------------------------------------------
template < class keytype, typename hashtype >
diff --git a/Types.h b/Types.h
index ddb464b..bf02288 100644
--- a/Types.h
+++ b/Types.h
@@ -3,6 +3,10 @@
#include "Platform.h"
#include "Bitvec.h"
+#include <vector>
+#include <map>
+#include <set>
+
//-----------------------------------------------------------------------------
// If the optimizer detects that a value in a speed test is constant or unused,
// the optimizer may remove references to it or otherwise create code that
@@ -30,6 +34,25 @@ void MixVCode ( const void * blob, int len );
typedef void (*pfHash) ( const void * blob, const int len, const uint32_t seed, void * out );
+struct ByteVec : public std::vector<uint8_t>
+{
+ ByteVec ( const void * key, int len )
+ {
+ resize(len);
+ memcpy(&front(),key,len);
+ }
+};
+
+template< typename hashtype, typename keytype >
+struct CollisionMap : public std::map< hashtype, std::vector<keytype> >
+{
+};
+
+template< typename hashtype >
+struct HashSet : public std::set<hashtype>
+{
+};
+
//-----------------------------------------------------------------------------
template < class T >
@@ -64,6 +87,99 @@ public:
};
//-----------------------------------------------------------------------------
+// Key-processing callback objects. Simplifies keyset testing a bit.
+
+struct KeyCallback
+{
+ KeyCallback() : m_count(0)
+ {
+ }
+
+ virtual void operator() ( const uint8_t * key, int len )
+ {
+ m_count++;
+ }
+
+ virtual void reserve ( int keycount )
+ {
+ };
+
+ int m_count;
+};
+
+//----------
+
+template<typename hashtype>
+struct HashCallback : public KeyCallback
+{
+ typedef std::vector<hashtype> hashvec;
+
+ HashCallback ( pfHash hash, hashvec & hashes ) : m_pfHash(hash), m_hashes(hashes)
+ {
+ m_hashes.clear();
+ }
+
+ virtual void operator () ( const uint8_t * key, int len )
+ {
+ m_hashes.resize(m_hashes.size() + 1);
+
+ m_pfHash(key,len,0,&m_hashes.back());
+ }
+
+ virtual void reserve ( int keycount )
+ {
+ m_hashes.reserve(keycount);
+ }
+
+ hashvec & m_hashes;
+ pfHash m_pfHash;
+
+ //----------
+
+private:
+
+ HashCallback & operator = ( const HashCallback & );
+};
+
+//----------
+
+template<typename hashtype>
+struct CollisionCallback : public KeyCallback
+{
+ typedef HashSet<hashtype> hashset;
+ typedef CollisionMap<hashtype,ByteVec> collmap;
+
+ CollisionCallback ( pfHash hash, hashset & collisions, collmap & cmap )
+ : m_pfHash(hash),
+ m_collisions(collisions),
+ m_collmap(cmap)
+ {
+ }
+
+ virtual void operator () ( const uint8_t * key, int len )
+ {
+ hashtype h;
+
+ m_pfHash(key,len,0,&h);
+
+ if(m_collisions.count(h))
+ {
+ m_collmap[h].push_back( ByteVec(key,len) );
+ }
+ }
+
+ //----------
+
+ pfHash m_pfHash;
+ hashset & m_collisions;
+ collmap & m_collmap;
+
+private:
+
+ CollisionCallback & operator = ( const CollisionCallback & c );
+};
+
+//-----------------------------------------------------------------------------
template < int _bits >
class Blob
diff --git a/main.cpp b/main.cpp
index bc4996c..3d23f21 100644
--- a/main.cpp
+++ b/main.cpp
@@ -66,7 +66,7 @@ HashInfo g_hashes[] =
// MurmurHash3
- { MurmurHash3_x86_32, 32, 0xEA5DFD02, "Murmur3A", "MurmurHash3 for x86, 32-bit" },
+ { MurmurHash3_x86_32, 32, 0xCB75A3F6, "Murmur3A", "MurmurHash3 for x86, 32-bit" },
{ MurmurHash3_x86_128, 128, 0x411C981B, "Murmur3C", "MurmurHash3 for x86, 128-bit" },
{ MurmurHash3_x64_128, 128, 0x04D005BA, "Murmur3F", "MurmurHash3 for x64, 128-bit" },
@@ -220,9 +220,10 @@ void test ( hashfunc<hashtype> hash, HashInfo * info )
}
//-----------------------------------------------------------------------------
- // Bit Independence Criteria
+ // Bit Independence Criteria. Interesting, but doesn't tell us much about
+ // collision or distribution.
- if(g_testBIC /*|| g_testAll*/)
+ if(g_testBIC)
{
printf("[[[ Bit Independence Criteria ]]]\n\n");
@@ -236,7 +237,7 @@ void test ( hashfunc<hashtype> hash, HashInfo * info )
}
//-----------------------------------------------------------------------------
- // Keyset 'Cyclic'
+ // Keyset 'Cyclic' - keys of the form "abcdabcdabcd..."
if(g_testCyclic || g_testAll)
{
@@ -256,23 +257,28 @@ void test ( hashfunc<hashtype> hash, HashInfo * info )
}
//-----------------------------------------------------------------------------
- // Keyset 'TwoBytes'
+ // Keyset 'TwoBytes' - all keys up to N bytes containing two non-zero bytes
- if(g_testTwoBytes)
+ // This generates some huge keysets, 128-bit tests will take ~1.3 gigs of RAM.
+
+ if(g_testTwoBytes || g_testAll)
{
printf("[[[ Keyset 'TwoBytes' Tests ]]]\n\n");
bool result = true;
bool drawDiagram = false;
- result &= TwoBytesTest<hashtype>(hash,24,drawDiagram);
+ for(int i = 4; i <= 20; i += 4)
+ {
+ result &= TwoBytesTest2<hashtype>(hash,i,drawDiagram);
+ }
if(!result) printf("*********FAIL*********\n");
printf("\n");
}
//-----------------------------------------------------------------------------
- // Keyset 'Sparse'
+ // Keyset 'Sparse' - keys with all bits 0 except a few
if(g_testSparse || g_testAll)
{
@@ -295,7 +301,7 @@ void test ( hashfunc<hashtype> hash, HashInfo * info )
}
//-----------------------------------------------------------------------------
- // Keyset 'Permutation'
+ // Keyset 'Permutation' - all possible combinations of a set of blocks
if(g_testPermutation || g_testAll)
{
@@ -543,7 +549,7 @@ int main ( int argc, char ** argv )
hashToTest = argv[1];
}
- SetAffinity(2);
+ SetAffinity(3);
SelfTest();
@@ -561,21 +567,11 @@ int main ( int argc, char ** argv )
//g_testDiffDist = true;
//g_testSparse = true;
//g_testPermutation = true;
+ //g_testWindow = true;
//g_testZeroes = true;
testHash(hashToTest);
- /*
- for(int i = 0; i < sizeof(g_hashes)/sizeof(HashInfo); i++)
- {
- testHash(g_hashes[i].name);
- }
- */
-
- //testHash("murmur3a");
- //testHash("murmur3c");
- //testHash("murmur3f");
-
//----------
int timeEnd = clock();