diff options
author | tanjent@gmail.com <tanjent@gmail.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a> | 2011-04-01 08:50:06 +0000 |
---|---|---|
committer | tanjent@gmail.com <tanjent@gmail.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a> | 2011-04-01 08:50:06 +0000 |
commit | 603c8781c055989d0123fcb61f60871bd26e095c (patch) | |
tree | 03fa917ea12582546035f0cc7fbd99447b8a843f | |
parent | 96601f2cd5d12b4618ecd830e8a155cad18e15dc (diff) | |
download | src-603c8781c055989d0123fcb61f60871bd26e095c.tar.gz |
Tweak to Murmur3a (yeah, even though I said it was finalized...)
Added key-processing callback experiments, may move all keyset tests to use it
git-svn-id: http://smhasher.googlecode.com/svn/trunk@102 77a7d1d3-4c08-bdc2-d393-d5859734b01a
-rw-r--r-- | Bitvec.cpp | 46 | ||||
-rw-r--r-- | Bitvec.h | 16 | ||||
-rw-r--r-- | KeysetTest.cpp | 68 | ||||
-rw-r--r-- | KeysetTest.h | 75 | ||||
-rw-r--r-- | MurmurHash3.cpp | 12 | ||||
-rw-r--r-- | SMHasher.vcproj | 8 | ||||
-rw-r--r-- | SpeedTest.cpp | 2 | ||||
-rw-r--r-- | Stats.h | 46 | ||||
-rw-r--r-- | Types.h | 116 | ||||
-rw-r--r-- | main.cpp | 38 |
10 files changed, 316 insertions, 111 deletions
@@ -14,9 +14,9 @@ void assert ( bool ) //----------------------------------------------------------------------------
-void printbits ( void * blob, int len )
+void printbits ( const void * blob, int len )
{
- uint8_t * data = (uint8_t*)blob;
+ const uint8_t * data = (const uint8_t *)blob;
printf("[");
for(int i = 0; i < len; i++)
@@ -37,7 +37,7 @@ void printbits ( void * blob, int len ) printf("]");
}
-void printbits2 ( uint8_t * k, int nbytes )
+void printbits2 ( const uint8_t * k, int nbytes )
{
printf("[");
@@ -55,7 +55,7 @@ void printbits2 ( uint8_t * k, int nbytes ) printf("]");
}
-void printhex32 ( void * blob, int len )
+void printhex32 ( const void * blob, int len )
{
assert((len & 3) == 0);
@@ -71,7 +71,7 @@ void printhex32 ( void * blob, int len ) printf("}");
}
-void printbytes ( void * blob, int len )
+void printbytes ( const void * blob, int len )
{
uint8_t * d = (uint8_t*)blob;
@@ -85,9 +85,41 @@ void printbytes ( void * blob, int len ) printf(" };");
}
+void printbytes2 ( const void * blob, int len )
+{
+ uint8_t * d = (uint8_t*)blob;
+
+ for(int i = 0; i < len; i++)
+ {
+ printf("%02x ",d[i]);
+ }
+}
+
+//-----------------------------------------------------------------------------
+// Bit-level manipulation
+
+// These two are from the "Bit Twiddling Hacks" webpage
+
+uint32_t popcount ( uint32_t v )
+{
+ v = v - ((v >> 1) & 0x55555555); // reuse input as temporary
+ v = (v & 0x33333333) + ((v >> 2) & 0x33333333); // temp
+ uint32_t c = ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; // count
+
+ return c;
+}
+
+uint32_t parity ( uint32_t v )
+{
+ v ^= v >> 1;
+ v ^= v >> 2;
+ v = (v & 0x11111111U) * 0x11111111U;
+ return (v >> 28) & 1;
+}
+
//-----------------------------------------------------------------------------
-uint32_t getbit ( void * block, int len, uint32_t bit )
+uint32_t getbit ( const void * block, int len, uint32_t bit )
{
uint8_t * b = (uint8_t*)block;
@@ -99,7 +131,7 @@ uint32_t getbit ( void * block, int len, uint32_t bit ) return 0;
}
-uint32_t getbit_wrap ( void * block, int len, uint32_t bit )
+uint32_t getbit_wrap ( const void * block, int len, uint32_t bit )
{
uint8_t * b = (uint8_t*)block;
@@ -6,12 +6,16 @@ //-----------------------------------------------------------------------------
-void printbits ( void * blob, int len );
-void printhex32 ( void * blob, int len );
-void printbytes ( void * blob, int len );
+void printbits ( const void * blob, int len );
+void printhex32 ( const void * blob, int len );
+void printbytes ( const void * blob, int len );
+void printbytes2 ( const void * blob, int len );
-uint32_t getbit ( void * blob, int len, uint32_t bit );
-uint32_t getbit_wrap ( void * blob, int len, uint32_t bit );
+uint32_t popcount ( uint32_t v );
+uint32_t parity ( uint32_t v );
+
+uint32_t getbit ( const void * blob, int len, uint32_t bit );
+uint32_t getbit_wrap ( const void * blob, int len, uint32_t bit );
void setbit ( void * blob, int len, uint32_t bit );
void setbit ( void * blob, int len, uint32_t bit, uint32_t val );
@@ -23,7 +27,7 @@ void flipbit ( void * blob, int len, uint32_t bit ); int countbits ( uint32_t v );
int countbits ( std::vector<uint32_t> & v );
-int countbits ( void * blob, int len );
+int countbits ( const void * blob, int len );
void invert ( std::vector<uint32_t> & v );
diff --git a/KeysetTest.cpp b/KeysetTest.cpp index 6519f8b..f11d512 100644 --- a/KeysetTest.cpp +++ b/KeysetTest.cpp @@ -1,7 +1,11 @@ #include "KeysetTest.h"
+#include "Platform.h"
#include "Random.h"
+#include <map>
+#include <set>
+
//-----------------------------------------------------------------------------
// This should hopefully be a thorough and uambiguous test of whether a hash
// is correctly implemented on a given platform
@@ -194,3 +198,67 @@ void AppendedZeroesTest ( pfHash hash, const int hashbits ) }
//-----------------------------------------------------------------------------
+// Generate all keys of up to N bytes containing two non-zero bytes
+
+void TwoBytesKeygen ( int maxlen, KeyCallback & c )
+{
+ //----------
+ // Compute # of keys
+
+ int keycount = 0;
+
+ for(int i = 2; i <= maxlen; i++) keycount += (int)chooseK(i,2);
+
+ keycount *= 255*255;
+
+ for(int i = 2; i <= maxlen; i++) keycount += i*255;
+
+ printf("Keyset 'TwoBytes' - up-to-%d-byte keys, %d total keys\n",maxlen, keycount);
+
+ c.reserve(keycount);
+
+ //----------
+ // Add all keys with one non-zero byte
+
+ uint8_t key[256];
+
+ memset(key,0,256);
+
+ for(int keylen = 2; keylen <= maxlen; keylen++)
+ for(int byteA = 0; byteA < keylen; byteA++)
+ {
+ for(int valA = 1; valA <= 255; valA++)
+ {
+ key[byteA] = (uint8_t)valA;
+
+ c(key,keylen);
+ }
+
+ key[byteA] = 0;
+ }
+
+ //----------
+ // Add all keys with two non-zero bytes
+
+ for(int keylen = 2; keylen <= maxlen; keylen++)
+ for(int byteA = 0; byteA < keylen-1; byteA++)
+ for(int byteB = byteA+1; byteB < keylen; byteB++)
+ {
+ for(int valA = 1; valA <= 255; valA++)
+ {
+ key[byteA] = (uint8_t)valA;
+
+ for(int valB = 1; valB <= 255; valB++)
+ {
+ key[byteB] = (uint8_t)valB;
+ c(key,keylen);
+ }
+
+ key[byteB] = 0;
+ }
+
+ key[byteA] = 0;
+ }
+}
+
+//-----------------------------------------------------------------------------
diff --git a/KeysetTest.h b/KeysetTest.h index 9e41b6f..55d5d5f 100644 --- a/KeysetTest.h +++ b/KeysetTest.h @@ -210,7 +210,7 @@ bool WindowedKeyTest ( hashfunc<hashtype> hash, const int windowbits, bool testC bool result = true;
- int testcount = (keybits-windowbits);
+ int testcount = keybits;
printf("Keyset 'Windowed' - %3d-bit key, %3d-bit window - %d tests, %d keys per test\n",keybits,windowbits,testcount,keycount);
@@ -223,7 +223,9 @@ bool WindowedKeyTest ( hashfunc<hashtype> hash, const int windowbits, bool testC for(int i = 0; i < keycount; i++)
{
key = i;
- key = key << minbit;
+ //key = key << minbit;
+
+ lrot(&key,sizeof(keytype),minbit);
hash(&key,sizeof(keytype),0,&hashes[i]);
}
@@ -291,77 +293,20 @@ bool CyclicKeyTest ( pfHash hash, int cycleLen, int cycleReps, const int keycoun //-----------------------------------------------------------------------------
// Keyset 'TwoBytes' - generate all keys up to length N with two non-zero bytes
+void TwoBytesKeygen ( int maxlen, KeyCallback & c );
+
template < typename hashtype >
-bool TwoBytesTest ( pfHash hash, int maxlen, bool drawDiagram )
+bool TwoBytesTest2 ( pfHash hash, int maxlen, bool drawDiagram )
{
- int keycount = 0;
-
- for(int i = 2; i <= maxlen; i++) keycount += (int)chooseK(i,2);
-
- keycount *= 255*255;
-
- for(int i = 2; i <= maxlen; i++) keycount += i*255;
-
- printf("Keyset 'TwoBytes' - %d keys of up to %d bytes\n",keycount,maxlen);
-
std::vector<hashtype> hashes;
- hashes.resize(keycount);
- int cursor = 0;
-
- uint8_t key[256];
-
- memset(key,0,256);
-
- //----------
- // Add all keys with one non-zero byte
-
- for(int keylen = 2; keylen <= maxlen; keylen++)
- for(int byteA = 0; byteA < keylen; byteA++)
- {
- for(int valA = 1; valA <= 255; valA++)
- {
- key[byteA] = (uint8_t)valA;
-
- assert(cursor <= keycount);
- hash(key,keylen,0,&hashes[cursor++]);
- }
-
- key[byteA] = 0;
- }
-
- //----------
- // Add all keys with two non-zero bytes
-
- for(int keylen = 2; keylen <= maxlen; keylen++)
- for(int byteA = 0; byteA < keylen-1; byteA++)
- for(int byteB = byteA+1; byteB < keylen; byteB++)
- {
- for(int valA = 1; valA <= 255; valA++)
- {
- key[byteA] = (uint8_t)valA;
- for(int valB = 1; valB <= 255; valB++)
- {
- key[byteB] = (uint8_t)valB;
- assert(cursor <= keycount);
- hash(key,keylen,0,&hashes[cursor++]);
- }
+ HashCallback<hashtype> c(hash,hashes);
- key[byteB] = 0;
- }
-
- key[byteA] = 0;
- }
-
- //----------
-
- printf("Actually %d keys\n",cursor);
-
- assert(cursor == keycount);
+ TwoBytesKeygen(maxlen,c);
bool result = true;
- result &= TestHashList(hashes,true,false,drawDiagram);
+ result &= TestHashList(hashes,true,true,drawDiagram);
printf("\n");
return result;
diff --git a/MurmurHash3.cpp b/MurmurHash3.cpp index 7a6e435..28aa6a0 100644 --- a/MurmurHash3.cpp +++ b/MurmurHash3.cpp @@ -20,16 +20,16 @@ FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) FORCE_INLINE void bmix32 ( uint32_t & h1, uint32_t & k1,
uint32_t & c1, uint32_t & c2 )
{
- c1 = c1*5+0x7b7d159c;
- c2 = c2*5+0x6bce6396;
-
k1 *= c1;
- k1 = ROTL32(k1,11);
+ k1 = ROTL32(k1,16);
k1 *= c2;
- h1 = ROTL32(h1,13);
- h1 = h1*5+0x52dce729;
h1 ^= k1;
+ h1 = h1*3+0x52dce729;
+ h1 = ROTL32(h1,15);
+
+ c1 = c1*3+0x7b7d159c;
+ c2 = c2*3+0x6bce6396;
}
//----------
diff --git a/SMHasher.vcproj b/SMHasher.vcproj index bb4125e..dffe6dc 100644 --- a/SMHasher.vcproj +++ b/SMHasher.vcproj @@ -435,6 +435,14 @@ >
</File>
<File
+ RelativePath=".\Experiments.cpp"
+ >
+ </File>
+ <File
+ RelativePath=".\Experiments.h"
+ >
+ </File>
+ <File
RelativePath=".\Platform.cpp"
>
</File>
diff --git a/SpeedTest.cpp b/SpeedTest.cpp index dc6d7cc..a6d131e 100644 --- a/SpeedTest.cpp +++ b/SpeedTest.cpp @@ -17,7 +17,7 @@ void BulkSpeedTest ( pfHash hash, uint32_t seed ) printf("Bulk speed test - %d-byte keys\n",blocksize);
- char * block = new char[blocksize + 16];
+ uint8_t * block = new uint8_t[blocksize + 16];
r.rand_p(block,blocksize+16);
@@ -35,9 +35,13 @@ inline uint32_t f3mix ( uint32_t k ) }
//-----------------------------------------------------------------------------
+// Sort the hash list, count the total number of collisions and return
+// the first N collisions for further processing
template< typename hashtype >
-int CountCollisions ( std::vector<hashtype> & hashes )
+int FindCollisions ( std::vector<hashtype> & hashes,
+ HashSet<hashtype> & collisions,
+ int maxCollisions )
{
int collcount = 0;
@@ -45,7 +49,15 @@ int CountCollisions ( std::vector<hashtype> & hashes ) for(size_t i = 1; i < hashes.size(); i++)
{
- if(hashes[i] == hashes[i-1]) collcount++;
+ if(hashes[i] == hashes[i-1])
+ {
+ collcount++;
+
+ if((int)collisions.size() < maxCollisions)
+ {
+ collisions.insert(hashes[i]);
+ }
+ }
}
return collcount;
@@ -90,11 +102,10 @@ int PrintCollisions ( hashfunc<hashtype> hash, std::vector<keytype> & keys ) //----------------------------------------------------------------------------
template < typename hashtype >
-bool TestHashList ( std::vector<hashtype> & hashes, bool testColl, bool testDist, bool drawDiagram )
+bool TestHashList ( std::vector<hashtype> & hashes, std::vector<hashtype> & collisions, bool testDist, bool drawDiagram )
{
bool result = true;
- if(testColl)
{
size_t count = hashes.size();
@@ -104,10 +115,14 @@ bool TestHashList ( std::vector<hashtype> & hashes, bool testColl, bool testDist double collcount = 0;
- collcount = CountCollisions(hashes);
+ HashSet<hashtype> collisions;
+
+ collcount = FindCollisions(hashes,collisions,1000);
printf("actual %8.2f (%5.2fx)",collcount, collcount / expected);
+ if(sizeof(hashtype) == sizeof(uint32_t))
+ {
// 2x expected collisions = fail
// #TODO - collision failure cutoff needs to be expressed as a standard deviation instead
@@ -119,6 +134,17 @@ bool TestHashList ( std::vector<hashtype> & hashes, bool testColl, bool testDist printf(" !!!!! ");
result = false;
}
+ }
+ else
+ {
+ // For all hashes larger than 32 bits, _any_ collisions are a failure.
+
+ if(collcount > 0)
+ {
+ printf(" !!!!! ");
+ result = false;
+ }
+ }
printf("\n");
}
@@ -133,6 +159,16 @@ bool TestHashList ( std::vector<hashtype> & hashes, bool testColl, bool testDist return result;
}
+//----------
+
+template < typename hashtype >
+bool TestHashList ( std::vector<hashtype> & hashes, bool /*testColl*/, bool testDist, bool drawDiagram )
+{
+ std::vector<hashtype> collisions;
+
+ return TestHashList(hashes,collisions,testDist,drawDiagram);
+}
+
//-----------------------------------------------------------------------------
template < class keytype, typename hashtype >
@@ -3,6 +3,10 @@ #include "Platform.h"
#include "Bitvec.h"
+#include <vector>
+#include <map>
+#include <set>
+
//-----------------------------------------------------------------------------
// If the optimizer detects that a value in a speed test is constant or unused,
// the optimizer may remove references to it or otherwise create code that
@@ -30,6 +34,25 @@ void MixVCode ( const void * blob, int len ); typedef void (*pfHash) ( const void * blob, const int len, const uint32_t seed, void * out );
+struct ByteVec : public std::vector<uint8_t>
+{
+ ByteVec ( const void * key, int len )
+ {
+ resize(len);
+ memcpy(&front(),key,len);
+ }
+};
+
+template< typename hashtype, typename keytype >
+struct CollisionMap : public std::map< hashtype, std::vector<keytype> >
+{
+};
+
+template< typename hashtype >
+struct HashSet : public std::set<hashtype>
+{
+};
+
//-----------------------------------------------------------------------------
template < class T >
@@ -64,6 +87,99 @@ public: };
//-----------------------------------------------------------------------------
+// Key-processing callback objects. Simplifies keyset testing a bit.
+
+struct KeyCallback
+{
+ KeyCallback() : m_count(0)
+ {
+ }
+
+ virtual void operator() ( const uint8_t * key, int len )
+ {
+ m_count++;
+ }
+
+ virtual void reserve ( int keycount )
+ {
+ };
+
+ int m_count;
+};
+
+//----------
+
+template<typename hashtype>
+struct HashCallback : public KeyCallback
+{
+ typedef std::vector<hashtype> hashvec;
+
+ HashCallback ( pfHash hash, hashvec & hashes ) : m_pfHash(hash), m_hashes(hashes)
+ {
+ m_hashes.clear();
+ }
+
+ virtual void operator () ( const uint8_t * key, int len )
+ {
+ m_hashes.resize(m_hashes.size() + 1);
+
+ m_pfHash(key,len,0,&m_hashes.back());
+ }
+
+ virtual void reserve ( int keycount )
+ {
+ m_hashes.reserve(keycount);
+ }
+
+ hashvec & m_hashes;
+ pfHash m_pfHash;
+
+ //----------
+
+private:
+
+ HashCallback & operator = ( const HashCallback & );
+};
+
+//----------
+
+template<typename hashtype>
+struct CollisionCallback : public KeyCallback
+{
+ typedef HashSet<hashtype> hashset;
+ typedef CollisionMap<hashtype,ByteVec> collmap;
+
+ CollisionCallback ( pfHash hash, hashset & collisions, collmap & cmap )
+ : m_pfHash(hash),
+ m_collisions(collisions),
+ m_collmap(cmap)
+ {
+ }
+
+ virtual void operator () ( const uint8_t * key, int len )
+ {
+ hashtype h;
+
+ m_pfHash(key,len,0,&h);
+
+ if(m_collisions.count(h))
+ {
+ m_collmap[h].push_back( ByteVec(key,len) );
+ }
+ }
+
+ //----------
+
+ pfHash m_pfHash;
+ hashset & m_collisions;
+ collmap & m_collmap;
+
+private:
+
+ CollisionCallback & operator = ( const CollisionCallback & c );
+};
+
+//-----------------------------------------------------------------------------
template < int _bits >
class Blob
@@ -66,7 +66,7 @@ HashInfo g_hashes[] = // MurmurHash3
- { MurmurHash3_x86_32, 32, 0xEA5DFD02, "Murmur3A", "MurmurHash3 for x86, 32-bit" },
+ { MurmurHash3_x86_32, 32, 0xCB75A3F6, "Murmur3A", "MurmurHash3 for x86, 32-bit" },
{ MurmurHash3_x86_128, 128, 0x411C981B, "Murmur3C", "MurmurHash3 for x86, 128-bit" },
{ MurmurHash3_x64_128, 128, 0x04D005BA, "Murmur3F", "MurmurHash3 for x64, 128-bit" },
@@ -220,9 +220,10 @@ void test ( hashfunc<hashtype> hash, HashInfo * info ) }
//-----------------------------------------------------------------------------
- // Bit Independence Criteria
+ // Bit Independence Criteria. Interesting, but doesn't tell us much about
+ // collision or distribution.
- if(g_testBIC /*|| g_testAll*/)
+ if(g_testBIC)
{
printf("[[[ Bit Independence Criteria ]]]\n\n");
@@ -236,7 +237,7 @@ void test ( hashfunc<hashtype> hash, HashInfo * info ) }
//-----------------------------------------------------------------------------
- // Keyset 'Cyclic'
+ // Keyset 'Cyclic' - keys of the form "abcdabcdabcd..."
if(g_testCyclic || g_testAll)
{
@@ -256,23 +257,28 @@ void test ( hashfunc<hashtype> hash, HashInfo * info ) }
//-----------------------------------------------------------------------------
- // Keyset 'TwoBytes'
+ // Keyset 'TwoBytes' - all keys up to N bytes containing two non-zero bytes
- if(g_testTwoBytes)
+ // This generates some huge keysets, 128-bit tests will take ~1.3 gigs of RAM.
+
+ if(g_testTwoBytes || g_testAll)
{
printf("[[[ Keyset 'TwoBytes' Tests ]]]\n\n");
bool result = true;
bool drawDiagram = false;
- result &= TwoBytesTest<hashtype>(hash,24,drawDiagram);
+ for(int i = 4; i <= 20; i += 4)
+ {
+ result &= TwoBytesTest2<hashtype>(hash,i,drawDiagram);
+ }
if(!result) printf("*********FAIL*********\n");
printf("\n");
}
//-----------------------------------------------------------------------------
- // Keyset 'Sparse'
+ // Keyset 'Sparse' - keys with all bits 0 except a few
if(g_testSparse || g_testAll)
{
@@ -295,7 +301,7 @@ void test ( hashfunc<hashtype> hash, HashInfo * info ) }
//-----------------------------------------------------------------------------
- // Keyset 'Permutation'
+ // Keyset 'Permutation' - all possible combinations of a set of blocks
if(g_testPermutation || g_testAll)
{
@@ -543,7 +549,7 @@ int main ( int argc, char ** argv ) hashToTest = argv[1];
}
- SetAffinity(2);
+ SetAffinity(3);
SelfTest();
@@ -561,21 +567,11 @@ int main ( int argc, char ** argv ) //g_testDiffDist = true;
//g_testSparse = true;
//g_testPermutation = true;
+ //g_testWindow = true;
//g_testZeroes = true;
testHash(hashToTest);
- /*
- for(int i = 0; i < sizeof(g_hashes)/sizeof(HashInfo); i++)
- {
- testHash(g_hashes[i].name);
- }
- */
-
- //testHash("murmur3a");
- //testHash("murmur3c");
- //testHash("murmur3f");
-
//----------
int timeEnd = clock();
|