summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoraappleby@google.com <aappleby@google.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a>2011-04-04 22:42:08 +0000
committeraappleby@google.com <aappleby@google.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a>2011-04-04 22:42:08 +0000
commit6bde2782cde84aea2002eed32ffd12c49d72544d (patch)
tree7d7662091895048c246dfc481110c0c0b9695e0a
parenta6248fc797e4a4491f58c345bad6973fe67ef4b1 (diff)
downloadsrc-6bde2782cde84aea2002eed32ffd12c49d72544d.tar.gz
Add improved timing code that attempts to filter out spurious timing results
git-svn-id: http://smhasher.googlecode.com/svn/trunk@119 77a7d1d3-4c08-bdc2-d393-d5859734b01a
-rw-r--r--Platform.h2
-rw-r--r--SpeedTest.cpp266
2 files changed, 197 insertions, 71 deletions
diff --git a/Platform.h b/Platform.h
index f15580e..8bb0d58 100644
--- a/Platform.h
+++ b/Platform.h
@@ -11,6 +11,7 @@ void SetAffinity ( int cpu );
#if defined(_MSC_VER)
#define FORCE_INLINE __forceinline
+#define NEVER_INLINE __declspec(noinline)
#include <stdlib.h>
#include <math.h> // Has to be included before intrin.h or VC complains about 'ceil'
@@ -40,6 +41,7 @@ void SetAffinity ( int cpu );
#include <stdint.h>
#define FORCE_INLINE __attribute__((always_inline))
+#define NEVER_INLINE __attribute__((noinline))
inline uint32_t rotl32 ( uint32_t x, int8_t r )
{
diff --git a/SpeedTest.cpp b/SpeedTest.cpp
index d0bba8b..9012ae3 100644
--- a/SpeedTest.cpp
+++ b/SpeedTest.cpp
@@ -4,117 +4,241 @@
#include <stdio.h> // for printf
#include <memory.h> // for memset
+#include <math.h> // for sqrt
+#include <algorithm> // for sort
//-----------------------------------------------------------------------------
-// 256k blocks seem to give the best results.
+// We view our timing values as a series of random variables V that has been
+// contaminated with occasional outliers due to cache misses, thread
+// preemption, etcetera. To filter out the outliers, we search for the largest
+// subset of V such that all its values are within three standard deviations
+// of the mean.
-void BulkSpeedTest ( pfHash hash, uint32_t seed )
+double CalcMean ( std::vector<double> & v )
{
- Rand r(seed);
+ double mean = 0;
- const int trials = 29999;
- const int blocksize = 256 * 1024;
-
- printf("Bulk speed test - %d-byte keys\n",blocksize);
-
- uint8_t * block = new uint8_t[blocksize + 16];
+ for(int i = 0; i < (int)v.size(); i++)
+ {
+ mean += v[i];
+ }
+
+ mean /= double(v.size());
+
+ return mean;
+}
- r.rand_p(block,blocksize+16);
+double CalcMean ( std::vector<double> & v, int a, int b )
+{
+ double mean = 0;
+
+ for(int i = a; i <= b; i++)
+ {
+ mean += v[i];
+ }
+
+ mean /= (b-a+1);
+
+ return mean;
+}
- uint32_t temp[16];
+double CalcStdv ( std::vector<double> & v, int a, int b )
+{
+ double mean = CalcMean(v,a,b);
- for(int align = 0; align < 8; align++)
+ double stdv = 0;
+
+ for(int i = a; i <= b; i++)
{
- double bestbpc = 0;
-
- for(int itrial = 0; itrial < trials; itrial++)
- {
- int64_t begin,end;
+ double x = v[i] - mean;
+
+ stdv += x*x;
+ }
+
+ stdv = sqrt(stdv / (b-a+1));
+
+ return stdv;
+}
- begin = rdtsc();
+// Return true if the largest value in v[0,len) is more than three
+// standard deviations from the mean
- hash(block + align,blocksize,itrial,temp);
+bool ContainsOutlier ( std::vector<double> & v, int len )
+{
+ double mean = 0;
+
+ for(int i = 0; i < len; i++)
+ {
+ mean += v[i];
+ }
+
+ mean /= double(len);
+
+ double stdv = 0;
+
+ for(int i = 0; i < len; i++)
+ {
+ double x = v[i] - mean;
+ stdv += x*x;
+ }
+
+ stdv = sqrt(stdv / double(len));
- end = rdtsc();
+ double cutoff = mean + stdv*3;
+
+ return v[len-1] > cutoff;
+}
- blackhole(temp[0]);
+// Do a binary search to find the largest subset of v that does not contain
+// outliers.
- double cycles = double(end-begin);
- if(cycles > 0)
- {
- double bpc = double(blocksize) / cycles;
- if(bpc > bestbpc) bestbpc = bpc;
- }
+void FilterOutliers ( std::vector<double> & v )
+{
+ std::sort(v.begin(),v.end());
+
+ int len = 0;
+
+ for(int x = 0x40000000; x; x = x >> 1 )
+ {
+ if((len | x) >= v.size()) continue;
+
+ if(!ContainsOutlier(v,len | x))
+ {
+ len |= x;
}
-
- double bestbps = (bestbpc * 3000000000.0 / 1048576.0);
- printf("Alignment %2d - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",align,bestbpc,bestbps);
}
+
+ v.resize(len);
+}
- delete [] block;
+// Iteratively tighten the set to find a subset that does not contain
+// outliers. I'm not positive this works correctly in all cases.
+
+void FilterOutliers2 ( std::vector<double> & v )
+{
+ std::sort(v.begin(),v.end());
+
+ int a = 0;
+ int b = (int)(v.size() - 1);
+
+ for(int i = 0; i < 10; i++)
+ {
+ //printf("%d %d\n",a,b);
+
+ double mean = CalcMean(v,a,b);
+ double stdv = CalcStdv(v,a,b);
+
+ double cutA = mean - stdv*3;
+ double cutB = mean + stdv*3;
+
+ while((a < b) && (v[a] < cutA)) a++;
+ while((b > a) && (v[b] > cutB)) b--;
+ }
+
+ std::vector<double> v2;
+
+ v2.insert(v2.begin(),v.begin()+a,v.begin()+b+1);
+
+ v.swap(v2);
}
//-----------------------------------------------------------------------------
+// We really want the rdtsc() calls to bracket the function call as tightly
+// as possible, but that's hard to do portably. We'll try and get as close as
+// possible by marking the function as NEVER_INLINE (to keep the optimizer from
+// moving it) and marking the timing variables as "volatile register".
-void TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose, double & outCycles )
+NEVER_INLINE int64_t timehash ( pfHash hash, const void * key, int len, int seed )
{
- const int trials = 300000;
-
- if(verbose) printf("Small key speed test - %4d-byte keys - ",keysize);
+ volatile register int64_t begin,end;
+
+ uint32_t temp[16];
+
+ begin = rdtsc();
+ hash(key,len,seed,temp);
+
+ end = rdtsc();
+
+ return end-begin;
+}
+
+//-----------------------------------------------------------------------------
+
+double SpeedTest ( pfHash hash, uint32_t seed, const int trials, const int blocksize, const int align )
+{
Rand r(seed);
+
+ uint8_t * buf = new uint8_t[blocksize + 64];
- uint8_t * h = new uint8_t[hashsize];
- uint8_t * k = new uint8_t[keysize];
+ uint64_t t1 = reinterpret_cast<uint64_t>(buf);
+
+ t1 = (t1 + 15) & 0xFFFFFFFFFFFFFFF0;
+ t1 += align;
- memset(h,0,hashsize);
- memset(k,0,keysize);
+ uint8_t * block = reinterpret_cast<uint8_t*>(t1);
- double bestcycles = 1e9;
+ r.rand_p(block,blocksize);
- for(int itrial = 0; itrial < trials; itrial++)
- {
- volatile int64_t begin,end;
+ //----------
- rand_p(k,keysize);
+ std::vector<double> times;
+ times.reserve(trials);
- MixVCode(h,4);
+ for(int itrial = 0; itrial < trials; itrial++)
+ {
+ r.rand_p(block,blocksize);
- begin = rdtsc();
+ double t = timehash(hash,block,blocksize,itrial);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
-
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
+ if(t > 0) times.push_back(t);
+ }
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
+ //----------
+
+ std::sort(times.begin(),times.end());
+
+ FilterOutliers(times);
+
+ delete [] buf;
+
+ return CalcMean(times);
+}
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
- hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h); hash(k,keysize,itrial,h);
+//-----------------------------------------------------------------------------
+// 256k blocks seem to give the best results.
- end = rdtsc();
+void BulkSpeedTest ( pfHash hash, uint32_t seed )
+{
+ Rand r(seed);
+
+ const int trials = 999;
+ const int blocksize = 256 * 1024;
- MixVCode(h,4);
- //printf("0x%08x\n",g_verify);
+ printf("Bulk speed test - %d-byte keys\n",blocksize);
- double cycles = double(end-begin) / 64;
- if((cycles > 0) && (cycles < bestcycles)) bestcycles = cycles;
+ for(int align = 0; align < 8; align++)
+ {
+ double cycles = SpeedTest(hash,seed,trials,blocksize,align);
+
+ double bestbpc = double(blocksize)/cycles;
+
+ double bestbps = (bestbpc * 3000000000.0 / 1048576.0);
+ printf("Alignment %2d - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",align,bestbpc,bestbps);
}
+}
- double bestbpc = double(keysize) / bestcycles;
- if(verbose) printf("%8.2f cycles/hash, %8.4f bytes/cycle\n",bestcycles,bestbpc);
+//-----------------------------------------------------------------------------
+
+void TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose, double & /*outCycles*/ )
+{
+ const int trials = 999999;
- outCycles = bestcycles;
+ if(verbose) printf("Small key speed test - %4d-byte keys - ",keysize);
+
+ double cycles = SpeedTest(hash,seed,trials,keysize,0);
+
+ printf("%8.2f cycles/hash\n",cycles);
}
//-----------------------------------------------------------------------------