Add improved timing code that attempts to filter out spurious timing results

git-svn-id: http://smhasher.googlecode.com/svn/trunk@119 77a7d1d3-4c08-bdc2-d393-d5859734b01a
author: aappleby@google.com <aappleby@google.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a> 2011-04-04 22:42:08 +0000
committer: aappleby@google.com <aappleby@google.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a> 2011-04-04 22:42:08 +0000
commit: 6bde2782cde84aea2002eed32ffd12c49d72544d (patch)
tree: 7d7662091895048c246dfc481110c0c0b9695e0a
parent: a6248fc797e4a4491f58c345bad6973fe67ef4b1 (diff)
download: src-6bde2782cde84aea2002eed32ffd12c49d72544d.tar.gz
2 files changed, 197 insertions, 71 deletions
diff --git a/Platform.h b/Platform.h
index f15580e..8bb0d58 100644
--- a/Platform.h
+++ b/Platform.h
@@ -11,6 +11,7 @@ void SetAffinity ( int cpu );
 #if defined(_MSC_VER)
 
 #define FORCE_INLINE	__forceinline
+#define	NEVER_INLINE  __declspec(noinline)
 
 #include <stdlib.h>
 #include <math.h>   // Has to be included before intrin.h or VC complains about 'ceil'
@@ -40,6 +41,7 @@ void SetAffinity ( int cpu );
 #include <stdint.h>
 
 #define	FORCE_INLINE __attribute__((always_inline))
+#define	NEVER_INLINE __attribute__((noinline))
 
 inline uint32_t rotl32 ( uint32_t x, int8_t r )
 {
diff --git a/SpeedTest.cpp b/SpeedTest.cpp
index d0bba8b..9012ae3 100644
--- a/SpeedTest.cpp
+++ b/SpeedTest.cpp
@@ -4,117 +4,241 @@
 
 #include <stdio.h>   // for printf
 #include <memory.h>  // for memset
+#include <math.h>    // for sqrt
+#include <algorithm> // for sort
 
 //-----------------------------------------------------------------------------
-// 256k blocks seem to give the best results.
+// We view our timing values as a series of random variables V that has been
+// contaminated with occasional outliers due to cache misses, thread
+// preemption, etcetera. To filter out the outliers, we search for the largest
+// subset of V such that all its values are within three standard deviations
+// of the mean.
 
-void BulkSpeedTest ( pfHash hash, uint32_t seed )
+double CalcMean ( std::vector<double> & v )
 {
-  Rand r(seed);
+  double mean = 0;
   
-  const int trials = 29999;
-  const int blocksize = 256 * 1024;
-
-  printf("Bulk speed test - %d-byte keys\n",blocksize);
-
-  uint8_t * block = new uint8_t[blocksize + 16];
+  for(int i = 0; i < (int)v.size(); i++)
+  {
+    mean += v[i];
+  }
+  
+  mean /= double(v.size());
+  
+  return mean;
+}
 
-  r.rand_p(block,blocksize+16);
+double CalcMean ( std::vector<double> & v, int a, int b )
+{
+  double mean = 0;
+  
+  for(int i = a; i <= b; i++)
+  {
+    mean += v[i];
+  }
+  
+  mean /= (b-a+1);
+  
+  return mean;
+}
 
-  uint32_t temp[16];
+double CalcStdv ( std::vector<double> & v, int a, int b )
+{
+  double mean = CalcMean(v,a,b);
 
-  for(int align = 0; align < 8; align++)
+  double stdv = 0;
+  
+  for(int i = a; i <= b; i++)
   {
-    double bestbpc = 0;
-
-    for(int itrial = 0; itrial < trials; itrial++)
-    {
-      int64_t begin,end;
+    double x = v[i] - mean;
+    
+    stdv += x*x;
+  }
+  
+  stdv = sqrt(stdv / (b-a+1));
+  
+  return stdv;
+}
 
-      begin = rdtsc();
+// Return true if the largest value in v[0,len) is more than three
+// standard deviations from the mean
 
-      hash(block + align,blocksize,itrial,temp);
+bool ContainsOutlier ( std::vector<double> & v, int len )
+{
+  double mean = 0;
+  
+  for(int i = 0; i < len; i++)
+  {
+    mean += v[i];
+  }
+  
+  mean /= double(len);
+  
+  double stdv = 0;
+  
+  for(int i = 0; i < len; i++)
+  {
+    double x = v[i] - mean;
+    stdv += x*x;
+  }
+  
+  stdv = sqrt(stdv / double(len));
 
-      end = rdtsc();
+  double cutoff = mean + stdv*3;
+  
+  return v[len-1] > cutoff;  
+}
 
-      blackhole(temp[0]);
+// Do a binary search to find the largest subset of v that does not contain
+// outliers.
 
-      double cycles = double(end-begin);
-      if(cycles > 0)
-      {
-        double bpc = double(blocksize) / cycles;
-        if(bpc > bestbpc) bestbpc = bpc;
-      }
+void FilterOutliers ( std::vector<double> & v )
+{
+  std::sort(v.begin(),v.end());
+  
+  int len = 0;
+  
+  for(int x = 0x40000000; x; x = x >> 1 )
+  {
+    if((len | x) >= v.size()) continue;
+    
+    if(!ContainsOutlier(v,len | x))
+    {
+      len |= x;
     }
-
-    double bestbps = (bestbpc * 3000000000.0 / 1048576.0);
-    printf("Alignment %2d - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",align,bestbpc,bestbps);
   }
+  
+  v.resize(len);
+}
 
-  delete [] block;
+// Iteratively tighten the set to find a subset that does not contain
+// outliers. I'm not positive this works correctly in all cases.
+
+void FilterOutliers2 ( std::vector<double> & v )
+{
+  std::sort(v.begin(),v.end());
+  
+  int a = 0;
+  int b = (int)(v.size() - 1);
+  
+  for(int i = 0; i < 10; i++)
+  {
+    //printf("%d %d\n",a,b);
+  
+    double mean = CalcMean(v,a,b);
+    double stdv = CalcStdv(v,a,b);
+    
+    double cutA = mean - stdv*3;  
+    double cutB = mean + stdv*3;
+    
+    while((a < b) && (v[a] < cutA)) a++;
+    while((b > a) && (v[b] > cutB)) b--;
+  }
+  
+  std::vector<double> v2;
+  
+  v2.insert(v2.begin(),v.begin()+a,v.begin()+b+1);
+  
+  v.swap(v2);
 }
 
 //-----------------------------------------------------------------------------
+// We really want the rdtsc() calls to bracket the function call as tightly
+// as possible, but that's hard to do portably. We'll try and get as close as
+// possible by marking the function as NEVER_INLINE (to keep the optimizer from
+// moving it) and marking the timing variables as "volatile register".
 
-void TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose, double & outCycles )
+NEVER_INLINE int64_t timehash ( pfHash hash, const void * key, int len, int seed )
 {
-  const int trials = 300000;
-
-  if(verbose) printf("Small key speed test - %4d-byte keys - ",keysize);
+  volatile register int64_t begin,end;
+  
+  uint32_t temp[16];
+  
+  begin = rdtsc();
   
+  hash(key,len,seed,temp);
+  
+  end = rdtsc();
+  
+  return end-begin;
+}
+
+//-----------------------------------------------------------------------------
+
+double SpeedTest ( pfHash hash, uint32_t seed, const int trials, const int blocksize, const int align )
+{
   Rand r(seed);
+  
+  uint8_t * buf = new uint8_t[blocksize + 64];
 
-  uint8_t * h = new uint8_t[hashsize];
-  uint8_t * k = new uint8_t[keysize];
+  uint64_t t1 = reinterpret_cast<uint64_t>(buf);
+  
+  t1 = (t1 + 15) & 0xFFFFFFFFFFFFFFF0;
+  t1 += align;
   
-  memset(h,0,hashsize);
-  memset(k,0,keysize);
+  uint8_t * block = reinterpret_cast<uint8_t*>(t1);
 
-  double bestcycles = 1e9;
+  r.rand_p(block,blocksize);
 
-  for(int itrial = 0; itrial < trials; itrial++)
-  {
-    volatile int64_t begin,end;
+  //----------
 
-    rand_p(k,keysize);
+  std::vector<double> times;
+  times.reserve(trials);
 
-    MixVCode(h,4);
+  for(int itrial = 0; itrial < trials; itrial++)
+  {
+    r.rand_p(block,blocksize);
     
-    begin = rdtsc();
+    double t = timehash(hash,block,blocksize,itrial);
     
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
+    if(t > 0) times.push_back(t);
+  }
 
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
+  //----------
+  
+  std::sort(times.begin(),times.end());
+  
+  FilterOutliers(times);
+  
+  delete [] buf;
+  
+  return CalcMean(times);
+}
 
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
-    hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);   hash(k,keysize,itrial,h);
+//-----------------------------------------------------------------------------
+// 256k blocks seem to give the best results.
 
-    end = rdtsc();
+void BulkSpeedTest ( pfHash hash, uint32_t seed )
+{
+  Rand r(seed);
+  
+  const int trials = 999;
+  const int blocksize = 256 * 1024;
 
-    MixVCode(h,4);
-    //printf("0x%08x\n",g_verify);
+  printf("Bulk speed test - %d-byte keys\n",blocksize);
 
-    double cycles = double(end-begin) / 64;
-    if((cycles > 0) && (cycles < bestcycles)) bestcycles = cycles;
+  for(int align = 0; align < 8; align++)
+  {
+    double cycles = SpeedTest(hash,seed,trials,blocksize,align);
+    
+    double bestbpc = double(blocksize)/cycles;
+    
+    double bestbps = (bestbpc * 3000000000.0 / 1048576.0);
+    printf("Alignment %2d - %6.3f bytes/cycle - %7.2f MiB/sec @ 3 ghz\n",align,bestbpc,bestbps);
   }
+}
 
-  double bestbpc = double(keysize) / bestcycles;
-  if(verbose) printf("%8.2f cycles/hash, %8.4f bytes/cycle\n",bestcycles,bestbpc);
+//-----------------------------------------------------------------------------
+
+void TinySpeedTest ( pfHash hash, int hashsize, int keysize, uint32_t seed, bool verbose, double & /*outCycles*/ )
+{
+  const int trials = 999999;
 
-  outCycles = bestcycles;
+  if(verbose) printf("Small key speed test - %4d-byte keys - ",keysize);
+  
+  double cycles = SpeedTest(hash,seed,trials,keysize,0);
+  
+  printf("%8.2f cycles/hash\n",cycles);  
 }
 
 //-----------------------------------------------------------------------------
author	aappleby@google.com <aappleby@google.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a>	2011-04-04 22:42:08 +0000
committer	aappleby@google.com <aappleby@google.com@77a7d1d3-4c08-bdc2-d393-d5859734b01a>	2011-04-04 22:42:08 +0000
commit	6bde2782cde84aea2002eed32ffd12c49d72544d (patch)
tree	7d7662091895048c246dfc481110c0c0b9695e0a
parent	a6248fc797e4a4491f58c345bad6973fe67ef4b1 (diff)
download	src-6bde2782cde84aea2002eed32ffd12c49d72544d.tar.gz