Update stressapptest to 1.0.3.

* /dev/shm support to allow >1.4G memory usage for 32 bit app. * Some arm support. * x86 SSE support.
author: nick.j.sanders <nick.j.sanders@93e54ea4-8218-11de-8aaf-8d8425684b44> 2010-05-14 03:47:11 +0000
committer: nick.j.sanders <nick.j.sanders@93e54ea4-8218-11de-8aaf-8d8425684b44> 2010-05-14 03:47:11 +0000
commit: 6d1e64db329883e43dbca06471c093fc23dc9a2e (patch)
tree: f5c51b3fc301cfdee87aa54dd5a92bb16854b599 /src
parent: eea0aac32a3f522ea51d389f44dcd8abcfc5a6e0 (diff)
download: stressapptest-6d1e64db329883e43dbca06471c093fc23dc9a2e.tar.gz
14 files changed, 351 insertions, 148 deletions
diff --git a/src/adler32memcpy.cc b/src/adler32memcpy.cc
index 529dcc4..69324f7 100644
--- a/src/adler32memcpy.cc
+++ b/src/adler32memcpy.cc
@@ -225,19 +225,41 @@ bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64,
 // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
 bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
                     unsigned int size_in_bytes, AdlerChecksum *checksum) {
-// Use assembly implementation only with 64bit compilation.
-#ifndef STRESSAPPTEST_CPU_X86_64
-  // Fall back to C implementation for 32bit compilation.
-  return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
-#else
+// Use assembly implementation where supported.
+#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+
+// Pull a bit of tricky preprocessing to make the inline asm both
+// 32 bit and 64 bit.
+#ifdef STRESSAPPTEST_CPU_I686  // Instead of coding both, x86...
+#define rAX "%%eax"
+#define rCX "%%ecx"
+#define rDX "%%edx"
+#define rBX "%%ebx"
+#define rSP "%%esp"
+#define rBP "%%ebp"
+#define rSI "%%esi"
+#define rDI "%%edi"
+#endif
+
+#ifdef STRESSAPPTEST_CPU_X86_64  // ...and x64, we use rXX macros.
+#define rAX "%%rax"
+#define rCX "%%rcx"
+#define rDX "%%rdx"
+#define rBX "%%rbx"
+#define rSP "%%rsp"
+#define rBP "%%rbp"
+#define rSI "%%rsi"
+#define rDI "%%rdi"
+#endif
+
   // Elements 0 to 3 are used for holding checksum terms a1, a2,
   // b1, b2 respectively. These elements are filled by asm code.
   // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
   // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
   // setting word index 0 and word index 2 to zero).
   // Element 6 and 7 are used for setting a1 and a2 to 1.
-  volatile uint64 checksum_arr[] = {0, 0, 0, 0,
-    0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
+  volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
+      {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
 
   if ((size_in_bytes >> 19) > 0) {
     // Size is too large. Must be less than 2^19 bytes = 512 KB.
@@ -245,23 +267,24 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
   }
 
   // Number of 32-bit words which are not added to a1/a2 in the main loop.
-  uint64 remaining_words = (size_in_bytes % 48) / 4;
+  uint32 remaining_words = (size_in_bytes % 48) / 4;
 
   // Since we are moving 48 bytes at a time number of iterations = total size/48
   // is value of counter.
-  uint64 num_of_48_byte_units = size_in_bytes / 48;
+  uint32 num_of_48_byte_units = size_in_bytes / 48;
 
-  asm volatile(
+  asm volatile (
       // Source address is in ESI (extended source index)
       // destination is in EDI (extended destination index)
-      // and counter is already in ECX (extended counter index).
-      "cmp  $0, %%ecx;"   // Compare counter to zero.
+      // and counter is already in ECX (extended counter
+      // index).
+      "cmp  $0, " rCX ";"   // Compare counter to zero.
       "jz END;"
 
       // XMM6 is initialized with 1 and XMM7 with 0.
-      "prefetchnta  0(%%rsi);"
-      "prefetchnta 64(%%rsi);"
-      "movdqu   48(%%rax), %%xmm6;"
+      "prefetchnta  0(" rSI ");"
+      "prefetchnta 64(" rSI ");"
+      "movdqu   48(" rAX "), %%xmm6;"
       "xorps      %%xmm7, %%xmm7;"
 
       // Start of the loop which copies 48 bytes from source to dst each time.
@@ -269,28 +292,28 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
 
       // Make 6 moves each of 16 bytes from srcmem to XMM registers.
       // We are using 2 words out of 4 words in each XMM register,
-      // word index 0 and word index 2)
-      "movdqa   0(%%rsi), %%xmm0;"
-      "movdqu   4(%%rsi), %%xmm1;"  // Be careful to use unaligned move here.
-      "movdqa  16(%%rsi), %%xmm2;"
-      "movdqu  20(%%rsi), %%xmm3;"
-      "movdqa  32(%%rsi), %%xmm4;"
-      "movdqu  36(%%rsi), %%xmm5;"
+      // word index 0 and word index 2
+      "movdqa   0(" rSI "), %%xmm0;"
+      "movdqu   4(" rSI "), %%xmm1;"  // Be careful to use unaligned move here.
+      "movdqa  16(" rSI "), %%xmm2;"
+      "movdqu  20(" rSI "), %%xmm3;"
+      "movdqa  32(" rSI "), %%xmm4;"
+      "movdqu  36(" rSI "), %%xmm5;"
 
       // Move 3 * 16 bytes from XMM registers to dstmem.
       // Note: this copy must be performed before pinsrw instructions since
       // they will modify the XMM registers.
-      "movntdq %%xmm0,  0(%%rdi);"
-      "movntdq %%xmm2, 16(%%rdi);"
-      "movntdq %%xmm4, 32(%%rdi);"
+      "movntdq %%xmm0,  0(" rDI ");"
+      "movntdq %%xmm2, 16(" rDI ");"
+      "movntdq %%xmm4, 32(" rDI ");"
 
       // Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
-      "andps 32(%%rax), %%xmm0;"
-      "andps 32(%%rax), %%xmm1;"
-      "andps 32(%%rax), %%xmm2;"
-      "andps 32(%%rax), %%xmm3;"
-      "andps 32(%%rax), %%xmm4;"
-      "andps 32(%%rax), %%xmm5;"
+      "andps 32(" rAX "), %%xmm0;"
+      "andps 32(" rAX "), %%xmm1;"
+      "andps 32(" rAX "), %%xmm2;"
+      "andps 32(" rAX "), %%xmm3;"
+      "andps 32(" rAX "), %%xmm4;"
+      "andps 32(" rAX "), %%xmm5;"
 
       // Add XMM0 to XMM6 and then add XMM6 to XMM7.
       // Repeat this for XMM1, ..., XMM5.
@@ -311,43 +334,43 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
       "paddq %%xmm6, %%xmm7;"
 
       // Increment ESI and EDI by 48 bytes and decrement counter by 1.
-      "add $48, %%rsi;"
-      "add $48, %%rdi;"
-      "prefetchnta  0(%%rsi);"
-      "prefetchnta 64(%%rsi);"
-      "dec  %%rcx;"
+      "add $48, " rSI ";"
+      "add $48, " rDI ";"
+      "prefetchnta  0(" rSI ");"
+      "prefetchnta 64(" rSI ");"
+      "dec " rCX ";"
       "jnz TOP;"
 
       // Now only remaining_words 32-bit words are left.
       // make a loop, add first two words to a1 and next two to a2 (just like
       // above loop, the only extra thing we are doing is rechecking
-      // %rdx (=remaining_words) everytime we add a number to a1/a2.
+      // rDX (=remaining_words) everytime we add a number to a1/a2.
       "REM_IS_STILL_NOT_ZERO:\n"
       // Unless remaining_words becomes less than 4 words(16 bytes)
       // there is not much issue and remaining_words will always
       // be a multiple of four by assumption.
-      "cmp $4, %%rdx;"
+      "cmp $4, " rDX ";"
       // In case for some weird reasons if remaining_words becomes
       // less than 4 but not zero then also break the code and go off to END.
       "jl END;"
       // Otherwise just go on and copy data in chunks of 4-words at a time till
       // whole data (<48 bytes) is copied.
-      "movdqa  0(%%rsi), %%xmm0;"      // Copy next 4-words to XMM0 and to XMM1.
+      "movdqa  0(" rSI "), %%xmm0;"    // Copy next 4-words to XMM0 and to XMM1.
 
-      "movdqa  0(%%rsi), %%xmm5;"      // Accomplish movdqu 4(%%rsi) without
+      "movdqa  0(" rSI "), %%xmm5;"    // Accomplish movdqu 4(%rSI) without
       "pshufd $0x39, %%xmm5, %%xmm1;"  // indexing off memory boundary.
 
-      "movntdq %%xmm0,  0(%%rdi);"     // Copy 4-words to destination.
-      "andps  32(%%rax), %%xmm0;"
-      "andps  32(%%rax), %%xmm1;"
+      "movntdq %%xmm0,  0(" rDI ");"   // Copy 4-words to destination.
+      "andps  32(" rAX "), %%xmm0;"
+      "andps  32(" rAX "), %%xmm1;"
       "paddq     %%xmm0, %%xmm6;"
       "paddq     %%xmm6, %%xmm7;"
       "paddq     %%xmm1, %%xmm6;"
       "paddq     %%xmm6, %%xmm7;"
-      "add $16, %%rsi;"
-      "add $16, %%rdi;"
-      "sub $4, %%rdx;"
-      // Decrement %%rdx by 4 since %%rdx is number of 32-bit
+      "add $16, " rSI ";"
+      "add $16, " rDI ";"
+      "sub $4, " rDX ";"
+      // Decrement %rDX by 4 since %rDX is number of 32-bit
       // words left after considering all 48-byte units.
       "jmp REM_IS_STILL_NOT_ZERO;"
 
@@ -356,8 +379,8 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
       // 64 bit numbers and have to be converted to 64 bit numbers)
       // seems like Adler128 (since size of each part is 4 byte rather than
       // 1 byte).
-      "movdqa %%xmm6,   0(%%rax);"
-      "movdqa %%xmm7,  16(%%rax);"
+      "movdqa %%xmm6,   0(" rAX ");"
+      "movdqa %%xmm7,  16(" rAX ");"
       "sfence;"
 
       // No output registers.
@@ -376,5 +399,8 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
   // that there is no problem with memory this just mean that data was copied
   // from src to dst and checksum was calculated successfully).
   return true;
+#else
+  // Fall back to C implementation for anything else.
+  return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
 #endif
 }
diff --git a/src/disk_blocks.h b/src/disk_blocks.h
index f4ca93f..cb634c9 100644
--- a/src/disk_blocks.h
+++ b/src/disk_blocks.h
@@ -100,7 +100,7 @@ class DiskBlockTable {
   typedef vector<int64> PosToAddrVector;
   PosToAddrVector pos_to_addr_;
   AddrToBlockMap addr_to_block_;
-  int64 nelems_;
+  uint64 nelems_;
   int sector_size_;          // Sector size, in bytes
   int write_block_size_;     // Block size, in bytes
   string device_name_;       // Device name
diff --git a/src/finelock_queue.cc b/src/finelock_queue.cc
index 569903a..8d914b8 100644
--- a/src/finelock_queue.cc
+++ b/src/finelock_queue.cc
@@ -45,7 +45,7 @@ FineLockPEQueue::FineLockPEQueue(
   queue_metric_ = kTouch;
 
   {  // Init all the page locks.
-    for (int64 i = 0; i < q_size_; i++) {
+    for (uint64 i = 0; i < q_size_; i++) {
         pthread_mutex_init(&(pagelocks_[i]), NULL);
         // Pages start out owned (locked) by Sat::InitializePages.
         // A locked state indicates that the page state is unknown,
@@ -147,7 +147,7 @@ int64 FineLockPEQueue::getC(int64 m) {
 
 // Destructor: Clean-up allocated memory and destroy pthread locks.
 FineLockPEQueue::~FineLockPEQueue() {
-  int64 i;
+  uint64 i;
   for (i = 0; i < q_size_; i++)
     pthread_mutex_destroy(&(pagelocks_[i]));
   delete[] pagelocks_;
@@ -173,11 +173,11 @@ bool FineLockPEQueue::QueueAnalysis() {
   }
 
   // Bucketize the page counts by highest bit set.
-  for (int64 i = 0; i < q_size_; i++) {
+  for (uint64 i = 0; i < q_size_; i++) {
     uint32 readcount = pages_[i].touch;
     int b = 0;
     for (b = 0; b < 31; b++) {
-      if (readcount < (1 << b))
+      if (readcount < (1u << b))
         break;
     }
 
@@ -271,7 +271,7 @@ bool FineLockPEQueue::GetPageFromPhysical(uint64 paddr,
                                           struct page_entry *pe) {
   // Traverse through array until finding a page
   // that contains the address we want..
-  for (int64 i = 0; i < q_size_; i++) {
+  for (uint64 i = 0; i < q_size_; i++) {
     uint64 page_addr = pages_[i].paddr;
     // This assumes linear vaddr.
     if ((page_addr <= paddr) && (page_addr + page_size_ > paddr)) {
@@ -335,7 +335,7 @@ bool FineLockPEQueue::GetRandomWithPredicateTag(struct page_entry *pe,
   uint64 next_try = 1;
 
   // Traverse through array until finding a page meeting given predicate.
-  for (int64 i = 0; i < q_size_; i++) {
+  for (uint64 i = 0; i < q_size_; i++) {
     uint64 index = (next_try + first_try) % q_size_;
     // Go through the loop linear conguentially. We are offsetting by
     // 'first_try' so this path will be a different sequence for every
diff --git a/src/finelock_queue.h b/src/finelock_queue.h
index 54b154e..2de5a46 100644
--- a/src/finelock_queue.h
+++ b/src/finelock_queue.h
@@ -57,7 +57,9 @@ class FineLockPEQueue {
   uint64 GetRandom64FromSlot(int slot);
 
   // Helper function to check index range, returns true if index is valid.
-  bool valid_index(int64 index) { return index >= 0 && index < q_size_; }
+  bool valid_index(int64 index) {
+    return index >= 0 && static_cast<uint64>(index) < q_size_;
+  }
 
   // Returns true if page entry is valid, false otherwise.
   static bool page_is_valid(struct page_entry *pe) {
@@ -85,7 +87,7 @@ class FineLockPEQueue {
 
   pthread_mutex_t *pagelocks_;  // Per-page-entry locks.
   struct page_entry *pages_;     // Where page entries are held.
-  int64 q_size_;                 // Size of the queue.
+  uint64 q_size_;                // Size of the queue.
   int64 page_size_;              // For calculating array index from offset.
 
   enum {
diff --git a/src/logger.cc b/src/logger.cc
index 81f1e3e..e4ecb03 100644
--- a/src/logger.cc
+++ b/src/logger.cc
@@ -38,7 +38,7 @@ void Logger::VLogF(int priority, const char *format, va_list args) {
   }
   char buffer[4096];
   int length = vsnprintf(buffer, sizeof buffer, format, args);
-  if (length >= sizeof buffer) {
+  if (static_cast<size_t>(length) >= sizeof buffer) {
     length = sizeof buffer;
     buffer[sizeof buffer - 1] = '\n';
   }
@@ -96,7 +96,8 @@ void Logger::QueueLogLine(string *line) {
 
 namespace {
 void WriteToFile(const string& line, int fd) {
-  LOGGER_ASSERT(write(fd, line.data(), line.size()) == line.size());
+  LOGGER_ASSERT(write(fd, line.data(), line.size()) ==
+                static_cast<ssize_t>(line.size()));
 }
 }
 
diff --git a/src/logger.h b/src/logger.h
index 3eaea57..1d70107 100644
--- a/src/logger.h
+++ b/src/logger.h
@@ -28,7 +28,7 @@
 // Attempts to log additional lines will block when the queue reaches this size.
 // Due to how the logging thread works, up to twice this many log lines may be
 // outstanding at any point.
-static const int kMaxQueueSize = 250;
+static const size_t kMaxQueueSize = 250;
 
 
 // This is only for use by the Logger class, do not use it elsewhere!
diff --git a/src/os.cc b/src/os.cc
index 4784028..1340d6b 100644
--- a/src/os.cc
+++ b/src/os.cc
@@ -53,8 +53,12 @@ OsLayer::OsLayer() {
   testmemsize_ = 0;
   totalmemsize_ = 0;
   min_hugepages_bytes_ = 0;
-  error_injection_ = false;
   normal_mem_ = true;
+  use_hugepages_ = false;
+  use_posix_shm_ = false;
+  dynamic_mapped_shmem_ = false;
+  shmid_ = 0;
+
   time_initialized_ = 0;
 
   regionsize_ = 0;
@@ -64,6 +68,13 @@ OsLayer::OsLayer() {
   num_cpus_per_node_ = 0;
   error_diagnoser_ = 0;
   err_log_callback_ = 0;
+  error_injection_ = false;
+
+  void *pvoid = 0;
+  address_mode_ = sizeof(pvoid) * 8;
+
+  has_clflush_ = false;
+  has_sse2_ = false;
 }
 
 // OsLayer cleanup.
@@ -75,8 +86,9 @@ OsLayer::~OsLayer() {
 // OsLayer initialization.
 bool OsLayer::Initialize() {
   time_initialized_ = time(NULL);
-  use_hugepages_ = false;
-  shmid_ = 0;
+  // Detect asm support.
+  GetFeatures();
+
   if (num_cpus_ == 0) {
     num_nodes_ = 1;
     num_cpus_ = sysconf(_SC_NPROCESSORS_ONLN);
@@ -129,13 +141,53 @@ list<string> OsLayer::FindFileDevices() {
   return locations;
 }
 
+
+// Get HW core features from cpuid instruction.
+void OsLayer::GetFeatures() {
+#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+  // CPUID features documented at:
+  // http://www.sandpile.org/ia32/cpuid.htm
+  int ax, bx, cx, dx;
+  __asm__ __volatile__ (
+      "cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (1));
+  has_clflush_ = (dx >> 19) & 1;
+  has_sse2_ = (dx >> 26) & 1;
+
+  logprintf(9, "Log: has clflush: %s, has sse2: %s\n",
+            has_clflush_ ? "true" : "false",
+            has_sse2_ ? "true" : "false");
+#elif defined(STRESSAPPTEST_CPU_PPC)
+  // All PPC implementations have cache flush instructions.
+  has_clflush_ = true;
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+#warning "Unsupported CPU type ARMV7A: unable to determine feature set."
+#else
+#warning "Unsupported CPU type: unable to determine feature set."
+#endif
+}
+
+
 // We need to flush the cacheline here.
 void OsLayer::Flush(void *vaddr) {
   // Use the generic flush. This function is just so we can override
   // this if we are so inclined.
-  FastFlush(vaddr);
+  if (has_clflush_)
+    FastFlush(vaddr);
+}
+
+
+// Run C or ASM copy as appropriate..
+bool OsLayer::AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
+                              unsigned int size_in_bytes,
+                              AdlerChecksum *checksum) {
+  if (has_sse2_) {
+    return AdlerMemcpyAsm(dstmem, srcmem, size_in_bytes, checksum);
+  } else {
+    return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum);
+  }
 }
 
+
 // Translate user virtual to physical address.
 int OsLayer::FindDimm(uint64 addr, char *buf, int len) {
   char tmpbuf[256];
@@ -317,65 +369,155 @@ bool OsLayer::AllocateTestMem(int64 length, uint64 paddr_base) {
   // Try hugepages first.
   void *buf = 0;
 
+  sat_assert(length >= 0);
+
   if (paddr_base)
     logprintf(0, "Process Error: non zero paddr_base %#llx is not supported,"
               " ignore.\n", paddr_base);
 
-  {  // Allocate hugepage mapped memory.
-    int shmid;
-    void *shmaddr;
-
-    if ((shmid = shmget(2, length,
-            SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
-      int err = errno;
-      char errtxt[256] = "";
-      strerror_r(err, errtxt, sizeof(errtxt));
-      logprintf(12, "Log: failed to allocate shared mem object - err %d (%s)\n",
-                err, errtxt);
-      goto hugepage_failover;
-    }
+  // Determine optimal memory allocation path.
+  bool prefer_hugepages = false;
+  bool prefer_posix_shm = false;
+  bool prefer_dynamic_mapping = false;
 
-    shmaddr = shmat(shmid, NULL, NULL);
-    if (shmaddr == reinterpret_cast<void*>(-1)) {
-      int err = errno;
-      char errtxt[256] = "";
-      strerror_r(err, errtxt, sizeof(errtxt));
-      logprintf(0, "Log: failed to attach shared mem object - err %d (%s).\n",
-                err, errtxt);
-      if (shmctl(shmid, IPC_RMID, NULL) < 0) {
+  // Are there enough hugepages?
+  int64 hugepagesize = FindHugePages() * 2 * kMegabyte;
+  // TODO(nsanders): Is there enough /dev/shm? Is there enough free memeory?
+  if ((length >= 1400LL * kMegabyte) && (address_mode_ == 32)) {
+    prefer_dynamic_mapping = true;
+    prefer_posix_shm = true;
+    logprintf(3, "Log: Prefer POSIX shared memory allocation.\n");
+    logprintf(3, "Log: You may need to run "
+                 "'sudo mount -o remount,size=100\% /dev/shm.'\n");
+  } else if (hugepagesize >= length) {
+    prefer_hugepages = true;
+    logprintf(3, "Log: Prefer using hugepace allocation.\n");
+  } else {
+    logprintf(3, "Log: Prefer plain malloc memory allocation.\n");
+  }
+
+  // Allocate hugepage mapped memory.
+  if (prefer_hugepages) {
+    do { // Allow break statement.
+      int shmid;
+      void *shmaddr;
+
+      if ((shmid = shmget(2, length,
+              SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
         int err = errno;
-        char errtxt[256] = "";
-        strerror_r(err, errtxt, sizeof(errtxt));
-        logprintf(0, "Log: failed to remove shared mem object - err %d (%s).\n",
-                  err, errtxt);
+        string errtxt = ErrorString(err);
+        logprintf(3, "Log: failed to allocate shared hugepage "
+                      "object - err %d (%s)\n",
+                  err, errtxt.c_str());
+        logprintf(3, "Log: sysctl -w vm.nr_hugepages=XXX allows hugepages.\n");
+        break;
       }
-      goto hugepage_failover;
-    }
-    use_hugepages_ = true;
-    shmid_ = shmid;
-    buf = shmaddr;
-    logprintf(0, "Log: Using hugepages 0x%x at %p.\n", shmid, shmaddr);
+
+      shmaddr = shmat(shmid, NULL, NULL);
+      if (shmaddr == reinterpret_cast<void*>(-1)) {
+        int err = errno;
+        string errtxt = ErrorString(err);
+        logprintf(0, "Log: failed to attach shared "
+                     "hugepage object - err %d (%s).\n",
+                  err, errtxt.c_str());
+        if (shmctl(shmid, IPC_RMID, NULL) < 0) {
+          int err = errno;
+          string errtxt = ErrorString(err);
+          logprintf(0, "Log: failed to remove shared "
+                       "hugepage object - err %d (%s).\n",
+                    err, errtxt.c_str());
+        }
+        break;
+      }
+      use_hugepages_ = true;
+      shmid_ = shmid;
+      buf = shmaddr;
+      logprintf(0, "Log: Using shared hugepage object 0x%x at %p.\n",
+                shmid, shmaddr);
+    } while (0);
   }
-  hugepage_failover:
 
+  if ((!use_hugepages_) && prefer_posix_shm) {
+    do {
+      int shm_object;
+      void *shmaddr = NULL;
+
+      shm_object = shm_open("/stressapptest", O_CREAT | O_RDWR, S_IRWXU);
+      if (shm_object < 0) {
+        int err = errno;
+        string errtxt = ErrorString(err);
+        logprintf(3, "Log: failed to allocate shared "
+                      "smallpage object - err %d (%s)\n",
+                  err, errtxt.c_str());
+        break;
+      }
+
+      if (0 > ftruncate(shm_object, length)) {
+        int err = errno;
+        string errtxt = ErrorString(err);
+        logprintf(3, "Log: failed to ftruncate shared "
+                      "smallpage object - err %d (%s)\n",
+                  err, errtxt.c_str());
+        break;
+      }
+
+      // 32 bit linux apps can only use ~1.4G of address space.
+      // Use dynamic mapping for allocations larger than that.
+      // Currently perf hit is ~10% for this.
+      if (prefer_dynamic_mapping) {
+        dynamic_mapped_shmem_ = true;
+      } else {
+        // Do a full mapping here otherwise.
+        shmaddr = mmap64(NULL, length, PROT_READ | PROT_WRITE,
+                         MAP_SHARED | MAP_NORESERVE | MAP_LOCKED | MAP_POPULATE,
+                         shm_object, NULL);
+        if (shmaddr == reinterpret_cast<void*>(-1)) {
+          int err = errno;
+          string errtxt = ErrorString(err);
+          logprintf(0, "Log: failed to map shared "
+                       "smallpage object - err %d (%s).\n",
+                    err, errtxt.c_str());
+          break;
+        }
+      }
+
+      use_posix_shm_ = true;
+      shmid_ = shm_object;
+      buf = shmaddr;
+      char location_message[256] = "";
+      if (dynamic_mapped_shmem_) {
+        sprintf(location_message, "mapped as needed");
+      } else {
+        sprintf(location_message, "at %p", shmaddr);
+      }
+      logprintf(0, "Log: Using posix shared memory object 0x%x %s.\n",
+                shm_object, location_message);
+    } while (0);
+    shm_unlink("/stressapptest");
+  }
 
-  if (!use_hugepages_) {
+  if (!use_hugepages_ && !use_posix_shm_) {
     // Use memalign to ensure that blocks are aligned enough for disk direct IO.
     buf = static_cast<char*>(memalign(4096, length));
-    if (buf)
+    if (buf) {
       logprintf(0, "Log: Using memaligned allocation at %p.\n", buf);
-    else
+    } else {
       logprintf(0, "Process Error: memalign returned 0\n");
+      if ((length >= 1499LL * kMegabyte) && (address_mode_ == 32)) {
+        logprintf(0, "Log: You are trying to allocate > 1.4G on a 32 "
+                     "bit process. Please setup shared memory.\n");
+      }
+    }
   }
 
   testmem_ = buf;
-  if (buf) {
+  if (buf || dynamic_mapped_shmem_) {
     testmemsize_ = length;
   } else {
     testmemsize_ = 0;
   }
 
-  return (buf != 0);
+  return (buf != 0) || dynamic_mapped_shmem_;
 }
 
 // Free the test memory.
@@ -384,6 +526,11 @@ void OsLayer::FreeTestMem() {
     if (use_hugepages_) {
       shmdt(testmem_);
       shmctl(shmid_, IPC_RMID, NULL);
+    } else if (use_posix_shm_) {
+      if (!dynamic_mapped_shmem_) {
+        munmap(testmem_, testmemsize_);
+      }
+      close(shmid_);
     } else {
       free(testmem_);
     }
@@ -396,11 +543,37 @@ void OsLayer::FreeTestMem() {
 // Prepare the target memory. It may requre mapping in, or this may be a noop.
 void *OsLayer::PrepareTestMem(uint64 offset, uint64 length) {
   sat_assert((offset + length) <= testmemsize_);
+  if (dynamic_mapped_shmem_) {
+    // TODO(nsanders): Check if we can support MAP_NONBLOCK,
+    // and evaluate performance hit from not using it.
+    void * mapping = mmap64(NULL, length, PROT_READ | PROT_WRITE,
+                     MAP_SHARED | MAP_NORESERVE | MAP_LOCKED | MAP_POPULATE,
+                     shmid_, offset);
+    if (mapping == MAP_FAILED) {
+      string errtxt = ErrorString(errno);
+      logprintf(0, "Process Error: PrepareTestMem mmap64(%llx, %llx) failed. "
+                   "error: %s.\n",
+                offset, length, errtxt.c_str());
+      sat_assert(0);
+    }
+    return mapping;
+  }
+
   return reinterpret_cast<void*>(reinterpret_cast<char*>(testmem_) + offset);
 }
 
 // Release the test memory resources, if any.
 void OsLayer::ReleaseTestMem(void *addr, uint64 offset, uint64 length) {
+  if (dynamic_mapped_shmem_) {
+    int retval = munmap(addr, length);
+    if (retval == -1) {
+      string errtxt = ErrorString(errno);
+      logprintf(0, "Process Error: ReleaseTestMem munmap(%p, %llx) failed. "
+                   "error: %s.\n",
+                addr, length, errtxt.c_str());
+      sat_assert(0);
+    }
+  }
 }
 
 // No error polling on unknown systems.
@@ -453,7 +626,7 @@ uint32 OsLayer::PciRead(int fd, uint32 offset, int width) {
     logprintf(0, "Process Error: Can't seek %x\n", offset);
     return 0;
   }
-  if (read(fd, &datacast, size) != size) {
+  if (read(fd, &datacast, size) != static_cast<ssize_t>(size)) {
     logprintf(0, "Process Error: Can't read %x\n", offset);
     return 0;
   }
@@ -502,7 +675,7 @@ void OsLayer::PciWrite(int fd, uint32 offset, uint32 value, int width) {
     logprintf(0, "Process Error: Can't seek %x\n", offset);
     return;
   }
-  if (write(fd, &datacast, size) != size) {
+  if (write(fd, &datacast, size) != static_cast<ssize_t>(size)) {
     logprintf(0, "Process Error: Can't write %x to %x\n", datacast.l32, offset);
     return;
   }
diff --git a/src/os.h b/src/os.h
index 9ed04d5..28c8a2a 100644
--- a/src/os.h
+++ b/src/os.h
@@ -125,6 +125,8 @@ class OsLayer {
     asm volatile("mfence");
     asm volatile("clflush (%0)" :: "r" (vaddr));
     asm volatile("mfence");
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+  #warning "Unsupported CPU type ARMV7A: Unable to force cache flushes."
 #else
   #warning "Unsupported CPU type: Unable to force cache flushes."
 #endif
@@ -152,6 +154,9 @@ class OsLayer {
     datacast_t data;
     __asm __volatile("rdtsc" : "=a" (data.l32.l), "=d"(data.l32.h));
     tsc = data.l64;
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+  #warning "Unsupported CPU type ARMV7A: your build may not function correctly"
+    tsc = 0;
 #else
   #warning "Unsupported CPU type: your build may not function correctly"
     tsc = 0;
@@ -181,6 +186,8 @@ class OsLayer {
 
   // Returns 32 for 32-bit, 64 for 64-bit.
   virtual int AddressMode();
+  // Update OsLayer state regarding cpu support for various features.
+  virtual void GetFeatures();
 
   // Open, read, write pci cfg through /proc/bus/pci. fd is /proc/pci file.
   virtual int PciOpen(int bus, int device, int function);
@@ -217,12 +224,10 @@ class OsLayer {
   // Detect all PCI Devices.
   virtual PCIDevices GetPCIDevices();
 
-  // Default platform dependent warm Adler memcpy to C implementation
-  // for compatibility.
+  // Disambiguate between different "warm" memcopies.
   virtual bool AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
                                unsigned int size_in_bytes,
-                               AdlerChecksum *checksum)
-    {return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum);}
+                               AdlerChecksum *checksum);
 
   // Store a callback to use to print
   // app-specific info about the last error location.
@@ -237,12 +242,14 @@ class OsLayer {
 
  protected:
   void *testmem_;                // Location of test memory.
-  int64 testmemsize_;            // Size of test memory.
+  uint64 testmemsize_;           // Size of test memory.
   int64 totalmemsize_;           // Size of available memory.
   int64 min_hugepages_bytes_;    // Minimum hugepages size.
   bool  error_injection_;        // Do error injection?
   bool  normal_mem_;             // Memory DMA capable?
   bool  use_hugepages_;          // Use hugepage shmem?
+  bool  use_posix_shm_;          // Use 4k page shmem?
+  bool  dynamic_mapped_shmem_;   // Conserve virtual address space.
   int   shmid_;                  // Handle to shmem
 
   int64 regionsize_;             // Size of memory "regions"
@@ -250,6 +257,10 @@ class OsLayer {
   int   num_cpus_;               // Number of cpus in the system.
   int   num_nodes_;              // Number of nodes in the system.
   int   num_cpus_per_node_;      // Number of cpus per node in the system.
+  int   address_mode_;           // Are we running 32 or 64 bit?
+  bool  has_sse2_;               // Do we have sse2 instructions?
+  bool  has_clflush_;            // Do we have clflush instructions?
+
 
   time_t time_initialized_;      // Start time of test.
 
diff --git a/src/pattern.cc b/src/pattern.cc
index 2fb552a..9f22674 100644
--- a/src/pattern.cc
+++ b/src/pattern.cc
@@ -393,7 +393,7 @@ int PatternList::Destroy() {
 
 // Return pattern numbered "i"
 Pattern *PatternList::GetPattern(int i) {
-  if (i < size_) {
+  if (static_cast<unsigned int>(i) < size_) {
     return &patterns_[i];
   }
 
diff --git a/src/pattern.h b/src/pattern.h
index b1168aa..181f839 100644
--- a/src/pattern.h
+++ b/src/pattern.h
@@ -102,7 +102,7 @@ class PatternList {
  private:
   vector<class Pattern> patterns_;
   int weightcount_;  // Total count of pattern weights.
-  int size_;
+  unsigned int size_;
   int initialized_;
   DISALLOW_COPY_AND_ASSIGN(PatternList);
 };
diff --git a/src/sat.cc b/src/sat.cc
index 06b4c65..bed62b7 100644
--- a/src/sat.cc
+++ b/src/sat.cc
@@ -164,26 +164,6 @@ bool Sat::CheckEnvironment() {
     return false;
   }
 
-  if ((address_mode_ == 32) &&
-      (os_->normal_mem()) &&
-      (size_ >= 1499 * kMegabyte)) {
-    if (run_on_anything_) {
-      int64 new_size_mb = 1499;
-      logprintf(1, "Log: 32 bit binary: reducing from %lldMB to %lldMB\n",
-                size_mb_,
-                new_size_mb);
-      size_mb_ = new_size_mb;
-      size_ = size_mb_ * kMegabyte;
-    } else {
-      logprintf(0, "Process Error: %dMB test memory too large "
-                   "for 32 bit binary.\n",
-                static_cast<int>(size_ / kMegabyte));
-      logprintf(0, "Log: Command line option '-A' bypasses this error.\n");
-      bad_status();
-      return false;
-    }
-  }
-
   // If platform is 32 bit Xeon, floor memory size to multiple of 4.
   if (address_mode_ == 32) {
     size_mb_ = (size_mb_ / 4) * 4;
@@ -350,7 +330,7 @@ void Sat::AddrMapUpdate(struct page_entry *pe) {
   for (int i = 0; i < page_length_; i += 4096) {
     uint64 paddr = os_->VirtualToPhysical(base + i);
 
-    int offset = paddr / 4096 / 8;
+    uint32 offset = paddr / 4096 / 8;
     unsigned char mask = 1 << ((paddr / 4096) % 8);
 
     if (offset >= arraysize) {
@@ -969,7 +949,8 @@ bool Sat::ParseArgs(int argc, char **argv) {
   }
 
   // Set disk_pages_ if filesize or page size changed.
-  if (filesize != page_length_ * disk_pages_) {
+  if (filesize != static_cast<uint64>(page_length_) *
+                  static_cast<uint64>(disk_pages_)) {
     disk_pages_ = filesize / page_length_;
     if (disk_pages_ == 0)
       disk_pages_ = 1;
@@ -1014,7 +995,7 @@ void Sat::PrintHelp() {
          " --force_errors_like_crazy   inject a lot of false errors "
          "to test error handling\n"
          " -F               don't result check each transaction\n"
-         "--stop_on_errors  Stop after finding the first error.\n"
+         " --stop_on_errors  Stop after finding the first error.\n"
          " --read-block-size     size of block for reading (-d)\n"
          " --write-block-size    size of block for writing (-d). If not "
          "defined, the size of block for writing will be defined as the "
@@ -1041,7 +1022,7 @@ void Sat::PrintHelp() {
          " --pause_duration duration (in seconds) of each pause\n"
          " --local_numa : choose memory regions associated with "
          "each CPU to be tested by that CPU\n"
-         "--remote_numa : choose memory regions not associated with "
+         " --remote_numa : choose memory regions not associated with "
          "each CPU to be tested by that CPU\n");
 }
 
@@ -1850,7 +1831,7 @@ bool Sat::Cleanup() {
     delete[] page_bitmap_;
   }
 
-  for (int i = 0; i < blocktables_.size(); i++) {
+  for (size_t i = 0; i < blocktables_.size(); i++) {
     delete blocktables_[i];
   }
 
diff --git a/src/sat.h b/src/sat.h
index 950270f..b48f519 100644
--- a/src/sat.h
+++ b/src/sat.h
@@ -164,7 +164,7 @@ class Sat {
 
   bool error_injection_;              // Simulate errors, for unittests.
   bool crazy_error_injection_;        // Simulate lots of errors.
-  int64 max_errorcount_;              // Number of errors before forced exit.
+  uint64 max_errorcount_;             // Number of errors before forced exit.
   int run_on_anything_;               // Ignore unknown machine ereor.
   int use_logfile_;                   // Log to a file.
   char logfilename_[255];             // Name of file to log to.
diff --git a/src/stressapptest_config.h.in b/src/stressapptest_config.h.in
index 535bb34..b78857c 100644
--- a/src/stressapptest_config.h.in
+++ b/src/stressapptest_config.h.in
@@ -148,6 +148,9 @@
 /* Define to 1 if strerror_r returns char *. */
 #undef STRERROR_R_CHAR_P
 
+/* Defined if the target CPU is armv7a */
+#undef STRESSAPPTEST_CPU_ARMV7A
+
 /* Defined if the target CPU is i686 */
 #undef STRESSAPPTEST_CPU_I686
 
diff --git a/src/worker.cc b/src/worker.cc
index c568064..2fab28e 100644
--- a/src/worker.cc
+++ b/src/worker.cc
@@ -86,6 +86,9 @@ namespace {
     int cpu;
 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
     __asm __volatile("cpuid" : "=b" (cpu) : "a" (1) : "cx", "dx");
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+  #warning "Unsupported CPU type ARMV7A: unable to determine core ID."
+    cpu = 0;
 #else
   #warning "Unsupported CPU type: unable to determine core ID."
     cpu = 0;
@@ -1953,7 +1956,6 @@ bool FileThread::Work() {
   }
 
   pages_copied_ = loops * sat_->disk_pages();
-  status_ = result;
 
   // Clean up.
   CloseFile(fd);
@@ -1961,7 +1963,10 @@ bool FileThread::Work() {
 
   logprintf(9, "Log: Completed %d: file thread status %d, %d pages copied\n",
             thread_num_, status_, pages_copied_);
-  return result;
+  // Failure to read from device indicates hardware,
+  // rather than procedural SW error.
+  status_ = true;
+  return true;
 }
 
 bool NetworkThread::IsNetworkStopSet() {
@@ -2259,7 +2264,7 @@ bool NetworkListenThread::ReapSlaves() {
   // Gather status and reap threads.
   logprintf(12, "Log: Joining all outstanding threads\n");
 
-  for (int i = 0; i < child_workers_.size(); i++) {
+  for (size_t i = 0; i < child_workers_.size(); i++) {
     NetworkSlaveThread& child_thread = child_workers_[i]->thread;
     logprintf(12, "Log: Joining slave thread %d\n", i);
     child_thread.JoinThread();
@@ -2689,7 +2694,7 @@ bool DiskThread::GetDiskSize(int fd) {
       return false;
     }
 
-    // If an Elephant is initialized with status DEAD its size will be zero.
+    // Zero size indicates nonworking device..
     if (block_size == 0) {
       os_->ErrorReport(device_name_.c_str(), "device-size-zero", 1);
       ++errorcount_;
@@ -2734,11 +2739,11 @@ int64 DiskThread::GetTime() {
 }
 
 // Do randomized reads and (possibly) writes on a device.
-// Return false on fatal error, either SW or HW.
+// Return false on fatal SW error, true on SW success,
+// regardless of whether HW failed.
 bool DiskThread::DoWork(int fd) {
   int64 block_num = 0;
   int64 num_segments;
-  bool result = true;
 
   if (segment_size_ == -1) {
     num_segments = 1;
@@ -2775,7 +2780,8 @@ bool DiskThread::DoWork(int fd) {
               non_destructive_ ? "(disabled) " : "",
               device_name_.c_str(), thread_num_);
     while (IsReadyToRunNoPause() &&
-           in_flight_sectors_.size() < queue_size_ + 1) {
+           in_flight_sectors_.size() <
+               static_cast<size_t>(queue_size_ + 1)) {
       // Confine testing to a particular segment of the disk.
       int64 segment = (block_num / blocks_per_segment_) % num_segments;
       if (!non_destructive_ &&
@@ -2810,7 +2816,7 @@ bool DiskThread::DoWork(int fd) {
       if (!non_destructive_) {
         if (!WriteBlockToDisk(fd, block)) {
           block_table_->RemoveBlock(block);
-          return false;
+          return true;
         }
         blocks_written_++;
       }
@@ -2829,14 +2835,14 @@ bool DiskThread::DoWork(int fd) {
       BlockData *block = in_flight_sectors_.front();
       in_flight_sectors_.pop();
       if (!ValidateBlockOnDisk(fd, block))
-        return false;
+        return true;
       block_table_->RemoveBlock(block);
       blocks_read_++;
     }
   }
 
   pages_copied_ = blocks_written_ + blocks_read_;
-  return result;
+  return true;
 }
 
 // Do an asynchronous disk I/O operation.
@@ -2923,7 +2929,7 @@ bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size,
 
   // event.res contains the number of bytes written/read or
   // error if < 0, I think.
-  if (event.res != size) {
+  if (event.res != static_cast<uint64>(size)) {
     errorcount_++;
     os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
author	nick.j.sanders <nick.j.sanders@93e54ea4-8218-11de-8aaf-8d8425684b44>	2010-05-14 03:47:11 +0000
committer	nick.j.sanders <nick.j.sanders@93e54ea4-8218-11de-8aaf-8d8425684b44>	2010-05-14 03:47:11 +0000
commit	6d1e64db329883e43dbca06471c093fc23dc9a2e (patch)
tree	f5c51b3fc301cfdee87aa54dd5a92bb16854b599 /src
parent	eea0aac32a3f522ea51d389f44dcd8abcfc5a6e0 (diff)
download	stressapptest-6d1e64db329883e43dbca06471c093fc23dc9a2e.tar.gz