aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authornick.j.sanders <nick.j.sanders@93e54ea4-8218-11de-8aaf-8d8425684b44>2010-05-14 03:47:11 +0000
committernick.j.sanders <nick.j.sanders@93e54ea4-8218-11de-8aaf-8d8425684b44>2010-05-14 03:47:11 +0000
commit6d1e64db329883e43dbca06471c093fc23dc9a2e (patch)
treef5c51b3fc301cfdee87aa54dd5a92bb16854b599 /src
parenteea0aac32a3f522ea51d389f44dcd8abcfc5a6e0 (diff)
downloadstressapptest-6d1e64db329883e43dbca06471c093fc23dc9a2e.tar.gz
Update stressapptest to 1.0.3.
* /dev/shm support to allow >1.4G memory usage for 32 bit app. * Some arm support. * x86 SSE support.
Diffstat (limited to 'src')
-rw-r--r--src/adler32memcpy.cc124
-rw-r--r--src/disk_blocks.h2
-rw-r--r--src/finelock_queue.cc12
-rw-r--r--src/finelock_queue.h6
-rw-r--r--src/logger.cc5
-rw-r--r--src/logger.h2
-rw-r--r--src/os.cc259
-rw-r--r--src/os.h21
-rw-r--r--src/pattern.cc2
-rw-r--r--src/pattern.h2
-rw-r--r--src/sat.cc31
-rw-r--r--src/sat.h2
-rw-r--r--src/stressapptest_config.h.in3
-rw-r--r--src/worker.cc28
14 files changed, 351 insertions, 148 deletions
diff --git a/src/adler32memcpy.cc b/src/adler32memcpy.cc
index 529dcc4..69324f7 100644
--- a/src/adler32memcpy.cc
+++ b/src/adler32memcpy.cc
@@ -225,19 +225,41 @@ bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64,
// x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy.
bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
unsigned int size_in_bytes, AdlerChecksum *checksum) {
-// Use assembly implementation only with 64bit compilation.
-#ifndef STRESSAPPTEST_CPU_X86_64
- // Fall back to C implementation for 32bit compilation.
- return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
-#else
+// Use assembly implementation where supported.
+#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+
+// Pull a bit of tricky preprocessing to make the inline asm both
+// 32 bit and 64 bit.
+#ifdef STRESSAPPTEST_CPU_I686 // Instead of coding both, x86...
+#define rAX "%%eax"
+#define rCX "%%ecx"
+#define rDX "%%edx"
+#define rBX "%%ebx"
+#define rSP "%%esp"
+#define rBP "%%ebp"
+#define rSI "%%esi"
+#define rDI "%%edi"
+#endif
+
+#ifdef STRESSAPPTEST_CPU_X86_64 // ...and x64, we use rXX macros.
+#define rAX "%%rax"
+#define rCX "%%rcx"
+#define rDX "%%rdx"
+#define rBX "%%rbx"
+#define rSP "%%rsp"
+#define rBP "%%rbp"
+#define rSI "%%rsi"
+#define rDI "%%rdi"
+#endif
+
// Elements 0 to 3 are used for holding checksum terms a1, a2,
// b1, b2 respectively. These elements are filled by asm code.
// Elements 4 and 5 are used by asm code to for ANDing MMX data and removing
// 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are
// setting word index 0 and word index 2 to zero).
// Element 6 and 7 are used for setting a1 and a2 to 1.
- volatile uint64 checksum_arr[] = {0, 0, 0, 0,
- 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
+ volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
+ {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1};
if ((size_in_bytes >> 19) > 0) {
// Size is too large. Must be less than 2^19 bytes = 512 KB.
@@ -245,23 +267,24 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
}
// Number of 32-bit words which are not added to a1/a2 in the main loop.
- uint64 remaining_words = (size_in_bytes % 48) / 4;
+ uint32 remaining_words = (size_in_bytes % 48) / 4;
// Since we are moving 48 bytes at a time number of iterations = total size/48
// is value of counter.
- uint64 num_of_48_byte_units = size_in_bytes / 48;
+ uint32 num_of_48_byte_units = size_in_bytes / 48;
- asm volatile(
+ asm volatile (
// Source address is in ESI (extended source index)
// destination is in EDI (extended destination index)
- // and counter is already in ECX (extended counter index).
- "cmp $0, %%ecx;" // Compare counter to zero.
+ // and counter is already in ECX (extended counter
+ // index).
+ "cmp $0, " rCX ";" // Compare counter to zero.
"jz END;"
// XMM6 is initialized with 1 and XMM7 with 0.
- "prefetchnta 0(%%rsi);"
- "prefetchnta 64(%%rsi);"
- "movdqu 48(%%rax), %%xmm6;"
+ "prefetchnta 0(" rSI ");"
+ "prefetchnta 64(" rSI ");"
+ "movdqu 48(" rAX "), %%xmm6;"
"xorps %%xmm7, %%xmm7;"
// Start of the loop which copies 48 bytes from source to dst each time.
@@ -269,28 +292,28 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
// Make 6 moves each of 16 bytes from srcmem to XMM registers.
// We are using 2 words out of 4 words in each XMM register,
- // word index 0 and word index 2)
- "movdqa 0(%%rsi), %%xmm0;"
- "movdqu 4(%%rsi), %%xmm1;" // Be careful to use unaligned move here.
- "movdqa 16(%%rsi), %%xmm2;"
- "movdqu 20(%%rsi), %%xmm3;"
- "movdqa 32(%%rsi), %%xmm4;"
- "movdqu 36(%%rsi), %%xmm5;"
+ // word index 0 and word index 2
+ "movdqa 0(" rSI "), %%xmm0;"
+ "movdqu 4(" rSI "), %%xmm1;" // Be careful to use unaligned move here.
+ "movdqa 16(" rSI "), %%xmm2;"
+ "movdqu 20(" rSI "), %%xmm3;"
+ "movdqa 32(" rSI "), %%xmm4;"
+ "movdqu 36(" rSI "), %%xmm5;"
// Move 3 * 16 bytes from XMM registers to dstmem.
// Note: this copy must be performed before pinsrw instructions since
// they will modify the XMM registers.
- "movntdq %%xmm0, 0(%%rdi);"
- "movntdq %%xmm2, 16(%%rdi);"
- "movntdq %%xmm4, 32(%%rdi);"
+ "movntdq %%xmm0, 0(" rDI ");"
+ "movntdq %%xmm2, 16(" rDI ");"
+ "movntdq %%xmm4, 32(" rDI ");"
// Sets the word[1] and word[3] of XMM0 to XMM5 to zero.
- "andps 32(%%rax), %%xmm0;"
- "andps 32(%%rax), %%xmm1;"
- "andps 32(%%rax), %%xmm2;"
- "andps 32(%%rax), %%xmm3;"
- "andps 32(%%rax), %%xmm4;"
- "andps 32(%%rax), %%xmm5;"
+ "andps 32(" rAX "), %%xmm0;"
+ "andps 32(" rAX "), %%xmm1;"
+ "andps 32(" rAX "), %%xmm2;"
+ "andps 32(" rAX "), %%xmm3;"
+ "andps 32(" rAX "), %%xmm4;"
+ "andps 32(" rAX "), %%xmm5;"
// Add XMM0 to XMM6 and then add XMM6 to XMM7.
// Repeat this for XMM1, ..., XMM5.
@@ -311,43 +334,43 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
"paddq %%xmm6, %%xmm7;"
// Increment ESI and EDI by 48 bytes and decrement counter by 1.
- "add $48, %%rsi;"
- "add $48, %%rdi;"
- "prefetchnta 0(%%rsi);"
- "prefetchnta 64(%%rsi);"
- "dec %%rcx;"
+ "add $48, " rSI ";"
+ "add $48, " rDI ";"
+ "prefetchnta 0(" rSI ");"
+ "prefetchnta 64(" rSI ");"
+ "dec " rCX ";"
"jnz TOP;"
// Now only remaining_words 32-bit words are left.
// make a loop, add first two words to a1 and next two to a2 (just like
// above loop, the only extra thing we are doing is rechecking
- // %rdx (=remaining_words) everytime we add a number to a1/a2.
+ // rDX (=remaining_words) everytime we add a number to a1/a2.
"REM_IS_STILL_NOT_ZERO:\n"
// Unless remaining_words becomes less than 4 words(16 bytes)
// there is not much issue and remaining_words will always
// be a multiple of four by assumption.
- "cmp $4, %%rdx;"
+ "cmp $4, " rDX ";"
// In case for some weird reasons if remaining_words becomes
// less than 4 but not zero then also break the code and go off to END.
"jl END;"
// Otherwise just go on and copy data in chunks of 4-words at a time till
// whole data (<48 bytes) is copied.
- "movdqa 0(%%rsi), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1.
+ "movdqa 0(" rSI "), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1.
- "movdqa 0(%%rsi), %%xmm5;" // Accomplish movdqu 4(%%rsi) without
+ "movdqa 0(" rSI "), %%xmm5;" // Accomplish movdqu 4(%rSI) without
"pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary.
- "movntdq %%xmm0, 0(%%rdi);" // Copy 4-words to destination.
- "andps 32(%%rax), %%xmm0;"
- "andps 32(%%rax), %%xmm1;"
+ "movntdq %%xmm0, 0(" rDI ");" // Copy 4-words to destination.
+ "andps 32(" rAX "), %%xmm0;"
+ "andps 32(" rAX "), %%xmm1;"
"paddq %%xmm0, %%xmm6;"
"paddq %%xmm6, %%xmm7;"
"paddq %%xmm1, %%xmm6;"
"paddq %%xmm6, %%xmm7;"
- "add $16, %%rsi;"
- "add $16, %%rdi;"
- "sub $4, %%rdx;"
- // Decrement %%rdx by 4 since %%rdx is number of 32-bit
+ "add $16, " rSI ";"
+ "add $16, " rDI ";"
+ "sub $4, " rDX ";"
+ // Decrement %rDX by 4 since %rDX is number of 32-bit
// words left after considering all 48-byte units.
"jmp REM_IS_STILL_NOT_ZERO;"
@@ -356,8 +379,8 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
// 64 bit numbers and have to be converted to 64 bit numbers)
// seems like Adler128 (since size of each part is 4 byte rather than
// 1 byte).
- "movdqa %%xmm6, 0(%%rax);"
- "movdqa %%xmm7, 16(%%rax);"
+ "movdqa %%xmm6, 0(" rAX ");"
+ "movdqa %%xmm7, 16(" rAX ");"
"sfence;"
// No output registers.
@@ -376,5 +399,8 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
// that there is no problem with memory this just mean that data was copied
// from src to dst and checksum was calculated successfully).
return true;
+#else
+ // Fall back to C implementation for anything else.
+ return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
#endif
}
diff --git a/src/disk_blocks.h b/src/disk_blocks.h
index f4ca93f..cb634c9 100644
--- a/src/disk_blocks.h
+++ b/src/disk_blocks.h
@@ -100,7 +100,7 @@ class DiskBlockTable {
typedef vector<int64> PosToAddrVector;
PosToAddrVector pos_to_addr_;
AddrToBlockMap addr_to_block_;
- int64 nelems_;
+ uint64 nelems_;
int sector_size_; // Sector size, in bytes
int write_block_size_; // Block size, in bytes
string device_name_; // Device name
diff --git a/src/finelock_queue.cc b/src/finelock_queue.cc
index 569903a..8d914b8 100644
--- a/src/finelock_queue.cc
+++ b/src/finelock_queue.cc
@@ -45,7 +45,7 @@ FineLockPEQueue::FineLockPEQueue(
queue_metric_ = kTouch;
{ // Init all the page locks.
- for (int64 i = 0; i < q_size_; i++) {
+ for (uint64 i = 0; i < q_size_; i++) {
pthread_mutex_init(&(pagelocks_[i]), NULL);
// Pages start out owned (locked) by Sat::InitializePages.
// A locked state indicates that the page state is unknown,
@@ -147,7 +147,7 @@ int64 FineLockPEQueue::getC(int64 m) {
// Destructor: Clean-up allocated memory and destroy pthread locks.
FineLockPEQueue::~FineLockPEQueue() {
- int64 i;
+ uint64 i;
for (i = 0; i < q_size_; i++)
pthread_mutex_destroy(&(pagelocks_[i]));
delete[] pagelocks_;
@@ -173,11 +173,11 @@ bool FineLockPEQueue::QueueAnalysis() {
}
// Bucketize the page counts by highest bit set.
- for (int64 i = 0; i < q_size_; i++) {
+ for (uint64 i = 0; i < q_size_; i++) {
uint32 readcount = pages_[i].touch;
int b = 0;
for (b = 0; b < 31; b++) {
- if (readcount < (1 << b))
+ if (readcount < (1u << b))
break;
}
@@ -271,7 +271,7 @@ bool FineLockPEQueue::GetPageFromPhysical(uint64 paddr,
struct page_entry *pe) {
// Traverse through array until finding a page
// that contains the address we want..
- for (int64 i = 0; i < q_size_; i++) {
+ for (uint64 i = 0; i < q_size_; i++) {
uint64 page_addr = pages_[i].paddr;
// This assumes linear vaddr.
if ((page_addr <= paddr) && (page_addr + page_size_ > paddr)) {
@@ -335,7 +335,7 @@ bool FineLockPEQueue::GetRandomWithPredicateTag(struct page_entry *pe,
uint64 next_try = 1;
// Traverse through array until finding a page meeting given predicate.
- for (int64 i = 0; i < q_size_; i++) {
+ for (uint64 i = 0; i < q_size_; i++) {
uint64 index = (next_try + first_try) % q_size_;
// Go through the loop linear conguentially. We are offsetting by
// 'first_try' so this path will be a different sequence for every
diff --git a/src/finelock_queue.h b/src/finelock_queue.h
index 54b154e..2de5a46 100644
--- a/src/finelock_queue.h
+++ b/src/finelock_queue.h
@@ -57,7 +57,9 @@ class FineLockPEQueue {
uint64 GetRandom64FromSlot(int slot);
// Helper function to check index range, returns true if index is valid.
- bool valid_index(int64 index) { return index >= 0 && index < q_size_; }
+ bool valid_index(int64 index) {
+ return index >= 0 && static_cast<uint64>(index) < q_size_;
+ }
// Returns true if page entry is valid, false otherwise.
static bool page_is_valid(struct page_entry *pe) {
@@ -85,7 +87,7 @@ class FineLockPEQueue {
pthread_mutex_t *pagelocks_; // Per-page-entry locks.
struct page_entry *pages_; // Where page entries are held.
- int64 q_size_; // Size of the queue.
+ uint64 q_size_; // Size of the queue.
int64 page_size_; // For calculating array index from offset.
enum {
diff --git a/src/logger.cc b/src/logger.cc
index 81f1e3e..e4ecb03 100644
--- a/src/logger.cc
+++ b/src/logger.cc
@@ -38,7 +38,7 @@ void Logger::VLogF(int priority, const char *format, va_list args) {
}
char buffer[4096];
int length = vsnprintf(buffer, sizeof buffer, format, args);
- if (length >= sizeof buffer) {
+ if (static_cast<size_t>(length) >= sizeof buffer) {
length = sizeof buffer;
buffer[sizeof buffer - 1] = '\n';
}
@@ -96,7 +96,8 @@ void Logger::QueueLogLine(string *line) {
namespace {
void WriteToFile(const string& line, int fd) {
- LOGGER_ASSERT(write(fd, line.data(), line.size()) == line.size());
+ LOGGER_ASSERT(write(fd, line.data(), line.size()) ==
+ static_cast<ssize_t>(line.size()));
}
}
diff --git a/src/logger.h b/src/logger.h
index 3eaea57..1d70107 100644
--- a/src/logger.h
+++ b/src/logger.h
@@ -28,7 +28,7 @@
// Attempts to log additional lines will block when the queue reaches this size.
// Due to how the logging thread works, up to twice this many log lines may be
// outstanding at any point.
-static const int kMaxQueueSize = 250;
+static const size_t kMaxQueueSize = 250;
// This is only for use by the Logger class, do not use it elsewhere!
diff --git a/src/os.cc b/src/os.cc
index 4784028..1340d6b 100644
--- a/src/os.cc
+++ b/src/os.cc
@@ -53,8 +53,12 @@ OsLayer::OsLayer() {
testmemsize_ = 0;
totalmemsize_ = 0;
min_hugepages_bytes_ = 0;
- error_injection_ = false;
normal_mem_ = true;
+ use_hugepages_ = false;
+ use_posix_shm_ = false;
+ dynamic_mapped_shmem_ = false;
+ shmid_ = 0;
+
time_initialized_ = 0;
regionsize_ = 0;
@@ -64,6 +68,13 @@ OsLayer::OsLayer() {
num_cpus_per_node_ = 0;
error_diagnoser_ = 0;
err_log_callback_ = 0;
+ error_injection_ = false;
+
+ void *pvoid = 0;
+ address_mode_ = sizeof(pvoid) * 8;
+
+ has_clflush_ = false;
+ has_sse2_ = false;
}
// OsLayer cleanup.
@@ -75,8 +86,9 @@ OsLayer::~OsLayer() {
// OsLayer initialization.
bool OsLayer::Initialize() {
time_initialized_ = time(NULL);
- use_hugepages_ = false;
- shmid_ = 0;
+ // Detect asm support.
+ GetFeatures();
+
if (num_cpus_ == 0) {
num_nodes_ = 1;
num_cpus_ = sysconf(_SC_NPROCESSORS_ONLN);
@@ -129,13 +141,53 @@ list<string> OsLayer::FindFileDevices() {
return locations;
}
+
+// Get HW core features from cpuid instruction.
+void OsLayer::GetFeatures() {
+#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+ // CPUID features documented at:
+ // http://www.sandpile.org/ia32/cpuid.htm
+ int ax, bx, cx, dx;
+ __asm__ __volatile__ (
+ "cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (1));
+ has_clflush_ = (dx >> 19) & 1;
+ has_sse2_ = (dx >> 26) & 1;
+
+ logprintf(9, "Log: has clflush: %s, has sse2: %s\n",
+ has_clflush_ ? "true" : "false",
+ has_sse2_ ? "true" : "false");
+#elif defined(STRESSAPPTEST_CPU_PPC)
+ // All PPC implementations have cache flush instructions.
+ has_clflush_ = true;
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+#warning "Unsupported CPU type ARMV7A: unable to determine feature set."
+#else
+#warning "Unsupported CPU type: unable to determine feature set."
+#endif
+}
+
+
// We need to flush the cacheline here.
void OsLayer::Flush(void *vaddr) {
// Use the generic flush. This function is just so we can override
// this if we are so inclined.
- FastFlush(vaddr);
+ if (has_clflush_)
+ FastFlush(vaddr);
+}
+
+
+// Run C or ASM copy as appropriate..
+bool OsLayer::AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
+ unsigned int size_in_bytes,
+ AdlerChecksum *checksum) {
+ if (has_sse2_) {
+ return AdlerMemcpyAsm(dstmem, srcmem, size_in_bytes, checksum);
+ } else {
+ return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum);
+ }
}
+
// Translate user virtual to physical address.
int OsLayer::FindDimm(uint64 addr, char *buf, int len) {
char tmpbuf[256];
@@ -317,65 +369,155 @@ bool OsLayer::AllocateTestMem(int64 length, uint64 paddr_base) {
// Try hugepages first.
void *buf = 0;
+ sat_assert(length >= 0);
+
if (paddr_base)
logprintf(0, "Process Error: non zero paddr_base %#llx is not supported,"
" ignore.\n", paddr_base);
- { // Allocate hugepage mapped memory.
- int shmid;
- void *shmaddr;
-
- if ((shmid = shmget(2, length,
- SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
- int err = errno;
- char errtxt[256] = "";
- strerror_r(err, errtxt, sizeof(errtxt));
- logprintf(12, "Log: failed to allocate shared mem object - err %d (%s)\n",
- err, errtxt);
- goto hugepage_failover;
- }
+ // Determine optimal memory allocation path.
+ bool prefer_hugepages = false;
+ bool prefer_posix_shm = false;
+ bool prefer_dynamic_mapping = false;
- shmaddr = shmat(shmid, NULL, NULL);
- if (shmaddr == reinterpret_cast<void*>(-1)) {
- int err = errno;
- char errtxt[256] = "";
- strerror_r(err, errtxt, sizeof(errtxt));
- logprintf(0, "Log: failed to attach shared mem object - err %d (%s).\n",
- err, errtxt);
- if (shmctl(shmid, IPC_RMID, NULL) < 0) {
+ // Are there enough hugepages?
+ int64 hugepagesize = FindHugePages() * 2 * kMegabyte;
+ // TODO(nsanders): Is there enough /dev/shm? Is there enough free memeory?
+ if ((length >= 1400LL * kMegabyte) && (address_mode_ == 32)) {
+ prefer_dynamic_mapping = true;
+ prefer_posix_shm = true;
+ logprintf(3, "Log: Prefer POSIX shared memory allocation.\n");
+ logprintf(3, "Log: You may need to run "
+ "'sudo mount -o remount,size=100\% /dev/shm.'\n");
+ } else if (hugepagesize >= length) {
+ prefer_hugepages = true;
+ logprintf(3, "Log: Prefer using hugepace allocation.\n");
+ } else {
+ logprintf(3, "Log: Prefer plain malloc memory allocation.\n");
+ }
+
+ // Allocate hugepage mapped memory.
+ if (prefer_hugepages) {
+ do { // Allow break statement.
+ int shmid;
+ void *shmaddr;
+
+ if ((shmid = shmget(2, length,
+ SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
int err = errno;
- char errtxt[256] = "";
- strerror_r(err, errtxt, sizeof(errtxt));
- logprintf(0, "Log: failed to remove shared mem object - err %d (%s).\n",
- err, errtxt);
+ string errtxt = ErrorString(err);
+ logprintf(3, "Log: failed to allocate shared hugepage "
+ "object - err %d (%s)\n",
+ err, errtxt.c_str());
+ logprintf(3, "Log: sysctl -w vm.nr_hugepages=XXX allows hugepages.\n");
+ break;
}
- goto hugepage_failover;
- }
- use_hugepages_ = true;
- shmid_ = shmid;
- buf = shmaddr;
- logprintf(0, "Log: Using hugepages 0x%x at %p.\n", shmid, shmaddr);
+
+ shmaddr = shmat(shmid, NULL, NULL);
+ if (shmaddr == reinterpret_cast<void*>(-1)) {
+ int err = errno;
+ string errtxt = ErrorString(err);
+ logprintf(0, "Log: failed to attach shared "
+ "hugepage object - err %d (%s).\n",
+ err, errtxt.c_str());
+ if (shmctl(shmid, IPC_RMID, NULL) < 0) {
+ int err = errno;
+ string errtxt = ErrorString(err);
+ logprintf(0, "Log: failed to remove shared "
+ "hugepage object - err %d (%s).\n",
+ err, errtxt.c_str());
+ }
+ break;
+ }
+ use_hugepages_ = true;
+ shmid_ = shmid;
+ buf = shmaddr;
+ logprintf(0, "Log: Using shared hugepage object 0x%x at %p.\n",
+ shmid, shmaddr);
+ } while (0);
}
- hugepage_failover:
+ if ((!use_hugepages_) && prefer_posix_shm) {
+ do {
+ int shm_object;
+ void *shmaddr = NULL;
+
+ shm_object = shm_open("/stressapptest", O_CREAT | O_RDWR, S_IRWXU);
+ if (shm_object < 0) {
+ int err = errno;
+ string errtxt = ErrorString(err);
+ logprintf(3, "Log: failed to allocate shared "
+ "smallpage object - err %d (%s)\n",
+ err, errtxt.c_str());
+ break;
+ }
+
+ if (0 > ftruncate(shm_object, length)) {
+ int err = errno;
+ string errtxt = ErrorString(err);
+ logprintf(3, "Log: failed to ftruncate shared "
+ "smallpage object - err %d (%s)\n",
+ err, errtxt.c_str());
+ break;
+ }
+
+ // 32 bit linux apps can only use ~1.4G of address space.
+ // Use dynamic mapping for allocations larger than that.
+ // Currently perf hit is ~10% for this.
+ if (prefer_dynamic_mapping) {
+ dynamic_mapped_shmem_ = true;
+ } else {
+ // Do a full mapping here otherwise.
+ shmaddr = mmap64(NULL, length, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_NORESERVE | MAP_LOCKED | MAP_POPULATE,
+ shm_object, NULL);
+ if (shmaddr == reinterpret_cast<void*>(-1)) {
+ int err = errno;
+ string errtxt = ErrorString(err);
+ logprintf(0, "Log: failed to map shared "
+ "smallpage object - err %d (%s).\n",
+ err, errtxt.c_str());
+ break;
+ }
+ }
+
+ use_posix_shm_ = true;
+ shmid_ = shm_object;
+ buf = shmaddr;
+ char location_message[256] = "";
+ if (dynamic_mapped_shmem_) {
+ sprintf(location_message, "mapped as needed");
+ } else {
+ sprintf(location_message, "at %p", shmaddr);
+ }
+ logprintf(0, "Log: Using posix shared memory object 0x%x %s.\n",
+ shm_object, location_message);
+ } while (0);
+ shm_unlink("/stressapptest");
+ }
- if (!use_hugepages_) {
+ if (!use_hugepages_ && !use_posix_shm_) {
// Use memalign to ensure that blocks are aligned enough for disk direct IO.
buf = static_cast<char*>(memalign(4096, length));
- if (buf)
+ if (buf) {
logprintf(0, "Log: Using memaligned allocation at %p.\n", buf);
- else
+ } else {
logprintf(0, "Process Error: memalign returned 0\n");
+ if ((length >= 1499LL * kMegabyte) && (address_mode_ == 32)) {
+ logprintf(0, "Log: You are trying to allocate > 1.4G on a 32 "
+ "bit process. Please setup shared memory.\n");
+ }
+ }
}
testmem_ = buf;
- if (buf) {
+ if (buf || dynamic_mapped_shmem_) {
testmemsize_ = length;
} else {
testmemsize_ = 0;
}
- return (buf != 0);
+ return (buf != 0) || dynamic_mapped_shmem_;
}
// Free the test memory.
@@ -384,6 +526,11 @@ void OsLayer::FreeTestMem() {
if (use_hugepages_) {
shmdt(testmem_);
shmctl(shmid_, IPC_RMID, NULL);
+ } else if (use_posix_shm_) {
+ if (!dynamic_mapped_shmem_) {
+ munmap(testmem_, testmemsize_);
+ }
+ close(shmid_);
} else {
free(testmem_);
}
@@ -396,11 +543,37 @@ void OsLayer::FreeTestMem() {
// Prepare the target memory. It may requre mapping in, or this may be a noop.
void *OsLayer::PrepareTestMem(uint64 offset, uint64 length) {
sat_assert((offset + length) <= testmemsize_);
+ if (dynamic_mapped_shmem_) {
+ // TODO(nsanders): Check if we can support MAP_NONBLOCK,
+ // and evaluate performance hit from not using it.
+ void * mapping = mmap64(NULL, length, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_NORESERVE | MAP_LOCKED | MAP_POPULATE,
+ shmid_, offset);
+ if (mapping == MAP_FAILED) {
+ string errtxt = ErrorString(errno);
+ logprintf(0, "Process Error: PrepareTestMem mmap64(%llx, %llx) failed. "
+ "error: %s.\n",
+ offset, length, errtxt.c_str());
+ sat_assert(0);
+ }
+ return mapping;
+ }
+
return reinterpret_cast<void*>(reinterpret_cast<char*>(testmem_) + offset);
}
// Release the test memory resources, if any.
void OsLayer::ReleaseTestMem(void *addr, uint64 offset, uint64 length) {
+ if (dynamic_mapped_shmem_) {
+ int retval = munmap(addr, length);
+ if (retval == -1) {
+ string errtxt = ErrorString(errno);
+ logprintf(0, "Process Error: ReleaseTestMem munmap(%p, %llx) failed. "
+ "error: %s.\n",
+ addr, length, errtxt.c_str());
+ sat_assert(0);
+ }
+ }
}
// No error polling on unknown systems.
@@ -453,7 +626,7 @@ uint32 OsLayer::PciRead(int fd, uint32 offset, int width) {
logprintf(0, "Process Error: Can't seek %x\n", offset);
return 0;
}
- if (read(fd, &datacast, size) != size) {
+ if (read(fd, &datacast, size) != static_cast<ssize_t>(size)) {
logprintf(0, "Process Error: Can't read %x\n", offset);
return 0;
}
@@ -502,7 +675,7 @@ void OsLayer::PciWrite(int fd, uint32 offset, uint32 value, int width) {
logprintf(0, "Process Error: Can't seek %x\n", offset);
return;
}
- if (write(fd, &datacast, size) != size) {
+ if (write(fd, &datacast, size) != static_cast<ssize_t>(size)) {
logprintf(0, "Process Error: Can't write %x to %x\n", datacast.l32, offset);
return;
}
diff --git a/src/os.h b/src/os.h
index 9ed04d5..28c8a2a 100644
--- a/src/os.h
+++ b/src/os.h
@@ -125,6 +125,8 @@ class OsLayer {
asm volatile("mfence");
asm volatile("clflush (%0)" :: "r" (vaddr));
asm volatile("mfence");
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+ #warning "Unsupported CPU type ARMV7A: Unable to force cache flushes."
#else
#warning "Unsupported CPU type: Unable to force cache flushes."
#endif
@@ -152,6 +154,9 @@ class OsLayer {
datacast_t data;
__asm __volatile("rdtsc" : "=a" (data.l32.l), "=d"(data.l32.h));
tsc = data.l64;
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+ #warning "Unsupported CPU type ARMV7A: your build may not function correctly"
+ tsc = 0;
#else
#warning "Unsupported CPU type: your build may not function correctly"
tsc = 0;
@@ -181,6 +186,8 @@ class OsLayer {
// Returns 32 for 32-bit, 64 for 64-bit.
virtual int AddressMode();
+ // Update OsLayer state regarding cpu support for various features.
+ virtual void GetFeatures();
// Open, read, write pci cfg through /proc/bus/pci. fd is /proc/pci file.
virtual int PciOpen(int bus, int device, int function);
@@ -217,12 +224,10 @@ class OsLayer {
// Detect all PCI Devices.
virtual PCIDevices GetPCIDevices();
- // Default platform dependent warm Adler memcpy to C implementation
- // for compatibility.
+ // Disambiguate between different "warm" memcopies.
virtual bool AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
unsigned int size_in_bytes,
- AdlerChecksum *checksum)
- {return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum);}
+ AdlerChecksum *checksum);
// Store a callback to use to print
// app-specific info about the last error location.
@@ -237,12 +242,14 @@ class OsLayer {
protected:
void *testmem_; // Location of test memory.
- int64 testmemsize_; // Size of test memory.
+ uint64 testmemsize_; // Size of test memory.
int64 totalmemsize_; // Size of available memory.
int64 min_hugepages_bytes_; // Minimum hugepages size.
bool error_injection_; // Do error injection?
bool normal_mem_; // Memory DMA capable?
bool use_hugepages_; // Use hugepage shmem?
+ bool use_posix_shm_; // Use 4k page shmem?
+ bool dynamic_mapped_shmem_; // Conserve virtual address space.
int shmid_; // Handle to shmem
int64 regionsize_; // Size of memory "regions"
@@ -250,6 +257,10 @@ class OsLayer {
int num_cpus_; // Number of cpus in the system.
int num_nodes_; // Number of nodes in the system.
int num_cpus_per_node_; // Number of cpus per node in the system.
+ int address_mode_; // Are we running 32 or 64 bit?
+ bool has_sse2_; // Do we have sse2 instructions?
+ bool has_clflush_; // Do we have clflush instructions?
+
time_t time_initialized_; // Start time of test.
diff --git a/src/pattern.cc b/src/pattern.cc
index 2fb552a..9f22674 100644
--- a/src/pattern.cc
+++ b/src/pattern.cc
@@ -393,7 +393,7 @@ int PatternList::Destroy() {
// Return pattern numbered "i"
Pattern *PatternList::GetPattern(int i) {
- if (i < size_) {
+ if (static_cast<unsigned int>(i) < size_) {
return &patterns_[i];
}
diff --git a/src/pattern.h b/src/pattern.h
index b1168aa..181f839 100644
--- a/src/pattern.h
+++ b/src/pattern.h
@@ -102,7 +102,7 @@ class PatternList {
private:
vector<class Pattern> patterns_;
int weightcount_; // Total count of pattern weights.
- int size_;
+ unsigned int size_;
int initialized_;
DISALLOW_COPY_AND_ASSIGN(PatternList);
};
diff --git a/src/sat.cc b/src/sat.cc
index 06b4c65..bed62b7 100644
--- a/src/sat.cc
+++ b/src/sat.cc
@@ -164,26 +164,6 @@ bool Sat::CheckEnvironment() {
return false;
}
- if ((address_mode_ == 32) &&
- (os_->normal_mem()) &&
- (size_ >= 1499 * kMegabyte)) {
- if (run_on_anything_) {
- int64 new_size_mb = 1499;
- logprintf(1, "Log: 32 bit binary: reducing from %lldMB to %lldMB\n",
- size_mb_,
- new_size_mb);
- size_mb_ = new_size_mb;
- size_ = size_mb_ * kMegabyte;
- } else {
- logprintf(0, "Process Error: %dMB test memory too large "
- "for 32 bit binary.\n",
- static_cast<int>(size_ / kMegabyte));
- logprintf(0, "Log: Command line option '-A' bypasses this error.\n");
- bad_status();
- return false;
- }
- }
-
// If platform is 32 bit Xeon, floor memory size to multiple of 4.
if (address_mode_ == 32) {
size_mb_ = (size_mb_ / 4) * 4;
@@ -350,7 +330,7 @@ void Sat::AddrMapUpdate(struct page_entry *pe) {
for (int i = 0; i < page_length_; i += 4096) {
uint64 paddr = os_->VirtualToPhysical(base + i);
- int offset = paddr / 4096 / 8;
+ uint32 offset = paddr / 4096 / 8;
unsigned char mask = 1 << ((paddr / 4096) % 8);
if (offset >= arraysize) {
@@ -969,7 +949,8 @@ bool Sat::ParseArgs(int argc, char **argv) {
}
// Set disk_pages_ if filesize or page size changed.
- if (filesize != page_length_ * disk_pages_) {
+ if (filesize != static_cast<uint64>(page_length_) *
+ static_cast<uint64>(disk_pages_)) {
disk_pages_ = filesize / page_length_;
if (disk_pages_ == 0)
disk_pages_ = 1;
@@ -1014,7 +995,7 @@ void Sat::PrintHelp() {
" --force_errors_like_crazy inject a lot of false errors "
"to test error handling\n"
" -F don't result check each transaction\n"
- "--stop_on_errors Stop after finding the first error.\n"
+ " --stop_on_errors Stop after finding the first error.\n"
" --read-block-size size of block for reading (-d)\n"
" --write-block-size size of block for writing (-d). If not "
"defined, the size of block for writing will be defined as the "
@@ -1041,7 +1022,7 @@ void Sat::PrintHelp() {
" --pause_duration duration (in seconds) of each pause\n"
" --local_numa : choose memory regions associated with "
"each CPU to be tested by that CPU\n"
- "--remote_numa : choose memory regions not associated with "
+ " --remote_numa : choose memory regions not associated with "
"each CPU to be tested by that CPU\n");
}
@@ -1850,7 +1831,7 @@ bool Sat::Cleanup() {
delete[] page_bitmap_;
}
- for (int i = 0; i < blocktables_.size(); i++) {
+ for (size_t i = 0; i < blocktables_.size(); i++) {
delete blocktables_[i];
}
diff --git a/src/sat.h b/src/sat.h
index 950270f..b48f519 100644
--- a/src/sat.h
+++ b/src/sat.h
@@ -164,7 +164,7 @@ class Sat {
bool error_injection_; // Simulate errors, for unittests.
bool crazy_error_injection_; // Simulate lots of errors.
- int64 max_errorcount_; // Number of errors before forced exit.
+ uint64 max_errorcount_; // Number of errors before forced exit.
int run_on_anything_; // Ignore unknown machine ereor.
int use_logfile_; // Log to a file.
char logfilename_[255]; // Name of file to log to.
diff --git a/src/stressapptest_config.h.in b/src/stressapptest_config.h.in
index 535bb34..b78857c 100644
--- a/src/stressapptest_config.h.in
+++ b/src/stressapptest_config.h.in
@@ -148,6 +148,9 @@
/* Define to 1 if strerror_r returns char *. */
#undef STRERROR_R_CHAR_P
+/* Defined if the target CPU is armv7a */
+#undef STRESSAPPTEST_CPU_ARMV7A
+
/* Defined if the target CPU is i686 */
#undef STRESSAPPTEST_CPU_I686
diff --git a/src/worker.cc b/src/worker.cc
index c568064..2fab28e 100644
--- a/src/worker.cc
+++ b/src/worker.cc
@@ -86,6 +86,9 @@ namespace {
int cpu;
#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
__asm __volatile("cpuid" : "=b" (cpu) : "a" (1) : "cx", "dx");
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+ #warning "Unsupported CPU type ARMV7A: unable to determine core ID."
+ cpu = 0;
#else
#warning "Unsupported CPU type: unable to determine core ID."
cpu = 0;
@@ -1953,7 +1956,6 @@ bool FileThread::Work() {
}
pages_copied_ = loops * sat_->disk_pages();
- status_ = result;
// Clean up.
CloseFile(fd);
@@ -1961,7 +1963,10 @@ bool FileThread::Work() {
logprintf(9, "Log: Completed %d: file thread status %d, %d pages copied\n",
thread_num_, status_, pages_copied_);
- return result;
+ // Failure to read from device indicates hardware,
+ // rather than procedural SW error.
+ status_ = true;
+ return true;
}
bool NetworkThread::IsNetworkStopSet() {
@@ -2259,7 +2264,7 @@ bool NetworkListenThread::ReapSlaves() {
// Gather status and reap threads.
logprintf(12, "Log: Joining all outstanding threads\n");
- for (int i = 0; i < child_workers_.size(); i++) {
+ for (size_t i = 0; i < child_workers_.size(); i++) {
NetworkSlaveThread& child_thread = child_workers_[i]->thread;
logprintf(12, "Log: Joining slave thread %d\n", i);
child_thread.JoinThread();
@@ -2689,7 +2694,7 @@ bool DiskThread::GetDiskSize(int fd) {
return false;
}
- // If an Elephant is initialized with status DEAD its size will be zero.
+ // Zero size indicates nonworking device..
if (block_size == 0) {
os_->ErrorReport(device_name_.c_str(), "device-size-zero", 1);
++errorcount_;
@@ -2734,11 +2739,11 @@ int64 DiskThread::GetTime() {
}
// Do randomized reads and (possibly) writes on a device.
-// Return false on fatal error, either SW or HW.
+// Return false on fatal SW error, true on SW success,
+// regardless of whether HW failed.
bool DiskThread::DoWork(int fd) {
int64 block_num = 0;
int64 num_segments;
- bool result = true;
if (segment_size_ == -1) {
num_segments = 1;
@@ -2775,7 +2780,8 @@ bool DiskThread::DoWork(int fd) {
non_destructive_ ? "(disabled) " : "",
device_name_.c_str(), thread_num_);
while (IsReadyToRunNoPause() &&
- in_flight_sectors_.size() < queue_size_ + 1) {
+ in_flight_sectors_.size() <
+ static_cast<size_t>(queue_size_ + 1)) {
// Confine testing to a particular segment of the disk.
int64 segment = (block_num / blocks_per_segment_) % num_segments;
if (!non_destructive_ &&
@@ -2810,7 +2816,7 @@ bool DiskThread::DoWork(int fd) {
if (!non_destructive_) {
if (!WriteBlockToDisk(fd, block)) {
block_table_->RemoveBlock(block);
- return false;
+ return true;
}
blocks_written_++;
}
@@ -2829,14 +2835,14 @@ bool DiskThread::DoWork(int fd) {
BlockData *block = in_flight_sectors_.front();
in_flight_sectors_.pop();
if (!ValidateBlockOnDisk(fd, block))
- return false;
+ return true;
block_table_->RemoveBlock(block);
blocks_read_++;
}
}
pages_copied_ = blocks_written_ + blocks_read_;
- return result;
+ return true;
}
// Do an asynchronous disk I/O operation.
@@ -2923,7 +2929,7 @@ bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size,
// event.res contains the number of bytes written/read or
// error if < 0, I think.
- if (event.res != size) {
+ if (event.res != static_cast<uint64>(size)) {
errorcount_++;
os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);