diff options
author | nick.j.sanders <nick.j.sanders@93e54ea4-8218-11de-8aaf-8d8425684b44> | 2010-05-14 03:47:11 +0000 |
---|---|---|
committer | nick.j.sanders <nick.j.sanders@93e54ea4-8218-11de-8aaf-8d8425684b44> | 2010-05-14 03:47:11 +0000 |
commit | 6d1e64db329883e43dbca06471c093fc23dc9a2e (patch) | |
tree | f5c51b3fc301cfdee87aa54dd5a92bb16854b599 /src | |
parent | eea0aac32a3f522ea51d389f44dcd8abcfc5a6e0 (diff) | |
download | stressapptest-6d1e64db329883e43dbca06471c093fc23dc9a2e.tar.gz |
Update stressapptest to 1.0.3.
* /dev/shm support to allow >1.4G memory usage for 32 bit app.
* Some arm support.
* x86 SSE support.
Diffstat (limited to 'src')
-rw-r--r-- | src/adler32memcpy.cc | 124 | ||||
-rw-r--r-- | src/disk_blocks.h | 2 | ||||
-rw-r--r-- | src/finelock_queue.cc | 12 | ||||
-rw-r--r-- | src/finelock_queue.h | 6 | ||||
-rw-r--r-- | src/logger.cc | 5 | ||||
-rw-r--r-- | src/logger.h | 2 | ||||
-rw-r--r-- | src/os.cc | 259 | ||||
-rw-r--r-- | src/os.h | 21 | ||||
-rw-r--r-- | src/pattern.cc | 2 | ||||
-rw-r--r-- | src/pattern.h | 2 | ||||
-rw-r--r-- | src/sat.cc | 31 | ||||
-rw-r--r-- | src/sat.h | 2 | ||||
-rw-r--r-- | src/stressapptest_config.h.in | 3 | ||||
-rw-r--r-- | src/worker.cc | 28 |
14 files changed, 351 insertions, 148 deletions
diff --git a/src/adler32memcpy.cc b/src/adler32memcpy.cc index 529dcc4..69324f7 100644 --- a/src/adler32memcpy.cc +++ b/src/adler32memcpy.cc @@ -225,19 +225,41 @@ bool AdlerMemcpyWarmC(uint64 *dstmem64, uint64 *srcmem64, // x86_64 SSE2 assembly implementation of fast and stressful Adler memory copy. bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, unsigned int size_in_bytes, AdlerChecksum *checksum) { -// Use assembly implementation only with 64bit compilation. -#ifndef STRESSAPPTEST_CPU_X86_64 - // Fall back to C implementation for 32bit compilation. - return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum); -#else +// Use assembly implementation where supported. +#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686) + +// Pull a bit of tricky preprocessing to make the inline asm both +// 32 bit and 64 bit. +#ifdef STRESSAPPTEST_CPU_I686 // Instead of coding both, x86... +#define rAX "%%eax" +#define rCX "%%ecx" +#define rDX "%%edx" +#define rBX "%%ebx" +#define rSP "%%esp" +#define rBP "%%ebp" +#define rSI "%%esi" +#define rDI "%%edi" +#endif + +#ifdef STRESSAPPTEST_CPU_X86_64 // ...and x64, we use rXX macros. +#define rAX "%%rax" +#define rCX "%%rcx" +#define rDX "%%rdx" +#define rBX "%%rbx" +#define rSP "%%rsp" +#define rBP "%%rbp" +#define rSI "%%rsi" +#define rDI "%%rdi" +#endif + // Elements 0 to 3 are used for holding checksum terms a1, a2, // b1, b2 respectively. These elements are filled by asm code. // Elements 4 and 5 are used by asm code to for ANDing MMX data and removing // 2 words from each MMX register (A MMX reg has 4 words, by ANDing we are // setting word index 0 and word index 2 to zero). // Element 6 and 7 are used for setting a1 and a2 to 1. - volatile uint64 checksum_arr[] = {0, 0, 0, 0, - 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1}; + volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) = + {0, 0, 0, 0, 0x00000000ffffffffUL, 0x00000000ffffffffUL, 1, 1}; if ((size_in_bytes >> 19) > 0) { // Size is too large. Must be less than 2^19 bytes = 512 KB. @@ -245,23 +267,24 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, } // Number of 32-bit words which are not added to a1/a2 in the main loop. - uint64 remaining_words = (size_in_bytes % 48) / 4; + uint32 remaining_words = (size_in_bytes % 48) / 4; // Since we are moving 48 bytes at a time number of iterations = total size/48 // is value of counter. - uint64 num_of_48_byte_units = size_in_bytes / 48; + uint32 num_of_48_byte_units = size_in_bytes / 48; - asm volatile( + asm volatile ( // Source address is in ESI (extended source index) // destination is in EDI (extended destination index) - // and counter is already in ECX (extended counter index). - "cmp $0, %%ecx;" // Compare counter to zero. + // and counter is already in ECX (extended counter + // index). + "cmp $0, " rCX ";" // Compare counter to zero. "jz END;" // XMM6 is initialized with 1 and XMM7 with 0. - "prefetchnta 0(%%rsi);" - "prefetchnta 64(%%rsi);" - "movdqu 48(%%rax), %%xmm6;" + "prefetchnta 0(" rSI ");" + "prefetchnta 64(" rSI ");" + "movdqu 48(" rAX "), %%xmm6;" "xorps %%xmm7, %%xmm7;" // Start of the loop which copies 48 bytes from source to dst each time. @@ -269,28 +292,28 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, // Make 6 moves each of 16 bytes from srcmem to XMM registers. // We are using 2 words out of 4 words in each XMM register, - // word index 0 and word index 2) - "movdqa 0(%%rsi), %%xmm0;" - "movdqu 4(%%rsi), %%xmm1;" // Be careful to use unaligned move here. - "movdqa 16(%%rsi), %%xmm2;" - "movdqu 20(%%rsi), %%xmm3;" - "movdqa 32(%%rsi), %%xmm4;" - "movdqu 36(%%rsi), %%xmm5;" + // word index 0 and word index 2 + "movdqa 0(" rSI "), %%xmm0;" + "movdqu 4(" rSI "), %%xmm1;" // Be careful to use unaligned move here. + "movdqa 16(" rSI "), %%xmm2;" + "movdqu 20(" rSI "), %%xmm3;" + "movdqa 32(" rSI "), %%xmm4;" + "movdqu 36(" rSI "), %%xmm5;" // Move 3 * 16 bytes from XMM registers to dstmem. // Note: this copy must be performed before pinsrw instructions since // they will modify the XMM registers. - "movntdq %%xmm0, 0(%%rdi);" - "movntdq %%xmm2, 16(%%rdi);" - "movntdq %%xmm4, 32(%%rdi);" + "movntdq %%xmm0, 0(" rDI ");" + "movntdq %%xmm2, 16(" rDI ");" + "movntdq %%xmm4, 32(" rDI ");" // Sets the word[1] and word[3] of XMM0 to XMM5 to zero. - "andps 32(%%rax), %%xmm0;" - "andps 32(%%rax), %%xmm1;" - "andps 32(%%rax), %%xmm2;" - "andps 32(%%rax), %%xmm3;" - "andps 32(%%rax), %%xmm4;" - "andps 32(%%rax), %%xmm5;" + "andps 32(" rAX "), %%xmm0;" + "andps 32(" rAX "), %%xmm1;" + "andps 32(" rAX "), %%xmm2;" + "andps 32(" rAX "), %%xmm3;" + "andps 32(" rAX "), %%xmm4;" + "andps 32(" rAX "), %%xmm5;" // Add XMM0 to XMM6 and then add XMM6 to XMM7. // Repeat this for XMM1, ..., XMM5. @@ -311,43 +334,43 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, "paddq %%xmm6, %%xmm7;" // Increment ESI and EDI by 48 bytes and decrement counter by 1. - "add $48, %%rsi;" - "add $48, %%rdi;" - "prefetchnta 0(%%rsi);" - "prefetchnta 64(%%rsi);" - "dec %%rcx;" + "add $48, " rSI ";" + "add $48, " rDI ";" + "prefetchnta 0(" rSI ");" + "prefetchnta 64(" rSI ");" + "dec " rCX ";" "jnz TOP;" // Now only remaining_words 32-bit words are left. // make a loop, add first two words to a1 and next two to a2 (just like // above loop, the only extra thing we are doing is rechecking - // %rdx (=remaining_words) everytime we add a number to a1/a2. + // rDX (=remaining_words) everytime we add a number to a1/a2. "REM_IS_STILL_NOT_ZERO:\n" // Unless remaining_words becomes less than 4 words(16 bytes) // there is not much issue and remaining_words will always // be a multiple of four by assumption. - "cmp $4, %%rdx;" + "cmp $4, " rDX ";" // In case for some weird reasons if remaining_words becomes // less than 4 but not zero then also break the code and go off to END. "jl END;" // Otherwise just go on and copy data in chunks of 4-words at a time till // whole data (<48 bytes) is copied. - "movdqa 0(%%rsi), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1. + "movdqa 0(" rSI "), %%xmm0;" // Copy next 4-words to XMM0 and to XMM1. - "movdqa 0(%%rsi), %%xmm5;" // Accomplish movdqu 4(%%rsi) without + "movdqa 0(" rSI "), %%xmm5;" // Accomplish movdqu 4(%rSI) without "pshufd $0x39, %%xmm5, %%xmm1;" // indexing off memory boundary. - "movntdq %%xmm0, 0(%%rdi);" // Copy 4-words to destination. - "andps 32(%%rax), %%xmm0;" - "andps 32(%%rax), %%xmm1;" + "movntdq %%xmm0, 0(" rDI ");" // Copy 4-words to destination. + "andps 32(" rAX "), %%xmm0;" + "andps 32(" rAX "), %%xmm1;" "paddq %%xmm0, %%xmm6;" "paddq %%xmm6, %%xmm7;" "paddq %%xmm1, %%xmm6;" "paddq %%xmm6, %%xmm7;" - "add $16, %%rsi;" - "add $16, %%rdi;" - "sub $4, %%rdx;" - // Decrement %%rdx by 4 since %%rdx is number of 32-bit + "add $16, " rSI ";" + "add $16, " rDI ";" + "sub $4, " rDX ";" + // Decrement %rDX by 4 since %rDX is number of 32-bit // words left after considering all 48-byte units. "jmp REM_IS_STILL_NOT_ZERO;" @@ -356,8 +379,8 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, // 64 bit numbers and have to be converted to 64 bit numbers) // seems like Adler128 (since size of each part is 4 byte rather than // 1 byte). - "movdqa %%xmm6, 0(%%rax);" - "movdqa %%xmm7, 16(%%rax);" + "movdqa %%xmm6, 0(" rAX ");" + "movdqa %%xmm7, 16(" rAX ");" "sfence;" // No output registers. @@ -376,5 +399,8 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64, // that there is no problem with memory this just mean that data was copied // from src to dst and checksum was calculated successfully). return true; +#else + // Fall back to C implementation for anything else. + return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum); #endif } diff --git a/src/disk_blocks.h b/src/disk_blocks.h index f4ca93f..cb634c9 100644 --- a/src/disk_blocks.h +++ b/src/disk_blocks.h @@ -100,7 +100,7 @@ class DiskBlockTable { typedef vector<int64> PosToAddrVector; PosToAddrVector pos_to_addr_; AddrToBlockMap addr_to_block_; - int64 nelems_; + uint64 nelems_; int sector_size_; // Sector size, in bytes int write_block_size_; // Block size, in bytes string device_name_; // Device name diff --git a/src/finelock_queue.cc b/src/finelock_queue.cc index 569903a..8d914b8 100644 --- a/src/finelock_queue.cc +++ b/src/finelock_queue.cc @@ -45,7 +45,7 @@ FineLockPEQueue::FineLockPEQueue( queue_metric_ = kTouch; { // Init all the page locks. - for (int64 i = 0; i < q_size_; i++) { + for (uint64 i = 0; i < q_size_; i++) { pthread_mutex_init(&(pagelocks_[i]), NULL); // Pages start out owned (locked) by Sat::InitializePages. // A locked state indicates that the page state is unknown, @@ -147,7 +147,7 @@ int64 FineLockPEQueue::getC(int64 m) { // Destructor: Clean-up allocated memory and destroy pthread locks. FineLockPEQueue::~FineLockPEQueue() { - int64 i; + uint64 i; for (i = 0; i < q_size_; i++) pthread_mutex_destroy(&(pagelocks_[i])); delete[] pagelocks_; @@ -173,11 +173,11 @@ bool FineLockPEQueue::QueueAnalysis() { } // Bucketize the page counts by highest bit set. - for (int64 i = 0; i < q_size_; i++) { + for (uint64 i = 0; i < q_size_; i++) { uint32 readcount = pages_[i].touch; int b = 0; for (b = 0; b < 31; b++) { - if (readcount < (1 << b)) + if (readcount < (1u << b)) break; } @@ -271,7 +271,7 @@ bool FineLockPEQueue::GetPageFromPhysical(uint64 paddr, struct page_entry *pe) { // Traverse through array until finding a page // that contains the address we want.. - for (int64 i = 0; i < q_size_; i++) { + for (uint64 i = 0; i < q_size_; i++) { uint64 page_addr = pages_[i].paddr; // This assumes linear vaddr. if ((page_addr <= paddr) && (page_addr + page_size_ > paddr)) { @@ -335,7 +335,7 @@ bool FineLockPEQueue::GetRandomWithPredicateTag(struct page_entry *pe, uint64 next_try = 1; // Traverse through array until finding a page meeting given predicate. - for (int64 i = 0; i < q_size_; i++) { + for (uint64 i = 0; i < q_size_; i++) { uint64 index = (next_try + first_try) % q_size_; // Go through the loop linear conguentially. We are offsetting by // 'first_try' so this path will be a different sequence for every diff --git a/src/finelock_queue.h b/src/finelock_queue.h index 54b154e..2de5a46 100644 --- a/src/finelock_queue.h +++ b/src/finelock_queue.h @@ -57,7 +57,9 @@ class FineLockPEQueue { uint64 GetRandom64FromSlot(int slot); // Helper function to check index range, returns true if index is valid. - bool valid_index(int64 index) { return index >= 0 && index < q_size_; } + bool valid_index(int64 index) { + return index >= 0 && static_cast<uint64>(index) < q_size_; + } // Returns true if page entry is valid, false otherwise. static bool page_is_valid(struct page_entry *pe) { @@ -85,7 +87,7 @@ class FineLockPEQueue { pthread_mutex_t *pagelocks_; // Per-page-entry locks. struct page_entry *pages_; // Where page entries are held. - int64 q_size_; // Size of the queue. + uint64 q_size_; // Size of the queue. int64 page_size_; // For calculating array index from offset. enum { diff --git a/src/logger.cc b/src/logger.cc index 81f1e3e..e4ecb03 100644 --- a/src/logger.cc +++ b/src/logger.cc @@ -38,7 +38,7 @@ void Logger::VLogF(int priority, const char *format, va_list args) { } char buffer[4096]; int length = vsnprintf(buffer, sizeof buffer, format, args); - if (length >= sizeof buffer) { + if (static_cast<size_t>(length) >= sizeof buffer) { length = sizeof buffer; buffer[sizeof buffer - 1] = '\n'; } @@ -96,7 +96,8 @@ void Logger::QueueLogLine(string *line) { namespace { void WriteToFile(const string& line, int fd) { - LOGGER_ASSERT(write(fd, line.data(), line.size()) == line.size()); + LOGGER_ASSERT(write(fd, line.data(), line.size()) == + static_cast<ssize_t>(line.size())); } } diff --git a/src/logger.h b/src/logger.h index 3eaea57..1d70107 100644 --- a/src/logger.h +++ b/src/logger.h @@ -28,7 +28,7 @@ // Attempts to log additional lines will block when the queue reaches this size. // Due to how the logging thread works, up to twice this many log lines may be // outstanding at any point. -static const int kMaxQueueSize = 250; +static const size_t kMaxQueueSize = 250; // This is only for use by the Logger class, do not use it elsewhere! @@ -53,8 +53,12 @@ OsLayer::OsLayer() { testmemsize_ = 0; totalmemsize_ = 0; min_hugepages_bytes_ = 0; - error_injection_ = false; normal_mem_ = true; + use_hugepages_ = false; + use_posix_shm_ = false; + dynamic_mapped_shmem_ = false; + shmid_ = 0; + time_initialized_ = 0; regionsize_ = 0; @@ -64,6 +68,13 @@ OsLayer::OsLayer() { num_cpus_per_node_ = 0; error_diagnoser_ = 0; err_log_callback_ = 0; + error_injection_ = false; + + void *pvoid = 0; + address_mode_ = sizeof(pvoid) * 8; + + has_clflush_ = false; + has_sse2_ = false; } // OsLayer cleanup. @@ -75,8 +86,9 @@ OsLayer::~OsLayer() { // OsLayer initialization. bool OsLayer::Initialize() { time_initialized_ = time(NULL); - use_hugepages_ = false; - shmid_ = 0; + // Detect asm support. + GetFeatures(); + if (num_cpus_ == 0) { num_nodes_ = 1; num_cpus_ = sysconf(_SC_NPROCESSORS_ONLN); @@ -129,13 +141,53 @@ list<string> OsLayer::FindFileDevices() { return locations; } + +// Get HW core features from cpuid instruction. +void OsLayer::GetFeatures() { +#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686) + // CPUID features documented at: + // http://www.sandpile.org/ia32/cpuid.htm + int ax, bx, cx, dx; + __asm__ __volatile__ ( + "cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (1)); + has_clflush_ = (dx >> 19) & 1; + has_sse2_ = (dx >> 26) & 1; + + logprintf(9, "Log: has clflush: %s, has sse2: %s\n", + has_clflush_ ? "true" : "false", + has_sse2_ ? "true" : "false"); +#elif defined(STRESSAPPTEST_CPU_PPC) + // All PPC implementations have cache flush instructions. + has_clflush_ = true; +#elif defined(STRESSAPPTEST_CPU_ARMV7A) +#warning "Unsupported CPU type ARMV7A: unable to determine feature set." +#else +#warning "Unsupported CPU type: unable to determine feature set." +#endif +} + + // We need to flush the cacheline here. void OsLayer::Flush(void *vaddr) { // Use the generic flush. This function is just so we can override // this if we are so inclined. - FastFlush(vaddr); + if (has_clflush_) + FastFlush(vaddr); +} + + +// Run C or ASM copy as appropriate.. +bool OsLayer::AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem, + unsigned int size_in_bytes, + AdlerChecksum *checksum) { + if (has_sse2_) { + return AdlerMemcpyAsm(dstmem, srcmem, size_in_bytes, checksum); + } else { + return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum); + } } + // Translate user virtual to physical address. int OsLayer::FindDimm(uint64 addr, char *buf, int len) { char tmpbuf[256]; @@ -317,65 +369,155 @@ bool OsLayer::AllocateTestMem(int64 length, uint64 paddr_base) { // Try hugepages first. void *buf = 0; + sat_assert(length >= 0); + if (paddr_base) logprintf(0, "Process Error: non zero paddr_base %#llx is not supported," " ignore.\n", paddr_base); - { // Allocate hugepage mapped memory. - int shmid; - void *shmaddr; - - if ((shmid = shmget(2, length, - SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { - int err = errno; - char errtxt[256] = ""; - strerror_r(err, errtxt, sizeof(errtxt)); - logprintf(12, "Log: failed to allocate shared mem object - err %d (%s)\n", - err, errtxt); - goto hugepage_failover; - } + // Determine optimal memory allocation path. + bool prefer_hugepages = false; + bool prefer_posix_shm = false; + bool prefer_dynamic_mapping = false; - shmaddr = shmat(shmid, NULL, NULL); - if (shmaddr == reinterpret_cast<void*>(-1)) { - int err = errno; - char errtxt[256] = ""; - strerror_r(err, errtxt, sizeof(errtxt)); - logprintf(0, "Log: failed to attach shared mem object - err %d (%s).\n", - err, errtxt); - if (shmctl(shmid, IPC_RMID, NULL) < 0) { + // Are there enough hugepages? + int64 hugepagesize = FindHugePages() * 2 * kMegabyte; + // TODO(nsanders): Is there enough /dev/shm? Is there enough free memeory? + if ((length >= 1400LL * kMegabyte) && (address_mode_ == 32)) { + prefer_dynamic_mapping = true; + prefer_posix_shm = true; + logprintf(3, "Log: Prefer POSIX shared memory allocation.\n"); + logprintf(3, "Log: You may need to run " + "'sudo mount -o remount,size=100\% /dev/shm.'\n"); + } else if (hugepagesize >= length) { + prefer_hugepages = true; + logprintf(3, "Log: Prefer using hugepace allocation.\n"); + } else { + logprintf(3, "Log: Prefer plain malloc memory allocation.\n"); + } + + // Allocate hugepage mapped memory. + if (prefer_hugepages) { + do { // Allow break statement. + int shmid; + void *shmaddr; + + if ((shmid = shmget(2, length, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) { int err = errno; - char errtxt[256] = ""; - strerror_r(err, errtxt, sizeof(errtxt)); - logprintf(0, "Log: failed to remove shared mem object - err %d (%s).\n", - err, errtxt); + string errtxt = ErrorString(err); + logprintf(3, "Log: failed to allocate shared hugepage " + "object - err %d (%s)\n", + err, errtxt.c_str()); + logprintf(3, "Log: sysctl -w vm.nr_hugepages=XXX allows hugepages.\n"); + break; } - goto hugepage_failover; - } - use_hugepages_ = true; - shmid_ = shmid; - buf = shmaddr; - logprintf(0, "Log: Using hugepages 0x%x at %p.\n", shmid, shmaddr); + + shmaddr = shmat(shmid, NULL, NULL); + if (shmaddr == reinterpret_cast<void*>(-1)) { + int err = errno; + string errtxt = ErrorString(err); + logprintf(0, "Log: failed to attach shared " + "hugepage object - err %d (%s).\n", + err, errtxt.c_str()); + if (shmctl(shmid, IPC_RMID, NULL) < 0) { + int err = errno; + string errtxt = ErrorString(err); + logprintf(0, "Log: failed to remove shared " + "hugepage object - err %d (%s).\n", + err, errtxt.c_str()); + } + break; + } + use_hugepages_ = true; + shmid_ = shmid; + buf = shmaddr; + logprintf(0, "Log: Using shared hugepage object 0x%x at %p.\n", + shmid, shmaddr); + } while (0); } - hugepage_failover: + if ((!use_hugepages_) && prefer_posix_shm) { + do { + int shm_object; + void *shmaddr = NULL; + + shm_object = shm_open("/stressapptest", O_CREAT | O_RDWR, S_IRWXU); + if (shm_object < 0) { + int err = errno; + string errtxt = ErrorString(err); + logprintf(3, "Log: failed to allocate shared " + "smallpage object - err %d (%s)\n", + err, errtxt.c_str()); + break; + } + + if (0 > ftruncate(shm_object, length)) { + int err = errno; + string errtxt = ErrorString(err); + logprintf(3, "Log: failed to ftruncate shared " + "smallpage object - err %d (%s)\n", + err, errtxt.c_str()); + break; + } + + // 32 bit linux apps can only use ~1.4G of address space. + // Use dynamic mapping for allocations larger than that. + // Currently perf hit is ~10% for this. + if (prefer_dynamic_mapping) { + dynamic_mapped_shmem_ = true; + } else { + // Do a full mapping here otherwise. + shmaddr = mmap64(NULL, length, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_NORESERVE | MAP_LOCKED | MAP_POPULATE, + shm_object, NULL); + if (shmaddr == reinterpret_cast<void*>(-1)) { + int err = errno; + string errtxt = ErrorString(err); + logprintf(0, "Log: failed to map shared " + "smallpage object - err %d (%s).\n", + err, errtxt.c_str()); + break; + } + } + + use_posix_shm_ = true; + shmid_ = shm_object; + buf = shmaddr; + char location_message[256] = ""; + if (dynamic_mapped_shmem_) { + sprintf(location_message, "mapped as needed"); + } else { + sprintf(location_message, "at %p", shmaddr); + } + logprintf(0, "Log: Using posix shared memory object 0x%x %s.\n", + shm_object, location_message); + } while (0); + shm_unlink("/stressapptest"); + } - if (!use_hugepages_) { + if (!use_hugepages_ && !use_posix_shm_) { // Use memalign to ensure that blocks are aligned enough for disk direct IO. buf = static_cast<char*>(memalign(4096, length)); - if (buf) + if (buf) { logprintf(0, "Log: Using memaligned allocation at %p.\n", buf); - else + } else { logprintf(0, "Process Error: memalign returned 0\n"); + if ((length >= 1499LL * kMegabyte) && (address_mode_ == 32)) { + logprintf(0, "Log: You are trying to allocate > 1.4G on a 32 " + "bit process. Please setup shared memory.\n"); + } + } } testmem_ = buf; - if (buf) { + if (buf || dynamic_mapped_shmem_) { testmemsize_ = length; } else { testmemsize_ = 0; } - return (buf != 0); + return (buf != 0) || dynamic_mapped_shmem_; } // Free the test memory. @@ -384,6 +526,11 @@ void OsLayer::FreeTestMem() { if (use_hugepages_) { shmdt(testmem_); shmctl(shmid_, IPC_RMID, NULL); + } else if (use_posix_shm_) { + if (!dynamic_mapped_shmem_) { + munmap(testmem_, testmemsize_); + } + close(shmid_); } else { free(testmem_); } @@ -396,11 +543,37 @@ void OsLayer::FreeTestMem() { // Prepare the target memory. It may requre mapping in, or this may be a noop. void *OsLayer::PrepareTestMem(uint64 offset, uint64 length) { sat_assert((offset + length) <= testmemsize_); + if (dynamic_mapped_shmem_) { + // TODO(nsanders): Check if we can support MAP_NONBLOCK, + // and evaluate performance hit from not using it. + void * mapping = mmap64(NULL, length, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_NORESERVE | MAP_LOCKED | MAP_POPULATE, + shmid_, offset); + if (mapping == MAP_FAILED) { + string errtxt = ErrorString(errno); + logprintf(0, "Process Error: PrepareTestMem mmap64(%llx, %llx) failed. " + "error: %s.\n", + offset, length, errtxt.c_str()); + sat_assert(0); + } + return mapping; + } + return reinterpret_cast<void*>(reinterpret_cast<char*>(testmem_) + offset); } // Release the test memory resources, if any. void OsLayer::ReleaseTestMem(void *addr, uint64 offset, uint64 length) { + if (dynamic_mapped_shmem_) { + int retval = munmap(addr, length); + if (retval == -1) { + string errtxt = ErrorString(errno); + logprintf(0, "Process Error: ReleaseTestMem munmap(%p, %llx) failed. " + "error: %s.\n", + addr, length, errtxt.c_str()); + sat_assert(0); + } + } } // No error polling on unknown systems. @@ -453,7 +626,7 @@ uint32 OsLayer::PciRead(int fd, uint32 offset, int width) { logprintf(0, "Process Error: Can't seek %x\n", offset); return 0; } - if (read(fd, &datacast, size) != size) { + if (read(fd, &datacast, size) != static_cast<ssize_t>(size)) { logprintf(0, "Process Error: Can't read %x\n", offset); return 0; } @@ -502,7 +675,7 @@ void OsLayer::PciWrite(int fd, uint32 offset, uint32 value, int width) { logprintf(0, "Process Error: Can't seek %x\n", offset); return; } - if (write(fd, &datacast, size) != size) { + if (write(fd, &datacast, size) != static_cast<ssize_t>(size)) { logprintf(0, "Process Error: Can't write %x to %x\n", datacast.l32, offset); return; } @@ -125,6 +125,8 @@ class OsLayer { asm volatile("mfence"); asm volatile("clflush (%0)" :: "r" (vaddr)); asm volatile("mfence"); +#elif defined(STRESSAPPTEST_CPU_ARMV7A) + #warning "Unsupported CPU type ARMV7A: Unable to force cache flushes." #else #warning "Unsupported CPU type: Unable to force cache flushes." #endif @@ -152,6 +154,9 @@ class OsLayer { datacast_t data; __asm __volatile("rdtsc" : "=a" (data.l32.l), "=d"(data.l32.h)); tsc = data.l64; +#elif defined(STRESSAPPTEST_CPU_ARMV7A) + #warning "Unsupported CPU type ARMV7A: your build may not function correctly" + tsc = 0; #else #warning "Unsupported CPU type: your build may not function correctly" tsc = 0; @@ -181,6 +186,8 @@ class OsLayer { // Returns 32 for 32-bit, 64 for 64-bit. virtual int AddressMode(); + // Update OsLayer state regarding cpu support for various features. + virtual void GetFeatures(); // Open, read, write pci cfg through /proc/bus/pci. fd is /proc/pci file. virtual int PciOpen(int bus, int device, int function); @@ -217,12 +224,10 @@ class OsLayer { // Detect all PCI Devices. virtual PCIDevices GetPCIDevices(); - // Default platform dependent warm Adler memcpy to C implementation - // for compatibility. + // Disambiguate between different "warm" memcopies. virtual bool AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem, unsigned int size_in_bytes, - AdlerChecksum *checksum) - {return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum);} + AdlerChecksum *checksum); // Store a callback to use to print // app-specific info about the last error location. @@ -237,12 +242,14 @@ class OsLayer { protected: void *testmem_; // Location of test memory. - int64 testmemsize_; // Size of test memory. + uint64 testmemsize_; // Size of test memory. int64 totalmemsize_; // Size of available memory. int64 min_hugepages_bytes_; // Minimum hugepages size. bool error_injection_; // Do error injection? bool normal_mem_; // Memory DMA capable? bool use_hugepages_; // Use hugepage shmem? + bool use_posix_shm_; // Use 4k page shmem? + bool dynamic_mapped_shmem_; // Conserve virtual address space. int shmid_; // Handle to shmem int64 regionsize_; // Size of memory "regions" @@ -250,6 +257,10 @@ class OsLayer { int num_cpus_; // Number of cpus in the system. int num_nodes_; // Number of nodes in the system. int num_cpus_per_node_; // Number of cpus per node in the system. + int address_mode_; // Are we running 32 or 64 bit? + bool has_sse2_; // Do we have sse2 instructions? + bool has_clflush_; // Do we have clflush instructions? + time_t time_initialized_; // Start time of test. diff --git a/src/pattern.cc b/src/pattern.cc index 2fb552a..9f22674 100644 --- a/src/pattern.cc +++ b/src/pattern.cc @@ -393,7 +393,7 @@ int PatternList::Destroy() { // Return pattern numbered "i" Pattern *PatternList::GetPattern(int i) { - if (i < size_) { + if (static_cast<unsigned int>(i) < size_) { return &patterns_[i]; } diff --git a/src/pattern.h b/src/pattern.h index b1168aa..181f839 100644 --- a/src/pattern.h +++ b/src/pattern.h @@ -102,7 +102,7 @@ class PatternList { private: vector<class Pattern> patterns_; int weightcount_; // Total count of pattern weights. - int size_; + unsigned int size_; int initialized_; DISALLOW_COPY_AND_ASSIGN(PatternList); }; @@ -164,26 +164,6 @@ bool Sat::CheckEnvironment() { return false; } - if ((address_mode_ == 32) && - (os_->normal_mem()) && - (size_ >= 1499 * kMegabyte)) { - if (run_on_anything_) { - int64 new_size_mb = 1499; - logprintf(1, "Log: 32 bit binary: reducing from %lldMB to %lldMB\n", - size_mb_, - new_size_mb); - size_mb_ = new_size_mb; - size_ = size_mb_ * kMegabyte; - } else { - logprintf(0, "Process Error: %dMB test memory too large " - "for 32 bit binary.\n", - static_cast<int>(size_ / kMegabyte)); - logprintf(0, "Log: Command line option '-A' bypasses this error.\n"); - bad_status(); - return false; - } - } - // If platform is 32 bit Xeon, floor memory size to multiple of 4. if (address_mode_ == 32) { size_mb_ = (size_mb_ / 4) * 4; @@ -350,7 +330,7 @@ void Sat::AddrMapUpdate(struct page_entry *pe) { for (int i = 0; i < page_length_; i += 4096) { uint64 paddr = os_->VirtualToPhysical(base + i); - int offset = paddr / 4096 / 8; + uint32 offset = paddr / 4096 / 8; unsigned char mask = 1 << ((paddr / 4096) % 8); if (offset >= arraysize) { @@ -969,7 +949,8 @@ bool Sat::ParseArgs(int argc, char **argv) { } // Set disk_pages_ if filesize or page size changed. - if (filesize != page_length_ * disk_pages_) { + if (filesize != static_cast<uint64>(page_length_) * + static_cast<uint64>(disk_pages_)) { disk_pages_ = filesize / page_length_; if (disk_pages_ == 0) disk_pages_ = 1; @@ -1014,7 +995,7 @@ void Sat::PrintHelp() { " --force_errors_like_crazy inject a lot of false errors " "to test error handling\n" " -F don't result check each transaction\n" - "--stop_on_errors Stop after finding the first error.\n" + " --stop_on_errors Stop after finding the first error.\n" " --read-block-size size of block for reading (-d)\n" " --write-block-size size of block for writing (-d). If not " "defined, the size of block for writing will be defined as the " @@ -1041,7 +1022,7 @@ void Sat::PrintHelp() { " --pause_duration duration (in seconds) of each pause\n" " --local_numa : choose memory regions associated with " "each CPU to be tested by that CPU\n" - "--remote_numa : choose memory regions not associated with " + " --remote_numa : choose memory regions not associated with " "each CPU to be tested by that CPU\n"); } @@ -1850,7 +1831,7 @@ bool Sat::Cleanup() { delete[] page_bitmap_; } - for (int i = 0; i < blocktables_.size(); i++) { + for (size_t i = 0; i < blocktables_.size(); i++) { delete blocktables_[i]; } @@ -164,7 +164,7 @@ class Sat { bool error_injection_; // Simulate errors, for unittests. bool crazy_error_injection_; // Simulate lots of errors. - int64 max_errorcount_; // Number of errors before forced exit. + uint64 max_errorcount_; // Number of errors before forced exit. int run_on_anything_; // Ignore unknown machine ereor. int use_logfile_; // Log to a file. char logfilename_[255]; // Name of file to log to. diff --git a/src/stressapptest_config.h.in b/src/stressapptest_config.h.in index 535bb34..b78857c 100644 --- a/src/stressapptest_config.h.in +++ b/src/stressapptest_config.h.in @@ -148,6 +148,9 @@ /* Define to 1 if strerror_r returns char *. */ #undef STRERROR_R_CHAR_P +/* Defined if the target CPU is armv7a */ +#undef STRESSAPPTEST_CPU_ARMV7A + /* Defined if the target CPU is i686 */ #undef STRESSAPPTEST_CPU_I686 diff --git a/src/worker.cc b/src/worker.cc index c568064..2fab28e 100644 --- a/src/worker.cc +++ b/src/worker.cc @@ -86,6 +86,9 @@ namespace { int cpu; #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686) __asm __volatile("cpuid" : "=b" (cpu) : "a" (1) : "cx", "dx"); +#elif defined(STRESSAPPTEST_CPU_ARMV7A) + #warning "Unsupported CPU type ARMV7A: unable to determine core ID." + cpu = 0; #else #warning "Unsupported CPU type: unable to determine core ID." cpu = 0; @@ -1953,7 +1956,6 @@ bool FileThread::Work() { } pages_copied_ = loops * sat_->disk_pages(); - status_ = result; // Clean up. CloseFile(fd); @@ -1961,7 +1963,10 @@ bool FileThread::Work() { logprintf(9, "Log: Completed %d: file thread status %d, %d pages copied\n", thread_num_, status_, pages_copied_); - return result; + // Failure to read from device indicates hardware, + // rather than procedural SW error. + status_ = true; + return true; } bool NetworkThread::IsNetworkStopSet() { @@ -2259,7 +2264,7 @@ bool NetworkListenThread::ReapSlaves() { // Gather status and reap threads. logprintf(12, "Log: Joining all outstanding threads\n"); - for (int i = 0; i < child_workers_.size(); i++) { + for (size_t i = 0; i < child_workers_.size(); i++) { NetworkSlaveThread& child_thread = child_workers_[i]->thread; logprintf(12, "Log: Joining slave thread %d\n", i); child_thread.JoinThread(); @@ -2689,7 +2694,7 @@ bool DiskThread::GetDiskSize(int fd) { return false; } - // If an Elephant is initialized with status DEAD its size will be zero. + // Zero size indicates nonworking device.. if (block_size == 0) { os_->ErrorReport(device_name_.c_str(), "device-size-zero", 1); ++errorcount_; @@ -2734,11 +2739,11 @@ int64 DiskThread::GetTime() { } // Do randomized reads and (possibly) writes on a device. -// Return false on fatal error, either SW or HW. +// Return false on fatal SW error, true on SW success, +// regardless of whether HW failed. bool DiskThread::DoWork(int fd) { int64 block_num = 0; int64 num_segments; - bool result = true; if (segment_size_ == -1) { num_segments = 1; @@ -2775,7 +2780,8 @@ bool DiskThread::DoWork(int fd) { non_destructive_ ? "(disabled) " : "", device_name_.c_str(), thread_num_); while (IsReadyToRunNoPause() && - in_flight_sectors_.size() < queue_size_ + 1) { + in_flight_sectors_.size() < + static_cast<size_t>(queue_size_ + 1)) { // Confine testing to a particular segment of the disk. int64 segment = (block_num / blocks_per_segment_) % num_segments; if (!non_destructive_ && @@ -2810,7 +2816,7 @@ bool DiskThread::DoWork(int fd) { if (!non_destructive_) { if (!WriteBlockToDisk(fd, block)) { block_table_->RemoveBlock(block); - return false; + return true; } blocks_written_++; } @@ -2829,14 +2835,14 @@ bool DiskThread::DoWork(int fd) { BlockData *block = in_flight_sectors_.front(); in_flight_sectors_.pop(); if (!ValidateBlockOnDisk(fd, block)) - return false; + return true; block_table_->RemoveBlock(block); blocks_read_++; } } pages_copied_ = blocks_written_ + blocks_read_; - return result; + return true; } // Do an asynchronous disk I/O operation. @@ -2923,7 +2929,7 @@ bool DiskThread::AsyncDiskIO(IoOp op, int fd, void *buf, int64 size, // event.res contains the number of bytes written/read or // error if < 0, I think. - if (event.res != size) { + if (event.res != static_cast<uint64>(size)) { errorcount_++; os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1); |