aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authornick.j.sanders <nick.j.sanders@gmail.com>2014-02-11 05:57:33 +0000
committernick.j.sanders <nick.j.sanders@gmail.com>2014-02-11 05:57:33 +0000
commit3c1c63e2c8620aeb552aba19374c7af134bb63fd (patch)
treeffec5c7f3f69f8b32f434de929ef7ebcc6aaed92 /src
parent2ea87b7996f4f433d5d946eaf8f0d2f6fd18c144 (diff)
downloadstressapptest-3c1c63e2c8620aeb552aba19374c7af134bb63fd.tar.gz
Add NEON checksum and some bugfixes
* Add NEON copy and checksum for "-W" on ARM * Fix timer overflow for log runs under 32 bit. * Fix assert on checksum failure without miscompare. * Improve checksum error printout.
Diffstat (limited to 'src')
-rw-r--r--src/adler32memcpy.cc119
-rw-r--r--src/os.cc11
-rw-r--r--src/os.h8
-rw-r--r--src/sat.cc2
-rw-r--r--src/sattypes.h2
-rw-r--r--src/worker.cc10
-rw-r--r--src/worker.h8
7 files changed, 141 insertions, 19 deletions
diff --git a/src/adler32memcpy.cc b/src/adler32memcpy.cc
index 69324f7..47c6262 100644
--- a/src/adler32memcpy.cc
+++ b/src/adler32memcpy.cc
@@ -70,7 +70,7 @@ bool AdlerChecksum::Equals(const AdlerChecksum &other) const {
// Returns string representation of the Adler checksum.
string AdlerChecksum::ToHexString() const {
char buffer[128];
- snprintf(buffer, sizeof(buffer), "%llx%llx%llx%llx", a1_, a2_, b1_, b2_);
+ snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_);
return string(buffer);
}
@@ -399,7 +399,124 @@ bool AdlerMemcpyAsm(uint64 *dstmem64, uint64 *srcmem64,
// that there is no problem with memory this just mean that data was copied
// from src to dst and checksum was calculated successfully).
return true;
+#elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__)
+ // Elements 0 to 3 are used for holding checksum terms a1, a2,
+ // b1, b2 respectively. These elements are filled by asm code.
+ // Checksum is seeded with the null checksum.
+ volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
+ {1, 1, 0, 0};
+
+ if ((size_in_bytes >> 19) > 0) {
+ // Size is too large. Must be less than 2^19 bytes = 512 KB.
+ return false;
+ }
+
+ // Since we are moving 64 bytes at a time number of iterations = total size/64
+ uint32 blocks = size_in_bytes / 64;
+
+ uint64 *dst = dstmem64;
+ uint64 *src = srcmem64;
+
+ #define src_r "r3"
+ #define dst_r "r4"
+ #define blocks_r "r5"
+ #define crc_r "r6"
+
+ asm volatile (
+ "mov "src_r", %[src]; \n"
+ "mov "dst_r", %[dst]; \n"
+ "mov "crc_r", %[crc]; \n"
+ "mov "blocks_r", %[blocks]; \n"
+
+ // Loop over block count.
+ "cmp "blocks_r", #0; \n" // Compare counter to zero.
+ "ble END; \n"
+
+
+ // Preload upcoming cacheline.
+ "pld ["src_r", #0x0]; \n"
+ "pld ["src_r", #0x20]; \n"
+
+ // Init checksum
+ "vldm "crc_r", {q0}; \n"
+ "vmov.i32 q1, #0; \n"
+
+ // Start of the loop which copies 48 bytes from source to dst each time.
+ "TOP: \n"
+
+ // Make 3 moves each of 16 bytes from srcmem to qX registers.
+ // We are using 2 words out of 4 words in each qX register,
+ // word index 0 and word index 2. We'll swizzle them in a bit.
+ // Copy it.
+ "vldm "src_r"!, {q8, q9, q10, q11}; \n"
+ "vstm "dst_r"!, {q8, q9, q10, q11}; \n"
+
+ // Arrange it.
+ "vmov.i64 q12, #0; \n"
+ "vmov.i64 q13, #0; \n"
+ "vmov.i64 q14, #0; \n"
+ "vmov.i64 q15, #0; \n"
+ // This exchenges words 1,3 in the filled registers with
+ // words 0,2 in the empty registers.
+ "vtrn.32 q8, q12; \n"
+ "vtrn.32 q9, q13; \n"
+ "vtrn.32 q10, q14; \n"
+ "vtrn.32 q11, q15; \n"
+
+ // Sum into q0, then into q1.
+ // Repeat this for q8 - q13.
+ // Overflow can occur only if there are more
+ // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
+ // if size_in_bytes > 2^19 than overflow occurs.
+ "vadd.i64 q0, q0, q8; \n"
+ "vadd.i64 q1, q1, q0; \n"
+ "vadd.i64 q0, q0, q12; \n"
+ "vadd.i64 q1, q1, q0; \n"
+ "vadd.i64 q0, q0, q9; \n"
+ "vadd.i64 q1, q1, q0; \n"
+ "vadd.i64 q0, q0, q13; \n"
+ "vadd.i64 q1, q1, q0; \n"
+
+ "vadd.i64 q0, q0, q10; \n"
+ "vadd.i64 q1, q1, q0; \n"
+ "vadd.i64 q0, q0, q14; \n"
+ "vadd.i64 q1, q1, q0; \n"
+ "vadd.i64 q0, q0, q11; \n"
+ "vadd.i64 q1, q1, q0; \n"
+ "vadd.i64 q0, q0, q15; \n"
+ "vadd.i64 q1, q1, q0; \n"
+
+ // Increment counter and loop.
+ "sub "blocks_r", "blocks_r", #1; \n"
+ "cmp "blocks_r", #0; \n" // Compare counter to zero.
+ "bgt TOP; \n"
+
+
+ "END:\n"
+ // Report checksum values A and B (both right now are two concatenated
+ // 64 bit numbers and have to be converted to 64 bit numbers)
+ // seems like Adler128 (since size of each part is 4 byte rather than
+ // 1 byte).
+ "vstm "crc_r", {q0, q1}; \n"
+
+ // Output registers.
+ :
+ // Input registers.
+ : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr)
+ : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15"
+ ); // asm.
+
+ if (checksum != NULL) {
+ checksum->Set(checksum_arr[0], checksum_arr[1],
+ checksum_arr[2], checksum_arr[3]);
+ }
+
+ // Everything went fine, so return true (this does not mean
+ // that there is no problem with memory this just mean that data was copied
+ // from src to dst and checksum was calculated successfully).
+ return true;
#else
+ #warning "No vector copy defined for this architecture."
// Fall back to C implementation for anything else.
return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
#endif
diff --git a/src/os.cc b/src/os.cc
index 6358398..7c4e3d1 100644
--- a/src/os.cc
+++ b/src/os.cc
@@ -79,7 +79,7 @@ OsLayer::OsLayer() {
address_mode_ = sizeof(pvoid) * 8;
has_clflush_ = false;
- has_sse2_ = false;
+ has_vector_ = false;
use_flush_page_cache_ = false;
@@ -183,15 +183,18 @@ void OsLayer::GetFeatures() {
unsigned int eax = 1, ebx, ecx, edx;
cpuid(&eax, &ebx, &ecx, &edx);
has_clflush_ = (edx >> 19) & 1;
- has_sse2_ = (edx >> 26) & 1;
+ has_vector_ = (edx >> 26) & 1; // SSE2 caps bit.
logprintf(9, "Log: has clflush: %s, has sse2: %s\n",
has_clflush_ ? "true" : "false",
- has_sse2_ ? "true" : "false");
+ has_vector_ ? "true" : "false");
#elif defined(STRESSAPPTEST_CPU_PPC)
// All PPC implementations have cache flush instructions.
has_clflush_ = true;
#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+ // TODO(nsanders): add detect from /proc/cpuinfo or /proc/self/auxv.
+ // For now assume neon and don't run -W if you don't have it.
+ has_vector_ = true; // NEON.
#warning "Unsupported CPU type ARMV7A: unable to determine feature set."
#else
#warning "Unsupported CPU type: unable to determine feature set."
@@ -253,7 +256,7 @@ void OsLayer::Flush(void *vaddr) {
bool OsLayer::AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
unsigned int size_in_bytes,
AdlerChecksum *checksum) {
- if (has_sse2_) {
+ if (has_vector_) {
return AdlerMemcpyAsm(dstmem, srcmem, size_in_bytes, checksum);
} else {
return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum);
diff --git a/src/os.h b/src/os.h
index 13660d8..2272e4d 100644
--- a/src/os.h
+++ b/src/os.h
@@ -17,6 +17,7 @@
#define STRESSAPPTEST_OS_H_
#include <dirent.h>
+#include <unistd.h>
#include <sys/syscall.h>
#include <string>
@@ -153,7 +154,6 @@ class OsLayer {
asm volatile("clflush (%0)" : : "r" (vaddr));
asm volatile("mfence");
#elif defined(STRESSAPPTEST_CPU_ARMV7A)
- #warning "Unsupported CPU type ARMV7A: Using syscall to cache flush."
// ARMv7a cachelines are 8 words (32 bytes).
syscall(__ARM_NR_cacheflush, vaddr, reinterpret_cast<char*>(vaddr) + 32, 0);
#else
@@ -267,10 +267,10 @@ class OsLayer {
__asm __volatile("rdtsc" : "=a" (data.l32.l), "=d"(data.l32.h));
tsc = data.l64;
#elif defined(STRESSAPPTEST_CPU_ARMV7A)
- #warning "Unsupported CPU type ARMV7A: your build may not function correctly"
+ #warning "Unsupported CPU type ARMV7A: your timer may not function correctly"
tsc = 0;
#else
- #warning "Unsupported CPU type: your build may not function correctly"
+ #warning "Unsupported CPU type: your timer may not function correctly"
tsc = 0;
#endif
return (tsc);
@@ -381,7 +381,7 @@ class OsLayer {
int num_nodes_; // Number of nodes in the system.
int num_cpus_per_node_; // Number of cpus per node in the system.
int address_mode_; // Are we running 32 or 64 bit?
- bool has_sse2_; // Do we have sse2 instructions?
+ bool has_vector_; // Do we have sse2/neon instructions?
bool has_clflush_; // Do we have clflush instructions?
bool use_flush_page_cache_; // Do we need to flush the page cache?
diff --git a/src/sat.cc b/src/sat.cc
index 57fd4fe..56c6b66 100644
--- a/src/sat.cc
+++ b/src/sat.cc
@@ -1614,7 +1614,7 @@ void Sat::AnalysisAllStats() {
map_it != workers_map_.end(); ++map_it) {
for (WorkerVector::const_iterator it = map_it->second->begin();
it != map_it->second->end(); ++it) {
- thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000;
+ thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000.;
total_data += (*it)->GetMemoryCopiedData();
total_data += (*it)->GetDeviceCopiedData();
if (thread_runtime_sec > max_runtime_sec) {
diff --git a/src/sattypes.h b/src/sattypes.h
index e51db31..79bb47d 100644
--- a/src/sattypes.h
+++ b/src/sattypes.h
@@ -225,6 +225,8 @@ inline void cpuid(
#endif // defined(__PIC__) && defined(STRESSAPPTEST_CPU_I686)
#elif defined(STRESSAPPTEST_CPU_PPC)
return;
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+ return;
#else
#warning "Unsupported CPU type."
#endif
diff --git a/src/worker.cc b/src/worker.cc
index dcffd4e..0864661 100644
--- a/src/worker.cc
+++ b/src/worker.cc
@@ -1359,10 +1359,10 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
blocksize,
currentblock * blocksize, 0);
if (errorcount == 0) {
- logprintf(0, "Log: CrcWarmCopyPage CRC mismatch %s != %s, "
+ logprintf(0, "Log: CrcWarmCopyPage CRC mismatch expected: %s != actual: %s, "
"but no miscompares found. Retrying with fresh data.\n",
- crc.ToHexString().c_str(),
- expectedcrc->ToHexString().c_str());
+ expectedcrc->ToHexString().c_str(),
+ crc.ToHexString().c_str() );
if (!tag_mode_) {
// Copy the data originally read from this region back again.
// This data should have any corruption read originally while
@@ -1382,7 +1382,7 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
expectedcrc->ToHexString().c_str());
struct ErrorRecord er;
er.actual = sourcemem[0];
- er.expected = 0x0;
+ er.expected = 0xbad;
er.vaddr = sourcemem;
ProcessError(&er, 0, "Hardware Error");
}
@@ -1954,7 +1954,7 @@ bool FileThread::Work() {
// Load patterns into page records.
page_recs_ = new struct PageRec[sat_->disk_pages()];
for (int i = 0; i < sat_->disk_pages(); i++) {
- page_recs_[i].pattern = new struct Pattern();
+ page_recs_[i].pattern = new class Pattern();
}
// Loop until done.
diff --git a/src/worker.h b/src/worker.h
index 6f9fde7..091d96b 100644
--- a/src/worker.h
+++ b/src/worker.h
@@ -240,7 +240,7 @@ class WorkerThread {
int64 ReadThreadTimer() {
struct timeval end_time_;
gettimeofday(&end_time_, NULL);
- return (end_time_.tv_sec - start_time_.tv_sec)*1000000 +
+ return (end_time_.tv_sec - start_time_.tv_sec)*1000000ULL +
(end_time_.tv_usec - start_time_.tv_usec);
}
// Stops per-WorkerThread timer and records thread run duration.
@@ -264,10 +264,10 @@ class WorkerThread {
// Calculate worker thread specific bandwidth.
virtual float GetMemoryBandwidth()
{return GetMemoryCopiedData() / (
- runduration_usec_ * 1.0 / 1000000);}
+ runduration_usec_ * 1.0 / 1000000.);}
virtual float GetDeviceBandwidth()
{return GetDeviceCopiedData() / (
- runduration_usec_ * 1.0 / 1000000);}
+ runduration_usec_ * 1.0 / 1000000.);}
void set_cpu_mask(cpu_set_t *mask) {
memcpy(&cpu_mask_, mask, sizeof(*mask));
@@ -421,7 +421,7 @@ class FileThread : public WorkerThread {
// Record of where these pages were sourced from, and what
// potentially broken components they passed through.
struct PageRec {
- struct Pattern *pattern; // This is the data it should contain.
+ class Pattern *pattern; // This is the data it should contain.
void *src; // This is the memory location the data was sourced from.
void *dst; // This is where it ended up.
};