aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Sanders <nsanders@chromium.org>2020-03-18 14:27:07 -0700
committerNick Sanders <nsanders@chromium.org>2020-03-18 14:27:07 -0700
commit13bdf568b05f5a36e1374de45d37d80a2a0268c2 (patch)
treea83b13e8ef93742c2eccfb451b6c4b2d93a1522b
parentfb72e5e5f0879231f38e0e826a98a6ca2d1ca38e (diff)
downloadstressapptest-13bdf568b05f5a36e1374de45d37d80a2a0268c2.tar.gz
Print pattern name, CPU on error
Print pattern name and source CPU on error, for better debuggability. BUG=None TEST=run with --force_errors Signed-off-by: Nick Sanders <nsanders@chromium.org>
-rw-r--r--src/queue.h3
-rw-r--r--src/worker.cc52
-rw-r--r--src/worker.h1
3 files changed, 48 insertions, 8 deletions
diff --git a/src/queue.h b/src/queue.h
index a6296b1..d1920a5 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -44,6 +44,7 @@ struct page_entry {
int32 tag; // These are tags for use in NUMA affinity or other uses.
uint32 touch; // Counter of the number of reads from this page.
uint64 ts; // Timestamp of the last read from this page.
+ uint32 lastcpu; // Last CPU to write this page.
class Pattern *lastpattern; // Expected Pattern at last read.
};
@@ -54,7 +55,7 @@ static inline void init_pe(struct page_entry *pe) {
pe->tag = kInvalidTag;
pe->touch = 0;
pe->ts = 0;
- pe->lastpattern = NULL;
+ pe->lastcpu = 0;
}
// This is a threadsafe randomized queue of pages for
diff --git a/src/worker.cc b/src/worker.cc
index 922d2c1..d8f37eb 100644
--- a/src/worker.cc
+++ b/src/worker.cc
@@ -117,6 +117,8 @@ struct ErrorRecord {
uint64 paddr; // This is the bus address, if available.
uint64 *tagvaddr; // This holds the tag value if this data was tagged.
uint64 tagpaddr; // This holds the physical address corresponding to the tag.
+ uint32 lastcpu; // This holds the CPU recorded as probably writing this data.
+ const char *patternname; // This holds the pattern name of the expected data.
};
// This is a helper function to create new threads with pthreads.
@@ -462,6 +464,9 @@ bool WorkerThread::FillPage(struct page_entry *pe) {
return 0;
}
+ // Tag this page as written from the current CPU.
+ pe->lastcpu = sched_getcpu();
+
// Mask is the bitmask of indexes used by the pattern.
// It is the pattern size -1. Size is always a power of 2.
uint64 *memwords = static_cast<uint64*>(pe->addr);
@@ -514,6 +519,8 @@ bool FillThread::FillPageRandom(struct page_entry *pe) {
// Choose a random pattern for this block.
pe->pattern = patternlist_->GetRandomPattern();
+ pe->lastcpu = sched_getcpu();
+
if (pe->pattern == 0) {
logprintf(0, "Process Error: Null data pattern\n");
return 0;
@@ -604,17 +611,19 @@ void WorkerThread::ProcessError(struct ErrorRecord *error,
(error->vaddr), 1);
logprintf(priority,
- "%s: miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
- "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
+ "%s: miscompare on CPU %d(<-%d) at %p(0x%llx:%s): "
+ "read:0x%016llx, reread:0x%016llx expected:0x%016llx. '%s'%s.\n",
message,
core_id,
- CurrentCpusFormat().c_str(),
+ error->lastcpu,
error->vaddr,
error->paddr,
dimm_string,
error->actual,
error->reread,
- error->expected);
+ error->expected,
+ (error->patternname) ? error->patternname : "None",
+ (error->reread == error->expected) ? " read error" : "");
}
@@ -681,7 +690,8 @@ void FileThread::ProcessError(struct ErrorRecord *error,
dimm_string,
error->actual,
error->reread,
- error->expected);
+ error->expected,
+ (error->patternname) ? error->patternname : "None");
// Overwrite incorrect data with correct data to prevent
// future miscompares when this data is reused.
@@ -694,6 +704,7 @@ void FileThread::ProcessError(struct ErrorRecord *error,
// Print errors on mismatches.
int WorkerThread::CheckRegion(void *addr,
class Pattern *pattern,
+ uint32 lastcpu,
int64 length,
int offset,
int64 pattern_offset) {
@@ -729,6 +740,8 @@ int WorkerThread::CheckRegion(void *addr,
recorded[errors].actual = actual;
recorded[errors].expected = expected;
recorded[errors].vaddr = &memblock[i];
+ recorded[errors].patternname = pattern->name();
+ recorded[errors].lastcpu = lastcpu;
errors++;
} else {
page_error = true;
@@ -902,6 +915,7 @@ int WorkerThread::CrcCheckPage(struct page_entry *srcpe) {
expectedcrc->ToHexString().c_str());
int errorcount = CheckRegion(memslice,
srcpe->pattern,
+ srcpe->lastcpu,
blocksize,
currentblock * blocksize, 0);
if (errorcount == 0) {
@@ -920,6 +934,7 @@ int WorkerThread::CrcCheckPage(struct page_entry *srcpe) {
uint64 *memslice = memblock + blocks * blockwords;
errors += CheckRegion(memslice,
srcpe->pattern,
+ srcpe->lastcpu,
leftovers,
blocks * blocksize, 0);
}
@@ -1212,6 +1227,7 @@ int WorkerThread::CrcCopyPage(struct page_entry *dstpe,
expectedcrc->ToHexString().c_str());
int errorcount = CheckRegion(sourcemem,
srcpe->pattern,
+ srcpe->lastcpu,
blocksize,
currentblock * blocksize, 0);
if (errorcount == 0) {
@@ -1226,6 +1242,7 @@ int WorkerThread::CrcCopyPage(struct page_entry *dstpe,
memcpy(sourcemem, targetmem, blocksize);
errorcount = CheckRegion(sourcemem,
srcpe->pattern,
+ srcpe->lastcpu,
blocksize,
currentblock * blocksize, 0);
if (errorcount == 0) {
@@ -1240,6 +1257,9 @@ int WorkerThread::CrcCopyPage(struct page_entry *dstpe,
er.actual = sourcemem[0];
er.expected = 0xbad00000ull << 32;
er.vaddr = sourcemem;
+ er.lastcpu = srcpe->lastcpu;
+ logprintf(0, "Process Error: lastCPU %d\n", srcpe->lastcpu);
+ er.patternname = srcpe->pattern->name();
ProcessError(&er, 0, "Hardware Error");
errors += 1;
errorcount_ ++;
@@ -1258,6 +1278,7 @@ int WorkerThread::CrcCopyPage(struct page_entry *dstpe,
errors += CheckRegion(sourcemem,
srcpe->pattern,
+ srcpe->lastcpu,
leftovers,
blocks * blocksize, 0);
int leftoverwords = leftovers / wordsize_;
@@ -1268,6 +1289,7 @@ int WorkerThread::CrcCopyPage(struct page_entry *dstpe,
// Update pattern reference to reflect new contents.
dstpe->pattern = srcpe->pattern;
+ dstpe->lastcpu = sched_getcpu();
// Clean clean clean the errors away.
if (errors) {
@@ -1300,6 +1322,7 @@ int InvertThread::InvertPageDown(struct page_entry *srcpe) {
}
}
+ srcpe->lastcpu = sched_getcpu();
return 0;
}
@@ -1322,6 +1345,8 @@ int InvertThread::InvertPageUp(struct page_entry *srcpe) {
OsLayer::FastFlush(&sourcemem[i]);
}
}
+
+ srcpe->lastcpu = sched_getcpu();
return 0;
}
@@ -1358,6 +1383,7 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
expectedcrc->ToHexString().c_str());
int errorcount = CheckRegion(sourcemem,
srcpe->pattern,
+ srcpe->lastcpu,
blocksize,
currentblock * blocksize, 0);
if (errorcount == 0) {
@@ -1372,6 +1398,7 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
memcpy(sourcemem, targetmem, blocksize);
errorcount = CheckRegion(sourcemem,
srcpe->pattern,
+ srcpe->lastcpu,
blocksize,
currentblock * blocksize, 0);
if (errorcount == 0) {
@@ -1386,6 +1413,8 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
er.actual = sourcemem[0];
er.expected = 0xbad;
er.vaddr = sourcemem;
+ er.lastcpu = srcpe->lastcpu;
+ er.patternname = srcpe->pattern->name();
ProcessError(&er, 0, "Hardware Error");
errors ++;
errorcount_ ++;
@@ -1404,6 +1433,7 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
errors += CheckRegion(sourcemem,
srcpe->pattern,
+ srcpe->lastcpu,
leftovers,
blocks * blocksize, 0);
int leftoverwords = leftovers / wordsize_;
@@ -1414,6 +1444,8 @@ int WorkerThread::CrcWarmCopyPage(struct page_entry *dstpe,
// Update pattern reference to reflect new contents.
dstpe->pattern = srcpe->pattern;
+ dstpe->lastcpu = sched_getcpu();
+
// Clean clean clean the errors away.
if (errors) {
@@ -1494,7 +1526,7 @@ bool CopyThread::Work() {
// Force errors for unittests.
if (sat_->error_injection()) {
- if (loops == 8) {
+ if ((random() % 50000) == 8) {
char *addr = reinterpret_cast<char*>(src.addr);
int offset = random() % sat_->page_length();
addr[offset] = 0xba;
@@ -1509,6 +1541,7 @@ bool CopyThread::Work() {
} else {
memcpy(dst.addr, src.addr, sat_->page_length());
dst.pattern = src.pattern;
+ dst.lastcpu = sched_getcpu();
}
result = result && sat_->PutValid(&dst);
@@ -1753,6 +1786,7 @@ bool FileThread::SectorValidatePage(const struct PageRec &page,
tag[sec].sector,
page.src, page.dst);
+ errorcount_ += 1;
logprintf(5, "Sector Error: Sector tag @ 0x%x, pass %d/%d. "
"sec %x/%x, block %d/%d, magic %x/%x, File: %s \n",
block * page_length + 512 * sec,
@@ -1844,6 +1878,7 @@ bool FileThread::GetEmptyPage(struct page_entry *dst) {
dst->addr = local_page_;
dst->offset = 0;
dst->pattern = 0;
+ dst->lastcpu = 0;
}
return true;
}
@@ -1899,6 +1934,7 @@ bool FileThread::ReadPages(int fd) {
return false;
// Retrieve expected pattern.
dst.pattern = page_recs_[i].pattern;
+ dst.lastcpu = sched_getcpu();
// Update page recordpage record.
page_recs_[i].dst = dst.addr;
@@ -2230,6 +2266,7 @@ bool NetworkThread::Work() {
// Update pattern reference to reflect new contents.
dst.pattern = src.pattern;
+ dst.lastcpu = sched_getcpu();
// Do the network read.
if (!(result = result && ReceivePage(sock, &dst)))
@@ -3149,7 +3186,7 @@ bool DiskThread::ValidateBlockOnDisk(int fd, BlockData *block) {
// In non-destructive mode, don't compare the block to the pattern since
// the block was never written to disk in the first place.
if (!non_destructive_) {
- if (CheckRegion(block_buffer_, block->pattern(), current_bytes,
+ if (CheckRegion(block_buffer_, block->pattern(), 0, current_bytes,
0, bytes_read)) {
os_->ErrorReport(device_name_.c_str(), "disk-pattern-error", 1);
errorcount_ += 1;
@@ -3387,6 +3424,7 @@ bool MemoryRegionThread::Work() {
phase_ = kPhaseCopy;
CrcCopyPage(&memregion_pe, &source_pe);
memregion_pe.pattern = source_pe.pattern;
+ memregion_pe.lastcpu = sched_getcpu();
// Error injection for CRC Check.
if ((sat_->error_injection() || error_injection_) && loops == 2) {
diff --git a/src/worker.h b/src/worker.h
index 091d96b..3398208 100644
--- a/src/worker.h
+++ b/src/worker.h
@@ -320,6 +320,7 @@ class WorkerThread {
// Compare a region of memory with a known data patter, and report errors.
virtual int CheckRegion(void *addr,
class Pattern *pat,
+ uint32 lastcpu,
int64 length,
int offset,
int64 patternoffset);