aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornjn <njn@a5019735-40e9-0310-863c-91ae7b9d1cf9>2010-10-06 22:46:31 +0000
committernjn <njn@a5019735-40e9-0310-863c-91ae7b9d1cf9>2010-10-06 22:46:31 +0000
commit2d853a1f8a153ee1ed99c5e1166d69dd4c6574d8 (patch)
tree228507c154f7de887f5ca001de0540eb3fc5ca97
parentf252299f95af007e6e94061e4d44a01421b3eb11 (diff)
downloadvalgrind-2d853a1f8a153ee1ed99c5e1166d69dd4c6574d8.tar.gz
Change Cachegrind/Callgrind to talk about the LL (last-level) cache instead
of the L2 cache. This is to accommodate machines with three levels of cache. We still only simulate two levels, the first and the last. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@11404 a5019735-40e9-0310-863c-91ae7b9d1cf9
-rw-r--r--NEWS19
-rw-r--r--cachegrind/cg-arm.c4
-rw-r--r--cachegrind/cg-ppc32.c4
-rw-r--r--cachegrind/cg-ppc64.c4
-rw-r--r--cachegrind/cg-x86-amd64.c188
-rw-r--r--cachegrind/cg_arch.h10
-rw-r--r--cachegrind/cg_main.c155
-rw-r--r--cachegrind/cg_sim.c8
-rw-r--r--cachegrind/docs/cg-manual.xml122
-rw-r--r--cachegrind/tests/chdir.stderr.exp14
-rw-r--r--cachegrind/tests/dlclose.stderr.exp14
-rwxr-xr-xcachegrind/tests/filter_stderr8
-rw-r--r--cachegrind/tests/notpower2.stderr.exp14
-rw-r--r--cachegrind/tests/notpower2.vgtest2
-rw-r--r--cachegrind/tests/wrap5.stderr.exp14
-rw-r--r--cachegrind/tests/x86/fpu-28-108.stderr.exp14
-rw-r--r--callgrind/docs/cl-format.xml4
-rw-r--r--callgrind/docs/cl-manual.xml20
-rw-r--r--callgrind/sim.c282
-rwxr-xr-xcallgrind/tests/filter_stderr8
-rw-r--r--callgrind/tests/notpower2-hwpref.stderr.exp16
-rw-r--r--callgrind/tests/notpower2-hwpref.vgtest2
-rw-r--r--callgrind/tests/notpower2-use.stderr.exp16
-rw-r--r--callgrind/tests/notpower2-use.vgtest2
-rw-r--r--callgrind/tests/notpower2-wb.stderr.exp16
-rw-r--r--callgrind/tests/notpower2-wb.vgtest2
-rw-r--r--callgrind/tests/notpower2.stderr.exp16
-rw-r--r--callgrind/tests/notpower2.vgtest2
-rw-r--r--callgrind/tests/simwork-both.stderr.exp16
-rw-r--r--callgrind/tests/simwork-cache.stderr.exp16
-rw-r--r--callgrind/tests/simwork1.stderr.exp16
-rw-r--r--callgrind/tests/simwork2.stderr.exp16
-rw-r--r--callgrind/tests/simwork3.stderr.exp16
-rw-r--r--callgrind/tests/threads-use.stderr.exp16
34 files changed, 586 insertions, 490 deletions
diff --git a/NEWS b/NEWS
index ead2949dd..928ee0228 100644
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,20 @@ Improvements:
--threshold option has changed; this is unlikely to affect many people, if
you do use it please see the user manual for details.
+- Callgrind now can do branch prediction simulation, similar to Cachegrind.
+ In addition, it optionally can count the number of executed global bus events.
+ Both can be used for a better approximation of a "Cycle Estimation" as
+ derived event (you need to update the event formula in KCachegrind yourself).
+
+- Cachegrind and Callgrind now refer to the LL (last-level) cache rather
+ than the L2 cache. This is to accommodate machines with three levels of
+ caches -- if Cachegrind/Callgrind auto-detects the cache configuration of
+ such a machine it will run the simulation as if the L2 cache isn't
+ present. This means the results are less likely to match the true result
+ for the machine, but Cachegrind/Callgrind's results are already only
+ approximate, and should not be considered authoritative. The results are
+ still useful for giving a general idea about a program's locality.
+
- Massif has a new option, --pages-as-heap, which is disabled by default.
When enabled, instead of tracking allocations at the level of heap blocks
(as allocated with malloc/new/new[]), it instead tracks memory allocations
@@ -24,11 +38,6 @@ Improvements:
harder than the heap-level output, but this option is useful if you want
to account for every byte of memory used by a program.
-- Callgrind now can do branch prediction simulation, similar to Cachegrind.
- In addition, it optionally can count the number of executed global bus events.
- Both can be used for a better approximation of a "Cycle Estimation" as
- derived event (you need to update the event formula in KCachegrind yourself).
-
- Added new memcheck command-line option --show-possibly-lost.
diff --git a/cachegrind/cg-arm.c b/cachegrind/cg-arm.c
index 27f5d0da9..e37d0c0a1 100644
--- a/cachegrind/cg-arm.c
+++ b/cachegrind/cg-arm.c
@@ -37,13 +37,13 @@
#include "cg_arch.h"
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
Bool all_caches_clo_defined)
{
// Set caches to default (for Cortex-A8 ?)
*I1c = (cache_t) { 16384, 4, 64 };
*D1c = (cache_t) { 16384, 4, 64 };
- *L2c = (cache_t) { 262144, 8, 64 };
+ *LLc = (cache_t) { 262144, 8, 64 };
if (!all_caches_clo_defined) {
VG_(message)(Vg_DebugMsg,
diff --git a/cachegrind/cg-ppc32.c b/cachegrind/cg-ppc32.c
index ce39c2ee7..5920c0501 100644
--- a/cachegrind/cg-ppc32.c
+++ b/cachegrind/cg-ppc32.c
@@ -37,13 +37,13 @@
#include "cg_arch.h"
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
Bool all_caches_clo_defined)
{
// Set caches to default.
*I1c = (cache_t) { 65536, 2, 64 };
*D1c = (cache_t) { 65536, 2, 64 };
- *L2c = (cache_t) { 262144, 8, 64 };
+ *LLc = (cache_t) { 262144, 8, 64 };
// Warn if config not completely specified from cmd line. Note that
// this message is slightly different from the one we give on x86/AMD64
diff --git a/cachegrind/cg-ppc64.c b/cachegrind/cg-ppc64.c
index 82993f853..973664b24 100644
--- a/cachegrind/cg-ppc64.c
+++ b/cachegrind/cg-ppc64.c
@@ -37,13 +37,13 @@
#include "cg_arch.h"
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
Bool all_caches_clo_defined)
{
// Set caches to default.
*I1c = (cache_t) { 65536, 2, 64 };
*D1c = (cache_t) { 65536, 2, 64 };
- *L2c = (cache_t) { 262144, 8, 64 };
+ *LLc = (cache_t) { 262144, 8, 64 };
// Warn if config not completely specified from cmd line. Note that
// this message is slightly different from the one we give on x86/AMD64
diff --git a/cachegrind/cg-x86-amd64.c b/cachegrind/cg-x86-amd64.c
index 350606b2a..6794319b7 100644
--- a/cachegrind/cg-x86-amd64.c
+++ b/cachegrind/cg-x86-amd64.c
@@ -54,9 +54,12 @@ static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
* array of pre-defined configurations for various parts of the memory
* hierarchy.
* According to Intel Processor Identification, App Note 485.
+ *
+ * If a L3 cache is found, then data for it rather than the L2
+ * is returned via *LLc.
*/
static
-Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
+Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
{
Int cpuid1_eax;
Int cpuid1_ignore;
@@ -65,6 +68,14 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
UChar info[16];
Int i, trials;
Bool L2_found = False;
+ /* If we see L3 cache info, copy it into L3c. Then, at the end,
+ copy it into *LLc. Hence if a L3 cache is specified, *LLc will
+ eventually contain a description of it rather than the L2 cache.
+ The use of the L3c intermediary makes this process independent
+ of the order in which the cache specifications appear in
+ info[]. */
+ Bool L3_found = False;
+ cache_t L3c = { 0, 0, 0 };
if (level < 2) {
VG_(dmsg)("warning: CPUID level < 2 for Intel processor (%d)\n", level);
@@ -121,18 +132,39 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
case 0x90: case 0x96: case 0x9b:
VG_(tool_panic)("IA-64 cache detected?!");
- case 0x22: case 0x23: case 0x25: case 0x29:
- case 0x46: case 0x47: case 0x4a: case 0x4b: case 0x4c: case 0x4d:
- case 0xe2: case 0xe3: case 0xe4: case 0xea: case 0xeb: case 0xec:
- VG_(dmsg)("warning: L3 cache detected but ignored\n");
- break;
+ /* L3 cache info. */
+ case 0x22: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break;
+ case 0x23: L3c = (cache_t) { 1024, 8, 64 }; L3_found = True; break;
+ case 0x25: L3c = (cache_t) { 2048, 8, 64 }; L3_found = True; break;
+ case 0x29: L3c = (cache_t) { 4096, 8, 64 }; L3_found = True; break;
+ case 0x46: L3c = (cache_t) { 4096, 4, 64 }; L3_found = True; break;
+ case 0x47: L3c = (cache_t) { 8192, 8, 64 }; L3_found = True; break;
+ case 0x4a: L3c = (cache_t) { 6144, 12, 64 }; L3_found = True; break;
+ case 0x4b: L3c = (cache_t) { 8192, 16, 64 }; L3_found = True; break;
+ case 0x4c: L3c = (cache_t) { 12288, 12, 64 }; L3_found = True; break;
+ case 0x4d: L3c = (cache_t) { 16384, 16, 64 }; L3_found = True; break;
+ case 0xd0: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break;
+ case 0xd1: L3c = (cache_t) { 1024, 4, 64 }; L3_found = True; break;
+ case 0xd2: L3c = (cache_t) { 2048, 4, 64 }; L3_found = True; break;
+ case 0xd6: L3c = (cache_t) { 1024, 8, 64 }; L3_found = True; break;
+ case 0xd7: L3c = (cache_t) { 2048, 8, 64 }; L3_found = True; break;
+ case 0xd8: L3c = (cache_t) { 4096, 8, 64 }; L3_found = True; break;
+ case 0xdc: L3c = (cache_t) { 1536, 12, 64 }; L3_found = True; break;
+ case 0xdd: L3c = (cache_t) { 3072, 12, 64 }; L3_found = True; break;
+ case 0xde: L3c = (cache_t) { 6144, 12, 64 }; L3_found = True; break;
+ case 0xe2: L3c = (cache_t) { 2048, 16, 64 }; L3_found = True; break;
+ case 0xe3: L3c = (cache_t) { 4096, 16, 64 }; L3_found = True; break;
+ case 0xe4: L3c = (cache_t) { 8192, 16, 64 }; L3_found = True; break;
+ case 0xea: L3c = (cache_t) { 12288, 24, 64 }; L3_found = True; break;
+ case 0xeb: L3c = (cache_t) { 18432, 24, 64 }; L3_found = True; break;
+ case 0xec: L3c = (cache_t) { 24576, 24, 64 }; L3_found = True; break;
/* Described as "MLC" in Intel documentation */
- case 0x21: *L2c = (cache_t) { 256, 8, 64 }; L2_found = True; break;
+ case 0x21: *LLc = (cache_t) { 256, 8, 64 }; L2_found = True; break;
/* These are sectored, whatever that means */
- case 0x39: *L2c = (cache_t) { 128, 4, 64 }; L2_found = True; break;
- case 0x3c: *L2c = (cache_t) { 256, 4, 64 }; L2_found = True; break;
+ case 0x39: *LLc = (cache_t) { 128, 4, 64 }; L2_found = True; break;
+ case 0x3c: *LLc = (cache_t) { 256, 4, 64 }; L2_found = True; break;
/* If a P6 core, this means "no L2 cache".
If a P4 core, this means "no L3 cache".
@@ -141,20 +173,21 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
case 0x40:
break;
- case 0x41: *L2c = (cache_t) { 128, 4, 32 }; L2_found = True; break;
- case 0x42: *L2c = (cache_t) { 256, 4, 32 }; L2_found = True; break;
- case 0x43: *L2c = (cache_t) { 512, 4, 32 }; L2_found = True; break;
- case 0x44: *L2c = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
- case 0x45: *L2c = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
- case 0x48: *L2c = (cache_t) { 3072,12, 64 }; L2_found = True; break;
+ case 0x41: *LLc = (cache_t) { 128, 4, 32 }; L2_found = True; break;
+ case 0x42: *LLc = (cache_t) { 256, 4, 32 }; L2_found = True; break;
+ case 0x43: *LLc = (cache_t) { 512, 4, 32 }; L2_found = True; break;
+ case 0x44: *LLc = (cache_t) { 1024, 4, 32 }; L2_found = True; break;
+ case 0x45: *LLc = (cache_t) { 2048, 4, 32 }; L2_found = True; break;
+ case 0x48: *LLc = (cache_t) { 3072, 12, 64 }; L2_found = True; break;
+ case 0x4e: *LLc = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
case 0x49:
- if ((family == 15) && (model == 6))
- /* On Xeon MP (family F, model 6), this is for L3 */
- VG_(dmsg)("warning: L3 cache detected but ignored\n");
- else
- *L2c = (cache_t) { 4096, 16, 64 }; L2_found = True;
- break;
- case 0x4e: *L2c = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
+ if (family == 15 && model == 6) {
+ /* On Xeon MP (family F, model 6), this is for L3 */
+ L3c = (cache_t) { 4096, 16, 64 }; L3_found = True;
+ } else {
+ *LLc = (cache_t) { 4096, 16, 64 }; L2_found = True;
+ }
+ break;
/* These are sectored, whatever that means */
case 0x60: *D1c = (cache_t) { 16, 8, 64 }; break; /* sectored */
@@ -181,26 +214,24 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
break;
/* not sectored, whatever that might mean */
- case 0x78: *L2c = (cache_t) { 1024, 4, 64 }; L2_found = True; break;
+ case 0x78: *LLc = (cache_t) { 1024, 4, 64 }; L2_found = True; break;
/* These are sectored, whatever that means */
- case 0x79: *L2c = (cache_t) { 128, 8, 64 }; L2_found = True; break;
- case 0x7a: *L2c = (cache_t) { 256, 8, 64 }; L2_found = True; break;
- case 0x7b: *L2c = (cache_t) { 512, 8, 64 }; L2_found = True; break;
- case 0x7c: *L2c = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
- case 0x7d: *L2c = (cache_t) { 2048, 8, 64 }; L2_found = True; break;
- case 0x7e: *L2c = (cache_t) { 256, 8, 128 }; L2_found = True; break;
-
- case 0x7f: *L2c = (cache_t) { 512, 2, 64 }; L2_found = True; break;
- case 0x80: *L2c = (cache_t) { 512, 8, 64 }; L2_found = True; break;
-
- case 0x81: *L2c = (cache_t) { 128, 8, 32 }; L2_found = True; break;
- case 0x82: *L2c = (cache_t) { 256, 8, 32 }; L2_found = True; break;
- case 0x83: *L2c = (cache_t) { 512, 8, 32 }; L2_found = True; break;
- case 0x84: *L2c = (cache_t) { 1024, 8, 32 }; L2_found = True; break;
- case 0x85: *L2c = (cache_t) { 2048, 8, 32 }; L2_found = True; break;
- case 0x86: *L2c = (cache_t) { 512, 4, 64 }; L2_found = True; break;
- case 0x87: *L2c = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
+ case 0x79: *LLc = (cache_t) { 128, 8, 64 }; L2_found = True; break;
+ case 0x7a: *LLc = (cache_t) { 256, 8, 64 }; L2_found = True; break;
+ case 0x7b: *LLc = (cache_t) { 512, 8, 64 }; L2_found = True; break;
+ case 0x7c: *LLc = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
+ case 0x7d: *LLc = (cache_t) { 2048, 8, 64 }; L2_found = True; break;
+ case 0x7e: *LLc = (cache_t) { 256, 8, 128 }; L2_found = True; break;
+ case 0x7f: *LLc = (cache_t) { 512, 2, 64 }; L2_found = True; break;
+ case 0x80: *LLc = (cache_t) { 512, 8, 64 }; L2_found = True; break;
+ case 0x81: *LLc = (cache_t) { 128, 8, 32 }; L2_found = True; break;
+ case 0x82: *LLc = (cache_t) { 256, 8, 32 }; L2_found = True; break;
+ case 0x83: *LLc = (cache_t) { 512, 8, 32 }; L2_found = True; break;
+ case 0x84: *LLc = (cache_t) { 1024, 8, 32 }; L2_found = True; break;
+ case 0x85: *LLc = (cache_t) { 2048, 8, 32 }; L2_found = True; break;
+ case 0x86: *LLc = (cache_t) { 512, 4, 64 }; L2_found = True; break;
+ case 0x87: *LLc = (cache_t) { 1024, 8, 64 }; L2_found = True; break;
/* Ignore prefetch information */
case 0xf0: case 0xf1:
@@ -213,8 +244,15 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
}
}
+ /* If we found a L3 cache, throw away the L2 data and use the L3's instead. */
+ if (L3_found) {
+ VG_(dmsg)("warning: L3 cache found, using its data for the LL simulation.\n");
+ *LLc = L3c;
+ L2_found = True;
+ }
+
if (!L2_found)
- VG_(dmsg)("warning: L2 cache not installed, ignore L2 results.\n");
+ VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
return 0;
}
@@ -241,14 +279,37 @@ Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* L2c)
* 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
* so we detect that.
*
- * Returns 0 on success, non-zero on failure.
+ * Returns 0 on success, non-zero on failure. As with the Intel code
+ * above, if a L3 cache is found, then data for it rather than the L2
+ * is returned via *LLc.
*/
+
+/* A small helper */
+static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 )
+{
+ /* Decode a L2/L3 associativity indication. It is encoded
+ differently from the I1/D1 associativity. Returns 1
+ (direct-map) as a safe but suboptimal result for unknown
+ encodings. */
+ switch (bits_15_12 & 0xF) {
+ case 1: return 1; case 2: return 2;
+ case 4: return 4; case 6: return 8;
+ case 8: return 16; case 0xA: return 32;
+ case 0xB: return 48; case 0xC: return 64;
+ case 0xD: return 96; case 0xE: return 128;
+ case 0xF: /* fully associative */
+ case 0: /* L2/L3 cache or TLB is disabled */
+ default:
+ return 1;
+ }
+}
+
static
-Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc)
{
UInt ext_level;
UInt dummy, model;
- UInt I1i, D1i, L2i;
+ UInt I1i, D1i, L2i, L3i;
VG_(cpuid)(0x80000000, &ext_level, &dummy, &dummy, &dummy);
@@ -259,7 +320,7 @@ Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
}
VG_(cpuid)(0x80000005, &dummy, &dummy, &D1i, &I1i);
- VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &dummy);
+ VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &L3i);
VG_(cpuid)(0x1, &model, &dummy, &dummy, &dummy);
@@ -277,15 +338,26 @@ Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* L2c)
I1c->assoc = (I1i >> 16) & 0xff;
I1c->line_size = (I1i >> 0) & 0xff;
- L2c->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
- L2c->assoc = (L2i >> 12) & 0xf;
- L2c->line_size = (L2i >> 0) & 0xff;
+ LLc->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
+ LLc->assoc = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf);
+ LLc->line_size = (L2i >> 0) & 0xff;
+
+ if (((L3i >> 18) & 0x3fff) > 0) {
+ /* There's an L3 cache. Replace *LLc contents with this info. */
+ /* NB: the test in the if is "if L3 size > 0 ". I don't know if
+ this is the right way to test presence-vs-absence of L3. I
+ can't see any guidance on this in the AMD documentation. */
+ LLc->size = ((L3i >> 18) & 0x3fff) * 512;
+ LLc->assoc = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf);
+ LLc->line_size = (L3i >> 0) & 0xff;
+ VG_(dmsg)("warning: L3 cache found, using its data for the L2 simulation.\n");
+ }
return 0;
}
static
-Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc)
{
Int level, ret;
Char vendor_id[13];
@@ -306,10 +378,10 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
/* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
- ret = Intel_cache_info(level, I1c, D1c, L2c);
+ ret = Intel_cache_info(level, I1c, D1c, LLc);
} else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
- ret = AMD_cache_info(I1c, D1c, L2c);
+ ret = AMD_cache_info(I1c, D1c, LLc);
} else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) {
/* Total kludge. Pretend to be a VIA Nehemiah. */
@@ -319,9 +391,9 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
I1c->size = 64;
I1c->assoc = 4;
I1c->line_size = 16;
- L2c->size = 64;
- L2c->assoc = 16;
- L2c->line_size = 16;
+ LLc->size = 64;
+ LLc->assoc = 16;
+ LLc->line_size = 16;
ret = 0;
} else {
@@ -332,13 +404,13 @@ Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* L2c)
/* Successful! Convert sizes from KB to bytes */
I1c->size *= 1024;
D1c->size *= 1024;
- L2c->size *= 1024;
+ LLc->size *= 1024;
return ret;
}
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
Bool all_caches_clo_defined)
{
Int res;
@@ -346,10 +418,10 @@ void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
// Set caches to default.
*I1c = (cache_t) { 65536, 2, 64 };
*D1c = (cache_t) { 65536, 2, 64 };
- *L2c = (cache_t) { 262144, 8, 64 };
+ *LLc = (cache_t) { 262144, 8, 64 };
// Then replace with any info we can get from CPUID.
- res = get_caches_from_CPUID(I1c, D1c, L2c);
+ res = get_caches_from_CPUID(I1c, D1c, LLc);
// Warn if CPUID failed and config not completely specified from cmd line.
if (res != 0 && !all_caches_clo_defined) {
diff --git a/cachegrind/cg_arch.h b/cachegrind/cg_arch.h
index 7a8d17103..23f1a2cd8 100644
--- a/cachegrind/cg_arch.h
+++ b/cachegrind/cg_arch.h
@@ -33,14 +33,14 @@
// For cache simulation
typedef struct {
- int size; // bytes
- int assoc;
- int line_size; // bytes
+ Int size; // bytes
+ Int assoc;
+ Int line_size; // bytes
} cache_t;
-// Gives the configuration of I1, D1 and L2 caches. They get overridden
+// Gives the configuration of I1, D1 and LL caches. They get overridden
// by any cache configurations specified on the command line.
-void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* L2c,
+void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
Bool all_caches_clo_defined);
#endif // __CG_ARCH_H
diff --git a/cachegrind/cg_main.c b/cachegrind/cg_main.c
index 84341d477..ecdd706f0 100644
--- a/cachegrind/cg_main.c
+++ b/cachegrind/cg_main.c
@@ -77,7 +77,7 @@ typedef
struct {
ULong a; /* total # memory accesses of this kind */
ULong m1; /* misses in the first level cache */
- ULong m2; /* misses in the second level cache */
+ ULong mL; /* misses in the second level cache */
}
CacheCC;
@@ -268,13 +268,13 @@ static LineCC* get_lineCC(Addr origAddr)
lineCC->loc.line = loc.line;
lineCC->Ir.a = 0;
lineCC->Ir.m1 = 0;
- lineCC->Ir.m2 = 0;
+ lineCC->Ir.mL = 0;
lineCC->Dr.a = 0;
lineCC->Dr.m1 = 0;
- lineCC->Dr.m2 = 0;
+ lineCC->Dr.mL = 0;
lineCC->Dw.a = 0;
lineCC->Dw.m1 = 0;
- lineCC->Dw.m2 = 0;
+ lineCC->Dw.mL = 0;
lineCC->Bc.b = 0;
lineCC->Bc.mp = 0;
lineCC->Bi.b = 0;
@@ -319,7 +319,7 @@ void log_1I_0D_cache_access(InstrInfo* n)
//VG_(printf)("1I_0D : CCaddr=0x%010lx, iaddr=0x%010lx, isize=%lu\n",
// n, n->instr_addr, n->instr_len);
cachesim_I1_doref(n->instr_addr, n->instr_len,
- &n->parent->Ir.m1, &n->parent->Ir.m2);
+ &n->parent->Ir.m1, &n->parent->Ir.mL);
n->parent->Ir.a++;
}
@@ -331,10 +331,10 @@ void log_2I_0D_cache_access(InstrInfo* n, InstrInfo* n2)
// n, n->instr_addr, n->instr_len,
// n2, n2->instr_addr, n2->instr_len);
cachesim_I1_doref(n->instr_addr, n->instr_len,
- &n->parent->Ir.m1, &n->parent->Ir.m2);
+ &n->parent->Ir.m1, &n->parent->Ir.mL);
n->parent->Ir.a++;
cachesim_I1_doref(n2->instr_addr, n2->instr_len,
- &n2->parent->Ir.m1, &n2->parent->Ir.m2);
+ &n2->parent->Ir.m1, &n2->parent->Ir.mL);
n2->parent->Ir.a++;
}
@@ -348,13 +348,13 @@ void log_3I_0D_cache_access(InstrInfo* n, InstrInfo* n2, InstrInfo* n3)
// n2, n2->instr_addr, n2->instr_len,
// n3, n3->instr_addr, n3->instr_len);
cachesim_I1_doref(n->instr_addr, n->instr_len,
- &n->parent->Ir.m1, &n->parent->Ir.m2);
+ &n->parent->Ir.m1, &n->parent->Ir.mL);
n->parent->Ir.a++;
cachesim_I1_doref(n2->instr_addr, n2->instr_len,
- &n2->parent->Ir.m1, &n2->parent->Ir.m2);
+ &n2->parent->Ir.m1, &n2->parent->Ir.mL);
n2->parent->Ir.a++;
cachesim_I1_doref(n3->instr_addr, n3->instr_len,
- &n3->parent->Ir.m1, &n3->parent->Ir.m2);
+ &n3->parent->Ir.m1, &n3->parent->Ir.mL);
n3->parent->Ir.a++;
}
@@ -365,11 +365,11 @@ void log_1I_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
// " daddr=0x%010lx, dsize=%lu\n",
// n, n->instr_addr, n->instr_len, data_addr, data_size);
cachesim_I1_doref(n->instr_addr, n->instr_len,
- &n->parent->Ir.m1, &n->parent->Ir.m2);
+ &n->parent->Ir.m1, &n->parent->Ir.mL);
n->parent->Ir.a++;
cachesim_D1_doref(data_addr, data_size,
- &n->parent->Dr.m1, &n->parent->Dr.m2);
+ &n->parent->Dr.m1, &n->parent->Dr.mL);
n->parent->Dr.a++;
}
@@ -380,11 +380,11 @@ void log_1I_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
// " daddr=0x%010lx, dsize=%lu\n",
// n, n->instr_addr, n->instr_len, data_addr, data_size);
cachesim_I1_doref(n->instr_addr, n->instr_len,
- &n->parent->Ir.m1, &n->parent->Ir.m2);
+ &n->parent->Ir.m1, &n->parent->Ir.mL);
n->parent->Ir.a++;
cachesim_D1_doref(data_addr, data_size,
- &n->parent->Dw.m1, &n->parent->Dw.m2);
+ &n->parent->Dw.m1, &n->parent->Dw.mL);
n->parent->Dw.a++;
}
@@ -394,7 +394,7 @@ void log_0I_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
//VG_(printf)("0I_1Dr: CCaddr=0x%010lx, daddr=0x%010lx, dsize=%lu\n",
// n, data_addr, data_size);
cachesim_D1_doref(data_addr, data_size,
- &n->parent->Dr.m1, &n->parent->Dr.m2);
+ &n->parent->Dr.m1, &n->parent->Dr.mL);
n->parent->Dr.a++;
}
@@ -404,7 +404,7 @@ void log_0I_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
//VG_(printf)("0I_1Dw: CCaddr=0x%010lx, daddr=0x%010lx, dsize=%lu\n",
// n, data_addr, data_size);
cachesim_D1_doref(data_addr, data_size,
- &n->parent->Dw.m1, &n->parent->Dw.m2);
+ &n->parent->Dw.m1, &n->parent->Dw.mL);
n->parent->Dw.a++;
}
@@ -1234,7 +1234,7 @@ IRSB* cg_instrument ( VgCallbackClosure* closure,
static cache_t clo_I1_cache = UNDEFINED_CACHE;
static cache_t clo_D1_cache = UNDEFINED_CACHE;
-static cache_t clo_L2_cache = UNDEFINED_CACHE;
+static cache_t clo_LL_cache = UNDEFINED_CACHE;
// Checks cache config is ok. Returns NULL if ok, or a pointer to an error
// string otherwise.
@@ -1273,7 +1273,7 @@ static Char* check_cache(cache_t* cache)
}
static
-void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
{
#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
@@ -1283,22 +1283,22 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
Bool all_caches_clo_defined =
(DEFINED(clo_I1_cache) &&
DEFINED(clo_D1_cache) &&
- DEFINED(clo_L2_cache));
+ DEFINED(clo_LL_cache));
// Set the cache config (using auto-detection, if supported by the
// architecture).
- VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
+ VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
// Check the default/auto-detected values.
checkRes = check_cache(I1c); tl_assert(!checkRes);
checkRes = check_cache(D1c); tl_assert(!checkRes);
- checkRes = check_cache(L2c); tl_assert(!checkRes);
+ checkRes = check_cache(LLc); tl_assert(!checkRes);
// Then replace with any defined on the command line. (Already checked in
// parse_cache_opt().)
if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
- if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
+ if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
if (VG_(clo_verbosity) >= 2) {
VG_(umsg)("Cache configuration used:\n");
@@ -1306,8 +1306,8 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
I1c->size, I1c->assoc, I1c->line_size);
VG_(umsg)(" D1: %dB, %d-way, %dB lines\n",
D1c->size, D1c->assoc, D1c->line_size);
- VG_(umsg)(" L2: %dB, %d-way, %dB lines\n",
- L2c->size, L2c->assoc, L2c->line_size);
+ VG_(umsg)(" LL: %dB, %d-way, %dB lines\n",
+ LLc->size, LLc->assoc, LLc->line_size);
}
#undef CMD_LINE_DEFINED
}
@@ -1354,12 +1354,12 @@ static void fprint_CC_table_and_calc_totals(void)
VG_(free)(cachegrind_out_file);
}
- // "desc:" lines (giving I1/D1/L2 cache configuration). The spaces after
+ // "desc:" lines (giving I1/D1/LL cache configuration). The spaces after
// the 2nd colon makes cg_annotate's output look nicer.
VG_(sprintf)(buf, "desc: I1 cache: %s\n"
"desc: D1 cache: %s\n"
- "desc: L2 cache: %s\n",
- I1.desc_line, D1.desc_line, L2.desc_line);
+ "desc: LL cache: %s\n",
+ I1.desc_line, D1.desc_line, LL.desc_line);
VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
// "cmd:" line
@@ -1379,11 +1379,11 @@ static void fprint_CC_table_and_calc_totals(void)
}
// "events:" line
if (clo_cache_sim && clo_branch_sim) {
- VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
+ VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
"Bc Bcm Bi Bim\n");
}
else if (clo_cache_sim && !clo_branch_sim) {
- VG_(sprintf)(buf, "\nevents: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw "
+ VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
"\n");
}
else if (!clo_cache_sim && clo_branch_sim) {
@@ -1430,9 +1430,9 @@ static void fprint_CC_table_and_calc_totals(void)
" %llu %llu %llu"
" %llu %llu %llu %llu\n",
lineCC->loc.line,
- lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2,
- lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
- lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2,
+ lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL,
+ lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
+ lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL,
lineCC->Bc.b, lineCC->Bc.mp,
lineCC->Bi.b, lineCC->Bi.mp);
}
@@ -1441,9 +1441,9 @@ static void fprint_CC_table_and_calc_totals(void)
" %llu %llu %llu"
" %llu %llu %llu\n",
lineCC->loc.line,
- lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.m2,
- lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.m2,
- lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.m2);
+ lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL,
+ lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
+ lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL);
}
else if (!clo_cache_sim && clo_branch_sim) {
VG_(sprintf)(buf, "%u %llu"
@@ -1464,13 +1464,13 @@ static void fprint_CC_table_and_calc_totals(void)
// Update summary stats
Ir_total.a += lineCC->Ir.a;
Ir_total.m1 += lineCC->Ir.m1;
- Ir_total.m2 += lineCC->Ir.m2;
+ Ir_total.mL += lineCC->Ir.mL;
Dr_total.a += lineCC->Dr.a;
Dr_total.m1 += lineCC->Dr.m1;
- Dr_total.m2 += lineCC->Dr.m2;
+ Dr_total.mL += lineCC->Dr.mL;
Dw_total.a += lineCC->Dw.a;
Dw_total.m1 += lineCC->Dw.m1;
- Dw_total.m2 += lineCC->Dw.m2;
+ Dw_total.mL += lineCC->Dw.mL;
Bc_total.b += lineCC->Bc.b;
Bc_total.mp += lineCC->Bc.mp;
Bi_total.b += lineCC->Bi.b;
@@ -1487,9 +1487,9 @@ static void fprint_CC_table_and_calc_totals(void)
" %llu %llu %llu"
" %llu %llu %llu"
" %llu %llu %llu %llu\n",
- Ir_total.a, Ir_total.m1, Ir_total.m2,
- Dr_total.a, Dr_total.m1, Dr_total.m2,
- Dw_total.a, Dw_total.m1, Dw_total.m2,
+ Ir_total.a, Ir_total.m1, Ir_total.mL,
+ Dr_total.a, Dr_total.m1, Dr_total.mL,
+ Dw_total.a, Dw_total.m1, Dw_total.mL,
Bc_total.b, Bc_total.mp,
Bi_total.b, Bi_total.mp);
}
@@ -1498,9 +1498,9 @@ static void fprint_CC_table_and_calc_totals(void)
" %llu %llu %llu"
" %llu %llu %llu"
" %llu %llu %llu\n",
- Ir_total.a, Ir_total.m1, Ir_total.m2,
- Dr_total.a, Dr_total.m1, Dr_total.m2,
- Dw_total.a, Dw_total.m1, Dw_total.m2);
+ Ir_total.a, Ir_total.m1, Ir_total.mL,
+ Dr_total.a, Dr_total.m1, Dr_total.mL,
+ Dw_total.a, Dw_total.m1, Dw_total.mL);
}
else if (!clo_cache_sim && clo_branch_sim) {
VG_(sprintf)(buf, "summary:"
@@ -1537,8 +1537,8 @@ static void cg_fini(Int exitcode)
CacheCC D_total;
BranchCC B_total;
- ULong L2_total_m, L2_total_mr, L2_total_mw,
- L2_total, L2_total_r, L2_total_w;
+ ULong LL_total_m, LL_total_mr, LL_total_mw,
+ LL_total, LL_total_r, LL_total_w;
Int l1, l2, l3;
fprint_CC_table_and_calc_totals();
@@ -1565,21 +1565,21 @@ static void cg_fini(Int exitcode)
miss numbers */
if (clo_cache_sim) {
VG_(umsg)(fmt, "I1 misses: ", Ir_total.m1);
- VG_(umsg)(fmt, "L2i misses: ", Ir_total.m2);
+ VG_(umsg)(fmt, "LLi misses: ", Ir_total.mL);
if (0 == Ir_total.a) Ir_total.a = 1;
VG_(percentify)(Ir_total.m1, Ir_total.a, 2, l1+1, buf1);
VG_(umsg)("I1 miss rate: %s\n", buf1);
- VG_(percentify)(Ir_total.m2, Ir_total.a, 2, l1+1, buf1);
- VG_(umsg)("L2i miss rate: %s\n", buf1);
+ VG_(percentify)(Ir_total.mL, Ir_total.a, 2, l1+1, buf1);
+ VG_(umsg)("LLi miss rate: %s\n", buf1);
VG_(umsg)("\n");
/* D cache results. Use the D_refs.rd and D_refs.wr values to
* determine the width of columns 2 & 3. */
D_total.a = Dr_total.a + Dw_total.a;
D_total.m1 = Dr_total.m1 + Dw_total.m1;
- D_total.m2 = Dr_total.m2 + Dw_total.m2;
+ D_total.mL = Dr_total.mL + Dw_total.mL;
/* Make format string, getting width right for numbers */
VG_(sprintf)(fmt, "%%s %%,%dllu (%%,%dllu rd + %%,%dllu wr)\n",
@@ -1589,8 +1589,8 @@ static void cg_fini(Int exitcode)
D_total.a, Dr_total.a, Dw_total.a);
VG_(umsg)(fmt, "D1 misses: ",
D_total.m1, Dr_total.m1, Dw_total.m1);
- VG_(umsg)(fmt, "L2d misses: ",
- D_total.m2, Dr_total.m2, Dw_total.m2);
+ VG_(umsg)(fmt, "LLd misses: ",
+ D_total.mL, Dr_total.mL, Dw_total.mL);
if (0 == D_total.a) D_total.a = 1;
if (0 == Dr_total.a) Dr_total.a = 1;
@@ -1600,30 +1600,30 @@ static void cg_fini(Int exitcode)
VG_(percentify)(Dw_total.m1, Dw_total.a, 1, l3+1, buf3);
VG_(umsg)("D1 miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
- VG_(percentify)( D_total.m2, D_total.a, 1, l1+1, buf1);
- VG_(percentify)(Dr_total.m2, Dr_total.a, 1, l2+1, buf2);
- VG_(percentify)(Dw_total.m2, Dw_total.a, 1, l3+1, buf3);
- VG_(umsg)("L2d miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
+ VG_(percentify)( D_total.mL, D_total.a, 1, l1+1, buf1);
+ VG_(percentify)(Dr_total.mL, Dr_total.a, 1, l2+1, buf2);
+ VG_(percentify)(Dw_total.mL, Dw_total.a, 1, l3+1, buf3);
+ VG_(umsg)("LLd miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
VG_(umsg)("\n");
- /* L2 overall results */
+ /* LL overall results */
- L2_total = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
- L2_total_r = Dr_total.m1 + Ir_total.m1;
- L2_total_w = Dw_total.m1;
- VG_(umsg)(fmt, "L2 refs: ",
- L2_total, L2_total_r, L2_total_w);
+ LL_total = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
+ LL_total_r = Dr_total.m1 + Ir_total.m1;
+ LL_total_w = Dw_total.m1;
+ VG_(umsg)(fmt, "LL refs: ",
+ LL_total, LL_total_r, LL_total_w);
- L2_total_m = Dr_total.m2 + Dw_total.m2 + Ir_total.m2;
- L2_total_mr = Dr_total.m2 + Ir_total.m2;
- L2_total_mw = Dw_total.m2;
- VG_(umsg)(fmt, "L2 misses: ",
- L2_total_m, L2_total_mr, L2_total_mw);
+ LL_total_m = Dr_total.mL + Dw_total.mL + Ir_total.mL;
+ LL_total_mr = Dr_total.mL + Ir_total.mL;
+ LL_total_mw = Dw_total.mL;
+ VG_(umsg)(fmt, "LL misses: ",
+ LL_total_m, LL_total_mr, LL_total_mw);
- VG_(percentify)(L2_total_m, (Ir_total.a + D_total.a), 1, l1+1, buf1);
- VG_(percentify)(L2_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
- VG_(percentify)(L2_total_mw, Dw_total.a, 1, l3+1, buf3);
- VG_(umsg)("L2 miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
+ VG_(percentify)(LL_total_m, (Ir_total.a + D_total.a), 1, l1+1, buf1);
+ VG_(percentify)(LL_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
+ VG_(percentify)(LL_total_mw, Dw_total.a, 1, l3+1, buf3);
+ VG_(umsg)("LL miss rate: %s (%s + %s )\n", buf1, buf2,buf3);
}
/* If branch profiling is enabled, show branch overall results. */
@@ -1760,8 +1760,9 @@ static Bool cg_process_cmd_line_option(Char* arg)
parse_cache_opt(&clo_I1_cache, arg, tmp_str);
else if VG_STR_CLO(arg, "--D1", tmp_str)
parse_cache_opt(&clo_D1_cache, arg, tmp_str);
- else if VG_STR_CLO(arg, "--L2", tmp_str)
- parse_cache_opt(&clo_L2_cache, arg, tmp_str);
+ else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
+ VG_STR_CLO(arg, "--LL", tmp_str))
+ parse_cache_opt(&clo_LL_cache, arg, tmp_str);
else if VG_STR_CLO( arg, "--cachegrind-out-file", clo_cachegrind_out_file) {}
else if VG_BOOL_CLO(arg, "--cache-sim", clo_cache_sim) {}
@@ -1777,7 +1778,7 @@ static void cg_print_usage(void)
VG_(printf)(
" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
-" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
+" --LL=<size>,<assoc>,<line_size> set LL cache manually\n"
" --cache-sim=yes|no [yes] collect cache stats?\n"
" --branch-sim=yes|no [no] collect branch prediction stats?\n"
" --cachegrind-out-file=<file> output file name [cachegrind.out.%%p]\n"
@@ -1819,7 +1820,7 @@ static void cg_pre_clo_init(void)
static void cg_post_clo_init(void)
{
- cache_t I1c, D1c, L2c;
+ cache_t I1c, D1c, LLc;
CC_table =
VG_(OSetGen_Create)(offsetof(LineCC, loc),
@@ -1837,11 +1838,11 @@ static void cg_post_clo_init(void)
VG_(malloc), "cg.main.cpci.3",
VG_(free));
- configure_caches(&I1c, &D1c, &L2c);
+ configure_caches(&I1c, &D1c, &LLc);
cachesim_I1_initcache(I1c);
cachesim_D1_initcache(D1c);
- cachesim_L2_initcache(L2c);
+ cachesim_LL_initcache(LLc);
}
VG_DETERMINE_INTERFACE_VERSION(cg_pre_clo_init)
diff --git a/cachegrind/cg_sim.c b/cachegrind/cg_sim.c
index a55a1e4ca..0b8a1d7c9 100644
--- a/cachegrind/cg_sim.c
+++ b/cachegrind/cg_sim.c
@@ -96,7 +96,7 @@ static void cachesim_##L##_initcache(cache_t config) \
/* bigger than its usual limit. Inlining gains around 5--10% speedup. */ \
__attribute__((always_inline)) \
static __inline__ \
-void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *m2) \
+void cachesim_##L##_doref(Addr a, UChar size, ULong* m1, ULong *mL) \
{ \
UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \
UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \
@@ -188,9 +188,9 @@ miss_treatment: \
return; \
}
-CACHESIM(L2, (*m2)++ );
-CACHESIM(I1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
-CACHESIM(D1, { (*m1)++; cachesim_L2_doref(a, size, m1, m2); } );
+CACHESIM(LL, (*mL)++ );
+CACHESIM(I1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );
+CACHESIM(D1, { (*m1)++; cachesim_LL_doref(a, size, m1, mL); } );
/*--------------------------------------------------------------------*/
/*--- end cg_sim.c ---*/
diff --git a/cachegrind/docs/cg-manual.xml b/cachegrind/docs/cg-manual.xml
index b312771f9..b5a820b1c 100644
--- a/cachegrind/docs/cg-manual.xml
+++ b/cachegrind/docs/cg-manual.xml
@@ -16,33 +16,45 @@ Valgrind command line.</para>
<para>Cachegrind simulates how your program interacts with a machine's cache
hierarchy and (optionally) branch predictor. It simulates a machine with
-independent first level instruction and data caches (I1 and D1), backed by a
-unified second level cache (L2). This configuration is used by almost all
-modern machines.</para>
+independent first-level instruction and data caches (I1 and D1), backed by a
+unified second-level cache (L2). This exactly matches the configuration of
+many modern machines.</para>
+
+<para>However, some modern machines have three levels of cache. For these
+machines (in the cases where Cachegrind can auto-detect the cache
+configuration) Cachegrind simulates the first-level and third-level caches.
+The reason for this choice is that the L3 cache has the most influence on
+runtime, as it masks accesses to main memory. Furthermore, the L1 caches
+often have low associativity, so simulating them can detect cases where the
+code interacts badly with this cache (eg. traversing a matrix column-wise
+with the row length being a power of 2).</para>
+
+<para>Therefore, Cachegrind always refers to the I1, D1 and LL (last-level)
+caches.</para>
<para>
-It gathers the following statistics (abbreviations used for each statistic
+Cachegrind gathers the following statistics (abbreviations used for each statistic
is given in parentheses):</para>
<itemizedlist>
<listitem>
<para>I cache reads (<computeroutput>Ir</computeroutput>,
which equals the number of instructions executed),
I1 cache read misses (<computeroutput>I1mr</computeroutput>) and
- L2 cache instruction read misses (<computeroutput>I1mr</computeroutput>).
+ LL cache instruction read misses (<computeroutput>ILmr</computeroutput>).
</para>
</listitem>
<listitem>
<para>D cache reads (<computeroutput>Dr</computeroutput>, which
equals the number of memory reads),
D1 cache read misses (<computeroutput>D1mr</computeroutput>), and
- L2 cache data read misses (<computeroutput>D2mr</computeroutput>).
+ LL cache data read misses (<computeroutput>DLmr</computeroutput>).
</para>
</listitem>
<listitem>
<para>D cache writes (<computeroutput>Dw</computeroutput>, which equals
the number of memory writes),
D1 cache write misses (<computeroutput>D1mw</computeroutput>), and
- L2 cache data write misses (<computeroutput>D2mw</computeroutput>).
+ LL cache data write misses (<computeroutput>DLmw</computeroutput>).
</para>
</listitem>
<listitem>
@@ -59,10 +71,10 @@ is given in parentheses):</para>
<para>Note that D1 total accesses is given by
<computeroutput>D1mr</computeroutput> +
-<computeroutput>D1mw</computeroutput>, and that L2 total
-accesses is given by <computeroutput>I2mr</computeroutput> +
-<computeroutput>D2mr</computeroutput> +
-<computeroutput>D2mw</computeroutput>.
+<computeroutput>D1mw</computeroutput>, and that LL total
+accesses is given by <computeroutput>ILmr</computeroutput> +
+<computeroutput>DLmr</computeroutput> +
+<computeroutput>DLmw</computeroutput>.
</para>
<para>These statistics are presented for the entire program and for each
@@ -70,7 +82,7 @@ function in the program. You can also annotate each line of source code in
the program with the counts that were caused directly by it.</para>
<para>On a modern machine, an L1 miss will typically cost
-around 10 cycles, an L2 miss can cost as much as 200
+around 10 cycles, an LL miss can cost as much as 200
cycles, and a mispredicted branch costs in the region of 10
to 30 cycles. Detailed cache and branch profiling can be very useful
for understanding how your program interacts with the machine and thus how
@@ -118,24 +130,24 @@ summary statistics that look like this will be printed:</para>
<programlisting><![CDATA[
==31751== I refs: 27,742,716
==31751== I1 misses: 276
-==31751== L2i misses: 275
+==31751== LLi misses: 275
==31751== I1 miss rate: 0.0%
-==31751== L2i miss rate: 0.0%
+==31751== LLi miss rate: 0.0%
==31751==
==31751== D refs: 15,430,290 (10,955,517 rd + 4,474,773 wr)
==31751== D1 misses: 41,185 ( 21,905 rd + 19,280 wr)
-==31751== L2d misses: 23,085 ( 3,987 rd + 19,098 wr)
+==31751== LLd misses: 23,085 ( 3,987 rd + 19,098 wr)
==31751== D1 miss rate: 0.2% ( 0.1% + 0.4%)
-==31751== L2d miss rate: 0.1% ( 0.0% + 0.4%)
+==31751== LLd miss rate: 0.1% ( 0.0% + 0.4%)
==31751==
-==31751== L2 misses: 23,360 ( 4,262 rd + 19,098 wr)
-==31751== L2 miss rate: 0.0% ( 0.0% + 0.4%)]]></programlisting>
+==31751== LL misses: 23,360 ( 4,262 rd + 19,098 wr)
+==31751== LL miss rate: 0.0% ( 0.0% + 0.4%)]]></programlisting>
<para>Cache accesses for instruction fetches are summarised
first, giving the number of fetches made (this is the number of
instructions executed, which can be useful to know in its own
-right), the number of I1 misses, and the number of L2 instruction
-(<computeroutput>L2i</computeroutput>) misses.</para>
+right), the number of I1 misses, and the number of LL instruction
+(<computeroutput>LLi</computeroutput>) misses.</para>
<para>Cache accesses for data follow. The information is similar
to that of the instruction fetches, except that the values are
@@ -144,12 +156,12 @@ also shown split between reads and writes (note each row's
<computeroutput>wr</computeroutput> values add up to the row's
total).</para>
-<para>Combined instruction and data figures for the L2 cache
-follow that. Note that the L2 miss rate is computed relative to the total
+<para>Combined instruction and data figures for the LL cache
+follow that. Note that the LL miss rate is computed relative to the total
number of memory accesses, not the number of L1 misses. I.e. it is
-<computeroutput>(I2mr + D2mr + D2mw) / (Ir + Dr + Dw)</computeroutput>
+<computeroutput>(ILmr + DLmr + DLmw) / (Ir + Dr + Dw)</computeroutput>
not
-<computeroutput>(I2mr + D2mr + D2mw) / (I1mr + D1mr + D1mw)</computeroutput>
+<computeroutput>(ILmr + DLmr + DLmw) / (I1mr + D1mr + D1mw)</computeroutput>
</para>
<para>Branch prediction statistics are not collected by default.
@@ -208,11 +220,11 @@ wide if possible, as the output lines can be quite long.</para>
--------------------------------------------------------------------------------
I1 cache: 65536 B, 64 B, 2-way associative
D1 cache: 65536 B, 64 B, 2-way associative
-L2 cache: 262144 B, 64 B, 8-way associative
+LL cache: 262144 B, 64 B, 8-way associative
Command: concord vg_to_ucode.c
-Events recorded: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Events shown: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
-Event sort order: Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Events recorded: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
+Events shown: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
+Event sort order: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
Threshold: 99%
Chosen for annotation:
Auto-annotation: off
@@ -224,7 +236,7 @@ Auto-annotation: off
<itemizedlist>
<listitem>
- <para>I1 cache, D1 cache, L2 cache: cache configuration. So
+ <para>I1 cache, D1 cache, LL cache: cache configuration. So
you know the configuration with which these results were
obtained.</para>
</listitem>
@@ -300,7 +312,7 @@ program:</para>
<programlisting><![CDATA[
--------------------------------------------------------------------------------
-Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
--------------------------------------------------------------------------------
27,742,716 276 275 10,955,517 21,905 3,987 4,474,773 19,280 19,098 PROGRAM TOTALS]]></programlisting>
@@ -312,7 +324,7 @@ These are similar to the summary provided when Cachegrind finishes running.
<programlisting><![CDATA[
--------------------------------------------------------------------------------
-Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw file:function
+Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw file:function
--------------------------------------------------------------------------------
8,821,482 5 5 2,242,702 1,621 73 1,794,230 0 0 getc.c:_IO_getc
5,222,023 4 4 2,276,334 16 12 875,959 1 1 concord.c:get_word
@@ -367,7 +379,7 @@ produces the same output as above followed by an annotated version of
--------------------------------------------------------------------------------
-- User-annotated source: concord.c
--------------------------------------------------------------------------------
-Ir I1mr I2mr Dr D1mr D2mr Dw D1mw D2mw
+Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw
. . . . . . . . . void init_hash_table(char *file_name, Word_Node *table[])
3 1 1 . . . 1 0 0 {
@@ -687,7 +699,7 @@ programs. It does however check that the
<computeroutput>Events:</computeroutput> lines of all the inputs are
identical, so as to ensure that the addition of costs makes sense.
For example, it would be nonsensical for it to add a number indicating
-D1 read references to a number from a different file indicating L2
+D1 read references to a number from a different file indicating LL
write misses.</para>
<para>
@@ -746,7 +758,7 @@ programs. It does however check that the
<computeroutput>Events:</computeroutput> lines of all the inputs are
identical, so as to ensure that the addition of costs makes sense.
For example, it would be nonsensical for it to add a number indicating
-D1 read references to a number from a different file indicating L2
+D1 read references to a number from a different file indicating LL
write misses.</para>
<para>
@@ -810,12 +822,12 @@ this case.</para>
</listitem>
</varlistentry>
- <varlistentry id="opt.L2" xreflabel="--L2">
+ <varlistentry id="opt.LL" xreflabel="--LL">
<term>
- <option><![CDATA[--L2=<size>,<associativity>,<line size> ]]></option>
+ <option><![CDATA[--LL=<size>,<associativity>,<line size> ]]></option>
</term>
<listitem>
- <para>Specify the size, associativity and line size of the level 2
+ <para>Specify the size, associativity and line size of the last-level
cache.</para>
</listitem>
</varlistentry>
@@ -903,9 +915,9 @@ this case.</para>
order). Default is to use all present in the
<filename>cachegrind.out.&lt;pid&gt;</filename> file (and
use the order in the file). Useful if you want to concentrate on, for
- example, I cache misses (<option>--show=I1mr,I2mr</option>), or data
- read misses (<option>--show=D1mr,D2mr</option>), or L2 data misses
- (<option>--show=D2mr,D2mw</option>). Best used in conjunction with
+ example, I cache misses (<option>--show=I1mr,ILmr</option>), or data
+ read misses (<option>--show=D1mr,DLmr</option>), or LL data misses
+ (<option>--show=DLmr,DLmw</option>). Best used in conjunction with
<option>--sort</option>.</para>
</listitem>
</varlistentry>
@@ -935,9 +947,9 @@ this case.</para>
events by appending any events for the
<option>--sort</option> option with a colon
and a number (no spaces, though). E.g. if you want to see
- each function that covers more than 1% of L2 read misses or 1% of L2
+ each function that covers more than 1% of LL read misses or 1% of LL
write misses, use this option:</para>
- <para><option>--sort=D2mr:1,D2mw:1</option></para>
+ <para><option>--sort=DLmr:1,DLmw:1</option></para>
</listitem>
</varlistentry>
@@ -1059,13 +1071,13 @@ information, but they can still be very useful for identifying
bottlenecks.</para>
<para>
-After that, we have found that L2 misses are typically a much bigger source
+After that, we have found that LL misses are typically a much bigger source
of slow-downs than L1 misses. So it's worth looking for any snippets of
-code with high <computeroutput>D2mr</computeroutput> or
-<computeroutput>D2mw</computeroutput> counts. (You can use
-<option>--show=D2mr
---sort=D2mr</option> with cg_annotate to focus just on
-<literal>D2mr</literal> counts, for example.) If you find any, it's still
+code with high <computeroutput>DLmr</computeroutput> or
+<computeroutput>DLmw</computeroutput> counts. (You can use
+<option>--show=DLmr
+--sort=DLmr</option> with cg_annotate to focus just on
+<literal>DLmr</literal> counts, for example.) If you find any, it's still
not always easy to work out how to improve things. You need to have a
reasonable understanding of how caches work, the principles of locality, and
your program's data access patterns. Improving things may require
@@ -1153,12 +1165,12 @@ follows:</para>
</listitem>
<listitem>
- <para>Inclusive L2 cache: the L2 cache typically replicates all
+ <para>Inclusive LL cache: the LL cache typically replicates all
the entries of the L1 caches, because fetching into L1 involves
- fetching into L2 first (this does not guarantee strict inclusiveness,
- as lines evicted from L2 still could reside in L1). This is
+ fetching into LL first (this does not guarantee strict inclusiveness,
+ as lines evicted from LL still could reside in L1). This is
standard on Pentium chips, but AMD Opterons, Athlons and Durons
- use an exclusive L2 cache that only holds
+ use an exclusive LL cache that only holds
blocks evicted from L1. Ditto most modern VIA CPUs.</para>
</listitem>
@@ -1172,10 +1184,10 @@ early incarnation that doesn't give any cache information, then
Cachegrind will fall back to using a default configuration (that
of a model 3/4 Athlon). Cachegrind will tell you if this
happens. You can manually specify one, two or all three levels
-(I1/D1/L2) of the cache from the command line using the
+(I1/D1/LL) of the cache from the command line using the
<option>--I1</option>,
<option>--D1</option> and
-<option>--L2</option> options.
+<option>--LL</option> options.
For cache parameters to be valid for simulation, the number
of sets (with associativity being the number of cache lines in
each set) has to be a power of two.</para>
@@ -1186,7 +1198,7 @@ determine the cache configuration, so you will
need to specify it with the
<option>--I1</option>,
<option>--D1</option> and
-<option>--L2</option> options.</para>
+<option>--LL</option> options.</para>
<para>Other noteworthy behaviour:</para>
diff --git a/cachegrind/tests/chdir.stderr.exp b/cachegrind/tests/chdir.stderr.exp
index 8eaf65446..e8084c12c 100644
--- a/cachegrind/tests/chdir.stderr.exp
+++ b/cachegrind/tests/chdir.stderr.exp
@@ -2,16 +2,16 @@
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/cachegrind/tests/dlclose.stderr.exp b/cachegrind/tests/dlclose.stderr.exp
index 8eaf65446..e8084c12c 100644
--- a/cachegrind/tests/dlclose.stderr.exp
+++ b/cachegrind/tests/dlclose.stderr.exp
@@ -2,16 +2,16 @@
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/cachegrind/tests/filter_stderr b/cachegrind/tests/filter_stderr
index 6ec44bf7a..6a14e344a 100755
--- a/cachegrind/tests/filter_stderr
+++ b/cachegrind/tests/filter_stderr
@@ -7,11 +7,11 @@ $dir/../../tests/filter_stderr_basic |
# Remove "Cachegrind, ..." line and the following copyright line.
sed "/^Cachegrind, a cache and branch-prediction profiler/ , /./ d" |
-# Remove numbers from I/D/L2 "refs:" lines
-perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/' |
+# Remove numbers from I/D/LL "refs:" lines
+perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/' |
-# Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
-perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
+# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
+perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
# Remove CPUID warnings lines for P4s and other machines
sed "/warning: Pentium 4 with 12 KB micro-op instruction trace cache/d" |
diff --git a/cachegrind/tests/notpower2.stderr.exp b/cachegrind/tests/notpower2.stderr.exp
index 8eaf65446..e8084c12c 100644
--- a/cachegrind/tests/notpower2.stderr.exp
+++ b/cachegrind/tests/notpower2.stderr.exp
@@ -2,16 +2,16 @@
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/cachegrind/tests/notpower2.vgtest b/cachegrind/tests/notpower2.vgtest
index 132cfe592..21caffe94 100644
--- a/cachegrind/tests/notpower2.vgtest
+++ b/cachegrind/tests/notpower2.vgtest
@@ -1,3 +1,3 @@
prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64
cleanup: rm cachegrind.out.*
diff --git a/cachegrind/tests/wrap5.stderr.exp b/cachegrind/tests/wrap5.stderr.exp
index 8eaf65446..e8084c12c 100644
--- a/cachegrind/tests/wrap5.stderr.exp
+++ b/cachegrind/tests/wrap5.stderr.exp
@@ -2,16 +2,16 @@
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/cachegrind/tests/x86/fpu-28-108.stderr.exp b/cachegrind/tests/x86/fpu-28-108.stderr.exp
index 8eaf65446..e8084c12c 100644
--- a/cachegrind/tests/x86/fpu-28-108.stderr.exp
+++ b/cachegrind/tests/x86/fpu-28-108.stderr.exp
@@ -2,16 +2,16 @@
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/docs/cl-format.xml b/callgrind/docs/cl-format.xml
index 97b3543cb..7fce3188b 100644
--- a/callgrind/docs/cl-format.xml
+++ b/callgrind/docs/cl-format.xml
@@ -414,7 +414,7 @@ for "Ir and "Dr".</para>
<para>This specifies various information for this dump. For some
types, the semantic is defined, but any description type is allowed.
Unknown types should be ignored.</para>
- <para>There are the types "I1 cache", "D1 cache", "L2 cache", which
+ <para>There are the types "I1 cache", "D1 cache", "LL cache", which
specify parameters used for the cache simulator. These are the only
types originally used by Cachegrind. Additionally, Callgrind uses
the following types: "Timerange" gives a rough range of the basic
@@ -457,7 +457,7 @@ for "Ir and "Dr".</para>
<para><command>I1mr</command>: Instruction Level 1 read cache miss</para>
</listitem>
<listitem>
- <para><command>I2mr</command>: Instruction Level 2 read cache miss</para>
+ <para><command>ILmr</command>: Instruction last-level read cache miss</para>
</listitem>
<listitem>
<para>...</para>
diff --git a/callgrind/docs/cl-manual.xml b/callgrind/docs/cl-manual.xml
index e2289ff55..3f8330eaf 100644
--- a/callgrind/docs/cl-manual.xml
+++ b/callgrind/docs/cl-manual.xml
@@ -933,9 +933,9 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
<para>Specify if you want to do full cache simulation. By default,
only instruction read accesses will be counted ("Ir").
With cache simulation, further event counters are enabled:
- Cache misses on instruction reads ("I1mr"/"I2mr"),
- data read accesses ("Dr") and related cache misses ("D1mr"/"D2mr"),
- data write accesses ("Dw") and related cache misses ("D1mw"/"D2mw").
+ Cache misses on instruction reads ("I1mr"/"ILmr"),
+ data read accesses ("Dr") and related cache misses ("D1mr"/"DLmr"),
+ data write accesses ("Dw") and related cache misses ("D1mw"/"DLmw").
For more information, see <xref linkend="cg-manual"/>.
</para>
</listitem>
@@ -972,13 +972,13 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
</term>
<listitem>
<para>Specify whether write-back behavior should be simulated, allowing
- to distinguish L2 caches misses with and without write backs.
+ to distinguish LL caches misses with and without write backs.
The cache model of Cachegrind/Callgrind does not specify write-through
vs. write-back behavior, and this also is not relevant for the number
of generated miss counts. However, with explicit write-back simulation
it can be decided whether a miss triggers not only the loading of a new
cache line, but also if a write back of a dirty cache line had to take
- place before. The new dirty miss events are I2dmr, D2dmr, and D2dmw,
+ place before. The new dirty miss events are ILdmr, DLdmr, and DLdmw,
for misses because of instruction read, data read, and data write,
respectively. As they produce two memory transactions, they should
account for a doubled time estimation in relation to a normal miss.
@@ -1016,13 +1016,13 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
bad access behavior). The new counters are defined in a way such
that worse behavior results in higher cost.
AcCost1 and AcCost2 are counters showing bad temporal locality
- for L1 and L2 caches, respectively. This is done by summing up
+ for L1 and LL caches, respectively. This is done by summing up
reciprocal values of the numbers of accesses of each cache line,
multiplied by 1000 (as only integer costs are allowed). E.g. for
a given source line with 5 read accesses, a value of 5000 AcCost
means that for every access, a new cache line was loaded and directly
evicted afterwards without further accesses. Similarly, SpLoss1/2
- shows bad spatial locality for L1 and L2 caches, respectively. It
+ shows bad spatial locality for L1 and LL caches, respectively. It
gives the <emphasis>spatial loss</emphasis> count of bytes which
were loaded into cache but never accessed. It pinpoints at code
accessing data in a way such that cache space is wasted. This hints
@@ -1059,12 +1059,12 @@ Also see <xref linkend="cl-manual.cycles"/>.</para>
</listitem>
</varlistentry>
- <varlistentry id="opt.L2" xreflabel="--L2">
+ <varlistentry id="opt.LL" xreflabel="--LL">
<term>
- <option><![CDATA[--L2=<size>,<associativity>,<line size> ]]></option>
+ <option><![CDATA[--LL=<size>,<associativity>,<line size> ]]></option>
</term>
<listitem>
- <para>Specify the size, associativity and line size of the level 2
+ <para>Specify the size, associativity and line size of the last-level
cache.</para>
</listitem>
</varlistentry>
diff --git a/callgrind/sim.c b/callgrind/sim.c
index cb41d57d1..2b8cbe473 100644
--- a/callgrind/sim.c
+++ b/callgrind/sim.c
@@ -91,7 +91,7 @@ typedef struct {
* States of flat caches in our model.
* We use a 2-level hierarchy,
*/
-static cache_t2 I1, D1, L2;
+static cache_t2 I1, D1, LL;
/* Lower bits of cache tags are used as flags for a cache line */
#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
@@ -123,8 +123,8 @@ static Int off_I1_AcCost = 0;
static Int off_I1_SpLoss = 1;
static Int off_D1_AcCost = 0;
static Int off_D1_SpLoss = 1;
-static Int off_L2_AcCost = 2;
-static Int off_L2_SpLoss = 3;
+static Int off_LL_AcCost = 2;
+static Int off_LL_SpLoss = 3;
/* Cache access types */
typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
@@ -135,7 +135,7 @@ typedef enum { Hit = 0, Miss, MissDirty } CacheResult;
/* Result of a reference into a hierarchical cache model */
typedef enum {
L1_Hit,
- L2_Hit,
+ LL_Hit,
MemAccess,
WriteBackMemAccess } CacheModelResult;
@@ -231,7 +231,7 @@ static void print_cache(cache_t2* c)
/*------------------------------------------------------------*/
/*
- * Simple model: L1 & L2 Write Through
+ * Simple model: L1 & LL Write Through
* Does not distinguish among read and write references
*
* Simulator functions:
@@ -305,7 +305,7 @@ static
CacheModelResult cachesim_I1_ref(Addr a, UChar size)
{
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
- if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+ if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
return MemAccess;
}
@@ -313,7 +313,7 @@ static
CacheModelResult cachesim_D1_ref(Addr a, UChar size)
{
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
- if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+ if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
return MemAccess;
}
@@ -323,7 +323,7 @@ CacheModelResult cachesim_D1_ref(Addr a, UChar size)
/*------------------------------------------------------------*/
/*
- * More complex model: L1 Write-through, L2 Write-back
+ * More complex model: L1 Write-through, LL Write-back
* This needs to distinguish among read and write references.
*
* Simulator functions:
@@ -412,8 +412,8 @@ static
CacheModelResult cachesim_I1_Read(Addr a, UChar size)
{
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
- switch( cachesim_ref_wb( &L2, Read, a, size) ) {
- case Hit: return L2_Hit;
+ switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+ case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@@ -424,8 +424,8 @@ static
CacheModelResult cachesim_D1_Read(Addr a, UChar size)
{
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
- switch( cachesim_ref_wb( &L2, Read, a, size) ) {
- case Hit: return L2_Hit;
+ switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+ case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@@ -437,14 +437,14 @@ CacheModelResult cachesim_D1_Write(Addr a, UChar size)
{
if ( cachesim_ref( &D1, a, size) == Hit ) {
/* Even for a L1 hit, the write-trough L1 passes
- * the write to the L2 to make the L2 line dirty.
+ * the write to the LL to make the LL line dirty.
* But this causes no latency, so return the hit.
*/
- cachesim_ref_wb( &L2, Write, a, size);
+ cachesim_ref_wb( &LL, Write, a, size);
return L1_Hit;
}
- switch( cachesim_ref_wb( &L2, Write, a, size) ) {
- case Hit: return L2_Hit;
+ switch( cachesim_ref_wb( &LL, Write, a, size) ) {
+ case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@@ -479,10 +479,10 @@ void prefetch_clear(void)
* One stream can be detected per 4k page.
*/
static __inline__
-void prefetch_L2_doref(Addr a)
+void prefetch_LL_doref(Addr a)
{
UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
- UInt block = ( a >> L2.line_size_bits);
+ UInt block = ( a >> LL.line_size_bits);
if (block != pf_lastblock[stream]) {
if (pf_seqblocks[stream] == 0) {
@@ -494,7 +494,7 @@ void prefetch_L2_doref(Addr a)
pf_seqblocks[stream]++;
if (pf_seqblocks[stream] >= 2) {
prefetch_up++;
- cachesim_ref(&L2, a + 5 * L2.line_size,1);
+ cachesim_ref(&LL, a + 5 * LL.line_size,1);
}
}
else pf_seqblocks[stream] = 0;
@@ -504,7 +504,7 @@ void prefetch_L2_doref(Addr a)
pf_seqblocks[stream]--;
if (pf_seqblocks[stream] <= -2) {
prefetch_down++;
- cachesim_ref(&L2, a - 5 * L2.line_size,1);
+ cachesim_ref(&LL, a - 5 * LL.line_size,1);
}
}
else pf_seqblocks[stream] = 0;
@@ -519,8 +519,8 @@ static
CacheModelResult prefetch_I1_ref(Addr a, UChar size)
{
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
- prefetch_L2_doref(a);
- if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+ prefetch_LL_doref(a);
+ if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
return MemAccess;
}
@@ -528,8 +528,8 @@ static
CacheModelResult prefetch_D1_ref(Addr a, UChar size)
{
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
- prefetch_L2_doref(a);
- if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
+ prefetch_LL_doref(a);
+ if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
return MemAccess;
}
@@ -540,9 +540,9 @@ static
CacheModelResult prefetch_I1_Read(Addr a, UChar size)
{
if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
- prefetch_L2_doref(a);
- switch( cachesim_ref_wb( &L2, Read, a, size) ) {
- case Hit: return L2_Hit;
+ prefetch_LL_doref(a);
+ switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+ case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@@ -553,9 +553,9 @@ static
CacheModelResult prefetch_D1_Read(Addr a, UChar size)
{
if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
- prefetch_L2_doref(a);
- switch( cachesim_ref_wb( &L2, Read, a, size) ) {
- case Hit: return L2_Hit;
+ prefetch_LL_doref(a);
+ switch( cachesim_ref_wb( &LL, Read, a, size) ) {
+ case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@@ -565,17 +565,17 @@ CacheModelResult prefetch_D1_Read(Addr a, UChar size)
static
CacheModelResult prefetch_D1_Write(Addr a, UChar size)
{
- prefetch_L2_doref(a);
+ prefetch_LL_doref(a);
if ( cachesim_ref( &D1, a, size) == Hit ) {
/* Even for a L1 hit, the write-trough L1 passes
- * the write to the L2 to make the L2 line dirty.
+ * the write to the LL to make the LL line dirty.
* But this causes no latency, so return the hit.
*/
- cachesim_ref_wb( &L2, Write, a, size);
+ cachesim_ref_wb( &LL, Write, a, size);
return L1_Hit;
}
- switch( cachesim_ref_wb( &L2, Write, a, size) ) {
- case Hit: return L2_Hit;
+ switch( cachesim_ref_wb( &LL, Write, a, size) ) {
+ case Hit: return LL_Hit;
case Miss: return MemAccess;
default: break;
}
@@ -736,7 +736,7 @@ static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \
/* Second case: word straddles two lines. */ \
/* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \
} else if (((set1 + 1) & (L.sets-1)) == set2) { \
- Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \
+ Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */ \
set = &(L.tags[set1 * L.assoc]); \
use_mask = L.line_start_mask[a & L.line_size_mask]; \
if (tag == (set[0] & L.tag_mask)) { \
@@ -809,7 +809,7 @@ block2: \
idx = (set2 * L.assoc) + tmp_tag; \
miss2 = update_##L##_use(&L, idx, \
use_mask, (a+size-1) &~ L.line_size_mask); \
- return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \
+ return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit; \
\
} else { \
VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \
@@ -837,13 +837,13 @@ static __inline__ unsigned int countBits(unsigned int bits)
return c;
}
-static void update_L2_use(int idx, Addr memline)
+static void update_LL_use(int idx, Addr memline)
{
- line_loaded* loaded = &(L2.loaded[idx]);
- line_use* use = &(L2.use[idx]);
- int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
+ line_loaded* loaded = &(LL.loaded[idx]);
+ line_use* use = &(LL.use[idx]);
+ int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
- CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
+ CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
idx, CLG_(bb_base) + current_ii->instr_offset, memline);
if (use->count>0) {
CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
@@ -852,8 +852,8 @@ static void update_L2_use(int idx, Addr memline)
CLG_(current_state).collect, loaded->use_base);
if (CLG_(current_state).collect && loaded->use_base) {
- (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
- (loaded->use_base)[off_L2_SpLoss] += i;
+ (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
+ (loaded->use_base)[off_LL_SpLoss] += i;
}
}
@@ -868,53 +868,53 @@ static void update_L2_use(int idx, Addr memline)
}
static
-CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
+CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
{
- UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
- UWord* set = &(L2.tags[setNo * L2.assoc]);
- UWord tag = memline & L2.tag_mask;
+ UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
+ UWord* set = &(LL.tags[setNo * LL.assoc]);
+ UWord tag = memline & LL.tag_mask;
int i, j, idx;
UWord tmp_tag;
- CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
+ CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
- if (tag == (set[0] & L2.tag_mask)) {
- idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
- l1_loaded->dep_use = &(L2.use[idx]);
+ if (tag == (set[0] & LL.tag_mask)) {
+ idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
+ l1_loaded->dep_use = &(LL.use[idx]);
CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
- idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
- L2.use[idx].mask, L2.use[idx].count);
- return L2_Hit;
+ idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
+ LL.use[idx].mask, LL.use[idx].count);
+ return LL_Hit;
}
- for (i = 1; i < L2.assoc; i++) {
- if (tag == (set[i] & L2.tag_mask)) {
+ for (i = 1; i < LL.assoc; i++) {
+ if (tag == (set[i] & LL.tag_mask)) {
tmp_tag = set[i];
for (j = i; j > 0; j--) {
set[j] = set[j - 1];
}
set[0] = tmp_tag;
- idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
- l1_loaded->dep_use = &(L2.use[idx]);
+ idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
+ l1_loaded->dep_use = &(LL.use[idx]);
CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
- i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr,
- L2.use[idx].mask, L2.use[idx].count);
- return L2_Hit;
+ i, idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr,
+ LL.use[idx].mask, LL.use[idx].count);
+ return LL_Hit;
}
}
/* A miss; install this tag as MRU, shuffle rest down. */
- tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
- for (j = L2.assoc - 1; j > 0; j--) {
+ tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
+ for (j = LL.assoc - 1; j > 0; j--) {
set[j] = set[j - 1];
}
set[0] = tag | tmp_tag;
- idx = (setNo * L2.assoc) + tmp_tag;
- l1_loaded->dep_use = &(L2.use[idx]);
+ idx = (setNo * LL.assoc) + tmp_tag;
+ l1_loaded->dep_use = &(LL.use[idx]);
- update_L2_use(idx, memline);
+ update_LL_use(idx, memline);
return MemAccess;
}
@@ -943,7 +943,7 @@ static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
(loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \
(loaded->use_base)[off_##L##_SpLoss] += c; \
\
- /* FIXME (?): L1/L2 line sizes must be equal ! */ \
+ /* FIXME (?): L1/LL line sizes must be equal ! */ \
loaded->dep_use->mask |= use->mask; \
loaded->dep_use->count += use->count; \
} \
@@ -957,8 +957,8 @@ static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
CLG_(current_state).nonskipped->skipped : \
CLG_(cost_base) + current_ii->cost_offset; \
\
- if (memline == 0) return L2_Hit; \
- return cacheuse_L2_access(memline, loaded); \
+ if (memline == 0) return LL_Hit; \
+ return cacheuse_LL_access(memline, loaded); \
}
UPDATE_USE(I1);
@@ -991,10 +991,10 @@ void cacheuse_finish(void)
if (D1.loaded[i].use_base)
update_D1_use( &D1, i, 0,0);
- if (L2.use)
- for (i = 0; i < L2.sets * L2.assoc; i++)
- if (L2.loaded[i].use_base)
- update_L2_use(i, 0);
+ if (LL.use)
+ for (i = 0; i < LL.sets * LL.assoc; i++)
+ if (LL.loaded[i].use_base)
+ update_LL_use(i, 0);
}
@@ -1020,7 +1020,7 @@ void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
c2[2]++;
// fall through
- case L2_Hit:
+ case LL_Hit:
c1[1]++;
c2[1]++;
// fall through
@@ -1036,9 +1036,9 @@ Char* cacheRes(CacheModelResult r)
{
switch(r) {
case L1_Hit: return "L1 Hit ";
- case L2_Hit: return "L2 Hit ";
- case MemAccess: return "L2 Miss";
- case WriteBackMemAccess: return "L2 Miss (dirty)";
+ case LL_Hit: return "LL Hit ";
+ case MemAccess: return "LL Miss";
+ case WriteBackMemAccess: return "LL Miss (dirty)";
default:
tl_assert(0);
}
@@ -1268,7 +1268,7 @@ static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
static cache_t clo_I1_cache = UNDEFINED_CACHE;
static cache_t clo_D1_cache = UNDEFINED_CACHE;
-static cache_t clo_L2_cache = UNDEFINED_CACHE;
+static cache_t clo_LL_cache = UNDEFINED_CACHE;
// Checks cache config is ok. Returns NULL if ok, or a pointer to an error
@@ -1308,7 +1308,7 @@ static Char* check_cache(cache_t* cache)
}
static
-void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
+void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
{
#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size)
@@ -1317,30 +1317,30 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
Bool all_caches_clo_defined =
(DEFINED(clo_I1_cache) &&
DEFINED(clo_D1_cache) &&
- DEFINED(clo_L2_cache));
+ DEFINED(clo_LL_cache));
// Set the cache config (using auto-detection, if supported by the
// architecture).
- VG_(configure_caches)( I1c, D1c, L2c, all_caches_clo_defined );
+ VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
// Check the default/auto-detected values.
checkRes = check_cache(I1c); tl_assert(!checkRes);
checkRes = check_cache(D1c); tl_assert(!checkRes);
- checkRes = check_cache(L2c); tl_assert(!checkRes);
+ checkRes = check_cache(LLc); tl_assert(!checkRes);
// Then replace with any defined on the command line.
if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
- if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
+ if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
if (VG_(clo_verbosity) > 1) {
- VG_(message)(Vg_UserMsg, "Cache configuration used:\n");
- VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines\n",
- I1c->size, I1c->assoc, I1c->line_size);
- VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines\n",
- D1c->size, D1c->assoc, D1c->line_size);
- VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines\n",
- L2c->size, L2c->assoc, L2c->line_size);
+ VG_(umsg)("Cache configuration used:\n");
+ VG_(umsg)(" I1: %dB, %d-way, %dB lines\n",
+ I1c->size, I1c->assoc, I1c->line_size);
+ VG_(umsg)(" D1: %dB, %d-way, %dB lines\n",
+ D1c->size, D1c->assoc, D1c->line_size);
+ VG_(umsg)(" LL: %dB, %d-way, %dB lines\n",
+ LLc->size, LLc->assoc, LLc->line_size);
}
#undef CMD_LINE_DEFINED
}
@@ -1350,7 +1350,7 @@ void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
static void cachesim_post_clo_init(void)
{
/* Cache configurations. */
- cache_t I1c, D1c, L2c;
+ cache_t I1c, D1c, LLc;
/* Initialize access handlers */
if (!CLG_(clo).simulate_cache) {
@@ -1374,15 +1374,15 @@ static void cachesim_post_clo_init(void)
}
/* Configuration of caches only needed with real cache simulation */
- configure_caches(&I1c, &D1c, &L2c);
+ configure_caches(&I1c, &D1c, &LLc);
I1.name = "I1";
D1.name = "D1";
- L2.name = "L2";
+ LL.name = "LL";
cachesim_initcache(I1c, &I1);
cachesim_initcache(D1c, &D1);
- cachesim_initcache(L2c, &L2);
+ cachesim_initcache(LLc, &LL);
/* the other cache simulators use the standard helpers
* with dispatching via simulator struct */
@@ -1463,7 +1463,7 @@ void cachesim_clear(void)
{
cachesim_clearcache(&I1);
cachesim_clearcache(&D1);
- cachesim_clearcache(&L2);
+ cachesim_clearcache(&LL);
prefetch_clear();
}
@@ -1474,7 +1474,7 @@ static void cachesim_getdesc(Char* buf)
Int p;
p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
- VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
+ VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
}
static
@@ -1490,11 +1490,12 @@ void cachesim_print_opts(void)
" --cacheuse=no|yes Collect cache block use [no]\n"
" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n"
" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n"
-" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n"
+" --LL=<size>,<assoc>,<line_size> set LL cache manually\n"
);
}
-static void parse_opt ( cache_t* cache, char* opt, Char* optval )
+static void parse_opt ( cache_t* cache,
+ char* opt, Char* optval, UChar kind )
{
Long i1, i2, i3;
Char* endptr;
@@ -1550,11 +1551,12 @@ static Bool cachesim_parse_opt(Char* arg)
}
else if VG_STR_CLO(arg, "--I1", tmp_str)
- parse_opt(&clo_I1_cache, arg, tmp_str);
+ parse_opt(&clo_I1_cache, arg, tmp_str, 'i');
else if VG_STR_CLO(arg, "--D1", tmp_str)
- parse_opt(&clo_D1_cache, arg, tmp_str);
- else if VG_STR_CLO(arg, "--L2", tmp_str)
- parse_opt(&clo_L2_cache, arg, tmp_str);
+ parse_opt(&clo_D1_cache, arg, tmp_str, '1');
+ else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
+ VG_STR_CLO(arg, "--LL", tmp_str))
+ parse_opt(&clo_LL_cache, arg, tmp_str, '2');
else
return False;
@@ -1613,8 +1615,8 @@ static
void cachesim_printstat(Int l1, Int l2, Int l3)
{
FullCost total = CLG_(total_cost), D_total = 0;
- ULong L2_total_m, L2_total_mr, L2_total_mw,
- L2_total, L2_total_r, L2_total_w;
+ ULong LL_total_m, LL_total_mr, LL_total_mw,
+ LL_total, LL_total_r, LL_total_w;
char buf1[RESULTS_BUF_LEN],
buf2[RESULTS_BUF_LEN],
buf3[RESULTS_BUF_LEN];
@@ -1632,7 +1634,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
VG_(message)(Vg_UserMsg, "I1 misses: %s\n", buf1);
commify(total[fullOffset(EG_IR) +2], l1, buf1);
- VG_(message)(Vg_UserMsg, "L2i misses: %s\n", buf1);
+ VG_(message)(Vg_UserMsg, "LLi misses: %s\n", buf1);
p = 100;
@@ -1645,7 +1647,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
percentify(total[fullOffset(EG_IR)+2] * 100 * p /
total[fullOffset(EG_IR)], p, l1+1, buf1);
- VG_(message)(Vg_UserMsg, "L2i miss rate: %s\n", buf1);
+ VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
VG_(message)(Vg_UserMsg, "\n");
/* D cache results.
@@ -1673,7 +1675,7 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
commify( D_total[2], l1, buf1);
commify(total[fullOffset(EG_DR)+2], l2, buf2);
commify(total[fullOffset(EG_DW)+2], l3, buf3);
- VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)\n",
+ VG_(message)(Vg_UserMsg, "LLd misses: %s (%s rd + %s wr)\n",
buf1, buf2, buf3);
p = 10;
@@ -1695,50 +1697,50 @@ void cachesim_printstat(Int l1, Int l2, Int l3)
total[fullOffset(EG_DR)], p, l2+1, buf2);
percentify(total[fullOffset(EG_DW)+2] * 100 * p /
total[fullOffset(EG_DW)], p, l3+1, buf3);
- VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )\n",
+ VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s + %s )\n",
buf1, buf2,buf3);
VG_(message)(Vg_UserMsg, "\n");
- /* L2 overall results */
+ /* LL overall results */
- L2_total =
+ LL_total =
total[fullOffset(EG_DR) +1] +
total[fullOffset(EG_DW) +1] +
total[fullOffset(EG_IR) +1];
- L2_total_r =
+ LL_total_r =
total[fullOffset(EG_DR) +1] +
total[fullOffset(EG_IR) +1];
- L2_total_w = total[fullOffset(EG_DW) +1];
- commify(L2_total, l1, buf1);
- commify(L2_total_r, l2, buf2);
- commify(L2_total_w, l3, buf3);
- VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)\n",
+ LL_total_w = total[fullOffset(EG_DW) +1];
+ commify(LL_total, l1, buf1);
+ commify(LL_total_r, l2, buf2);
+ commify(LL_total_w, l3, buf3);
+ VG_(message)(Vg_UserMsg, "LL refs: %s (%s rd + %s wr)\n",
buf1, buf2, buf3);
- L2_total_m =
+ LL_total_m =
total[fullOffset(EG_DR) +2] +
total[fullOffset(EG_DW) +2] +
total[fullOffset(EG_IR) +2];
- L2_total_mr =
+ LL_total_mr =
total[fullOffset(EG_DR) +2] +
total[fullOffset(EG_IR) +2];
- L2_total_mw = total[fullOffset(EG_DW) +2];
- commify(L2_total_m, l1, buf1);
- commify(L2_total_mr, l2, buf2);
- commify(L2_total_mw, l3, buf3);
- VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)\n",
+ LL_total_mw = total[fullOffset(EG_DW) +2];
+ commify(LL_total_m, l1, buf1);
+ commify(LL_total_mr, l2, buf2);
+ commify(LL_total_mw, l3, buf3);
+ VG_(message)(Vg_UserMsg, "LL misses: %s (%s rd + %s wr)\n",
buf1, buf2, buf3);
- percentify(L2_total_m * 100 * p /
+ percentify(LL_total_m * 100 * p /
(total[fullOffset(EG_IR)] + D_total[0]), p, l1+1, buf1);
- percentify(L2_total_mr * 100 * p /
+ percentify(LL_total_mr * 100 * p /
(total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
p, l2+1, buf2);
- percentify(L2_total_mw * 100 * p /
+ percentify(LL_total_mw * 100 * p /
total[fullOffset(EG_DW)], p, l3+1, buf3);
- VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )\n",
+ VG_(message)(Vg_UserMsg, "LL miss rate: %s (%s + %s )\n",
buf1, buf2,buf3);
}
@@ -1760,14 +1762,14 @@ void CLG_(init_eventsets)()
if (!CLG_(clo).simulate_cache)
CLG_(register_event_group)(EG_IR, "Ir");
else if (!clo_simulate_writeback) {
- CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "I2mr");
- CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "D2mr");
- CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "D2mw");
+ CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
+ CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
+ CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
}
else { // clo_simulate_writeback
- CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "I2mr", "I2dmr");
- CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "D2mr", "D2dmr");
- CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "D2mw", "D2dmw");
+ CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
+ CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
+ CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
}
if (CLG_(clo).simulate_branch) {
@@ -1807,12 +1809,12 @@ void CLG_(init_eventsets)()
CLG_(append_event)(CLG_(dumpmap), "I1mr");
CLG_(append_event)(CLG_(dumpmap), "D1mr");
CLG_(append_event)(CLG_(dumpmap), "D1mw");
- CLG_(append_event)(CLG_(dumpmap), "I2mr");
- CLG_(append_event)(CLG_(dumpmap), "D2mr");
- CLG_(append_event)(CLG_(dumpmap), "D2mw");
- CLG_(append_event)(CLG_(dumpmap), "I2dmr");
- CLG_(append_event)(CLG_(dumpmap), "D2dmr");
- CLG_(append_event)(CLG_(dumpmap), "D2dmw");
+ CLG_(append_event)(CLG_(dumpmap), "ILmr");
+ CLG_(append_event)(CLG_(dumpmap), "DLmr");
+ CLG_(append_event)(CLG_(dumpmap), "DLmw");
+ CLG_(append_event)(CLG_(dumpmap), "ILdmr");
+ CLG_(append_event)(CLG_(dumpmap), "DLdmr");
+ CLG_(append_event)(CLG_(dumpmap), "DLdmw");
CLG_(append_event)(CLG_(dumpmap), "Bc");
CLG_(append_event)(CLG_(dumpmap), "Bcm");
CLG_(append_event)(CLG_(dumpmap), "Bi");
diff --git a/callgrind/tests/filter_stderr b/callgrind/tests/filter_stderr
index d2d754456..26bc3c04e 100755
--- a/callgrind/tests/filter_stderr
+++ b/callgrind/tests/filter_stderr
@@ -13,11 +13,11 @@ sed "/^For interactive control,.*$/d" |
# Remove numbers from "Collected" line
sed "s/^\(Collected *:\)[ 0-9]*$/\1/" |
-# Remove numbers from I/D/L2 "refs:" lines
-perl -p -e 's/((I|D|L2) *refs:)[ 0-9,()+rdw]*$/\1/' |
+# Remove numbers from I/D/LL "refs:" lines
+perl -p -e 's/((I|D|LL) *refs:)[ 0-9,()+rdw]*$/\1/' |
-# Remove numbers from I1/D1/L2/L2i/L2d "misses:" and "miss rates:" lines
-perl -p -e 's/((I1|D1|L2|L2i|L2d) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
+# Remove numbers from I1/D1/LL/LLi/LLd "misses:" and "miss rates:" lines
+perl -p -e 's/((I1|D1|LL|LLi|LLd) *(misses|miss rate):)[ 0-9,()+rdw%\.]*$/\1/' |
# Remove numbers from "Branches:", "Mispredicts:, and "Mispred rate:" lines
perl -p -e 's/((Branches|Mispredicts|Mispred rate):)[ 0-9,()+condi%\.]*$/\1/' |
diff --git a/callgrind/tests/notpower2-hwpref.stderr.exp b/callgrind/tests/notpower2-hwpref.stderr.exp
index 0705c1c84..974550a01 100644
--- a/callgrind/tests/notpower2-hwpref.stderr.exp
+++ b/callgrind/tests/notpower2-hwpref.stderr.exp
@@ -1,20 +1,20 @@
-Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
Collected :
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/notpower2-hwpref.vgtest b/callgrind/tests/notpower2-hwpref.vgtest
index 9da7dced2..1be3b13e3 100644
--- a/callgrind/tests/notpower2-hwpref.vgtest
+++ b/callgrind/tests/notpower2-hwpref.vgtest
@@ -1,3 +1,3 @@
prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-hwpref=yes
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --simulate-hwpref=yes
cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2-use.stderr.exp b/callgrind/tests/notpower2-use.stderr.exp
index ea9acc89b..6d41645f5 100644
--- a/callgrind/tests/notpower2-use.stderr.exp
+++ b/callgrind/tests/notpower2-use.stderr.exp
@@ -1,20 +1,20 @@
-Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
+Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2
Collected :
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/notpower2-use.vgtest b/callgrind/tests/notpower2-use.vgtest
index b8312a76b..23cec4a56 100644
--- a/callgrind/tests/notpower2-use.vgtest
+++ b/callgrind/tests/notpower2-use.vgtest
@@ -1,3 +1,3 @@
prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --cacheuse=yes
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --cacheuse=yes
cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2-wb.stderr.exp b/callgrind/tests/notpower2-wb.stderr.exp
index 90da3e4ce..461ac9601 100644
--- a/callgrind/tests/notpower2-wb.stderr.exp
+++ b/callgrind/tests/notpower2-wb.stderr.exp
@@ -1,20 +1,20 @@
-Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
+Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw ILdmr DLdmr DLdmw
Collected :
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/notpower2-wb.vgtest b/callgrind/tests/notpower2-wb.vgtest
index 34a1f6b33..6cd016f0b 100644
--- a/callgrind/tests/notpower2-wb.vgtest
+++ b/callgrind/tests/notpower2-wb.vgtest
@@ -1,3 +1,3 @@
prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64 --simulate-wb=yes
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64 --simulate-wb=yes
cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/notpower2.stderr.exp b/callgrind/tests/notpower2.stderr.exp
index 0705c1c84..974550a01 100644
--- a/callgrind/tests/notpower2.stderr.exp
+++ b/callgrind/tests/notpower2.stderr.exp
@@ -1,20 +1,20 @@
-Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
Collected :
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/notpower2.vgtest b/callgrind/tests/notpower2.vgtest
index 73823d749..83b994652 100644
--- a/callgrind/tests/notpower2.vgtest
+++ b/callgrind/tests/notpower2.vgtest
@@ -1,3 +1,3 @@
prog: ../../tests/true
-vgopts: --I1=32768,8,64 --D1=24576,6,64 --L2=3145728,12,64
+vgopts: --I1=32768,8,64 --D1=24576,6,64 --LL=3145728,12,64
cleanup: rm callgrind.out.*
diff --git a/callgrind/tests/simwork-both.stderr.exp b/callgrind/tests/simwork-both.stderr.exp
index b742c213b..f8fb40278 100644
--- a/callgrind/tests/simwork-both.stderr.exp
+++ b/callgrind/tests/simwork-both.stderr.exp
@@ -1,23 +1,23 @@
-Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw Bc Bcm Bi Bim
+Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw Bc Bcm Bi Bim
Collected :
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
Branches:
Mispredicts:
diff --git a/callgrind/tests/simwork-cache.stderr.exp b/callgrind/tests/simwork-cache.stderr.exp
index 0705c1c84..974550a01 100644
--- a/callgrind/tests/simwork-cache.stderr.exp
+++ b/callgrind/tests/simwork-cache.stderr.exp
@@ -1,20 +1,20 @@
-Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
Collected :
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/simwork1.stderr.exp b/callgrind/tests/simwork1.stderr.exp
index 0705c1c84..974550a01 100644
--- a/callgrind/tests/simwork1.stderr.exp
+++ b/callgrind/tests/simwork1.stderr.exp
@@ -1,20 +1,20 @@
-Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw
+Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw
Collected :
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/simwork2.stderr.exp b/callgrind/tests/simwork2.stderr.exp
index 90da3e4ce..461ac9601 100644
--- a/callgrind/tests/simwork2.stderr.exp
+++ b/callgrind/tests/simwork2.stderr.exp
@@ -1,20 +1,20 @@
-Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw I2dmr D2dmr D2dmw
+Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw ILdmr DLdmr DLdmw
Collected :
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/simwork3.stderr.exp b/callgrind/tests/simwork3.stderr.exp
index ea9acc89b..6d41645f5 100644
--- a/callgrind/tests/simwork3.stderr.exp
+++ b/callgrind/tests/simwork3.stderr.exp
@@ -1,20 +1,20 @@
-Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2
+Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2
Collected :
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate:
diff --git a/callgrind/tests/threads-use.stderr.exp b/callgrind/tests/threads-use.stderr.exp
index 4f0bb9948..c8fd75e04 100644
--- a/callgrind/tests/threads-use.stderr.exp
+++ b/callgrind/tests/threads-use.stderr.exp
@@ -1,20 +1,20 @@
-Events : Ir Dr Dw I1mr D1mr D1mw I2mr D2mr D2mw AcCost1 SpLoss1 AcCost2 SpLoss2 Ge sysCount sysTime
+Events : Ir Dr Dw I1mr D1mr D1mw ILmr DLmr DLmw AcCost1 SpLoss1 AcCost2 SpLoss2 Ge sysCount sysTime
Collected :
I refs:
I1 misses:
-L2i misses:
+LLi misses:
I1 miss rate:
-L2i miss rate:
+LLi miss rate:
D refs:
D1 misses:
-L2d misses:
+LLd misses:
D1 miss rate:
-L2d miss rate:
+LLd miss rate:
-L2 refs:
-L2 misses:
-L2 miss rate:
+LL refs:
+LL misses:
+LL miss rate: