Callgrind: fix instrumentation for arbitrary events per guest instruction

(should fix bug 169505) This uses the same event queue scheme as cachegrind and lackey, and same kind of helpers (1/2/3 Ir events, Ir+Dr, Dr, Ir+Dw, Dw). Note that in contrast to Cachegrind, Callgrind interpretes a modify event as Dw (otherwise the cache model generating write back events would not work). Callgrind uses per-(guest)instruction event sets for cost counters. An per-instruction eventset is incrementally extended as events for the same guest instruction are flushed. Event sets always start with Ir counters, but depending on Dr/Dw order afterwards, there exist IrDr(Dw) and IrDw(Dr). Per-instruction event sets now are consistently named according to event ordering. Event set "sim" is a subset of "full", was never used and was removed. git-svn-id: svn://svn.valgrind.org/valgrind/trunk@10321 a5019735-40e9-0310-863c-91ae7b9d1cf9
author: weidendo <weidendo@a5019735-40e9-0310-863c-91ae7b9d1cf9> 2009-06-15 00:16:36 +0000
committer: weidendo <weidendo@a5019735-40e9-0310-863c-91ae7b9d1cf9> 2009-06-15 00:16:36 +0000
commit: 0a1951d64df79f98f885426671fc9d3982647a6b (patch)
tree: f9a44bf8c444e9da77016303bdbf1a3ee1ab61dd
parent: 0b23d6eb63a4146dfa17304a2e76ce91f4d5e001 (diff)
download: valgrind-0a1951d64df79f98f885426671fc9d3982647a6b.tar.gz
5 files changed, 933 insertions, 666 deletions
diff --git a/callgrind/bbcc.c b/callgrind/bbcc.c
index dfe737b2f..7917c2526 100644
--- a/callgrind/bbcc.c
+++ b/callgrind/bbcc.c
@@ -601,7 +601,7 @@ void CLG_(setup_bbcc)(BB* bb)
 	  if (!CLG_(clo).simulate_cache) {
 	      /* update Ir cost */
 	      int instr_count = last_bb->jmp[passed].instr+1;
-	      CLG_(current_state).cost[CLG_(sets).off_sim_Ir] += instr_count;
+	      CLG_(current_state).cost[CLG_(sets).off_full_Ir] += instr_count;
 	  }
       }
 
diff --git a/callgrind/debug.c b/callgrind/debug.c
index f04bab414..2ac38a297 100644
--- a/callgrind/debug.c
+++ b/callgrind/debug.c
@@ -217,9 +217,9 @@ void CLG_(print_short_jcc)(jCC* jcc)
 		    bb_jmpaddr(jcc->from->bb),
 		    bb_addr(jcc->to->bb),
 		    jcc->call_counter,
-		    jcc->cost ? jcc->cost[CLG_(sets).off_sim_Ir]:0,
-		    jcc->cost ? jcc->cost[CLG_(sets).off_sim_Dr]:0,
-		    jcc->cost ? jcc->cost[CLG_(sets).off_sim_Dw]:0);
+		    jcc->cost ? jcc->cost[CLG_(sets).off_full_Ir]:0,
+		    jcc->cost ? jcc->cost[CLG_(sets).off_full_Dr]:0,
+		    jcc->cost ? jcc->cost[CLG_(sets).off_full_Dw]:0);
     else
 	VG_(printf)("[Skipped JCC]");
 }
diff --git a/callgrind/global.h b/callgrind/global.h
index 461218a8c..367f2d7d3 100644
--- a/callgrind/global.h
+++ b/callgrind/global.h
@@ -270,7 +270,6 @@ typedef struct _InstrInfo InstrInfo;
 struct _InstrInfo {
   UInt instr_offset;
   UInt instr_size;
-  UInt data_size;
   UInt cost_offset;
   EventSet* eventset;
 };
@@ -657,19 +656,19 @@ struct cachesim_if
     void (*finish)(void);
     
     void (*log_1I0D)(InstrInfo*) VG_REGPARM(1);
+    void (*log_2I0D)(InstrInfo*, InstrInfo*) VG_REGPARM(2);
+    void (*log_3I0D)(InstrInfo*, InstrInfo*, InstrInfo*) VG_REGPARM(3);
 
-    void (*log_1I1Dr)(InstrInfo*, Addr) VG_REGPARM(2);
-    void (*log_1I1Dw)(InstrInfo*, Addr) VG_REGPARM(2);
-    void (*log_1I2D)(InstrInfo*, Addr, Addr) VG_REGPARM(3);
+    void (*log_1I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+    void (*log_1I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
 
-    void (*log_0I1Dr)(InstrInfo*, Addr) VG_REGPARM(2);
-    void (*log_0I1Dw)(InstrInfo*, Addr) VG_REGPARM(2);
-    void (*log_0I2D)(InstrInfo*, Addr, Addr) VG_REGPARM(3);
+    void (*log_0I1Dr)(InstrInfo*, Addr, Word) VG_REGPARM(3);
+    void (*log_0I1Dw)(InstrInfo*, Addr, Word) VG_REGPARM(3);
 
     // function names of helpers (for debugging generated code)
-    Char *log_1I0D_name;
-    Char *log_1I1Dr_name, *log_1I1Dw_name, *log_1I2D_name;
-    Char *log_0I1Dr_name, *log_0I1Dw_name, *log_0I2D_name;
+    Char *log_1I0D_name, *log_2I0D_name, *log_3I0D_name;
+    Char *log_1I1Dr_name, *log_1I1Dw_name;
+    Char *log_0I1Dr_name, *log_0I1Dw_name;
 };
 
 
@@ -687,15 +686,13 @@ void CLG_(print_debug_usage)(void);
 
 /* from sim.c */
 struct event_sets {
-  EventSet *use, *Ir, *Dr, *Dw;
-  EventSet *D0, *D1r, *D1w, *D2;
-  EventSet *sim;
-  EventSet *full; /* sim plus user events */
+  EventSet *Use, *Ir, *Dr, *Dw;
+  EventSet *UIr, *UIrDr, *UIrDrDw, *UIrDw, *UIrDwDr;
+  EventSet *full;
 
   /* offsets into eventsets */  
-  Int off_sim_Ir, off_sim_Dr, off_sim_Dw;
   Int off_full_Ir, off_full_Dr, off_full_Dw;
-  Int off_full_user, off_full_alloc, off_full_systime;
+  Int off_full_alloc, off_full_systime;
 };
 
 extern struct event_sets CLG_(sets);
diff --git a/callgrind/main.c b/callgrind/main.c
index 68d13814d..f2d125037 100644
--- a/callgrind/main.c
+++ b/callgrind/main.c
@@ -94,224 +94,490 @@ static void CLG_(init_statistics)(Statistics* s)
 }
 
 
-    
-
 /*------------------------------------------------------------*/
-/*--- Cache simulation instrumentation phase               ---*/
+/*--- Instrumentation structures and event queue handling  ---*/
 /*------------------------------------------------------------*/
 
+/* Maintain an ordered list of memory events which are outstanding, in
+   the sense that no IR has yet been generated to do the relevant
+   helper calls.  The BB is scanned top to bottom and memory events
+   are added to the end of the list, merging with the most recent
+   notified event where possible (Dw immediately following Dr and
+   having the same size and EA can be merged).
+
+   This merging is done so that for architectures which have
+   load-op-store instructions (x86, amd64), the insn is treated as if
+   it makes just one memory reference (a modify), rather than two (a
+   read followed by a write at the same address).
+
+   At various points the list will need to be flushed, that is, IR
+   generated from it.  That must happen before any possible exit from
+   the block (the end, or an IRStmt_Exit).  Flushing also takes place
+   when there is no space to add a new event.
+
+   If we require the simulation statistics to be up to date with
+   respect to possible memory exceptions, then the list would have to
+   be flushed before each memory reference.  That would however lose
+   performance by inhibiting event-merging during flushing.
+
+   Flushing the list consists of walking it start to end and emitting
+   instrumentation IR for each event, in the order in which they
+   appear.  It may be possible to emit a single call for two adjacent
+   events in order to reduce the number of helper function calls made.
+   For example, it could well be profitable to handle two adjacent Ir
+   events with a single helper call.  */
+
+typedef
+   IRExpr
+   IRAtom;
+
+typedef
+   enum {
+      Ev_Ir,  // Instruction read
+      Ev_Dr,  // Data read
+      Ev_Dw,  // Data write
+      Ev_Dm,  // Data modify (read then write)
+   }
+   EventTag;
+
+typedef
+   struct {
+      EventTag   tag;
+      InstrInfo* inode;
+      union {
+	 struct {
+	 } Ir;
+	 struct {
+	    IRAtom* ea;
+	    Int     szB;
+	 } Dr;
+	 struct {
+	    IRAtom* ea;
+	    Int     szB;
+	 } Dw;
+	 struct {
+	    IRAtom* ea;
+	    Int     szB;
+	 } Dm;
+      } Ev;
+   }
+   Event;
+
+static void init_Event ( Event* ev ) {
+   VG_(memset)(ev, 0, sizeof(Event));
+}
+
+static IRAtom* get_Event_dea ( Event* ev ) {
+   switch (ev->tag) {
+      case Ev_Dr: return ev->Ev.Dr.ea;
+      case Ev_Dw: return ev->Ev.Dw.ea;
+      case Ev_Dm: return ev->Ev.Dm.ea;
+      default:    tl_assert(0);
+   }
+}
+
+static Int get_Event_dszB ( Event* ev ) {
+   switch (ev->tag) {
+      case Ev_Dr: return ev->Ev.Dr.szB;
+      case Ev_Dw: return ev->Ev.Dw.szB;
+      case Ev_Dm: return ev->Ev.Dm.szB;
+      default:    tl_assert(0);
+   }
+}
+
+
+/* Up to this many unnotified events are allowed.  Number is
+   arbitrary.  Larger numbers allow more event merging to occur, but
+   potentially induce more spilling due to extending live ranges of
+   address temporaries. */
+#define N_EVENTS 16
+
+
+/* A struct which holds all the running state during instrumentation.
+   Mostly to avoid passing loads of parameters everywhere. */
+typedef struct {
+    /* The current outstanding-memory-event list. */
+    Event events[N_EVENTS];
+    Int   events_used;
+
+    /* The array of InstrInfo's is part of BB struct. */
+    BB* bb;
 
-static Bool loadStoreAddrsMatch(IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+    /* BB seen before (ie. re-instrumentation) */
+    Bool seen_before;
+
+    /* Number InstrInfo bins 'used' so far. */
+    UInt ii_index;
+
+    // current offset of guest instructions from BB start
+    UInt instr_offset;
+
+    /* The output SB being constructed. */
+    IRSB* sbOut;
+} ClgState;
+
+
+static void showEvent ( Event* ev )
 {
-  // I'm assuming that for 'modify' instructions, that Vex always makes
-  // the loadAddrExpr and storeAddrExpr be of the same type, ie. both Tmp
-  // expressions, or both Const expressions.
-  CLG_ASSERT(isIRAtom(loadAddrExpr));
-  CLG_ASSERT(isIRAtom(storeAddrExpr));
-  return eqIRAtom(loadAddrExpr, storeAddrExpr);
+   switch (ev->tag) {
+      case Ev_Ir:
+	 VG_(printf)("Ir (InstrInfo %p) at +%d\n",
+		     ev->inode, ev->inode->instr_offset);
+	 break;
+      case Ev_Dr:
+	 VG_(printf)("Dr (InstrInfo %p) at +%d %d EA=",
+		     ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
+	 ppIRExpr(ev->Ev.Dr.ea);
+	 VG_(printf)("\n");
+	 break;
+      case Ev_Dw:
+	 VG_(printf)("Dw (InstrInfo %p) at +%d %d EA=",
+		     ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
+	 ppIRExpr(ev->Ev.Dw.ea);
+	 VG_(printf)("\n");
+	 break;
+      case Ev_Dm:
+	 VG_(printf)("Dm (InstrInfo %p) at +%d %d EA=",
+		     ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
+	 ppIRExpr(ev->Ev.Dm.ea);
+	 VG_(printf)("\n");
+	 break;
+      default:
+	 tl_assert(0);
+	 break;
+   }
 }
 
-static
-EventSet* insert_simcall(IRSB* bbOut, InstrInfo* ii, UInt dataSize,
-			 Bool instrIssued,
-			 IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+/* Generate code for all outstanding memory events, and mark the queue
+   empty.  Code is generated into cgs->sbOut, and this activity
+   'consumes' slots in cgs->bb. */
+
+static void flushEvents ( ClgState* clgs )
 {
-    HChar*    helperName;
-    void*     helperAddr;
-    Int       argc;
-    EventSet* es;
-    IRExpr   *arg1, *arg2 = 0, *arg3 = 0, **argv;
-    IRDirty* di;
-
-    /* Check type of original instruction regarding memory access,
-     * and collect info to be able to generate fitting helper call
-     */
-    if (!loadAddrExpr && !storeAddrExpr) {
-	// no load/store
-	CLG_ASSERT(0 == dataSize);
-	if (instrIssued) {
-	    helperName = 0;
-	    helperAddr = 0;
-	}
-	else {
-	    helperName = CLG_(cachesim).log_1I0D_name;
-	    helperAddr = CLG_(cachesim).log_1I0D;
-	}
-	argc = 1;
-	es = CLG_(sets).D0;
-	
-    } else if (loadAddrExpr && !storeAddrExpr) {
-	// load
-	CLG_ASSERT( isIRAtom(loadAddrExpr) );
-	if (instrIssued) {
-	    helperName = CLG_(cachesim).log_0I1Dr_name;
-	    helperAddr = CLG_(cachesim).log_0I1Dr;
-	}
-	else {
-	    helperName = CLG_(cachesim).log_1I1Dr_name;
-	    helperAddr = CLG_(cachesim).log_1I1Dr;
-	}
-	argc = 2;
-	arg2 = loadAddrExpr;
-	es = CLG_(sets).D1r;
-
-    } else if (!loadAddrExpr && storeAddrExpr) {
-	// store
-	CLG_ASSERT( isIRAtom(storeAddrExpr) );
-	if (instrIssued) {
-	    helperName = CLG_(cachesim).log_0I1Dw_name;
-	    helperAddr = CLG_(cachesim).log_0I1Dw;
-	}
-	else {
-	    helperName = CLG_(cachesim).log_1I1Dw_name;
-	    helperAddr = CLG_(cachesim).log_1I1Dw;
-	}
-	argc = 2;
-	arg2 = storeAddrExpr;
-	es = CLG_(sets).D1w;
-	
-    } else {
-	CLG_ASSERT( loadAddrExpr && storeAddrExpr );
-	CLG_ASSERT( isIRAtom(loadAddrExpr) );
-	CLG_ASSERT( isIRAtom(storeAddrExpr) );
-	
-	if ( loadStoreAddrsMatch(loadAddrExpr, storeAddrExpr) ) {
-	    /* modify: suppose write access, as this is
-	     * more resource consuming (as in callgrind for VG2)
-	     * Cachegrind does a read here (!)
-	     * DISCUSS: Best way depends on simulation model?
-	     */
-	    if (instrIssued) {
-		helperName = CLG_(cachesim).log_0I1Dw_name;
-		helperAddr = CLG_(cachesim).log_0I1Dw;
+   Int        i, regparms, inew;
+   Char*      helperName;
+   void*      helperAddr;
+   IRExpr**   argv;
+   IRExpr*    i_node_expr;
+   IRDirty*   di;
+   Event*     ev;
+   Event*     ev2;
+   Event*     ev3;
+
+   if (!clgs->seen_before) {
+       // extend event sets as needed
+       // available sets: D0 Dr
+       for(i=0; i<clgs->events_used; i++) {
+	   ev  = &clgs->events[i];
+	   switch(ev->tag) {
+	   case Ev_Ir:
+	       // Ir event always is first for a guest instruction
+	       CLG_ASSERT(ev->inode->eventset == 0);
+	       ev->inode->eventset = CLG_(sets).UIr;
+	       break;
+	   case Ev_Dr:
+	       // extend event set by Dr counter
+	       if ((ev->inode->eventset == CLG_(sets).UIrDr)   ||
+		   (ev->inode->eventset == CLG_(sets).UIrDrDw) ||
+		   (ev->inode->eventset == CLG_(sets).UIrDwDr))
+		   break;
+	       if (ev->inode->eventset == CLG_(sets).UIrDw) {
+		   ev->inode->eventset = CLG_(sets).UIrDwDr;
+		   break;
+	       }
+	       CLG_ASSERT(ev->inode->eventset == CLG_(sets).UIr);
+	       ev->inode->eventset = CLG_(sets).UIrDr;
+	       break;
+	   case Ev_Dw:
+	   case Ev_Dm:
+	       // extend event set by Dw counter
+	       if ((ev->inode->eventset == CLG_(sets).UIrDw)   ||
+		   (ev->inode->eventset == CLG_(sets).UIrDwDr) ||
+		   (ev->inode->eventset == CLG_(sets).UIrDrDw))
+		   break;
+	       if (ev->inode->eventset == CLG_(sets).UIrDr) {
+		   ev->inode->eventset = CLG_(sets).UIrDrDw;
+		   break;
+	       }
+	       CLG_ASSERT(ev->inode->eventset == CLG_(sets).UIr);
+	       ev->inode->eventset = CLG_(sets).UIrDw;
+	       break;
+	   default:
+	       tl_assert(0);
+	   }
+       }
+   }
+
+   for(i = 0; i < clgs->events_used; i = inew) {
+
+      helperName = NULL;
+      helperAddr = NULL;
+      argv       = NULL;
+      regparms   = 0;
+
+      /* generate IR to notify event i and possibly the ones
+	 immediately following it. */
+      tl_assert(i >= 0 && i < clgs->events_used);
+
+      ev  = &clgs->events[i];
+      ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
+      ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
+
+      CLG_DEBUGIF(5) {
+	 VG_(printf)("   flush ");
+	 showEvent( ev );
+      }
+
+      i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
+
+      /* Decide on helper fn to call and args to pass it, and advance
+	 i appropriately.
+	 Dm events have same effect as Dw events */
+      switch (ev->tag) {
+	 case Ev_Ir:
+	    /* Merge an Ir with a following Dr. */
+	    if (ev2 && ev2->tag == Ev_Dr) {
+	       /* Why is this true?  It's because we're merging an Ir
+		  with a following Dr.  The Ir derives from the
+		  instruction's IMark and the Dr from data
+		  references which follow it.  In short it holds
+		  because each insn starts with an IMark, hence an
+		  Ev_Ir, and so these Dr must pertain to the
+		  immediately preceding Ir.  Same applies to analogous
+		  assertions in the subsequent cases. */
+	       tl_assert(ev2->inode == ev->inode);
+	       helperName = CLG_(cachesim).log_1I1Dr_name;
+	       helperAddr = CLG_(cachesim).log_1I1Dr;
+	       argv = mkIRExprVec_3( i_node_expr,
+				     get_Event_dea(ev2),
+				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
+	       regparms = 3;
+	       inew = i+2;
 	    }
-	    else {
-		helperName = CLG_(cachesim).log_1I1Dw_name;
-		helperAddr = CLG_(cachesim).log_1I1Dw;
+	    /* Merge an Ir with a following Dw/Dm. */
+	    else
+	    if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
+	       tl_assert(ev2->inode == ev->inode);
+	       helperName = CLG_(cachesim).log_1I1Dw_name;
+	       helperAddr = CLG_(cachesim).log_1I1Dw;
+	       argv = mkIRExprVec_3( i_node_expr,
+				     get_Event_dea(ev2),
+				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
+	       regparms = 3;
+	       inew = i+2;
 	    }
-	    argc = 2;
-	    arg2 = storeAddrExpr;
-	    es = CLG_(sets).D1w;
-	    
-	} else {
-	    // load/store
-	    if (instrIssued) {
-		helperName = CLG_(cachesim).log_0I2D_name;
-		helperAddr = CLG_(cachesim).log_0I2D;
+	    /* Merge an Ir with two following Irs. */
+	    else
+	    if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
+	       helperName = CLG_(cachesim).log_3I0D_name;
+	       helperAddr = CLG_(cachesim).log_3I0D;
+	       argv = mkIRExprVec_3( i_node_expr,
+				     mkIRExpr_HWord( (HWord)ev2->inode ),
+				     mkIRExpr_HWord( (HWord)ev3->inode ) );
+	       regparms = 3;
+	       inew = i+3;
 	    }
+	    /* Merge an Ir with one following Ir. */
+	    else
+	    if (ev2 && ev2->tag == Ev_Ir) {
+	       helperName = CLG_(cachesim).log_2I0D_name;
+	       helperAddr = CLG_(cachesim).log_2I0D;
+	       argv = mkIRExprVec_2( i_node_expr,
+				     mkIRExpr_HWord( (HWord)ev2->inode ) );
+	       regparms = 2;
+	       inew = i+2;
+	    }
+	    /* No merging possible; emit as-is. */
 	    else {
-		helperName = CLG_(cachesim).log_1I2D_name;
-		helperAddr = CLG_(cachesim).log_1I2D;
+	       helperName = CLG_(cachesim).log_1I0D_name;
+	       helperAddr = CLG_(cachesim).log_1I0D;
+	       argv = mkIRExprVec_1( i_node_expr );
+	       regparms = 1;
+	       inew = i+1;
 	    }
-	    argc = 3;
-	    arg2 = loadAddrExpr;
-	    arg3 = storeAddrExpr;
-	    es = CLG_(sets).D2;
-	}
-    }
+	    break;
+	 case Ev_Dr:
+	    /* Data read or modify */
+	    helperName = CLG_(cachesim).log_0I1Dr_name;
+	    helperAddr = CLG_(cachesim).log_0I1Dr;
+	    argv = mkIRExprVec_3( i_node_expr,
+				  get_Event_dea(ev),
+				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
+	    regparms = 3;
+	    inew = i+1;
+	    break;
+	 case Ev_Dw:
+	 case Ev_Dm:
+	    /* Data write */
+	    helperName = CLG_(cachesim).log_0I1Dw_name;
+	    helperAddr = CLG_(cachesim).log_0I1Dw;
+	    argv = mkIRExprVec_3( i_node_expr,
+				  get_Event_dea(ev),
+				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
+	    regparms = 3;
+	    inew = i+1;
+	    break;
+	 default:
+	    tl_assert(0);
+      }
 
-    /* helper could be unset depending on the simulator used */
-    if (helperAddr == 0) return 0;
-    
-    /* Setup 1st arg: InstrInfo */
-    arg1 = mkIRExpr_HWord( (HWord)ii );
-    
-    // Add call to the instrumentation function
-    if      (argc == 1)
-	argv = mkIRExprVec_1(arg1);
-    else if (argc == 2)
-	argv = mkIRExprVec_2(arg1, arg2);
-    else if (argc == 3)
-	argv = mkIRExprVec_3(arg1, arg2, arg3);
-    else
-	VG_(tool_panic)("argc... not 1 or 2 or 3?");
-    
-    di = unsafeIRDirty_0_N( argc, helperName, 
-                                  VG_(fnptr_to_fnentry)( helperAddr ), argv);
-    addStmtToIRSB( bbOut, IRStmt_Dirty(di) );
+      CLG_DEBUGIF(5) {
+	  if (inew > i+1) {
+	      VG_(printf)("   merge ");
+	      showEvent( ev2 );
+	  }
+	  if (inew > i+2) {
+	      VG_(printf)("   merge ");
+	      showEvent( ev3 );
+	  }
+	  if (helperAddr)
+	      VG_(printf)("   call  %s (%p)\n",
+			  helperName, helperAddr);
+      }
+
+      /* helper could be unset depending on the simulator used */
+      if (helperAddr == 0) continue;
+
+      /* Add the helper. */
+      tl_assert(helperName);
+      tl_assert(helperAddr);
+      tl_assert(argv);
+      di = unsafeIRDirty_0_N( regparms,
+			      helperName, VG_(fnptr_to_fnentry)( helperAddr ),
+			      argv );
+      addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
+   }
 
-    return es;
+   clgs->events_used = 0;
 }
 
+static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
+{
+   Event* evt;
+   tl_assert(clgs->seen_before || (inode->eventset == 0));
+   if (!CLG_(clo).simulate_cache) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag      = Ev_Ir;
+   evt->inode    = inode;
+   clgs->events_used++;
+}
+
+static
+void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
+{
+   Event* evt;
+   tl_assert(isIRAtom(ea));
+   tl_assert(datasize >= 1 && datasize <= MIN_LINE_SIZE);
+   if (!CLG_(clo).simulate_cache) return;
+
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Dr;
+   evt->inode     = inode;
+   evt->Ev.Dr.szB = datasize;
+   evt->Ev.Dr.ea  = ea;
+   clgs->events_used++;
+}
 
-/* Instrumentation before a conditional jump or at the end
- * of each original instruction.
- * Fills the InstrInfo struct if not seen before
- */
 static
-void endOfInstr(IRSB* bbOut, InstrInfo* ii, Bool bb_seen_before,
-		UInt instr_offset, UInt instrLen, UInt dataSize, 
-		UInt* cost_offset, Bool instrIssued,
-		IRExpr* loadAddrExpr, IRExpr* storeAddrExpr)
+void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
 {
-   IRType    wordTy;
-   EventSet* es;
-
-   // Stay sane ...
-   CLG_ASSERT(sizeof(HWord) == sizeof(void*));
-   if (sizeof(HWord) == 4) {
-      wordTy = Ity_I32;
-   } else
-   if (sizeof(HWord) == 8) {
-      wordTy = Ity_I64;
-   } else {
-      VG_(tool_panic)("endOfInstr: strange word size");
+   Event* lastEvt;
+   Event* evt;
+   tl_assert(isIRAtom(ea));
+   tl_assert(datasize >= 1 && datasize <= MIN_LINE_SIZE);
+   if (!CLG_(clo).simulate_cache) return;
+
+   /* Is it possible to merge this write with the preceding read? */
+   lastEvt = &clgs->events[clgs->events_used-1];
+   if (clgs->events_used > 0
+       && lastEvt->tag       == Ev_Dr
+       && lastEvt->Ev.Dr.szB == datasize
+       && lastEvt->inode     == inode
+       && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
+   {
+      lastEvt->tag   = Ev_Dm;
+      return;
    }
 
-   if (loadAddrExpr) 
-      CLG_ASSERT(wordTy == typeOfIRExpr(bbOut->tyenv, loadAddrExpr));
-   if (storeAddrExpr) 
-      CLG_ASSERT(wordTy == typeOfIRExpr(bbOut->tyenv, storeAddrExpr));
-
-   // Large (eg. 28B, 108B, 512B on x86) data-sized instructions will be
-   // done inaccurately, but they're very rare and this avoids errors from
-   // hitting more than two cache lines in the simulation.
-   if (dataSize > MIN_LINE_SIZE) dataSize = MIN_LINE_SIZE;
-
-   /* returns 0 if simulator needs no instrumentation */
-   es = insert_simcall(bbOut, ii, dataSize, instrIssued,
-		       loadAddrExpr, storeAddrExpr);
-
-   CLG_DEBUG(5, "  Instr +%2d (Size %d, DSize %d): ESet %s (Size %d)\n",
-	     instr_offset, instrLen, dataSize, 
-	     es ? es->name : (Char*)"(no instrumentation)",
-	     es ? es->size : 0);
-
-   if (bb_seen_before) {
-       CLG_DEBUG(5, "   before: Instr +%2d (Size %d, DSize %d)\n",
-		 ii->instr_offset, ii->instr_size, ii->data_size);
-
-       CLG_ASSERT(ii->instr_offset == instr_offset);
-       CLG_ASSERT(ii->instr_size == instrLen);
-       CLG_ASSERT(ii->cost_offset == *cost_offset);
-       CLG_ASSERT(ii->eventset == es);
-
-       /* Only check size if data size >0.
-	* This is needed: e.g. for rep or cmov x86 instructions, the same InstrInfo
-	* is used both for 2 simulator calls: for the pure instruction fetch and
-        * separately for an memory access (which may not happen depending on flags).
-	* If checked always, this triggers an assertion failure on retranslation.
-	*/
-       if (dataSize>0) CLG_ASSERT(ii->data_size == dataSize);
+   /* No.  Add as normal. */
+   if (clgs->events_used == N_EVENTS)
+      flushEvents(clgs);
+   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
+   evt = &clgs->events[clgs->events_used];
+   init_Event(evt);
+   evt->tag       = Ev_Dw;
+   evt->inode     = inode;
+   evt->Ev.Dw.szB = datasize;
+   evt->Ev.Dw.ea  = ea;
+   clgs->events_used++;
+}
 
+/* Initialise or check (if already seen before) an InstrInfo for next insn.
+   We only can set instr_offset/instr_size here. The required event set and
+   resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
+   instructions. The event set is extended as required on flush of the event
+   queue (when Dm events were determined), cost offsets are determined at
+   end of BB instrumentation. */
+static
+InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
+{
+   InstrInfo* ii;
+   tl_assert(clgs->ii_index >= 0);
+   tl_assert(clgs->ii_index < clgs->bb->instr_count);
+   ii = &clgs->bb->instr[ clgs->ii_index ];
+
+   if (clgs->seen_before) {
+       CLG_ASSERT(ii->instr_offset == clgs->instr_offset);
+       CLG_ASSERT(ii->instr_size == instr_size);
    }
    else {
-       ii->instr_offset = instr_offset;
-       ii->instr_size = instrLen;
-       ii->cost_offset = *cost_offset;
-       ii->eventset = es;
-       
-       /* data size only relevant if >0 */
-       if (dataSize > 0) ii->data_size = dataSize;
+       ii->instr_offset = clgs->instr_offset;
+       ii->instr_size = instr_size;
+       ii->cost_offset = 0;
+       ii->eventset = 0;
+   }
 
+   clgs->ii_index++;
+   clgs->instr_offset += instr_size;
+   CLG_(stat).distinct_instrs++;
 
-       CLG_(stat).distinct_instrs++;
-   }
+   return ii;
+}
 
-   *cost_offset += es ? es->size : 0;
+// return total number of cost values needed for this BB
+static
+UInt update_cost_offsets( ClgState* clgs )
+{
+    Int i;
+    InstrInfo* ii;
+    UInt cost_offset = 0;
+
+    CLG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
+    for(i=0; i<clgs->ii_index; i++) {
+	ii = &clgs->bb->instr[i];
+	if (clgs->seen_before) {
+	    CLG_ASSERT(ii->cost_offset == cost_offset);
+	} else
+	    ii->cost_offset = cost_offset;
+	cost_offset += ii->eventset ? ii->eventset->size : 0;
+    }
 
+    return cost_offset;
 }
 
+/*------------------------------------------------------------*/
+/*--- Instrumentation                                      ---*/
+/*------------------------------------------------------------*/
+
 #if defined(VG_BIGENDIAN)
 # define CLGEndness Iend_BE
 #elif defined(VG_LITTLEENDIAN)
@@ -344,7 +610,7 @@ Addr IRConst2Addr(IRConst* con)
  *
  * Called from CLG_(get_bb)
  */
-void CLG_(collectBlockInfo)(IRSB* bbIn,
+void CLG_(collectBlockInfo)(IRSB* sbIn,
 			    /*INOUT*/ UInt* instrs,
 			    /*INOUT*/ UInt* cjmps,
 			    /*INOUT*/ Bool* cjmp_inverted)
@@ -360,10 +626,10 @@ void CLG_(collectBlockInfo)(IRSB* bbIn,
     // nothing to do with client code
     Bool inPreamble = True;
 
-    if (!bbIn) return;
+    if (!sbIn) return;
 
-    for (i = 0; i < bbIn->stmts_used; i++) {
-	  st = bbIn->stmts[i];
+    for (i = 0; i < sbIn->stmts_used; i++) {
+	  st = sbIn->stmts[i];
 	  if (Ist_IMark == st->tag) {
 	      inPreamble = False;
 
@@ -377,7 +643,7 @@ void CLG_(collectBlockInfo)(IRSB* bbIn,
 	  if (Ist_Exit == st->tag) {
 	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
 	      toNextInstr =  (jumpDst == instrAddr + instrLen);
-	      
+
 	      (*cjmps)++;
 	  }
     }
@@ -389,98 +655,6 @@ void CLG_(collectBlockInfo)(IRSB* bbIn,
 }
 
 static
-void collectStatementInfo(IRTypeEnv* tyenv, IRStmt* st,
-			  Addr* instrAddr, UInt* instrLen,
-			  IRExpr** loadAddrExpr, IRExpr** storeAddrExpr,
-			  UInt* dataSize)
-{
-   CLG_ASSERT(isFlatIRStmt(st));
-
-   switch (st->tag) {
-   case Ist_NoOp:
-      break;
-
-   case Ist_AbiHint:
-      /* ABI hints aren't interesting.  Ignore. */
-      break;
-
-   case Ist_IMark:
-      /* st->Ist.IMark.addr is a 64-bit int.  ULong_to_Ptr casts this
-         to the host's native pointer type; if that is 32 bits then it
-         discards the upper 32 bits.  If we are cachegrinding on a
-         32-bit host then we are also ensured that the guest word size
-         is 32 bits, due to the assertion in cg_instrument that the
-         host and guest word sizes must be the same.  Hence
-         st->Ist.IMark.addr will have been derived from a 32-bit guest
-         code address and truncation of it is safe.  I believe this
-         assignment should be correct for both 32- and 64-bit
-         machines. */
-      *instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
-      *instrLen =        st->Ist.IMark.len;
-      break;
-
-   case Ist_WrTmp: {
-      IRExpr* data = st->Ist.WrTmp.data;
-      if (data->tag == Iex_Load) {
-         IRExpr* aexpr = data->Iex.Load.addr;
-         CLG_ASSERT( isIRAtom(aexpr) );
-         // Note also, endianness info is ignored.  I guess that's not
-         // interesting.
-         // XXX: repe cmpsb does two loads... the first one is ignored here!
-         //tl_assert( NULL == *loadAddrExpr );          // XXX: ???
-         *loadAddrExpr = aexpr;
-         *dataSize = sizeofIRType(data->Iex.Load.ty);
-      }
-      break;
-   }
-      
-   case Ist_Store: {
-      IRExpr* data  = st->Ist.Store.data;
-      IRExpr* aexpr = st->Ist.Store.addr;
-      CLG_ASSERT( isIRAtom(aexpr) );
-      if ( NULL == *storeAddrExpr ) {
-          /* this is a kludge: ignore all except the first store from
-             an instruction. */
-          *storeAddrExpr = aexpr;
-          *dataSize = sizeofIRType(typeOfIRExpr(tyenv, data));
-      }
-      break;
-   }
-   
-   case Ist_Dirty: {
-      IRDirty* d = st->Ist.Dirty.details;
-      if (d->mFx != Ifx_None) {
-         /* This dirty helper accesses memory.  Collect the
-            details. */
-         CLG_ASSERT(d->mAddr != NULL);
-         CLG_ASSERT(d->mSize != 0);
-         *dataSize = d->mSize;
-         if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
-            *loadAddrExpr = d->mAddr;
-         if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
-            *storeAddrExpr = d->mAddr;
-      } else {
-         CLG_ASSERT(d->mAddr == NULL);
-         CLG_ASSERT(d->mSize == 0);
-      }
-      break;
-   }
-
-   case Ist_Put:
-   case Ist_PutI:
-   case Ist_MBE:
-   case Ist_Exit:
-       break;
-
-   default:
-      VG_(printf)("\n");
-      ppIRStmt(st);
-      VG_(printf)("\n");
-      VG_(tool_panic)("Callgrind: unhandled IRStmt");
-   }
-}
-
-static
 void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
 {
     addStmtToIRSB( bbOut,
@@ -491,29 +665,56 @@ void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
 				IRExpr_Const(IRConst_U32(val)) ));
 }   
 
+
+/* add helper call to setup_bbcc, with pointer to BB struct as argument
+ *
+ * precondition for setup_bbcc:
+ * - jmps_passed has number of cond.jumps passed in last executed BB
+ * - current_bbcc has a pointer to the BBCC of the last executed BB
+ *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
+ *     current_bbcc->bb->jmp_addr
+ *   gives the address of the jump source.
+ *
+ * the setup does 2 things:
+ * - trace call:
+ *   * Unwind own call stack, i.e sync our ESP with real ESP
+ *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
+ *   * For CALLs or JMPs crossing objects, record call arg +
+ *     push are on own call stack
+ *
+ * - prepare for cache log functions:
+ *   set current_bbcc to BBCC that gets the costs for this BB execution
+ *   attached
+ */
+static
+void addBBSetupCall(ClgState* clgs)
+{
+   IRDirty* di;
+   IRExpr  *arg1, **argv;
+
+   arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
+   argv = mkIRExprVec_1(arg1);
+   di = unsafeIRDirty_0_N( 1, "setup_bbcc",
+			      VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ),
+			      argv);
+   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
+}
+
+
 static
 IRSB* CLG_(instrument)( VgCallbackClosure* closure,
-			IRSB* bbIn,
+			IRSB* sbIn,
 			VexGuestLayout* layout,
 			VexGuestExtents* vge,
 			IRType gWordTy, IRType hWordTy )
 {
-   Int      i;
-   IRSB*    bbOut;
-   IRStmt*  st, *stnext;
-   Addr     instrAddr, origAddr;
-   UInt     instrLen = 0, dataSize;
-   UInt     instrCount, costOffset;
-   IRExpr  *loadAddrExpr, *storeAddrExpr;
-
-   BB*         bb;
+   Int      i, isize;
+   IRStmt*  st;
+   Addr     origAddr;
+   InstrInfo* curr_inode = NULL;
+   ClgState clgs;
+   UInt     cJumps = 0;
 
-   IRDirty* di;
-   IRExpr  *arg1, **argv;
-
-   Bool        bb_seen_before     = False;
-   UInt        cJumps = 0, cJumpsCorrected;
-   Bool        beforeIBoundary, instrIssued;
 
    if (gWordTy != hWordTy) {
       /* We don't currently support this case. */
@@ -524,173 +725,206 @@ IRSB* CLG_(instrument)( VgCallbackClosure* closure,
    if (! CLG_(instrument_state)) {
        CLG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
 		 (Addr)closure->readdr);
-       return bbIn;
+       return sbIn;
    }
 
    CLG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
 
    /* Set up SB for instrumented IR */
-   bbOut = deepCopyIRSBExceptStmts(bbIn);
+   clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
 
    // Copy verbatim any IR preamble preceding the first IMark
    i = 0;
-   while (i < bbIn->stmts_used && bbIn->stmts[i]->tag != Ist_IMark) {
-      addStmtToIRSB( bbOut, bbIn->stmts[i] );
+   while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
+      addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
       i++;
    }
 
    // Get the first statement, and origAddr from it
-   CLG_ASSERT(bbIn->stmts_used > 0);
-   st = bbIn->stmts[i];
+   CLG_ASSERT(sbIn->stmts_used >0);
+   CLG_ASSERT(i < sbIn->stmts_used);
+   st = sbIn->stmts[i];
    CLG_ASSERT(Ist_IMark == st->tag);
-   instrAddr = origAddr = (Addr)st->Ist.IMark.addr;
+
+   origAddr = (Addr)st->Ist.IMark.addr;
    CLG_ASSERT(origAddr == st->Ist.IMark.addr);  // XXX: check no overflow
 
-   /* Get BB (creating if necessary).
+   /* Get BB struct (creating if necessary).
     * JS: The hash table is keyed with orig_addr_noredir -- important!
     * JW: Why? If it is because of different chasing of the redirection,
     *     this is not needed, as chasing is switched off in callgrind
     */
-   bb = CLG_(get_bb)(origAddr, bbIn, &bb_seen_before);
-   //bb = CLG_(get_bb)(orig_addr_noredir, bbIn, &bb_seen_before);
-
-   /* 
-    * Precondition:
-    * - jmps_passed has number of cond.jumps passed in last executed BB
-    * - current_bbcc has a pointer to the BBCC of the last executed BB
-    *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
-    *     current_bbcc->bb->jmp_addr
-    *   gives the address of the jump source.
-    *   
-    * The BBCC setup does 2 things:
-    * - trace call:
-    *   * Unwind own call stack, i.e sync our ESP with real ESP
-    *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
-    *   * For CALLs or JMPs crossing objects, record call arg +
-    *     push are on own call stack
-    *
-    * - prepare for cache log functions:
-    *   Set current_bbcc to BBCC that gets the costs for this BB execution
-    *   attached
-    */
+   clgs.bb = CLG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
+
+   addBBSetupCall(&clgs);
+
+   // Set up running state
+   clgs.events_used = 0;
+   clgs.ii_index = 0;
+   clgs.instr_offset = 0;
+
+   for (/*use current i*/; i < sbIn->stmts_used; i++) {
+
+      st = sbIn->stmts[i];
+      CLG_ASSERT(isFlatIRStmt(st));
+
+      switch (st->tag) {
+	 case Ist_NoOp:
+	 case Ist_AbiHint:
+	 case Ist_Put:
+	 case Ist_PutI:
+	 case Ist_MBE:
+	    break;
+
+	 case Ist_IMark: {
+	    CLG_ASSERT(clgs.instr_offset == (Addr)st->Ist.IMark.addr - origAddr);
+	    isize = st->Ist.IMark.len;
+	    // If Vex fails to decode an instruction, the size will be zero.
+	    // Pretend otherwise.
+	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
+
+	    // Sanity-check size.
+	    tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
+		     || VG_CLREQ_SZB == isize );
+
+	    // Init the inode, record it as the current one.
+	    // Subsequent Dr/Dw/Dm events from the same instruction will
+	    // also use it.
+	    curr_inode = next_InstrInfo (&clgs, isize);
+
+	    addEvent_Ir( &clgs, curr_inode );
+	    break;
+	 }
+
+	 case Ist_WrTmp: {
+	    IRExpr* data = st->Ist.WrTmp.data;
+	    if (data->tag == Iex_Load) {
+	       IRExpr* aexpr = data->Iex.Load.addr;
+	       // Note also, endianness info is ignored.  I guess
+	       // that's not interesting.
+	       addEvent_Dr( &clgs, curr_inode,
+			    sizeofIRType(data->Iex.Load.ty), aexpr );
+	    }
+	    break;
+	 }
+
+	 case Ist_Store: {
+	    IRExpr* data  = st->Ist.Store.data;
+	    IRExpr* aexpr = st->Ist.Store.addr;
+	    addEvent_Dw( &clgs, curr_inode,
+			 sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
+	    break;
+	 }
+
+	 case Ist_Dirty: {
+	    Int      dataSize;
+	    IRDirty* d = st->Ist.Dirty.details;
+	    if (d->mFx != Ifx_None) {
+	       /* This dirty helper accesses memory.  Collect the details. */
+	       tl_assert(d->mAddr != NULL);
+	       tl_assert(d->mSize != 0);
+	       dataSize = d->mSize;
+	       // Large (eg. 28B, 108B, 512B on x86) data-sized
+	       // instructions will be done inaccurately, but they're
+	       // very rare and this avoids errors from hitting more
+	       // than two cache lines in the simulation.
+	       if (dataSize > MIN_LINE_SIZE)
+		  dataSize = MIN_LINE_SIZE;
+	       if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
+		  addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
+	       if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
+		  addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
+	    } else {
+	       tl_assert(d->mAddr == NULL);
+	       tl_assert(d->mSize == 0);
+	    }
+	    break;
+	 }
 
-   // helper call to setup_bbcc, with pointer to basic block info struct as argument
-   arg1 = mkIRExpr_HWord( (HWord)bb );
-   argv = mkIRExprVec_1(arg1);
-   di = unsafeIRDirty_0_N( 1, "setup_bbcc", 
-                              VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ), 
-                              argv);
-   addStmtToIRSB( bbOut, IRStmt_Dirty(di) );
-
-   instrCount = 0;
-   costOffset = 0;
-
-   // loop for each host instruction (starting from 'i')
-   do {
-
-      // We should be at an IMark statement
-      CLG_ASSERT(Ist_IMark == st->tag);
-
-      // Reset stuff for this original instruction
-      loadAddrExpr = storeAddrExpr = NULL;
-      instrIssued = False;
-      dataSize = 0;
-
-      // Process all the statements for this original instruction (ie. until
-      // the next IMark statement, or the end of the block)
-      do {
-	  i++;
-	  stnext = ( i < bbIn->stmts_used ? bbIn->stmts[i] : NULL );
-	  beforeIBoundary = !stnext || (Ist_IMark == stnext->tag);
-	  collectStatementInfo(bbIn->tyenv, st, &instrAddr, &instrLen,
-			       &loadAddrExpr, &storeAddrExpr, &dataSize);
-
-	  // instrument a simulator call before conditional jumps
-	  if (st->tag == Ist_Exit) {
-	      // Nb: instrLen will be zero if Vex failed to decode it.
-	      // Also Client requests can appear to be very large (eg. 18
-	      // bytes on x86) because they are really multiple instructions.
-	      CLG_ASSERT( 0 == instrLen ||
-			  bbIn->jumpkind == Ijk_ClientReq ||
-			  (instrLen >= VG_MIN_INSTR_SZB && 
-			   instrLen <= VG_MAX_INSTR_SZB) );
-
-              // Add instrumentation before this statement
-	      endOfInstr(bbOut, &(bb->instr[instrCount]), bb_seen_before,
-			 instrAddr - origAddr, instrLen, dataSize, &costOffset,
-			 instrIssued, loadAddrExpr, storeAddrExpr);
-
-	      // prepare for a possible further simcall in same host instr
-	      loadAddrExpr = storeAddrExpr = NULL;
-	      instrIssued = True;
-
-	      if (!bb_seen_before) {
-		  bb->jmp[cJumps].instr = instrCount;
-		  bb->jmp[cJumps].skip = False;
-	      }
-	      
-	      /* Update global variable jmps_passed (this is before the jump!)
-	       * A correction is needed if VEX inverted the last jump condition
-	       */
-	      cJumpsCorrected = cJumps;
-	      if ((cJumps+1 == bb->cjmp_count) && bb->cjmp_inverted) cJumpsCorrected++;
-	      addConstMemStoreStmt( bbOut, (UWord) &CLG_(current_state).jmps_passed,
-				    cJumpsCorrected, hWordTy);
-
-	      cJumps++;
-	  }
+	 case Ist_Exit: {
+	    UInt jmps_passed;
+
+	    /* We may never reach the next statement, so need to flush
+	       all outstanding transactions now. */
+	    flushEvents( &clgs );
 
-	  addStmtToIRSB( bbOut, st );
-	  st = stnext;
-      } 
-      while (!beforeIBoundary);
+	    CLG_ASSERT(clgs.ii_index>0);
+	    if (!clgs.seen_before) {
+		clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
+		clgs.bb->jmp[cJumps].skip = False;
+	    }
+
+	    /* Update global variable jmps_passed before the jump
+	     * A correction is needed if VEX inverted the last jump condition
+	    */
+	    jmps_passed = cJumps;
+	    if ((cJumps+1 == clgs.bb->cjmp_count) && clgs.bb->cjmp_inverted)
+		jmps_passed++;
+	    addConstMemStoreStmt( clgs.sbOut,
+				  (UWord) &CLG_(current_state).jmps_passed,
+				  jmps_passed, hWordTy);
+	    cJumps++;
+
+	    break;
+	 }
+
+	 default:
+	    tl_assert(0);
+	    break;
+      }
 
-      // Add instrumentation for this original instruction.
-      if (!instrIssued || (loadAddrExpr != 0) || (storeAddrExpr !=0))
-	  endOfInstr(bbOut, &(bb->instr[instrCount]), bb_seen_before,
-		     instrAddr - origAddr, instrLen, dataSize, &costOffset,
-		     instrIssued, loadAddrExpr, storeAddrExpr);
+      /* Copy the original statement */
+      addStmtToIRSB( clgs.sbOut, st );
 
-      instrCount++;
+      CLG_DEBUGIF(5) {
+	 VG_(printf)("   pass  ");
+	 ppIRStmt(st);
+	 VG_(printf)("\n");
+      }
    }
-   while (st);
 
-   /* Always update global variable jmps_passed (at end of BB)
+   /* At the end of the bb.  Flush outstandings. */
+   flushEvents( &clgs );
+
+   /* Always update global variable jmps_passed at end of bb.
     * A correction is needed if VEX inverted the last jump condition
     */
-   cJumpsCorrected = cJumps;
-   if (bb->cjmp_inverted) cJumpsCorrected--;
-   addConstMemStoreStmt( bbOut, (UWord) &CLG_(current_state).jmps_passed,
-			 cJumpsCorrected, hWordTy);
+   {
+      UInt jmps_passed = cJumps;
+      if (clgs.bb->cjmp_inverted) jmps_passed--;
+      addConstMemStoreStmt( clgs.sbOut,
+			    (UWord) &CLG_(current_state).jmps_passed,
+			    jmps_passed, hWordTy);
+   }
+   CLG_ASSERT(clgs.bb->cjmp_count == cJumps);
+   CLG_ASSERT(clgs.bb->instr_count = clgs.ii_index);
 
    /* This stores the instr of the call/ret at BB end */
-   bb->jmp[cJumps].instr = instrCount-1;
+   clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
 
-   CLG_ASSERT(bb->cjmp_count == cJumps);
-   CLG_ASSERT(bb->instr_count == instrCount);
-
-   instrAddr += instrLen;
-   if (bb_seen_before) {
-       CLG_ASSERT(bb->instr_len == instrAddr - origAddr);
-       CLG_ASSERT(bb->cost_count == costOffset);
-       CLG_ASSERT(bb->jmpkind == bbIn->jumpkind);
+   if (clgs.seen_before) {
+       CLG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
+       CLG_ASSERT(clgs.bb->instr_len = clgs.instr_offset);
+       CLG_ASSERT(clgs.bb->jmpkind == sbIn->jumpkind);
    }
    else {
-       bb->instr_len = instrAddr - origAddr;
-       bb->cost_count = costOffset;
-       bb->jmpkind = bbIn->jumpkind;
+       clgs.bb->cost_count = update_cost_offsets(&clgs);
+       clgs.bb->instr_len = clgs.instr_offset;
+       clgs.bb->jmpkind = sbIn->jumpkind;
    }
-   
+
    CLG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
-	     origAddr, bb->instr_len, bb->cjmp_count, bb->cost_count);
+	     origAddr, clgs.bb->instr_len,
+	     clgs.bb->cjmp_count, clgs.bb->cost_count);
    if (cJumps>0) {
        CLG_DEBUG(3, "                     [ ");
        for (i=0;i<cJumps;i++)
-	   CLG_DEBUG(3, "%d ", bb->jmp[i].instr);
-       CLG_DEBUG(3, "], last inverted: %s \n", bb->cjmp_inverted ? "yes":"no");
+	   CLG_DEBUG(3, "%d ", clgs.bb->jmp[i].instr);
+       CLG_DEBUG(3, "], last inverted: %s \n",
+		 clgs.bb->cjmp_inverted ? "yes":"no");
    }
 
-  return bbOut;
+  return clgs.sbOut;
 }
 
 /*--------------------------------------------------------------------*/
diff --git a/callgrind/sim.c b/callgrind/sim.c
index 9edbecc17..9e53f8916 100644
--- a/callgrind/sim.c
+++ b/callgrind/sim.c
@@ -113,22 +113,21 @@ static Bool clo_collect_cacheuse = False;
  * - BBCC*  nonskipped  (only != 0 when in a function not skipped)
  */
 
-/* Offset to events in event set, used in log_* functions */
-static Int off_D0_Ir;
-static Int off_D1r_Ir;
-static Int off_D1r_Dr;
-static Int off_D1w_Ir;
-static Int off_D1w_Dw;
-static Int off_D2_Ir;
-static Int off_D2_Dr;
-static Int off_D2_Dw;
+/* Offset to events in event set, used in log_* functions
+ * <off_EventSet_BasicEventSet>: offset where basic set is found
+ */
+static Int off_UIr_Ir;
+static Int off_UIrDr_Ir,   off_UIrDr_Dr;
+static Int off_UIrDrDw_Ir, off_UIrDrDw_Dr, off_UIrDrDw_Dw;
+static Int off_UIrDw_Ir,   off_UIrDw_Dw;
+static Int off_UIrDwDr_Ir, off_UIrDwDr_Dr, off_UIrDwDr_Dw;
 
 static Addr   bb_base;
 static ULong* cost_base;
 static InstrInfo* current_ii;
 
 /* Cache use offsets */
-/* FIXME: The offsets are only correct because all eventsets get
+/* The offsets are only correct because all per-instruction event sets get
  * the "Use" set added first !
  */
 static Int off_I1_AcCost  = 0;
@@ -984,13 +983,13 @@ static
 void cacheuse_finish(void)
 {
   int i;
-  InstrInfo ii = { 0,0,0,0,0 };
+  InstrInfo ii = { 0,0,0,0 };
 
   if (!CLG_(current_state).collect) return;
 
   bb_base = 0;
   current_ii = &ii;
-  cost_base = 0;  
+  cost_base = 0;
 
   /* update usage counters */
   if (I1.use)
@@ -1043,6 +1042,19 @@ void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
     }
 }
 
+static
+Char* cacheRes(CacheModelResult r)
+{
+    switch(r) {
+    case L1_Hit:    return "L1 Hit ";
+    case L2_Hit:    return "L2 Hit ";
+    case MemAccess: return "L2 Miss";
+    case WriteBackMemAccess: return "L2 Miss (dirty)";
+    default:
+	tl_assert(0);
+    }
+    return "??";
+}
 
 VG_REGPARM(1)
 static void log_1I0D(InstrInfo* ii)
@@ -1052,37 +1064,101 @@ static void log_1I0D(InstrInfo* ii)
     current_ii = ii;
     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
 
-    CLG_DEBUG(6, "log_1I0D:  Ir=%#lx/%u => Ir %d\n",
-	      bb_base + ii->instr_offset, ii->instr_size, IrRes);
+    CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
+	      bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
 
     if (CLG_(current_state).collect) {
 	ULong* cost_Ir;
-	
+
 	if (CLG_(current_state).nonskipped)
 	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
 	else
-	    cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
+	    cost_Ir = cost_base + ii->cost_offset + off_UIr_Ir;
 
 	inc_costs(IrRes, cost_Ir, 
 		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
     }
 }
 
+VG_REGPARM(2)
+static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
+{
+    CacheModelResult Ir1Res, Ir2Res;
+    ULong *global_cost_Ir;
+
+    current_ii = ii1;
+    Ir1Res = (*simulator.I1_Read)(bb_base + ii1->instr_offset, ii1->instr_size);
+    current_ii = ii2;
+    Ir2Res = (*simulator.I1_Read)(bb_base + ii2->instr_offset, ii2->instr_size);
+
+    CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
+	      bb_base + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+	      bb_base + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
+
+    if (!CLG_(current_state).collect) return;
+
+    global_cost_Ir = CLG_(current_state).cost + CLG_(sets).off_full_Ir;
+    if (CLG_(current_state).nonskipped) {
+	ULong* skipped_cost_Ir = CLG_(current_state).nonskipped->skipped +
+				 CLG_(sets).off_full_Ir;
+	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+	return;
+    }
+
+    inc_costs(Ir1Res, global_cost_Ir, cost_base + ii1->cost_offset + off_UIr_Ir);
+    inc_costs(Ir2Res, global_cost_Ir, cost_base + ii2->cost_offset + off_UIr_Ir);
+}
+
+VG_REGPARM(3)
+static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
+{
+    CacheModelResult Ir1Res, Ir2Res, Ir3Res;
+    ULong *global_cost_Ir;
+
+    current_ii = ii1;
+    Ir1Res = (*simulator.I1_Read)(bb_base + ii1->instr_offset, ii1->instr_size);
+    current_ii = ii2;
+    Ir2Res = (*simulator.I1_Read)(bb_base + ii2->instr_offset, ii2->instr_size);
+    current_ii = ii3;
+    Ir3Res = (*simulator.I1_Read)(bb_base + ii3->instr_offset, ii3->instr_size);
+
+    CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
+	      bb_base + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
+	      bb_base + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
+	      bb_base + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
+
+    if (!CLG_(current_state).collect) return;
+
+    global_cost_Ir = CLG_(current_state).cost + CLG_(sets).off_full_Ir;
+    if (CLG_(current_state).nonskipped) {
+	ULong* skipped_cost_Ir = CLG_(current_state).nonskipped->skipped +
+				 CLG_(sets).off_full_Ir;
+	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
+	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
+	inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
+	return;
+    }
+
+    inc_costs(Ir1Res, global_cost_Ir, cost_base + ii1->cost_offset + off_UIr_Ir);
+    inc_costs(Ir2Res, global_cost_Ir, cost_base + ii2->cost_offset + off_UIr_Ir);
+    inc_costs(Ir3Res, global_cost_Ir, cost_base + ii3->cost_offset + off_UIr_Ir);
+}
 
 /* Instruction doing a read access */
 
-VG_REGPARM(2)
-static void log_1I1Dr(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult IrRes, DrRes;
 
     current_ii = ii;
     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
-    DrRes = (*simulator.D1_Read)(data, ii->data_size);
+    DrRes = (*simulator.D1_Read)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_1I1Dr: Ir=%#lx/%u, Dr=%#lx/%u => Ir %d, Dr %d\n",
-	      bb_base + ii->instr_offset, ii->instr_size,
-	      data, ii->data_size, IrRes, DrRes);
+    CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
+	      bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+	      data_addr, data_size, cacheRes(DrRes));
 
     if (CLG_(current_state).collect) {
 	ULong *cost_Ir, *cost_Dr;
@@ -1092,8 +1168,11 @@ static void log_1I1Dr(InstrInfo* ii, Addr data)
 	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
 	}
 	else {
-	    cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
-	    cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
+	    // event set must be UIrDr or extension
+	    CLG_ASSERT((ii->eventset == CLG_(sets).UIrDr) ||
+		       (ii->eventset == CLG_(sets).UIrDrDw));
+	    cost_Ir = cost_base + ii->cost_offset + off_UIrDr_Ir;
+	    cost_Dr = cost_base + ii->cost_offset + off_UIrDr_Dr;
 	}
        
 	inc_costs(IrRes, cost_Ir, 
@@ -1104,16 +1183,16 @@ static void log_1I1Dr(InstrInfo* ii, Addr data)
 }
 
 
-VG_REGPARM(2)
-static void log_0I1Dr(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult DrRes;
 
     current_ii = ii;
-    DrRes = (*simulator.D1_Read)(data, ii->data_size);
+    DrRes = (*simulator.D1_Read)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_0I1Dr: Dr=%#lx/%u => Dr %d\n",
-	      data, ii->data_size, DrRes);
+    CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
+	      data_addr, data_size, cacheRes(DrRes));
 
     if (CLG_(current_state).collect) {
 	ULong *cost_Dr;
@@ -1122,9 +1201,15 @@ static void log_0I1Dr(InstrInfo* ii, Addr data)
 	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
 	}
 	else {
-	    cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
+	    Int off_Dr;
+	    if      (ii->eventset == CLG_(sets).UIrDr)   off_Dr = off_UIrDr_Dr;
+	    else if (ii->eventset == CLG_(sets).UIrDrDw) off_Dr = off_UIrDrDw_Dr;
+	    else if (ii->eventset == CLG_(sets).UIrDwDr) off_Dr = off_UIrDwDr_Dr;
+	    else CLG_ASSERT(0);
+
+	    cost_Dr = cost_base + ii->cost_offset + off_Dr;
 	}
-       
+
 	inc_costs(DrRes, cost_Dr,
 		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
     }
@@ -1133,29 +1218,33 @@ static void log_0I1Dr(InstrInfo* ii, Addr data)
 
 /* Instruction doing a write access */
 
-VG_REGPARM(2)
-static void log_1I1Dw(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult IrRes, DwRes;
 
     current_ii = ii;
     IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
-    DwRes = (*simulator.D1_Write)(data, ii->data_size);
+    DwRes = (*simulator.D1_Write)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_1I1Dw: Ir=%#lx/%u, Dw=%#lx/%u => Ir %d, Dw %d\n",
-	      bb_base + ii->instr_offset, ii->instr_size,
-	      data, ii->data_size, IrRes, DwRes);
+    CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
+	      bb_base + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
+	      data_addr, data_size, cacheRes(DwRes));
 
     if (CLG_(current_state).collect) {
 	ULong *cost_Ir, *cost_Dw;
 	
 	if (CLG_(current_state).nonskipped) {
-	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
-	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
+	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
+	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
 	}
 	else {
-	    cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
-	    cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
+	    // This helper is called when a Dr event follows Ir;
+	    // Event set must be UIrDw or extension
+	    CLG_ASSERT((ii->eventset == CLG_(sets).UIrDw) ||
+		       (ii->eventset == CLG_(sets).UIrDwDr));
+	    cost_Ir = cost_base + ii->cost_offset + off_UIrDw_Ir;
+	    cost_Dw = cost_base + ii->cost_offset + off_UIrDw_Dw;
 	}
        
 	inc_costs(IrRes, cost_Ir,
@@ -1165,16 +1254,16 @@ static void log_1I1Dw(InstrInfo* ii, Addr data)
     }
 }
 
-VG_REGPARM(2)
-static void log_0I1Dw(InstrInfo* ii, Addr data)
+VG_REGPARM(3)
+static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
 {
     CacheModelResult DwRes;
 
     current_ii = ii;
-    DwRes = (*simulator.D1_Write)(data, ii->data_size);
+    DwRes = (*simulator.D1_Write)(data_addr, data_size);
 
-    CLG_DEBUG(6, "log_0I1Dw: Dw=%#lx/%u => Dw %d\n",
-	      data, ii->data_size, DwRes);
+    CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
+	      data_addr, data_size, cacheRes(DwRes));
 
     if (CLG_(current_state).collect) {
 	ULong *cost_Dw;
@@ -1183,7 +1272,13 @@ static void log_0I1Dw(InstrInfo* ii, Addr data)
 	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
 	}
 	else {
-	    cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
+	    Int off_Dw;
+	    if      (ii->eventset == CLG_(sets).UIrDw)   off_Dw = off_UIrDw_Dw;
+	    else if (ii->eventset == CLG_(sets).UIrDwDr) off_Dw = off_UIrDwDr_Dw;
+	    else if (ii->eventset == CLG_(sets).UIrDrDw) off_Dw = off_UIrDrDw_Dw;
+	    else CLG_ASSERT(0);
+
+	    cost_Dw = cost_base + ii->cost_offset + off_Dw;
 	}
        
 	inc_costs(DwRes, cost_Dw,
@@ -1191,77 +1286,6 @@ static void log_0I1Dw(InstrInfo* ii, Addr data)
     }
 }
 
-/* Instruction doing a read and a write access */
-
-VG_REGPARM(3)
-static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
-{
-    CacheModelResult IrRes, DrRes, DwRes;
-
-    current_ii = ii;
-    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
-    DrRes = (*simulator.D1_Read)(data1, ii->data_size);
-    DwRes = (*simulator.D1_Write)(data2, ii->data_size);
-
-    CLG_DEBUG(6,
-	      "log_1I2D: Ir=%#lx/%u, Dr=%#lx/%u, Dw=%#lx/%u => Ir %d, Dr %d, Dw %d\n",
-	      bb_base + ii->instr_offset, ii->instr_size,
-	      data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
-
-    if (CLG_(current_state).collect) {
-	ULong *cost_Ir, *cost_Dr, *cost_Dw;
-
-	if (CLG_(current_state).nonskipped) {
-	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
-	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
-	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
-	}
-	else {
-	    cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
-	    cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
-	    cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
-	}
-	
-	inc_costs(IrRes, cost_Ir, 
-		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
-	inc_costs(DrRes, cost_Dr, 
-		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
-	inc_costs(DwRes, cost_Dw, 
-		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
-    }
-}
-
-VG_REGPARM(3)
-static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
-{
-    CacheModelResult DrRes, DwRes;
-
-    current_ii = ii;
-    DrRes = (*simulator.D1_Read)(data1, ii->data_size);
-    DwRes = (*simulator.D1_Write)(data2, ii->data_size);
-
-    CLG_DEBUG(6,
-	      "log_0D2D: Dr=%#lx/%u, Dw=%#lx/%u => Dr %d, Dw %d\n",
-	      data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
-
-    if (CLG_(current_state).collect) {
-	ULong *cost_Dr, *cost_Dw;
-
-	if (CLG_(current_state).nonskipped) {
-	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
-	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
-	}
-	else {
-	    cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
-	    cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
-	}
-	
-	inc_costs(DrRes, cost_Dr, 
-		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
-	inc_costs(DwRes, cost_Dw, 
-		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
-    }
-}
 
 
 /*------------------------------------------------------------*/
@@ -1369,20 +1393,20 @@ static void cachesim_post_clo_init(void)
   if (!CLG_(clo).simulate_cache) {
     CLG_(cachesim).log_1I0D  = 0;
     CLG_(cachesim).log_1I0D_name = "(no function)";
+    CLG_(cachesim).log_2I0D  = 0;
+    CLG_(cachesim).log_2I0D_name = "(no function)";
+    CLG_(cachesim).log_3I0D  = 0;
+    CLG_(cachesim).log_3I0D_name = "(no function)";
 
     CLG_(cachesim).log_1I1Dr = 0;
-    CLG_(cachesim).log_1I1Dw = 0;
-    CLG_(cachesim).log_1I2D  = 0;
     CLG_(cachesim).log_1I1Dr_name = "(no function)";
+    CLG_(cachesim).log_1I1Dw = 0;
     CLG_(cachesim).log_1I1Dw_name = "(no function)";
-    CLG_(cachesim).log_1I2D_name = "(no function)";
 
     CLG_(cachesim).log_0I1Dr = 0;
-    CLG_(cachesim).log_0I1Dw = 0;
-    CLG_(cachesim).log_0I2D  = 0;
     CLG_(cachesim).log_0I1Dr_name = "(no function)";
+    CLG_(cachesim).log_0I1Dw = 0;
     CLG_(cachesim).log_0I1Dw_name = "(no function)";
-    CLG_(cachesim).log_0I2D_name = "(no function)";
     return;
   }
 
@@ -1402,20 +1426,20 @@ static void cachesim_post_clo_init(void)
 
   CLG_(cachesim).log_1I0D  = log_1I0D;
   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
+  CLG_(cachesim).log_2I0D  = log_2I0D;
+  CLG_(cachesim).log_2I0D_name  = "log_2I0D";
+  CLG_(cachesim).log_3I0D  = log_3I0D;
+  CLG_(cachesim).log_3I0D_name  = "log_3I0D";
 
   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
-  CLG_(cachesim).log_1I2D  = log_1I2D;
   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
-  CLG_(cachesim).log_1I2D_name  = "log_1I2D";
 
   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
-  CLG_(cachesim).log_0I2D  = log_0I2D;
   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
-  CLG_(cachesim).log_0I2D_name  = "log_0I2D";
 
   if (clo_collect_cacheuse) {
 
@@ -1763,26 +1787,29 @@ struct event_sets CLG_(sets);
 void CLG_(init_eventsets)(Int max_user)
 {
   EventType * e1, *e2, *e3, *e4;
-  EventSet *Ir, *Dr, *Dw;
-  EventSet *D0, *D1r, *D1w, *D2;
-  EventSet *sim, *full;
-  EventSet *use;
+  // Basic event sets from which others are composed
+  EventSet *Use, *Ir, *Dr, *Dw;
+  // Compositions of basic sets used for per-instruction counters
+  EventSet *UIr, *UIrDr, *UIrDrDw, *UIrDw, *UIrDwDr;
+  // Composition used for global counters and aggregation
+  EventSet *full;
   int sizeOfUseIr;
 
-  use = CLG_(get_eventset)("Use", 4);
+  // the "Use" events types only are used with "cacheuse" simulation
+  Use = CLG_(get_eventset)("Use", 4);
   if (clo_collect_cacheuse) {
     /* if TUse is 0, there was never a load, and no loss, too */
     e1 = CLG_(register_eventtype)("AcCost1");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
     e1 = CLG_(register_eventtype)("SpLoss1");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
     e1 = CLG_(register_eventtype)("AcCost2");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
     e1 = CLG_(register_eventtype)("SpLoss2");
-    CLG_(add_eventtype)(use, e1);
+    CLG_(add_eventtype)(Use, e1);
   }
 
-  Ir = CLG_(get_eventset)("Ir", 4);    
+  Ir = CLG_(get_eventset)("Ir", 4);
   Dr = CLG_(get_eventset)("Dr", 4);
   Dw = CLG_(get_eventset)("Dw", 4);
   if (CLG_(clo).simulate_cache) {
@@ -1822,74 +1849,76 @@ void CLG_(init_eventsets)(Int max_user)
     CLG_(add_eventtype)(Ir, e1);
   }
 
-  sizeOfUseIr =  use->size + Ir->size;
-  D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
-  CLG_(add_eventset)(D0, use);
-  off_D0_Ir  = CLG_(add_eventset)(D0, Ir);
-
-  D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
-  CLG_(add_eventset)(D1r, use);
-  off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
-  off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
-
-  D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
-  CLG_(add_eventset)(D1w, use);
-  off_D1w_Ir   = CLG_(add_eventset)(D1w, Ir);
-  off_D1w_Dw   = CLG_(add_eventset)(D1w, Dw);
-
-  D2  = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
-  CLG_(add_eventset)(D2, use);
-  off_D2_Ir    = CLG_(add_eventset)(D2, Ir);
-  off_D2_Dr    = CLG_(add_eventset)(D2, Dr);
-  off_D2_Dw    = CLG_(add_eventset)(D2, Dw);
-
-  sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
-  CLG_(add_eventset)(sim, use);
-  CLG_(sets).off_sim_Ir   = CLG_(add_eventset)(sim, Ir);
-  CLG_(sets).off_sim_Dr   = CLG_(add_eventset)(sim, Dr);
-  CLG_(sets).off_sim_Dw   = CLG_(add_eventset)(sim, Dw);
+  // Self cost event sets per guest instruction (U used only for cacheUse).
+  // Each basic event set only appears once, as eg. multiple different Dr's
+  // in one guest instruction are counted in the same counter.
 
-  if (CLG_(clo).collect_alloc)   max_user += 2;
-  if (CLG_(clo).collect_systime) max_user += 2;
+  sizeOfUseIr =  Use->size + Ir->size;
+  UIr = CLG_(get_eventset)("UIr", sizeOfUseIr);
+  CLG_(add_eventset)(UIr, Use);
+  off_UIr_Ir  = CLG_(add_eventset)(UIr, Ir);
 
-  full = CLG_(get_eventset)("full", sim->size + max_user);
-  CLG_(add_eventset)(full, sim);
-  CLG_(sets).off_full_Ir   = CLG_(sets).off_sim_Ir;
-  CLG_(sets).off_full_Dr   = CLG_(sets).off_sim_Dr;
-  CLG_(sets).off_full_Dw   = CLG_(sets).off_sim_Dw;
+  UIrDr = CLG_(get_eventset)("UIrDr", sizeOfUseIr + Dr->size);
+  CLG_(add_eventset)(UIrDr, Use);
+  off_UIrDr_Ir = CLG_(add_eventset)(UIrDr, Ir);
+  off_UIrDr_Dr = CLG_(add_eventset)(UIrDr, Dr);
 
-  CLG_(sets).use = use;
-  CLG_(sets).Ir  = Ir;
-  CLG_(sets).Dr  = Dr;
-  CLG_(sets).Dw  = Dw;
+  UIrDrDw  = CLG_(get_eventset)("IrDrDw", sizeOfUseIr + Dr->size + Dw->size);
+  CLG_(add_eventset)(UIrDrDw, Use);
+  off_UIrDrDw_Ir    = CLG_(add_eventset)(UIrDrDw, Ir);
+  off_UIrDrDw_Dr    = CLG_(add_eventset)(UIrDrDw, Dr);
+  off_UIrDrDw_Dw    = CLG_(add_eventset)(UIrDrDw, Dw);
 
-  CLG_(sets).D0  = D0;
-  CLG_(sets).D1r = D1r;
-  CLG_(sets).D1w = D1w;
-  CLG_(sets).D2  = D2;
+  UIrDw = CLG_(get_eventset)("UIrDw", sizeOfUseIr + Dw->size);
+  CLG_(add_eventset)(UIrDw, Use);
+  off_UIrDw_Ir   = CLG_(add_eventset)(UIrDw, Ir);
+  off_UIrDw_Dw   = CLG_(add_eventset)(UIrDw, Dw);
+
+  UIrDwDr  = CLG_(get_eventset)("IrDwDr", sizeOfUseIr + Dw->size + Dr->size);
+  CLG_(add_eventset)(UIrDwDr, Use);
+  off_UIrDwDr_Ir    = CLG_(add_eventset)(UIrDrDw, Ir);
+  off_UIrDwDr_Dw    = CLG_(add_eventset)(UIrDrDw, Dw);
+  off_UIrDwDr_Dr    = CLG_(add_eventset)(UIrDrDw, Dr);
 
-  CLG_(sets).sim  = sim;
-  CLG_(sets).full = full;
 
+  // the "full" event set is used as global counter and for aggregation
+  if (CLG_(clo).collect_alloc)   max_user += 2;
+  if (CLG_(clo).collect_systime) max_user += 2;
+  full = CLG_(get_eventset)("full",
+			    sizeOfUseIr + Dr->size + Dw->size + max_user);
+  CLG_(add_eventset)(full, Use);
+  CLG_(sets).off_full_Ir   = CLG_(add_eventset)(full, Ir);
+  CLG_(sets).off_full_Dr   = CLG_(add_eventset)(full, Dr);
+  CLG_(sets).off_full_Dw   = CLG_(add_eventset)(full, Dw);
   if (CLG_(clo).collect_alloc) {
-    e1 = CLG_(register_eventtype)("allocCount");
-    e2 = CLG_(register_eventtype)("allocSize");
-    CLG_(sets).off_full_user =  CLG_(add_dep_event2)(full, e1,e2);
+      e1 = CLG_(register_eventtype)("allocCount");
+      e2 = CLG_(register_eventtype)("allocSize");
+      CLG_(sets).off_full_alloc =  CLG_(add_dep_event2)(full, e1,e2);
   }
-
   if (CLG_(clo).collect_systime) {
-    e1 = CLG_(register_eventtype)("sysCount");
-    e2 = CLG_(register_eventtype)("sysTime");
-    CLG_(sets).off_full_systime =  CLG_(add_dep_event2)(full, e1,e2);
+      e1 = CLG_(register_eventtype)("sysCount");
+      e2 = CLG_(register_eventtype)("sysTime");
+      CLG_(sets).off_full_systime =  CLG_(add_dep_event2)(full, e1,e2);
   }
 
+  CLG_(sets).Use = Use;
+  CLG_(sets).Ir  = Ir;
+  CLG_(sets).Dr  = Dr;
+  CLG_(sets).Dw  = Dw;
+  CLG_(sets).UIr  = UIr;
+  CLG_(sets).UIrDr = UIrDr;
+  CLG_(sets).UIrDrDw  = UIrDrDw;
+  CLG_(sets).UIrDw = UIrDw;
+  CLG_(sets).UIrDwDr  = UIrDwDr;
+  CLG_(sets).full = full;
+
+
   CLG_DEBUGIF(1) {
     CLG_DEBUG(1, "EventSets:\n");
-    CLG_(print_eventset)(-2, use);
+    CLG_(print_eventset)(-2, Use);
     CLG_(print_eventset)(-2, Ir);
     CLG_(print_eventset)(-2, Dr);
     CLG_(print_eventset)(-2, Dw);
-    CLG_(print_eventset)(-2, sim);
     CLG_(print_eventset)(-2, full);
   }
 
@@ -1924,34 +1953,41 @@ static
 void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
 {
   /* if eventset use is defined, it is always first (hardcoded!) */
-  CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);  
+  CLG_(add_and_zero_cost)( CLG_(sets).Use, dst, cost);
 
-  /* FIXME: This is hardcoded... */
-  if (es == CLG_(sets).D0) {
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-			    cost + off_D0_Ir);
+  if (es == CLG_(sets).UIr) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+			    cost + off_UIr_Ir);
   }
-  else if (es == CLG_(sets).D1r) {
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-			    cost + off_D1r_Ir);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
-			    cost + off_D1r_Dr);
+  else if (es == CLG_(sets).UIrDr) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+			    cost + off_UIrDr_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
+			    cost + off_UIrDr_Dr);
   }
-  else if (es == CLG_(sets).D1w) {
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-			    cost + off_D1w_Ir);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
-			    cost + off_D1w_Dw);
+  else if (es == CLG_(sets).UIrDrDw) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+			    cost + off_UIrDrDw_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
+			    cost + off_UIrDrDw_Dr);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
+			    cost + off_UIrDrDw_Dw);
   }
-  else {
-    CLG_ASSERT(es == CLG_(sets).D2);
-    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
-			    cost + off_D2_Ir);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
-			    cost + off_D2_Dr);
-    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
-			    cost + off_D2_Dw);
+  else if (es == CLG_(sets).UIrDw) {
+      CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+			       cost + off_UIrDw_Ir);
+      CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
+			       cost + off_UIrDw_Dw);
+  }
+  else if (es == CLG_(sets).UIrDwDr) {
+    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_full_Ir,
+			    cost + off_UIrDwDr_Ir);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_full_Dw,
+			    cost + off_UIrDwDr_Dw);
+    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_full_Dr,
+			     cost + off_UIrDwDr_Dr);
   }
+  else CLG_ASSERT(0);
 }
 
 /* this is called at dump time for every instruction executed */
@@ -1959,7 +1995,7 @@ static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
 			       InstrInfo* ii, ULong exe_count)
 {
   if (!CLG_(clo).simulate_cache)
-      cost[CLG_(sets).off_sim_Ir] += exe_count;
+      cost[CLG_(sets).off_full_Ir] += exe_count;
   else {
 
 #if 0
@@ -2019,24 +2055,24 @@ struct cachesim_if CLG_(cachesim) = {
 
   /* these will be set by cachesim_post_clo_init */
   .log_1I0D        = 0,
+  .log_2I0D        = 0,
+  .log_3I0D        = 0,
 
   .log_1I1Dr       = 0,
   .log_1I1Dw       = 0,
-  .log_1I2D        = 0,
 
   .log_0I1Dr       = 0,
   .log_0I1Dw       = 0,
-  .log_0I2D        = 0,
 
   .log_1I0D_name = "(no function)",
+  .log_2I0D_name = "(no function)",
+  .log_3I0D_name = "(no function)",
 
   .log_1I1Dr_name = "(no function)",
   .log_1I1Dw_name = "(no function)",
-  .log_1I2D_name = "(no function)",
 
   .log_0I1Dr_name = "(no function)",
   .log_0I1Dw_name = "(no function)",
-  .log_0I2D_name = "(no function)"
 };
author	weidendo <weidendo@a5019735-40e9-0310-863c-91ae7b9d1cf9>	2009-06-15 00:16:36 +0000
committer	weidendo <weidendo@a5019735-40e9-0310-863c-91ae7b9d1cf9>	2009-06-15 00:16:36 +0000
commit	0a1951d64df79f98f885426671fc9d3982647a6b (patch)
tree	f9a44bf8c444e9da77016303bdbf1a3ee1ab61dd
parent	0b23d6eb63a4146dfa17304a2e76ce91f4d5e001 (diff)
download	valgrind-0a1951d64df79f98f885426671fc9d3982647a6b.tar.gz