simpleperf: merge records from different buffers in memory.

By reading records from all buffers at once, we can merge records in memory instead of sorting them in perf.data. To make it clear, this patch only contains the code to merge records in memory, and I will remove old method later. Bug: http://b/32343227 Test: run simpleperf_unit_test. Test: run simpleperf_runtest.py. Change-Id: Iea2da06c072243c2014f43c8aa6d96a23cfb9123
author: Yabin Cui <yabinc@google.com> 2016-10-24 19:13:09 -0700
committer: Yabin Cui <yabinc@google.com> 2016-10-26 17:08:13 -0700
commit: 2ea6de11962fea5613f15308b202fb505e57ae9b (patch)
tree: b0e52846150f77724aa8c2e4d6500d06b58b13cd
parent: e7448f00cf02ebe92f41881cf9c4b198cadbb8af (diff)
download: extras-2ea6de11962fea5613f15308b202fb505e57ae9b.tar.gz
8 files changed, 124 insertions, 48 deletions
diff --git a/simpleperf/cmd_record.cpp b/simpleperf/cmd_record.cpp
index 0462eba3..1d4af87e 100644
--- a/simpleperf/cmd_record.cpp
+++ b/simpleperf/cmd_record.cpp
@@ -313,6 +313,7 @@ bool RecordCommand::Run(const std::vector<std::string>& args) {
   if (!event_selection_set_.FinishReadMmapEventData()) {
     return false;
   }
+  // TODO: remove SortDataSection as we have merged records in memory.
   if (!record_file_writer_->SortDataSection()) {
     return false;
   }
@@ -592,6 +593,7 @@ bool RecordCommand::SetEventSelectionFlags() {
     }
   }
   event_selection_set_.SetInherit(child_inherit_);
+  // TODO: remove SetLowWatermark() as we have merged records in memory.
   // If Unwinding while recording, records are used before being sorted.
   // By using low watermark, records are almost sorted when read from kernel.
   if (dwarf_callchain_sampling_ && unwind_dwarf_callchain_ && !post_unwind_) {
diff --git a/simpleperf/event_fd.cpp b/simpleperf/event_fd.cpp
index 39df0852..e01147b0 100644
--- a/simpleperf/event_fd.cpp
+++ b/simpleperf/event_fd.cpp
@@ -37,8 +37,6 @@
 #include "perf_event.h"
 #include "utils.h"
 
-std::vector<char> EventFd::data_process_buffer_;
-
 static int perf_event_open(const perf_event_attr& attr, pid_t pid, int cpu,
                            int group_fd, unsigned long flags) {  // NOLINT
   return syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd, flags);
@@ -157,9 +155,6 @@ bool EventFd::CreateMappedBuffer(size_t mmap_pages, bool report_error) {
   mmap_metadata_page_ = reinterpret_cast<perf_event_mmap_page*>(mmap_addr_);
   mmap_data_buffer_ = reinterpret_cast<char*>(mmap_addr_) + page_size;
   mmap_data_buffer_size_ = mmap_len_ - page_size;
-  if (data_process_buffer_.size() < mmap_data_buffer_size_) {
-    data_process_buffer_.resize(mmap_data_buffer_size_);
-  }
   return true;
 }
 
@@ -189,7 +184,7 @@ void EventFd::DestroyMappedBuffer() {
   }
 }
 
-size_t EventFd::GetAvailableMmapData(const char** pdata) {
+size_t EventFd::GetAvailableMmapData(std::vector<char>& buffer, size_t& buffer_pos) {
   if (!HasMappedBuffer()) {
     return 0;
   }
@@ -216,18 +211,27 @@ size_t EventFd::GetAvailableMmapData(const char** pdata) {
     // No available data.
     return 0;
   }
+  size_t read_bytes;
+  if (read_head < write_head) {
+    read_bytes = write_head - read_head;
+  } else {
+    read_bytes = mmap_data_buffer_size_ - read_head + write_head;
+  }
+  // Extend the buffer if it is not big enough.
+  if (buffer.size() < buffer_pos + read_bytes) {
+    buffer.resize(buffer_pos + read_bytes);
+  }
 
   // Make sure we can see the data after the fence.
   std::atomic_thread_fence(std::memory_order_acquire);
 
-  // Copy records from mapped buffer to data_process_buffer. Note that records
-  // can be wrapped at the end of the mapped buffer.
-  char* to = data_process_buffer_.data();
+  // Copy records from mapped buffer. Note that records can be wrapped at the
+  // end of the mapped buffer.
+  char* to = &buffer[buffer_pos];
   if (read_head < write_head) {
     char* from = mmap_data_buffer_ + read_head;
     size_t n = write_head - read_head;
     memcpy(to, from, n);
-    to += n;
   } else {
     char* from = mmap_data_buffer_ + read_head;
     size_t n = mmap_data_buffer_size_ - read_head;
@@ -236,10 +240,8 @@ size_t EventFd::GetAvailableMmapData(const char** pdata) {
     from = mmap_data_buffer_;
     n = write_head;
     memcpy(to, from, n);
-    to += n;
   }
-  size_t read_bytes = to - data_process_buffer_.data();
-  *pdata = data_process_buffer_.data();
+  buffer_pos += read_bytes;
   DiscardMmapData(read_bytes);
   return read_bytes;
 }
diff --git a/simpleperf/event_fd.h b/simpleperf/event_fd.h
index bb5fb543..aaba0ef2 100644
--- a/simpleperf/event_fd.h
+++ b/simpleperf/event_fd.h
@@ -76,7 +76,7 @@ class EventFd {
 
   // When the kernel writes new sampled records to the mapped area, we can get
   // them by returning the start address and size of the data.
-  size_t GetAvailableMmapData(const char** pdata);
+  size_t GetAvailableMmapData(std::vector<char>& buffer, size_t& buffer_pos);
 
   // [callback] is called when there is data available in the mapped buffer.
   bool StartPolling(IOEventLoop& loop, const std::function<bool()>& callback);
diff --git a/simpleperf/event_selection_set.cpp b/simpleperf/event_selection_set.cpp
index 9202bfad..5bdf5397 100644
--- a/simpleperf/event_selection_set.cpp
+++ b/simpleperf/event_selection_set.cpp
@@ -481,8 +481,8 @@ bool EventSelectionSet::PrepareToReadMmapEventData(
     for (auto& selection : group) {
       for (auto& event_fd : selection.event_fds) {
         if (event_fd->HasMappedBuffer()) {
-          if (!event_fd->StartPolling(loop, [&]() {
-                return ReadMmapEventDataForFd(event_fd.get());
+          if (!event_fd->StartPolling(loop, [this]() {
+                return ReadMmapEventData();
               })) {
             return false;
           }
@@ -497,43 +497,85 @@ bool EventSelectionSet::PrepareToReadMmapEventData(
   return true;
 }
 
-bool EventSelectionSet::ReadMmapEventDataForFd(EventFd* event_fd) {
-  const char* data;
-  // Call GetAvailableMmapData() only once instead of calling in a loop, because
-  // 1) A mapped buffer caches data before needing to be read again. By default
-  //    it raises read Event when half full.
-  // 2) Spinning on one mapped buffer can make other mapped buffers overflow.
-  size_t size = event_fd->GetAvailableMmapData(&data);
-  if (size == 0) {
-    return true;
-  }
-  std::vector<std::unique_ptr<Record>> records =
-      ReadRecordsFromBuffer(event_fd->attr(), data, size);
-  for (auto& r : records) {
-    if (!record_callback_(r.get())) {
-      return false;
-    }
+// When reading from mmap buffers, we prefer reading from all buffers at once
+// rather than reading one buffer at a time. Because by reading all buffers
+// at once, we can merge records from different buffers easily in memory.
+// Otherwise, we have to sort records with greater effort.
+bool EventSelectionSet::ReadMmapEventData() {
+  size_t head_size = 0;
+  std::vector<RecordBufferHead>& heads = record_buffer_heads_;
+  if (heads.empty()) {
+    heads.resize(1);
   }
-  return true;
-}
+  heads[0].current_pos = 0;
+  size_t buffer_pos = 0;
 
-bool EventSelectionSet::FinishReadMmapEventData() {
-  // Read each mapped buffer once, because some data may exist in the buffers
-  // but is not much enough to raise read Events.
   for (auto& group : groups_) {
     for (auto& selection : group) {
       for (auto& event_fd : selection.event_fds) {
         if (event_fd->HasMappedBuffer()) {
-          if (!ReadMmapEventDataForFd(event_fd.get())) {
-            return false;
+          if (event_fd->GetAvailableMmapData(record_buffer_, buffer_pos) != 0) {
+            heads[head_size].end_pos = buffer_pos;
+            heads[head_size].attr = &selection.event_attr;
+            head_size++;
+            if (heads.size() == head_size) {
+              heads.resize(head_size + 1);
+            }
+            heads[head_size].current_pos = buffer_pos;
           }
         }
       }
     }
   }
+
+  if (head_size == 1) {
+    // Only one buffer has data, process it directly.
+    std::vector<std::unique_ptr<Record>> records =
+        ReadRecordsFromBuffer(*heads[0].attr,
+                              record_buffer_.data(), buffer_pos);
+    for (auto& r : records) {
+      if (!record_callback_(r.get())) {
+        return false;
+      }
+    }
+  } else {
+    // Use a priority queue to merge records from different buffers. As
+    // records from the same buffer are already ordered by time, we only
+    // need to merge the first record from all buffers. And each time a
+    // record is popped from the queue, we put the next record from its
+    // buffer into the queue.
+    auto comparator = [&](RecordBufferHead* h1, RecordBufferHead* h2) {
+      return h1->timestamp > h2->timestamp;
+    };
+    std::priority_queue<RecordBufferHead*, std::vector<RecordBufferHead*>, decltype(comparator)> q(comparator);
+    for (size_t i = 0; i < head_size; ++i) {
+      RecordBufferHead& h = heads[i];
+      h.r = ReadRecordFromBuffer(*h.attr, &record_buffer_[h.current_pos]);
+      h.timestamp = h.r->Timestamp();
+      h.current_pos += h.r->size();
+      q.push(&h);
+    }
+    while (!q.empty()) {
+      RecordBufferHead* h = q.top();
+      q.pop();
+      if (!record_callback_(h->r.get())) {
+        return false;
+      }
+      if (h->current_pos < h->end_pos) {
+        h->r = ReadRecordFromBuffer(*h->attr, &record_buffer_[h->current_pos]);
+        h->timestamp = h->r->Timestamp();
+        h->current_pos += h->r->size();
+        q.push(h);
+      }
+    }
+  }
   return true;
 }
 
+bool EventSelectionSet::FinishReadMmapEventData() {
+  return ReadMmapEventData();
+}
+
 bool EventSelectionSet::HandleCpuHotplugEvents(
     IOEventLoop& loop, const std::vector<int>& monitored_cpus,
     double check_interval_in_sec) {
@@ -576,6 +618,13 @@ bool EventSelectionSet::DetectCpuHotplugEvents() {
 }
 
 bool EventSelectionSet::HandleCpuOfflineEvent(int cpu) {
+  if (!for_stat_cmd_) {
+    // Read mmap data here, so we won't lose the existing records of the
+    // offlined cpu.
+    if (!ReadMmapEventData()) {
+      return false;
+    }
+  }
   for (auto& group : groups_) {
     for (auto& selection : group) {
       for (auto it = selection.event_fds.begin();
@@ -589,9 +638,6 @@ bool EventSelectionSet::HandleCpuOfflineEvent(int cpu) {
             selection.hotplugged_counters.push_back(counter);
           } else {
             if ((*it)->HasMappedBuffer()) {
-              if (!ReadMmapEventDataForFd(it->get())) {
-                return false;
-              }
               if (!(*it)->StopPolling()) {
                 return false;
               }
@@ -674,8 +720,8 @@ bool EventSelectionSet::CreateMappedBufferForCpu(int cpu) {
     }
   }
   if (fd_with_buffer != nullptr &&
-      !fd_with_buffer->StartPolling(*loop_, [this, fd_with_buffer]() {
-        return ReadMmapEventDataForFd(fd_with_buffer);
+      !fd_with_buffer->StartPolling(*loop_, [this]() {
+        return ReadMmapEventData();
       })) {
     return false;
   }
diff --git a/simpleperf/event_selection_set.h b/simpleperf/event_selection_set.h
index 32904c12..2ec58402 100644
--- a/simpleperf/event_selection_set.h
+++ b/simpleperf/event_selection_set.h
@@ -134,7 +134,7 @@ class EventSelectionSet {
                              std::string* failed_event_type);
 
   bool MmapEventFiles(size_t mmap_pages, bool report_error);
-  bool ReadMmapEventDataForFd(EventFd* event_fd);
+  bool ReadMmapEventData();
 
   bool DetectCpuHotplugEvents();
   bool HandleCpuOnlineEvent(int cpu);
@@ -154,6 +154,20 @@ class EventSelectionSet {
   std::set<int> monitored_cpus_;
   std::vector<int> online_cpus_;
 
+  // Records from all mapped buffers are stored in record_buffer_, each
+  // RecordBufferHead manages records read from one mapped buffer. Create
+  // record_buffer_heads_ and record_buffer_ here to avoid allocating them
+  // from heap each time calling ReadMmapEventData().
+  struct RecordBufferHead {
+    size_t current_pos;  // current position in record_buffer_
+    size_t end_pos;  // end position in record_buffer_
+    perf_event_attr* attr;
+    uint64_t timestamp;
+    std::unique_ptr<Record> r;
+  };
+  std::vector<RecordBufferHead> record_buffer_heads_;
+  std::vector<char> record_buffer_;
+
   DISALLOW_COPY_AND_ASSIGN(EventSelectionSet);
 };
 
diff --git a/simpleperf/perf_clock.cpp b/simpleperf/perf_clock.cpp
index f6f65110..127470e7 100644
--- a/simpleperf/perf_clock.cpp
+++ b/simpleperf/perf_clock.cpp
@@ -127,10 +127,11 @@ static bool GetClockDiff(int64_t* clock_diff_in_ns) {
     return false;
   }
 
-  const char* data;
-  size_t size = event_fd->GetAvailableMmapData(&data);
+  std::vector<char> buffer;
+  size_t buffer_pos = 0;
+  size_t size = event_fd->GetAvailableMmapData(buffer, buffer_pos);
   std::vector<std::unique_ptr<Record>> records =
-      ReadRecordsFromBuffer(attr, data, size);
+      ReadRecordsFromBuffer(attr, buffer.data(), size);
   uint64_t perf_time_in_ns = 0;
   for (auto& r : records) {
     if (r->type() == PERF_RECORD_MMAP) {
diff --git a/simpleperf/record.cpp b/simpleperf/record.cpp
index be92ac1b..86da0657 100644
--- a/simpleperf/record.cpp
+++ b/simpleperf/record.cpp
@@ -918,6 +918,12 @@ std::vector<std::unique_ptr<Record>> ReadRecordsFromBuffer(
   return result;
 }
 
+std::unique_ptr<Record> ReadRecordFromBuffer(const perf_event_attr& attr,
+                                             const char* p) {
+  auto header = reinterpret_cast<const perf_event_header*>(p);
+  return ReadRecordFromBuffer(attr, header->type, p);
+}
+
 bool RecordCache::RecordWithSeq::IsHappensBefore(
     const RecordWithSeq& other) const {
   bool is_sample = (record->type() == PERF_RECORD_SAMPLE);
diff --git a/simpleperf/record.h b/simpleperf/record.h
index 2cbe35a0..1d308691 100644
--- a/simpleperf/record.h
+++ b/simpleperf/record.h
@@ -517,6 +517,11 @@ std::unique_ptr<Record> ReadRecordFromOwnedBuffer(const perf_event_attr& attr,
 std::vector<std::unique_ptr<Record>> ReadRecordsFromBuffer(
     const perf_event_attr& attr, const char* buf, size_t buf_size);
 
+// Read one record from the buffer pointed by [p]. But the record doesn't
+// own the buffer.
+std::unique_ptr<Record> ReadRecordFromBuffer(const perf_event_attr& attr,
+                                             const char* p);
+
 // RecordCache is a cache used when receiving records from the kernel.
 // It sorts received records based on type and timestamp, and pops records
 // in sorted order. Records from the kernel need to be sorted because
author	Yabin Cui <yabinc@google.com>	2016-10-24 19:13:09 -0700
committer	Yabin Cui <yabinc@google.com>	2016-10-26 17:08:13 -0700
commit	2ea6de11962fea5613f15308b202fb505e57ae9b (patch)
tree	b0e52846150f77724aa8c2e4d6500d06b58b13cd
parent	e7448f00cf02ebe92f41881cf9c4b198cadbb8af (diff)
download	extras-2ea6de11962fea5613f15308b202fb505e57ae9b.tar.gz