Snap for 5110273 from 295f386099df283f36505edc41a8f656ac3bc11e to qt-release

Change-Id: I71d8d4f30d2e983b6c4ee7d8b8f3c34c10f3c446
author: android-build-team Robot <android-build-team-robot@google.com> 2018-11-04 03:05:48 +0000
committer: android-build-team Robot <android-build-team-robot@google.com> 2018-11-04 03:05:48 +0000
commit: d847f84a424ec4513a35dca61629f4b121e7e25c (patch)
tree: 293b6b89a54667c8fd3a2895acbb6448475daee6
parent: c68bc3578dbd02bee8c8cedd23dcbb1223b0fc81 (diff)
parent: 295f386099df283f36505edc41a8f656ac3bc11e (diff)
download: google-benchmark-d847f84a424ec4513a35dca61629f4b121e7e25c.tar.gz
48 files changed, 2049 insertions, 865 deletions
diff --git a/.travis.yml b/.travis.yml
index 168ed0b..4625dfb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -129,6 +129,11 @@ matrix:
         - COMPILER=clang++ BUILD_TYPE=Release
     - os: osx
       osx_image: xcode8.3
+      compiler: clang
+      env:
+        - COMPILER=clang++ BUILD_TYPE=Release BUILD_32_BITS=ON
+    - os: osx
+      osx_image: xcode8.3
       compiler: gcc
       env:
         - COMPILER=g++-7 C_COMPILER=gcc-7  BUILD_TYPE=Debug
diff --git a/AUTHORS b/AUTHORS
index f821903..daea1f6 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -19,6 +19,7 @@ Dominik Czarnota <dominik.b.czarnota@gmail.com>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Google Inc.
 International Business Machines Corporation
diff --git a/Android.bp b/Android.bp
index 1f709f3..67faefb 100644
--- a/Android.bp
+++ b/Android.bp
@@ -31,8 +31,10 @@ cc_library_static {
     ],
 
     srcs: [
+        "src/benchmark_api_internal.cc",
         "src/benchmark.cc",
         "src/benchmark_register.cc",
+        "src/benchmark_runner.cc",
         "src/colorprint.cc",
         "src/commandlineflags.cc",
         "src/complexity.cc",
@@ -50,3 +52,8 @@ cc_library_static {
     export_include_dirs: ["include"],
 }
 
+cc_test {
+    name: "google-benchmark-test",
+    srcs: ["test/basic_test.cc"],
+    static_libs: ["libgoogle-benchmark"],
+}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7fe7a8..1c57db9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,11 @@ option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." ON)
 option(BENCHMARK_ENABLE_EXCEPTIONS "Enable the use of exceptions in the benchmark library." ON)
 option(BENCHMARK_ENABLE_LTO "Enable link time optimisation of the benchmark library." OFF)
 option(BENCHMARK_USE_LIBCXX "Build and test using libc++ as the standard library." OFF)
-option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
+if(NOT MSVC)
+  option(BENCHMARK_BUILD_32_BITS "Build a 32 bit version of the library." OFF)
+else()
+  set(BENCHMARK_BUILD_32_BITS OFF CACHE BOOL "Build a 32 bit version of the library - unsupported when using MSVC)" FORCE)
+endif()
 option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark. (Projects embedding benchmark may want to turn this OFF.)" ON)
 
 # Allow unmet dependencies to be met using CMake's ExternalProject mechanics, which
@@ -90,7 +94,7 @@ if (BENCHMARK_BUILD_32_BITS)
   add_required_cxx_compiler_flag(-m32)
 endif()
 
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+if (MSVC)
   # Turn compiler warnings up to 11
   string(REGEX REPLACE "[-/]W[1-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
@@ -142,6 +146,12 @@ else()
   # Disable warnings regarding deprecated parts of the library while building
   # and testing those parts of the library.
   add_cxx_compiler_flag(-Wno-deprecated-declarations)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+    # Intel silently ignores '-Wno-deprecated-declarations',
+    # warning no. 1786 must be explicitly disabled.
+    # See #631 for rationale.
+    add_cxx_compiler_flag(-wd1786)
+  endif()
   # Disable deprecation warnings for release builds (when -Werror is enabled).
   add_cxx_compiler_flag(-Wno-deprecated RELEASE)
   add_cxx_compiler_flag(-Wno-deprecated RELWITHDEBINFO)
@@ -183,7 +193,7 @@ else()
       if (GCC_RANLIB)
         set(CMAKE_RANLIB ${GCC_RANLIB})
       endif()
-    elseif("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+    elseif("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
       include(llvm-toolchain)
     endif()
   endif()
@@ -208,7 +218,7 @@ else()
 endif()
 
 if (BENCHMARK_USE_LIBCXX)
-  if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+  if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     add_cxx_compiler_flag(-stdlib=libc++)
   elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR
           "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
index 1cf04db..2ff2f2a 100644
--- a/CONTRIBUTORS
+++ b/CONTRIBUTORS
@@ -34,6 +34,7 @@ Dominik Czarnota <dominik.b.czarnota@gmail.com>
 Eric Fiselier <eric@efcs.ca>
 Eugene Zhuk <eugene.zhuk@gmail.com>
 Evgeny Safronov <division494@gmail.com>
+Federico Ficarelli <federico.ficarelli@gmail.com>
 Felix Homann <linuxaudio@showlabor.de>
 Ismael Jimenez Martinez <ismael.jimenez.martinez@gmail.com>
 Jern-Kuan Leong <jernkuan@gmail.com>
diff --git a/METADATA b/METADATA
index edc1d25..586285d 100644
--- a/METADATA
+++ b/METADATA
@@ -9,10 +9,10 @@ third_party {
     type: GIT
     value: "https://github.com/google/benchmark.git"
   }
-  version: "5946795e82f778fff891c37f56c0d7a76a118bf9"
+  version: "c6193afe7eb1eb7802e34833e55e1528cb65c533"
   last_upgrade_date {
     year: 2018
-    month: 7
-    day: 6
+    month: 10
+    day: 29
   }
 }
diff --git a/README.md b/README.md
index 80e69f6..3820395 100644
--- a/README.md
+++ b/README.md
@@ -6,11 +6,9 @@
 
 A library to support the benchmarking of functions, similar to unit-tests.
 
-Discussion group: https://groups.google.com/d/forum/benchmark-discuss
+[Discussion group](https://groups.google.com/d/forum/benchmark-discuss)
 
-IRC channel: https://freenode.net #googlebenchmark
-
-[Known issues and common problems](#known-issues)
+IRC channel: [freenode](https://freenode.net) #googlebenchmark
 
 [Additional Tooling Documentation](docs/tools.md)
 
@@ -47,11 +45,10 @@ to `CMAKE_ARGS`.
 
 For Ubuntu and Debian Based System
 
-First make sure you have git and cmake installed (If not please install it)
+First make sure you have git and cmake installed (If not please install them)
 
 ```
-sudo apt-get install git
-sudo apt-get install cmake
+sudo apt-get install git cmake
 ```
 
 Now, let's clone the repository and build it
@@ -59,22 +56,20 @@ Now, let's clone the repository and build it
 ```
 git clone https://github.com/google/benchmark.git
 cd benchmark
-git clone https://github.com/google/googletest.git
+# If you want to build tests and don't use BENCHMARK_DOWNLOAD_DEPENDENCIES, then
+# git clone https://github.com/google/googletest.git
 mkdir build
 cd build
 cmake .. -DCMAKE_BUILD_TYPE=RELEASE
 make
 ```
 
-We need to install the library globally now
+If you need to install the library globally
 
 ```
 sudo make install
 ```
 
-Now you have google/benchmark installed in your machine
-Note: Don't forget to link to pthread library while building
-
 ## Stable and Experimental Library Versions
 
 The main branch contains the latest stable version of the benchmarking library;
@@ -87,15 +82,16 @@ to use, test, and provide feedback on the new features are encouraged to try
 this branch. However, this branch provides no stability guarantees and reserves
 the right to change and break the API at any time.
 
-## Prerequisite knowledge
-
-Before attempting to understand this framework one should ideally have some familiarity with the structure and format of the Google Test framework, upon which it is based. Documentation for Google Test, including a "Getting Started" (primer) guide, is available here:
-https://github.com/google/googletest/blob/master/googletest/docs/primer.md
+## Further knowledge
 
+It may help to read the [Google Test documentation](https://github.com/google/googletest/blob/master/googletest/docs/primer.md)
+as some of the structural aspects of the APIs are similar.
 
 ## Example usage
 ### Basic usage
-Define a function that executes the code to be measured.
+Define a function that executes the code to be measured, register it as a
+benchmark function using the `BENCHMARK` macro, and ensure an appropriate `main`
+function is available:
 
 ```c++
 #include <benchmark/benchmark.h>
@@ -123,7 +119,23 @@ Don't forget to inform your linker to add benchmark library e.g. through
 `BENCHMARK_MAIN();` at the end of the source file and link against 
 `-lbenchmark_main` to get the same default behavior.
 
-The benchmark library will reporting the timing for the code within the `for(...)` loop.
+The benchmark library will measure and report the timing for code within the
+`for(...)` loop.
+
+#### Platform-specific libraries
+When the library is built using GCC it is necessary to link with the pthread
+library due to how GCC implements `std::thread`. Failing to link to pthread will
+lead to runtime exceptions (unless you're using libc++), not linker errors. See
+[issue #67](https://github.com/google/benchmark/issues/67) for more details. You
+can link to pthread by adding `-pthread` to your linker command. Note, you can
+also use `-lpthread`, but there are potential issues with ordering of command
+line parameters if you use that.
+
+If you're running benchmarks on Windows, the shlwapi library (`-lshlwapi`) is
+also required.
+
+If you're running benchmarks on solaris, you'll want the kstat library linked in
+too (`-lkstat`).
 
 ### Passing arguments
 Sometimes a family of benchmarks can be implemented with just one routine that
@@ -522,15 +534,7 @@ order to manually set the time unit, you can specify it manually:
 BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 ```
 
-## Controlling number of iterations
-In all cases, the number of iterations for which the benchmark is run is
-governed by the amount of time the benchmark takes. Concretely, the number of
-iterations is at least one, not more than 1e9, until CPU time is greater than
-the minimum time, or the wallclock time is 5x minimum time. The minimum time is
-set as a flag `--benchmark_min_time` or per benchmark by calling `MinTime` on
-the registered benchmark object.
-
-## Reporting the mean, median and standard deviation by repeated benchmarks
+### Reporting the mean, median and standard deviation by repeated benchmarks
 By default each benchmark is run once and that single result is reported.
 However benchmarks are often noisy and a single result may not be representative
 of the overall behavior. For this reason it's possible to repeatedly rerun the
@@ -541,12 +545,20 @@ The number of runs of each benchmark is specified globally by the
 `Repetitions` on the registered benchmark object. When a benchmark is run more
 than once the mean, median and standard deviation of the runs will be reported.
 
-Additionally the `--benchmark_report_aggregates_only={true|false}` flag or
-`ReportAggregatesOnly(bool)` function can be used to change how repeated tests
-are reported. By default the result of each repeated run is reported. When this
-option is `true` only the mean, median and standard deviation of the runs is reported.
-Calling `ReportAggregatesOnly(bool)` on a registered benchmark object overrides
-the value of the flag for that benchmark.
+Additionally the `--benchmark_report_aggregates_only={true|false}`,
+`--benchmark_display_aggregates_only={true|false}` flags or
+`ReportAggregatesOnly(bool)`, `DisplayAggregatesOnly(bool)` functions can be
+used to change how repeated tests are reported. By default the result of each
+repeated run is reported. When `report aggregates only` option is `true`,
+only the aggregates (i.e. mean, median and standard deviation, maybe complexity
+measurements if they were requested) of the runs is reported, to both the
+reporters - standard output (console), and the file.
+However when only the `display aggregates only` option is `true`,
+only the aggregates are displayed in the standard output, while the file
+output still contains everything.
+Calling `ReportAggregatesOnly(bool)` / `DisplayAggregatesOnly(bool)` on a
+registered benchmark object overrides the value of the appropriate flag for that
+benchmark.
 
 ## User-defined statistics for repeated benchmarks
 While having mean, median and standard deviation is nice, this may not be
@@ -653,9 +665,12 @@ In multithreaded benchmarks, each counter is set on the calling thread only.
 When the benchmark finishes, the counters from each thread will be summed;
 the resulting sum is the value which will be shown for the benchmark.
 
-The `Counter` constructor accepts two parameters: the value as a `double`
-and a bit flag which allows you to show counters as rates and/or as
-per-thread averages:
+The `Counter` constructor accepts three parameters: the value as a `double`
+; a bit flag which allows you to show counters as rates, and/or as per-thread
+iteration, and/or as per-thread averages, and/or iteration invariants;
+and a flag specifying the 'unit' - i.e. is 1k a 1000 (default,
+`benchmark::Counter::OneK::kIs1000`), or 1024
+(`benchmark::Counter::OneK::kIs1024`)?
 
 ```c++
   // sets a simple counter
@@ -671,6 +686,9 @@ per-thread averages:
 
   // There's also a combined flag:
   state.counters["FooAvgRate"] = Counter(numFoos,benchmark::Counter::kAvgThreadsRate);
+
+  // This says that we process with the rate of state.range(0) bytes every iteration:
+  state.counters["BytesProcessed"] = Counter(state.range(0), benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024);
 ```
 
 When you're compiling in C++11 mode or later you can use `insert()` with
@@ -810,8 +828,29 @@ BM_memcpy/32          12 ns         12 ns   54687500
 BM_memcpy/32k       1834 ns       1837 ns     357143
 ```
 
+## Runtime and reporting considerations
+When the benchmark binary is executed, each benchmark function is run serially.
+The number of iterations to run is determined dynamically by running the
+benchmark a few times and measuring the time taken and ensuring that the
+ultimate result will be statistically stable. As such, faster benchmark
+functions will be run for more iterations than slower benchmark functions, and
+the number of iterations is thus reported.
 
-## Output Formats
+In all cases, the number of iterations for which the benchmark is run is
+governed by the amount of time the benchmark takes. Concretely, the number of
+iterations is at least one, not more than 1e9, until CPU time is greater than
+the minimum time, or the wallclock time is 5x minimum time. The minimum time is
+set per benchmark by calling `MinTime` on the registered benchmark object.
+
+Average timings are then reported over the iterations run. If multiple
+repetitions are requested using the `--benchmark_repetitions` command-line
+option, or at registration time, the benchmark function will be run several
+times and statistical results across these repetitions will also be reported.
+
+As well as the per-benchmark entries, a preamble in the report will include
+information about the machine on which the benchmarks are run.
+
+### Output Formats
 The library supports multiple output formats. Use the
 `--benchmark_format=<console|json|csv>` flag to set the format type. `console`
 is the default format.
@@ -879,14 +918,19 @@ name,iterations,real_time,cpu_time,bytes_per_second,items_per_second,label
 "BM_SetInsert/1024/10",106365,17238.4,8421.53,4.74973e+06,1.18743e+06,
 ```
 
-## Output Files
+### Output Files
 The library supports writing the output of the benchmark to a file specified
 by `--benchmark_out=<filename>`. The format of the output can be specified
 using `--benchmark_out_format={json|console|csv}`. Specifying
 `--benchmark_out` does not suppress the console output.
 
+## Result comparison
+
+It is possible to compare the benchmarking results. See [Additional Tooling Documentation](docs/tools.md)
+
 ## Debug vs Release
-By default, benchmark builds as a debug library. You will see a warning in the output when this is the case. To build it as a release library instead, use:
+By default, benchmark builds as a debug library. You will see a warning in the
+output when this is the case. To build it as a release library instead, use:
 
 ```
 cmake -DCMAKE_BUILD_TYPE=Release
@@ -898,16 +942,11 @@ To enable link-time optimisation, use
 cmake -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_LTO=true
 ```
 
-If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake cache variables, if autodetection fails.
-If you are using clang, you may need to set `LLVMAR_EXECUTABLE`, `LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
+If you are using gcc, you might need to set `GCC_AR` and `GCC_RANLIB` cmake
+cache variables, if autodetection fails.
 
-## Linking against the library
-
-When the library is built using GCC it is necessary to link with `-pthread`,
-due to how GCC implements `std::thread`.
-
-For GCC 4.x failing to link to pthreads will lead to runtime exceptions, not linker errors.
-See [issue #67](https://github.com/google/benchmark/issues/67) for more details.
+If you are using clang, you may need to set `LLVMAR_EXECUTABLE`,
+`LLVMNM_EXECUTABLE` and `LLVMRANLIB_EXECUTABLE` cmake cache variables.
 
 ## Compiler Support
 
@@ -937,14 +976,3 @@ sudo cpupower frequency-set --governor performance
 ./mybench
 sudo cpupower frequency-set --governor powersave
 ```
-
-# Known Issues
-
-### Windows with CMake
-
-* Users must manually link `shlwapi.lib`. Failure to do so may result
-in unresolved symbols.
-
-### Solaris
-
-* Users must explicitly link with kstat library (-lkstat compilation flag).
diff --git a/appveyor.yml b/appveyor.yml
index e99c6e7..cf24019 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -20,12 +20,6 @@ environment:
     - compiler: msvc-14-seh
       generator: "Visual Studio 14 2015 Win64"
 
-    - compiler: msvc-12-seh
-      generator: "Visual Studio 12 2013"
-
-    - compiler: msvc-12-seh
-      generator: "Visual Studio 12 2013 Win64"
-
     - compiler: gcc-5.3.0-posix
       generator: "MinGW Makefiles"
       cxx_path: 'C:\mingw-w64\i686-5.3.0-posix-dwarf-rt_v4-rev0\mingw32\bin'
diff --git a/cmake/HandleGTest.cmake b/cmake/HandleGTest.cmake
index 7ce1a63..b9c1443 100644
--- a/cmake/HandleGTest.cmake
+++ b/cmake/HandleGTest.cmake
@@ -5,7 +5,7 @@ macro(build_external_gtest)
   include(ExternalProject)
   set(GTEST_FLAGS "")
   if (BENCHMARK_USE_LIBCXX)
-    if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
       list(APPEND GTEST_FLAGS -stdlib=libc++)
     else()
       message(WARNING "Unsupported compiler (${CMAKE_CXX_COMPILER}) when using libc++")
@@ -76,11 +76,11 @@ macro(build_external_gtest)
 endmacro(build_external_gtest)
 
 if (BENCHMARK_ENABLE_GTEST_TESTS)
-  if (IS_DIRECTORY ${CMAKE_SOURCE_DIR}/googletest)
-    set(GTEST_ROOT "${CMAKE_SOURCE_DIR}/googletest")
+  if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/googletest)
+    set(GTEST_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/googletest")
     set(INSTALL_GTEST OFF CACHE INTERNAL "")
     set(INSTALL_GMOCK OFF CACHE INTERNAL "")
-    add_subdirectory(${CMAKE_SOURCE_DIR}/googletest)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/googletest)
     set(GTEST_BOTH_LIBRARIES gtest gmock gmock_main)
     foreach(HEADER test mock)
       # CMake 2.8 and older don't respect INTERFACE_INCLUDE_DIRECTORIES, so we
diff --git a/docs/tools.md b/docs/tools.md
index 70500bd..4a3b2e9 100644
--- a/docs/tools.md
+++ b/docs/tools.md
@@ -1,84 +1,25 @@
 # Benchmark Tools
 
-## compare_bench.py
-
-The `compare_bench.py` utility which can be used to compare the result of benchmarks.
-The program is invoked like:
-
-``` bash
-$ compare_bench.py <old-benchmark> <new-benchmark> [benchmark options]...
-```
-
-Where `<old-benchmark>` and `<new-benchmark>` either specify a benchmark executable file, or a JSON output file. The type of the input file is automatically detected. If a benchmark executable is specified then the benchmark is run to obtain the results. Otherwise the results are simply loaded from the output file.
-
-`[benchmark options]` will be passed to the benchmarks invocations. They can be anything that binary accepts, be it either normal `--benchmark_*` parameters, or some custom parameters your binary takes.
-
-The sample output using the JSON test files under `Inputs/` gives:
-
-``` bash
-$ ./compare_bench.py ./gbench/Inputs/test1_run1.json ./gbench/Inputs/test1_run2.json
-Comparing ./gbench/Inputs/test1_run1.json to ./gbench/Inputs/test1_run2.json
-Benchmark                        Time             CPU      Time Old      Time New       CPU Old       CPU New
--------------------------------------------------------------------------------------------------------------
-BM_SameTimes                  +0.0000         +0.0000            10            10            10            10
-BM_2xFaster                   -0.5000         -0.5000            50            25            50            25
-BM_2xSlower                   +1.0000         +1.0000            50           100            50           100
-BM_1PercentFaster             -0.0100         -0.0100           100            99           100            99
-BM_1PercentSlower             +0.0100         +0.0100           100           101           100           101
-BM_10PercentFaster            -0.1000         -0.1000           100            90           100            90
-BM_10PercentSlower            +0.1000         +0.1000           100           110           100           110
-BM_100xSlower                +99.0000        +99.0000           100         10000           100         10000
-BM_100xFaster                 -0.9900         -0.9900         10000           100         10000           100
-BM_10PercentCPUToTime         +0.1000         -0.1000           100           110           100            90
-BM_ThirdFaster                -0.3333         -0.3334           100            67           100            67
-BM_BadTimeUnit                -0.9000         +0.2000             0             0             0             1
-```
-
-As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+## compare.py
 
-When a benchmark executable is run, the raw output from the benchmark is printed in real time to stdout. The sample output using `benchmark/basic_test` for both arguments looks like:
+The `compare.py` can be used to compare the result of benchmarks.
 
-```
-./compare_bench.py  test/basic_test test/basic_test  --benchmark_filter=BM_empty.*
-RUNNING: test/basic_test --benchmark_filter=BM_empty.* --benchmark_out=/tmp/tmpN7LF3a
-Run on (8 X 4000 MHz CPU s)
-2017-11-07 23:28:36
----------------------------------------------------------------------
-Benchmark                              Time           CPU Iterations
----------------------------------------------------------------------
-BM_empty                               4 ns          4 ns  170178757
-BM_empty/threads:8                     1 ns          7 ns  103868920
-BM_empty_stop_start                    0 ns          0 ns 1000000000
-BM_empty_stop_start/threads:8          0 ns          0 ns 1403031720
-RUNNING: /test/basic_test --benchmark_filter=BM_empty.* --benchmark_out=/tmp/tmplvrIp8
-Run on (8 X 4000 MHz CPU s)
-2017-11-07 23:28:38
----------------------------------------------------------------------
-Benchmark                              Time           CPU Iterations
----------------------------------------------------------------------
-BM_empty                               4 ns          4 ns  169534855
-BM_empty/threads:8                     1 ns          7 ns  104188776
-BM_empty_stop_start                    0 ns          0 ns 1000000000
-BM_empty_stop_start/threads:8          0 ns          0 ns 1404159424
-Comparing ../build/test/basic_test to ../build/test/basic_test
-Benchmark                                Time             CPU      Time Old      Time New       CPU Old       CPU New
----------------------------------------------------------------------------------------------------------------------
-BM_empty                              -0.0048         -0.0049             4             4             4             4
-BM_empty/threads:8                    -0.0123         -0.0054             1             1             7             7
-BM_empty_stop_start                   -0.0000         -0.0000             0             0             0             0
-BM_empty_stop_start/threads:8         -0.0029         +0.0001             0             0             0             0
+**NOTE**: the utility relies on the scipy package which can be installed using [these instructions](https://www.scipy.org/install.html).
 
-```
+### Displaying aggregates only
 
-As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
-Obviously this example doesn't give any useful output, but it's intended to show the output format when 'compare_bench.py' needs to run benchmarks.
+The switch `-a` / `--display_aggregates_only` can be used to control the
+displayment of the normal iterations vs the aggregates. When passed, it will
+be passthrough to the benchmark binaries to be run, and will be accounted for
+in the tool itself; only the aggregates will be displayed, but not normal runs.
+It only affects the display, the separate runs will still be used to calculate
+the U test.
 
-## compare.py
+### Modes of operation
 
-The `compare.py` can be used to compare the result of benchmarks.
 There are three modes of operation:
 
-1. Just compare two benchmarks, what `compare_bench.py` did.
+1. Just compare two benchmarks
 The program is invoked like:
 
 ``` bash
@@ -240,3 +181,19 @@ Benchmark                               Time             CPU      Time Old
 ```
 This is a mix of the previous two modes, two (potentially different) benchmark binaries are run, and a different filter is applied to each one.
 As you can note, the values in `Time` and `CPU` columns are calculated as `(new - old) / |old|`.
+
+### U test
+
+If there is a sufficient repetition count of the benchmarks, the tool can do
+a [U Test](https://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test), of the
+null hypothesis that it is equally likely that a randomly selected value from
+one sample will be less than or greater than a randomly selected value from a
+second sample.
+
+If the calculated p-value is below this value is lower than the significance
+level alpha, then the result is said to be statistically significant and the
+null hypothesis is rejected. Which in other words means that the two benchmarks
+aren't identical.
+
+**WARNING**: requires **LARGE** (no less than 9) number of repetitions to be
+meaningful!
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 193fffc..157a717 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -241,8 +241,21 @@ BENCHMARK(BM_test)->Unit(benchmark::kMillisecond);
 #define BENCHMARK_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
 #endif
 
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+#if defined(__GNUC__) || __has_builtin(__builtin_unreachable)
+  #define BENCHMARK_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+  #define BENCHMARK_UNREACHABLE() __assume(false)
+#else
+  #define BENCHMARK_UNREACHABLE() ((void)0)
+#endif
+
 namespace benchmark {
 class BenchmarkReporter;
+class MemoryManager;
 
 void Initialize(int* argc, char** argv);
 
@@ -255,7 +268,7 @@ bool ReportUnrecognizedArguments(int argc, char** argv);
 // of each matching benchmark. Otherwise run each matching benchmark and
 // report the results.
 //
-// The second and third overload use the specified 'console_reporter' and
+// The second and third overload use the specified 'display_reporter' and
 //  'file_reporter' respectively. 'file_reporter' will write to the file
 //  specified
 //   by '--benchmark_output'. If '--benchmark_output' is not given the
@@ -263,16 +276,13 @@ bool ReportUnrecognizedArguments(int argc, char** argv);
 //
 // RETURNS: The number of matching benchmarks.
 size_t RunSpecifiedBenchmarks();
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter);
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                               BenchmarkReporter* file_reporter);
 
-// If this routine is called, peak memory allocation past this point in the
-// benchmark is reported at the end of the benchmark report line. (It is
-// computed by running the benchmark once with a single iteration and a memory
-// tracer.)
-// TODO(dominic)
-// void MemoryUsage();
+// Register a MemoryManager instance that will be used to collect and report
+// allocation measurements for benchmark runs.
+void RegisterMemoryManager(MemoryManager* memory_manager);
 
 namespace internal {
 class Benchmark;
@@ -363,11 +373,20 @@ class Counter {
     kAvgIterationsRate = kIsRate | kAvgIterations
   };
 
+  enum OneK {
+    // 1'000 items per 1k
+    kIs1000 = 1000,
+    // 1'024 items per 1k
+    kIs1024 = 1024
+  };
+
   double value;
   Flags flags;
+  OneK oneK;
 
   BENCHMARK_ALWAYS_INLINE
-  Counter(double v = 0., Flags f = kDefaults) : value(v), flags(f) {}
+  Counter(double v = 0., Flags f = kDefaults, OneK k = kIs1000)
+      : value(v), flags(f), oneK(k) {}
 
   BENCHMARK_ALWAYS_INLINE operator double const&() const { return value; }
   BENCHMARK_ALWAYS_INLINE operator double&() { return value; }
@@ -406,22 +425,35 @@ struct Statistics {
   std::string name_;
   StatisticsFunc* compute_;
 
-  Statistics(std::string name, StatisticsFunc* compute)
+  Statistics(const std::string& name, StatisticsFunc* compute)
       : name_(name), compute_(compute) {}
 };
 
 namespace internal {
+struct BenchmarkInstance;
 class ThreadTimer;
 class ThreadManager;
 
-enum ReportMode
+enum AggregationReportMode
 #if defined(BENCHMARK_HAS_CXX11)
     : unsigned
 #else
 #endif
-{ RM_Unspecified,  // The mode has not been manually specified
-  RM_Default,      // The mode is user-specified as default.
-  RM_ReportAggregatesOnly };
+{
+  // The mode has not been manually specified
+  ARM_Unspecified = 0,
+  // The mode is user-specified.
+  // This may or may not be set when the following bit-flags are set.
+  ARM_Default = 1U << 0U,
+  // File reporter should only output aggregates.
+  ARM_FileReportAggregatesOnly = 1U << 1U,
+  // Display reporter should only output aggregates
+  ARM_DisplayReportAggregatesOnly = 1U << 2U,
+  // Both reporters should only display aggregates.
+  ARM_ReportAggregatesOnly =
+      ARM_FileReportAggregatesOnly | ARM_DisplayReportAggregatesOnly
+};
+
 }  // namespace internal
 
 // State is passed to a running Benchmark and contains state for the
@@ -517,16 +549,21 @@ class State {
 
   // Set the number of bytes processed by the current benchmark
   // execution.  This routine is typically called once at the end of a
-  // throughput oriented benchmark.  If this routine is called with a
-  // value > 0, the report is printed in MB/sec instead of nanoseconds
-  // per iteration.
+  // throughput oriented benchmark.
   //
   // REQUIRES: a benchmark has exited its benchmarking loop.
   BENCHMARK_ALWAYS_INLINE
-  void SetBytesProcessed(int64_t bytes) { bytes_processed_ = bytes; }
+  void SetBytesProcessed(int64_t bytes) {
+    counters["bytes_per_second"] =
+        Counter(static_cast<double>(bytes), Counter::kIsRate, Counter::kIs1024);
+  }
 
   BENCHMARK_ALWAYS_INLINE
-  int64_t bytes_processed() const { return bytes_processed_; }
+  int64_t bytes_processed() const {
+    if (counters.find("bytes_per_second") != counters.end())
+      return static_cast<int64_t>(counters.at("bytes_per_second"));
+    return 0;
+  }
 
   // If this routine is called with complexity_n > 0 and complexity report is
   // requested for the
@@ -546,10 +583,17 @@ class State {
   //
   // REQUIRES: a benchmark has exited its benchmarking loop.
   BENCHMARK_ALWAYS_INLINE
-  void SetItemsProcessed(int64_t items) { items_processed_ = items; }
+  void SetItemsProcessed(int64_t items) {
+    counters["items_per_second"] =
+        Counter(static_cast<double>(items), benchmark::Counter::kIsRate);
+  }
 
   BENCHMARK_ALWAYS_INLINE
-  int64_t items_processed() const { return items_processed_; }
+  int64_t items_processed() const {
+    if (counters.find("items_per_second") != counters.end())
+      return static_cast<int64_t>(counters.at("items_per_second"));
+    return 0;
+  }
 
   // If this routine is called, the specified label is printed at the
   // end of the benchmark report line for the currently executing
@@ -612,9 +656,6 @@ class State {
  private:  // items we don't need on the first cache line
   std::vector<int64_t> range_;
 
-  int64_t bytes_processed_;
-  int64_t items_processed_;
-
   int64_t complexity_n_;
 
  public:
@@ -625,12 +666,11 @@ class State {
   // Number of threads concurrently executing the benchmark.
   const int threads;
 
-  // TODO(EricWF) make me private
+ private:
   State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
         int n_threads, internal::ThreadTimer* timer,
         internal::ThreadManager* manager);
 
- private:
   void StartKeepRunning();
   // Implementation of KeepRunning() and KeepRunningBatch().
   // is_batch must be true unless n is 1.
@@ -638,7 +678,8 @@ class State {
   void FinishKeepRunning();
   internal::ThreadTimer* timer_;
   internal::ThreadManager* manager_;
-  BENCHMARK_DISALLOW_COPY_AND_ASSIGN(State);
+
+  friend struct internal::BenchmarkInstance;
 };
 
 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
@@ -827,8 +868,12 @@ class Benchmark {
   // Specify if each repetition of the benchmark should be reported separately
   // or if only the final statistics should be reported. If the benchmark
   // is not repeated then the single result is always reported.
+  // Applies to *ALL* reporters (display and file).
   Benchmark* ReportAggregatesOnly(bool value = true);
 
+  // Same as ReportAggregatesOnly(), but applies to display reporter only.
+  Benchmark* DisplayAggregatesOnly(bool value = true);
+
   // If a particular benchmark is I/O bound, runs multiple threads internally or
   // if for some reason CPU timings are not representative, call this method. If
   // called, the elapsed time will be used to control how many iterations are
@@ -888,9 +933,6 @@ class Benchmark {
 
   virtual void Run(State& state) = 0;
 
-  // Used inside the benchmark implementation
-  struct Instance;
-
  protected:
   explicit Benchmark(const char* name);
   Benchmark(Benchmark const&);
@@ -902,7 +944,7 @@ class Benchmark {
   friend class BenchmarkFamilies;
 
   std::string name_;
-  ReportMode report_mode_;
+  AggregationReportMode aggregation_report_mode_;
   std::vector<std::string> arg_names_;       // Args for all benchmark runs
   std::vector<std::vector<int64_t> > args_;  // Args for all benchmark runs
   TimeUnit time_unit_;
@@ -1266,23 +1308,30 @@ class BenchmarkReporter {
   };
 
   struct Run {
+    enum RunType { RT_Iteration, RT_Aggregate };
+
     Run()
-        : error_occurred(false),
+        : run_type(RT_Iteration),
+          error_occurred(false),
           iterations(1),
           time_unit(kNanosecond),
           real_accumulated_time(0),
           cpu_accumulated_time(0),
-          bytes_per_second(0),
-          items_per_second(0),
           max_heapbytes_used(0),
           complexity(oNone),
           complexity_lambda(),
           complexity_n(0),
           report_big_o(false),
           report_rms(false),
-          counters() {}
-
-    std::string benchmark_name;
+          counters(),
+          has_memory_result(false),
+          allocs_per_iter(0.0),
+          max_bytes_used(0) {}
+
+    std::string benchmark_name() const;
+    std::string run_name;
+    RunType run_type;          // is this a measurement, or an aggregate?
+    std::string aggregate_name;
     std::string report_label;  // Empty if not set by benchmark.
     bool error_occurred;
     std::string error_message;
@@ -1304,10 +1353,6 @@ class BenchmarkReporter {
     // accumulated time.
     double GetAdjustedCPUTime() const;
 
-    // Zero if not set by benchmark.
-    double bytes_per_second;
-    double items_per_second;
-
     // This is set to 0.0 if memory tracing is not enabled.
     double max_heapbytes_used;
 
@@ -1324,6 +1369,11 @@ class BenchmarkReporter {
     bool report_rms;
 
     UserCounters counters;
+
+    // Memory metrics.
+    bool has_memory_result;
+    double allocs_per_iter;
+    int64_t max_bytes_used;
   };
 
   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
@@ -1438,6 +1488,29 @@ class BENCHMARK_DEPRECATED_MSG("The CSV Reporter will be removed in a future rel
   std::set<std::string> user_counter_names_;
 };
 
+// If a MemoryManager is registered, it can be used to collect and report
+// allocation metrics for a run of the benchmark.
+class MemoryManager {
+ public:
+  struct Result {
+    Result() : num_allocs(0), max_bytes_used(0) {}
+
+    // The number of allocations made in total between Start and Stop.
+    int64_t num_allocs;
+
+    // The peak memory use between Start and Stop.
+    int64_t max_bytes_used;
+  };
+
+  virtual ~MemoryManager() {}
+
+  // Implement this to start recording allocation information.
+  virtual void Start() = 0;
+
+  // Implement this to stop recording and fill out the given Result structure.
+  virtual void Stop(Result* result) = 0;
+};
+
 inline const char* GetTimeUnitString(TimeUnit unit) {
   switch (unit) {
     case kMillisecond:
@@ -1445,9 +1518,9 @@ inline const char* GetTimeUnitString(TimeUnit unit) {
     case kMicrosecond:
       return "us";
     case kNanosecond:
-    default:
       return "ns";
   }
+  BENCHMARK_UNREACHABLE();
 }
 
 inline double GetTimeUnitMultiplier(TimeUnit unit) {
@@ -1457,9 +1530,9 @@ inline double GetTimeUnitMultiplier(TimeUnit unit) {
     case kMicrosecond:
       return 1e6;
     case kNanosecond:
-    default:
       return 1e9;
   }
+  BENCHMARK_UNREACHABLE();
 }
 
 }  // namespace benchmark
diff --git a/src/benchmark.cc b/src/benchmark.cc
index b14bc62..410493f 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -14,6 +14,7 @@
 
 #include "benchmark/benchmark.h"
 #include "benchmark_api_internal.h"
+#include "benchmark_runner.h"
 #include "internal_macros.h"
 
 #ifndef BENCHMARK_OS_WINDOWS
@@ -34,6 +35,7 @@
 #include <memory>
 #include <string>
 #include <thread>
+#include <utility>
 
 #include "check.h"
 #include "colorprint.h"
@@ -72,10 +74,19 @@ DEFINE_int32(benchmark_repetitions, 1,
              "The number of runs of each benchmark. If greater than 1, the "
              "mean and standard deviation of the runs will be reported.");
 
-DEFINE_bool(benchmark_report_aggregates_only, false,
-            "Report the result of each benchmark repetitions. When 'true' is "
-            "specified only the mean, standard deviation, and other statistics "
-            "are reported for repeated benchmarks.");
+DEFINE_bool(
+    benchmark_report_aggregates_only, false,
+    "Report the result of each benchmark repetitions. When 'true' is specified "
+    "only the mean, standard deviation, and other statistics are reported for "
+    "repeated benchmarks. Affects all reporters.");
+
+DEFINE_bool(
+    benchmark_display_aggregates_only, false,
+    "Display the result of each benchmark repetitions. When 'true' is "
+    "specified only the mean, standard deviation, and other statistics are "
+    "displayed for repeated benchmarks. Unlike "
+    "benchmark_report_aggregates_only, only affects the display reporter, but "
+    "*NOT* file reporter, which will still contain all the output.");
 
 DEFINE_string(benchmark_format, "console",
               "The format to use for console output. Valid values are "
@@ -103,193 +114,11 @@ DEFINE_int32(v, 0, "The level of verbose logging to output");
 
 namespace benchmark {
 
-namespace {
-static const size_t kMaxIterations = 1000000000;
-}  // end namespace
-
 namespace internal {
 
+// FIXME: wouldn't LTO mess this up?
 void UseCharPointer(char const volatile*) {}
 
-namespace {
-
-BenchmarkReporter::Run CreateRunReport(
-    const benchmark::internal::Benchmark::Instance& b,
-    const internal::ThreadManager::Result& results, double seconds) {
-  // Create report about this benchmark run.
-  BenchmarkReporter::Run report;
-
-  report.benchmark_name = b.name;
-  report.error_occurred = results.has_error_;
-  report.error_message = results.error_message_;
-  report.report_label = results.report_label_;
-  // This is the total iterations across all threads.
-  report.iterations = results.iterations;
-  report.time_unit = b.time_unit;
-
-  if (!report.error_occurred) {
-    double bytes_per_second = 0;
-    if (results.bytes_processed > 0 && seconds > 0.0) {
-      bytes_per_second = (results.bytes_processed / seconds);
-    }
-    double items_per_second = 0;
-    if (results.items_processed > 0 && seconds > 0.0) {
-      items_per_second = (results.items_processed / seconds);
-    }
-
-    if (b.use_manual_time) {
-      report.real_accumulated_time = results.manual_time_used;
-    } else {
-      report.real_accumulated_time = results.real_time_used;
-    }
-    report.cpu_accumulated_time = results.cpu_time_used;
-    report.bytes_per_second = bytes_per_second;
-    report.items_per_second = items_per_second;
-    report.complexity_n = results.complexity_n;
-    report.complexity = b.complexity;
-    report.complexity_lambda = b.complexity_lambda;
-    report.statistics = b.statistics;
-    report.counters = results.counters;
-    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
-  }
-  return report;
-}
-
-// Execute one thread of benchmark b for the specified number of iterations.
-// Adds the stats collected for the thread into *total.
-void RunInThread(const benchmark::internal::Benchmark::Instance* b,
-                 size_t iters, int thread_id,
-                 internal::ThreadManager* manager) {
-  internal::ThreadTimer timer;
-  State st(iters, b->arg, thread_id, b->threads, &timer, manager);
-  b->benchmark->Run(st);
-  CHECK(st.iterations() >= st.max_iterations)
-      << "Benchmark returned before State::KeepRunning() returned false!";
-  {
-    MutexLock l(manager->GetBenchmarkMutex());
-    internal::ThreadManager::Result& results = manager->results;
-    results.iterations += st.iterations();
-    results.cpu_time_used += timer.cpu_time_used();
-    results.real_time_used += timer.real_time_used();
-    results.manual_time_used += timer.manual_time_used();
-    results.bytes_processed += st.bytes_processed();
-    results.items_processed += st.items_processed();
-    results.complexity_n += st.complexity_length_n();
-    internal::Increment(&results.counters, st.counters);
-  }
-  manager->NotifyThreadComplete();
-}
-
-std::vector<BenchmarkReporter::Run> RunBenchmark(
-    const benchmark::internal::Benchmark::Instance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  std::vector<BenchmarkReporter::Run> reports;  // return value
-
-  const bool has_explicit_iteration_count = b.iterations != 0;
-  size_t iters = has_explicit_iteration_count ? b.iterations : 1;
-  std::unique_ptr<internal::ThreadManager> manager;
-  std::vector<std::thread> pool(b.threads - 1);
-  const int repeats =
-      b.repetitions != 0 ? b.repetitions : FLAGS_benchmark_repetitions;
-  const bool report_aggregates_only =
-      repeats != 1 &&
-      (b.report_mode == internal::RM_Unspecified
-           ? FLAGS_benchmark_report_aggregates_only
-           : b.report_mode == internal::RM_ReportAggregatesOnly);
-  for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-    for (;;) {
-      // Try benchmark
-      VLOG(2) << "Running " << b.name << " for " << iters << "\n";
-
-      manager.reset(new internal::ThreadManager(b.threads));
-      for (std::size_t ti = 0; ti < pool.size(); ++ti) {
-        pool[ti] = std::thread(&RunInThread, &b, iters,
-                               static_cast<int>(ti + 1), manager.get());
-      }
-      RunInThread(&b, iters, 0, manager.get());
-      manager->WaitForAllThreads();
-      for (std::thread& thread : pool) thread.join();
-      internal::ThreadManager::Result results;
-      {
-        MutexLock l(manager->GetBenchmarkMutex());
-        results = manager->results;
-      }
-      manager.reset();
-      // Adjust real/manual time stats since they were reported per thread.
-      results.real_time_used /= b.threads;
-      results.manual_time_used /= b.threads;
-
-      VLOG(2) << "Ran in " << results.cpu_time_used << "/"
-              << results.real_time_used << "\n";
-
-      // Base decisions off of real time if requested by this benchmark.
-      double seconds = results.cpu_time_used;
-      if (b.use_manual_time) {
-        seconds = results.manual_time_used;
-      } else if (b.use_real_time) {
-        seconds = results.real_time_used;
-      }
-
-      const double min_time =
-          !IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time;
-
-      // clang-format off
-      // turn off clang-format since it mangles prettiness here
-      // Determine if this run should be reported; Either it has
-      // run for a sufficient amount of time or because an error was reported.
-      const bool should_report =  repetition_num > 0
-        || has_explicit_iteration_count  // An exact iteration count was requested
-        || results.has_error_
-        || iters >= kMaxIterations  // No chance to try again, we hit the limit.
-        || seconds >= min_time  // the elapsed time is large enough
-        // CPU time is specified but the elapsed real time greatly exceeds the
-        // minimum time. Note that user provided timers are except from this
-        // sanity check.
-        || ((results.real_time_used >= 5 * min_time) && !b.use_manual_time);
-      // clang-format on
-
-      if (should_report) {
-        BenchmarkReporter::Run report = CreateRunReport(b, results, seconds);
-        if (!report.error_occurred && b.complexity != oNone)
-          complexity_reports->push_back(report);
-        reports.push_back(report);
-        break;
-      }
-
-      // See how much iterations should be increased by
-      // Note: Avoid division by zero with max(seconds, 1ns).
-      double multiplier = min_time * 1.4 / std::max(seconds, 1e-9);
-      // If our last run was at least 10% of FLAGS_benchmark_min_time then we
-      // use the multiplier directly. Otherwise we use at most 10 times
-      // expansion.
-      // NOTE: When the last run was at least 10% of the min time the max
-      // expansion should be 14x.
-      bool is_significant = (seconds / min_time) > 0.1;
-      multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
-      if (multiplier <= 1.0) multiplier = 2.0;
-      double next_iters = std::max(multiplier * iters, iters + 1.0);
-      if (next_iters > kMaxIterations) {
-        next_iters = kMaxIterations;
-      }
-      VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
-      iters = static_cast<int>(next_iters + 0.5);
-    }
-  }
-  // Calculate additional statistics
-  auto stat_reports = ComputeStats(reports);
-  if ((b.complexity != oNone) && b.last_benchmark_instance) {
-    auto additional_run_stats = ComputeBigO(*complexity_reports);
-    stat_reports.insert(stat_reports.end(), additional_run_stats.begin(),
-                        additional_run_stats.end());
-    complexity_reports->clear();
-  }
-
-  if (report_aggregates_only) reports.clear();
-  reports.insert(reports.end(), stat_reports.begin(), stat_reports.end());
-  return reports;
-}
-
-}  // namespace
 }  // namespace internal
 
 State::State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
@@ -302,8 +131,6 @@ State::State(size_t max_iters, const std::vector<int64_t>& ranges, int thread_i,
       finished_(false),
       error_occurred_(false),
       range_(ranges),
-      bytes_processed_(0),
-      items_processed_(0),
       complexity_n_(0),
       counters(),
       thread_index(thread_i),
@@ -394,25 +221,25 @@ void State::FinishKeepRunning() {
 namespace internal {
 namespace {
 
-void RunBenchmarks(const std::vector<Benchmark::Instance>& benchmarks,
-                   BenchmarkReporter* console_reporter,
+void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
+                   BenchmarkReporter* display_reporter,
                    BenchmarkReporter* file_reporter) {
   // Note the file_reporter can be null.
-  CHECK(console_reporter != nullptr);
+  CHECK(display_reporter != nullptr);
 
   // Determine the width of the name field using a minimum width of 10.
-  bool has_repetitions = FLAGS_benchmark_repetitions > 1;
+  bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
   size_t name_field_width = 10;
   size_t stat_field_width = 0;
-  for (const Benchmark::Instance& benchmark : benchmarks) {
+  for (const BenchmarkInstance& benchmark : benchmarks) {
     name_field_width =
         std::max<size_t>(name_field_width, benchmark.name.size());
-    has_repetitions |= benchmark.repetitions > 1;
+    might_have_aggregates |= benchmark.repetitions > 1;
 
     for (const auto& Stat : *benchmark.statistics)
       stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
   }
-  if (has_repetitions) name_field_width += 1 + stat_field_width;
+  if (might_have_aggregates) name_field_width += 1 + stat_field_width;
 
   // Print header here
   BenchmarkReporter::Context context;
@@ -429,22 +256,36 @@ void RunBenchmarks(const std::vector<Benchmark::Instance>& benchmarks,
     std::flush(reporter->GetErrorStream());
   };
 
-  if (console_reporter->ReportContext(context) &&
+  if (display_reporter->ReportContext(context) &&
       (!file_reporter || file_reporter->ReportContext(context))) {
-    flushStreams(console_reporter);
+    flushStreams(display_reporter);
     flushStreams(file_reporter);
+
     for (const auto& benchmark : benchmarks) {
-      std::vector<BenchmarkReporter::Run> reports =
-          RunBenchmark(benchmark, &complexity_reports);
-      console_reporter->ReportRuns(reports);
-      if (file_reporter) file_reporter->ReportRuns(reports);
-      flushStreams(console_reporter);
+      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
+
+      auto report = [&run_results](BenchmarkReporter* reporter,
+                                   bool report_aggregates_only) {
+        assert(reporter);
+        // If there are no aggregates, do output non-aggregates.
+        report_aggregates_only &= !run_results.aggregates_only.empty();
+        if (!report_aggregates_only)
+          reporter->ReportRuns(run_results.non_aggregates);
+        if (!run_results.aggregates_only.empty())
+          reporter->ReportRuns(run_results.aggregates_only);
+      };
+
+      report(display_reporter, run_results.display_report_aggregates_only);
+      if (file_reporter)
+        report(file_reporter, run_results.file_report_aggregates_only);
+
+      flushStreams(display_reporter);
       flushStreams(file_reporter);
     }
   }
-  console_reporter->Finalize();
+  display_reporter->Finalize();
   if (file_reporter) file_reporter->Finalize();
-  flushStreams(console_reporter);
+  flushStreams(display_reporter);
   flushStreams(file_reporter);
 }
 
@@ -471,15 +312,20 @@ bool IsZero(double n) {
 
 ConsoleReporter::OutputOptions GetOutputOptions(bool force_no_color) {
   int output_opts = ConsoleReporter::OO_Defaults;
-  if ((FLAGS_benchmark_color == "auto" && IsColorTerminal()) ||
-      IsTruthyFlagValue(FLAGS_benchmark_color)) {
+  auto is_benchmark_color = [force_no_color] () -> bool {
+    if (force_no_color) {
+      return false;
+    }
+    if (FLAGS_benchmark_color == "auto") {
+      return IsColorTerminal();
+    }
+    return IsTruthyFlagValue(FLAGS_benchmark_color);
+  };
+  if (is_benchmark_color()) {
     output_opts |= ConsoleReporter::OO_Color;
   } else {
     output_opts &= ~ConsoleReporter::OO_Color;
   }
-  if (force_no_color) {
-    output_opts &= ~ConsoleReporter::OO_Color;
-  }
   if (FLAGS_benchmark_counters_tabular) {
     output_opts |= ConsoleReporter::OO_Tabular;
   } else {
@@ -494,11 +340,11 @@ size_t RunSpecifiedBenchmarks() {
   return RunSpecifiedBenchmarks(nullptr, nullptr);
 }
 
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter) {
-  return RunSpecifiedBenchmarks(console_reporter, nullptr);
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter) {
+  return RunSpecifiedBenchmarks(display_reporter, nullptr);
 }
 
-size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
+size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
                               BenchmarkReporter* file_reporter) {
   std::string spec = FLAGS_benchmark_filter;
   if (spec.empty() || spec == "all")
@@ -506,15 +352,15 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
 
   // Setup the reporters
   std::ofstream output_file;
-  std::unique_ptr<BenchmarkReporter> default_console_reporter;
+  std::unique_ptr<BenchmarkReporter> default_display_reporter;
   std::unique_ptr<BenchmarkReporter> default_file_reporter;
-  if (!console_reporter) {
-    default_console_reporter = internal::CreateReporter(
+  if (!display_reporter) {
+    default_display_reporter = internal::CreateReporter(
         FLAGS_benchmark_format, internal::GetOutputOptions());
-    console_reporter = default_console_reporter.get();
+    display_reporter = default_display_reporter.get();
   }
-  auto& Out = console_reporter->GetOutputStream();
-  auto& Err = console_reporter->GetErrorStream();
+  auto& Out = display_reporter->GetOutputStream();
+  auto& Err = display_reporter->GetErrorStream();
 
   std::string const& fname = FLAGS_benchmark_out;
   if (fname.empty() && file_reporter) {
@@ -538,7 +384,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
     file_reporter->SetErrorStream(&output_file);
   }
 
-  std::vector<internal::Benchmark::Instance> benchmarks;
+  std::vector<internal::BenchmarkInstance> benchmarks;
   if (!FindBenchmarksInternal(spec, &benchmarks, &Err)) return 0;
 
   if (benchmarks.empty()) {
@@ -549,12 +395,16 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* console_reporter,
   if (FLAGS_benchmark_list_tests) {
     for (auto const& benchmark : benchmarks) Out << benchmark.name << "\n";
   } else {
-    internal::RunBenchmarks(benchmarks, console_reporter, file_reporter);
+    internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
   }
 
   return benchmarks.size();
 }
 
+void RegisterMemoryManager(MemoryManager* manager) {
+  internal::memory_manager = manager;
+}
+
 namespace internal {
 
 void PrintUsageAndExit() {
@@ -564,7 +414,8 @@ void PrintUsageAndExit() {
           "          [--benchmark_filter=<regex>]\n"
           "          [--benchmark_min_time=<min_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
-          "          [--benchmark_report_aggregates_only={true|false}\n"
+          "          [--benchmark_report_aggregates_only={true|false}]\n"
+          "          [--benchmark_display_aggregates_only={true|false}]\n"
           "          [--benchmark_format=<console|json|csv>]\n"
           "          [--benchmark_out=<filename>]\n"
           "          [--benchmark_out_format=<json|console|csv>]\n"
@@ -588,6 +439,8 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                        &FLAGS_benchmark_repetitions) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                       &FLAGS_benchmark_report_aggregates_only) ||
+        ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
+                      &FLAGS_benchmark_display_aggregates_only) ||
         ParseStringFlag(argv[i], "benchmark_format", &FLAGS_benchmark_format) ||
         ParseStringFlag(argv[i], "benchmark_out", &FLAGS_benchmark_out) ||
         ParseStringFlag(argv[i], "benchmark_out_format",
diff --git a/src/benchmark_api_internal.cc b/src/benchmark_api_internal.cc
new file mode 100644
index 0000000..8d31083
--- /dev/null
+++ b/src/benchmark_api_internal.cc
@@ -0,0 +1,15 @@
+#include "benchmark_api_internal.h"
+
+namespace benchmark {
+namespace internal {
+
+State BenchmarkInstance::Run(
+    size_t iters, int thread_id, internal::ThreadTimer* timer,
+    internal::ThreadManager* manager) const {
+  State st(iters, arg, thread_id, threads, timer, manager);
+  benchmark->Run(st);
+  return st;
+}
+
+}  // internal
+}  // benchmark
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index dd7a3ff..0524a85 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -2,10 +2,12 @@
 #define BENCHMARK_API_INTERNAL_H
 
 #include "benchmark/benchmark.h"
+#include "commandlineflags.h"
 
 #include <cmath>
 #include <iosfwd>
 #include <limits>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -13,10 +15,10 @@ namespace benchmark {
 namespace internal {
 
 // Information kept per benchmark we may want to run
-struct Benchmark::Instance {
+struct BenchmarkInstance {
   std::string name;
   Benchmark* benchmark;
-  ReportMode report_mode;
+  AggregationReportMode aggregation_report_mode;
   std::vector<int64_t> arg;
   TimeUnit time_unit;
   int range_multiplier;
@@ -31,10 +33,13 @@ struct Benchmark::Instance {
   double min_time;
   size_t iterations;
   int threads;  // Number of concurrent threads to us
+
+  State Run(size_t iters, int thread_id, internal::ThreadTimer* timer,
+            internal::ThreadManager* manager) const;
 };
 
 bool FindBenchmarksInternal(const std::string& re,
-                            std::vector<Benchmark::Instance>* benchmarks,
+                            std::vector<BenchmarkInstance>* benchmarks,
                             std::ostream* Err);
 
 bool IsZero(double n);
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index 26a8972..a85a4b4 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -78,7 +78,7 @@ class BenchmarkFamilies {
   // Extract the list of benchmark instances that match the specified
   // regular expression.
   bool FindBenchmarks(std::string re,
-                      std::vector<Benchmark::Instance>* benchmarks,
+                      std::vector<BenchmarkInstance>* benchmarks,
                       std::ostream* Err);
 
  private:
@@ -107,7 +107,7 @@ void BenchmarkFamilies::ClearBenchmarks() {
 }
 
 bool BenchmarkFamilies::FindBenchmarks(
-    std::string spec, std::vector<Benchmark::Instance>* benchmarks,
+    std::string spec, std::vector<BenchmarkInstance>* benchmarks,
     std::ostream* ErrStream) {
   CHECK(ErrStream);
   auto& Err = *ErrStream;
@@ -152,10 +152,10 @@ bool BenchmarkFamilies::FindBenchmarks(
 
     for (auto const& args : family->args_) {
       for (int num_threads : *thread_counts) {
-        Benchmark::Instance instance;
+        BenchmarkInstance instance;
         instance.name = family->name_;
         instance.benchmark = family.get();
-        instance.report_mode = family->report_mode_;
+        instance.aggregation_report_mode = family->aggregation_report_mode_;
         instance.arg = args;
         instance.time_unit = family->time_unit_;
         instance.range_multiplier = family->range_multiplier_;
@@ -225,7 +225,7 @@ Benchmark* RegisterBenchmarkInternal(Benchmark* bench) {
 // FIXME: This function is a hack so that benchmark.cc can access
 // `BenchmarkFamilies`
 bool FindBenchmarksInternal(const std::string& re,
-                            std::vector<Benchmark::Instance>* benchmarks,
+                            std::vector<BenchmarkInstance>* benchmarks,
                             std::ostream* Err) {
   return BenchmarkFamilies::GetInstance()->FindBenchmarks(re, benchmarks, Err);
 }
@@ -236,7 +236,7 @@ bool FindBenchmarksInternal(const std::string& re,
 
 Benchmark::Benchmark(const char* name)
     : name_(name),
-      report_mode_(RM_Unspecified),
+      aggregation_report_mode_(ARM_Unspecified),
       time_unit_(kNanosecond),
       range_multiplier_(kRangeMultiplier),
       min_time_(0),
@@ -369,7 +369,23 @@ Benchmark* Benchmark::Repetitions(int n) {
 }
 
 Benchmark* Benchmark::ReportAggregatesOnly(bool value) {
-  report_mode_ = value ? RM_ReportAggregatesOnly : RM_Default;
+  aggregation_report_mode_ = value ? ARM_ReportAggregatesOnly : ARM_Default;
+  return this;
+}
+
+Benchmark* Benchmark::DisplayAggregatesOnly(bool value) {
+  // If we were called, the report mode is no longer 'unspecified', in any case.
+  aggregation_report_mode_ = static_cast<AggregationReportMode>(
+      aggregation_report_mode_ | ARM_Default);
+
+  if (value) {
+    aggregation_report_mode_ = static_cast<AggregationReportMode>(
+        aggregation_report_mode_ | ARM_DisplayReportAggregatesOnly);
+  } else {
+    aggregation_report_mode_ = static_cast<AggregationReportMode>(
+        aggregation_report_mode_ & ~ARM_DisplayReportAggregatesOnly);
+  }
+
   return this;
 }
 
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
new file mode 100644
index 0000000..38faeec
--- /dev/null
+++ b/src/benchmark_runner.cc
@@ -0,0 +1,350 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "benchmark_runner.h"
+#include "benchmark/benchmark.h"
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+#ifndef BENCHMARK_OS_WINDOWS
+#ifndef BENCHMARK_OS_FUCHSIA
+#include <sys/resource.h>
+#endif
+#include <sys/time.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+
+#include "check.h"
+#include "colorprint.h"
+#include "commandlineflags.h"
+#include "complexity.h"
+#include "counter.h"
+#include "internal_macros.h"
+#include "log.h"
+#include "mutex.h"
+#include "re.h"
+#include "statistics.h"
+#include "string_util.h"
+#include "thread_manager.h"
+#include "thread_timer.h"
+
+namespace benchmark {
+
+namespace internal {
+
+MemoryManager* memory_manager = nullptr;
+
+namespace {
+
+static const size_t kMaxIterations = 1000000000;
+
+BenchmarkReporter::Run CreateRunReport(
+    const benchmark::internal::BenchmarkInstance& b,
+    const internal::ThreadManager::Result& results, size_t memory_iterations,
+    const MemoryManager::Result& memory_result, double seconds) {
+  // Create report about this benchmark run.
+  BenchmarkReporter::Run report;
+
+  report.run_name = b.name;
+  report.error_occurred = results.has_error_;
+  report.error_message = results.error_message_;
+  report.report_label = results.report_label_;
+  // This is the total iterations across all threads.
+  report.iterations = results.iterations;
+  report.time_unit = b.time_unit;
+
+  if (!report.error_occurred) {
+    if (b.use_manual_time) {
+      report.real_accumulated_time = results.manual_time_used;
+    } else {
+      report.real_accumulated_time = results.real_time_used;
+    }
+    report.cpu_accumulated_time = results.cpu_time_used;
+    report.complexity_n = results.complexity_n;
+    report.complexity = b.complexity;
+    report.complexity_lambda = b.complexity_lambda;
+    report.statistics = b.statistics;
+    report.counters = results.counters;
+
+    if (memory_iterations > 0) {
+      report.has_memory_result = true;
+      report.allocs_per_iter =
+          memory_iterations ? static_cast<double>(memory_result.num_allocs) /
+                                  memory_iterations
+                            : 0;
+      report.max_bytes_used = memory_result.max_bytes_used;
+    }
+
+    internal::Finish(&report.counters, results.iterations, seconds, b.threads);
+  }
+  return report;
+}
+
+// Execute one thread of benchmark b for the specified number of iterations.
+// Adds the stats collected for the thread into *total.
+void RunInThread(const BenchmarkInstance* b, size_t iters, int thread_id,
+                 ThreadManager* manager) {
+  internal::ThreadTimer timer;
+  State st = b->Run(iters, thread_id, &timer, manager);
+  CHECK(st.iterations() >= st.max_iterations)
+      << "Benchmark returned before State::KeepRunning() returned false!";
+  {
+    MutexLock l(manager->GetBenchmarkMutex());
+    internal::ThreadManager::Result& results = manager->results;
+    results.iterations += st.iterations();
+    results.cpu_time_used += timer.cpu_time_used();
+    results.real_time_used += timer.real_time_used();
+    results.manual_time_used += timer.manual_time_used();
+    results.complexity_n += st.complexity_length_n();
+    internal::Increment(&results.counters, st.counters);
+  }
+  manager->NotifyThreadComplete();
+}
+
+class BenchmarkRunner {
+ public:
+  BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
+                  std::vector<BenchmarkReporter::Run>* complexity_reports_)
+      : b(b_),
+        complexity_reports(*complexity_reports_),
+        min_time(!IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time),
+        repeats(b.repetitions != 0 ? b.repetitions
+                                   : FLAGS_benchmark_repetitions),
+        has_explicit_iteration_count(b.iterations != 0),
+        pool(b.threads - 1),
+        iters(has_explicit_iteration_count ? b.iterations : 1) {
+    run_results.display_report_aggregates_only =
+        (FLAGS_benchmark_report_aggregates_only ||
+         FLAGS_benchmark_display_aggregates_only);
+    run_results.file_report_aggregates_only =
+        FLAGS_benchmark_report_aggregates_only;
+    if (b.aggregation_report_mode != internal::ARM_Unspecified) {
+      run_results.display_report_aggregates_only =
+          (b.aggregation_report_mode &
+           internal::ARM_DisplayReportAggregatesOnly);
+      run_results.file_report_aggregates_only =
+          (b.aggregation_report_mode & internal::ARM_FileReportAggregatesOnly);
+    }
+
+    for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
+      const bool is_the_first_repetition = repetition_num == 0;
+      DoOneRepetition(is_the_first_repetition);
+    }
+
+    // Calculate additional statistics
+    run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+    // Maybe calculate complexity report
+    if ((b.complexity != oNone) && b.last_benchmark_instance) {
+      auto additional_run_stats = ComputeBigO(complexity_reports);
+      run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                         additional_run_stats.begin(),
+                                         additional_run_stats.end());
+      complexity_reports.clear();
+    }
+  }
+
+  RunResults&& get_results() { return std::move(run_results); }
+
+ private:
+  RunResults run_results;
+
+  const benchmark::internal::BenchmarkInstance& b;
+  std::vector<BenchmarkReporter::Run>& complexity_reports;
+
+  const double min_time;
+  const int repeats;
+  const bool has_explicit_iteration_count;
+
+  std::vector<std::thread> pool;
+
+  size_t iters;  // preserved between repetitions!
+  // So only the first repetition has to find/calculate it,
+  // the other repetitions will just use that precomputed iteration count.
+
+  struct IterationResults {
+    internal::ThreadManager::Result results;
+    size_t iters;
+    double seconds;
+  };
+  IterationResults DoNIterations() {
+    VLOG(2) << "Running " << b.name << " for " << iters << "\n";
+
+    std::unique_ptr<internal::ThreadManager> manager;
+    manager.reset(new internal::ThreadManager(b.threads));
+
+    // Run all but one thread in separate threads
+    for (std::size_t ti = 0; ti < pool.size(); ++ti) {
+      pool[ti] = std::thread(&RunInThread, &b, iters, static_cast<int>(ti + 1),
+                             manager.get());
+    }
+    // And run one thread here directly.
+    // (If we were asked to run just one thread, we don't create new threads.)
+    // Yes, we need to do this here *after* we start the separate threads.
+    RunInThread(&b, iters, 0, manager.get());
+
+    // The main thread has finished. Now let's wait for the other threads.
+    manager->WaitForAllThreads();
+    for (std::thread& thread : pool) thread.join();
+
+    IterationResults i;
+    // Acquire the measurements/counters from the manager, UNDER THE LOCK!
+    {
+      MutexLock l(manager->GetBenchmarkMutex());
+      i.results = manager->results;
+    }
+
+    // And get rid of the manager.
+    manager.reset();
+
+    // Adjust real/manual time stats since they were reported per thread.
+    i.results.real_time_used /= b.threads;
+    i.results.manual_time_used /= b.threads;
+
+    VLOG(2) << "Ran in " << i.results.cpu_time_used << "/"
+            << i.results.real_time_used << "\n";
+
+    // So for how long were we running?
+    i.iters = iters;
+    // Base decisions off of real time if requested by this benchmark.
+    i.seconds = i.results.cpu_time_used;
+    if (b.use_manual_time) {
+      i.seconds = i.results.manual_time_used;
+    } else if (b.use_real_time) {
+      i.seconds = i.results.real_time_used;
+    }
+
+    return i;
+  }
+
+  size_t PredictNumItersNeeded(const IterationResults& i) const {
+    // See how much iterations should be increased by.
+    // Note: Avoid division by zero with max(seconds, 1ns).
+    double multiplier = min_time * 1.4 / std::max(i.seconds, 1e-9);
+    // If our last run was at least 10% of FLAGS_benchmark_min_time then we
+    // use the multiplier directly.
+    // Otherwise we use at most 10 times expansion.
+    // NOTE: When the last run was at least 10% of the min time the max
+    // expansion should be 14x.
+    bool is_significant = (i.seconds / min_time) > 0.1;
+    multiplier = is_significant ? multiplier : std::min(10.0, multiplier);
+    if (multiplier <= 1.0) multiplier = 2.0;
+
+    // So what seems to be the sufficiently-large iteration count? Round up.
+    const size_t max_next_iters =
+        0.5 + std::max(multiplier * i.iters, i.iters + 1.0);
+    // But we do have *some* sanity limits though..
+    const size_t next_iters = std::min(max_next_iters, kMaxIterations);
+
+    VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n";
+    return next_iters;  // round up before conversion to integer.
+  }
+
+  bool ShouldReportIterationResults(const IterationResults& i) const {
+    // Determine if this run should be reported;
+    // Either it has run for a sufficient amount of time
+    // or because an error was reported.
+    return i.results.has_error_ ||
+           i.iters >= kMaxIterations ||  // Too many iterations already.
+           i.seconds >= min_time ||      // The elapsed time is large enough.
+           // CPU time is specified but the elapsed real time greatly exceeds
+           // the minimum time.
+           // Note that user provided timers are except from this sanity check.
+           ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time);
+  }
+
+  void DoOneRepetition(bool is_the_first_repetition) {
+    IterationResults i;
+
+    // We *may* be gradually increasing the length (iteration count)
+    // of the benchmark until we decide the results are significant.
+    // And once we do, we report those last results and exit.
+    // Please do note that the if there are repetitions, the iteration count
+    // is *only* calculated for the *first* repetition, and other repetitions
+    // simply use that precomputed iteration count.
+    for (;;) {
+      i = DoNIterations();
+
+      // Do we consider the results to be significant?
+      // If we are doing repetitions, and the first repetition was already done,
+      // it has calculated the correct iteration time, so we have run that very
+      // iteration count just now. No need to calculate anything. Just report.
+      // Else, the normal rules apply.
+      const bool results_are_significant = !is_the_first_repetition ||
+                                           has_explicit_iteration_count ||
+                                           ShouldReportIterationResults(i);
+
+      if (results_are_significant) break;  // Good, let's report them!
+
+      // Nope, bad iteration. Let's re-estimate the hopefully-sufficient
+      // iteration count, and run the benchmark again...
+
+      iters = PredictNumItersNeeded(i);
+      assert(iters > i.iters &&
+             "if we did more iterations than we want to do the next time, "
+             "then we should have accepted the current iteration run.");
+    }
+
+    // Oh, one last thing, we need to also produce the 'memory measurements'..
+    MemoryManager::Result memory_result;
+    size_t memory_iterations = 0;
+    if (memory_manager != nullptr) {
+      // Only run a few iterations to reduce the impact of one-time
+      // allocations in benchmarks that are not properly managed.
+      memory_iterations = std::min<size_t>(16, iters);
+      memory_manager->Start();
+      std::unique_ptr<internal::ThreadManager> manager;
+      manager.reset(new internal::ThreadManager(1));
+      RunInThread(&b, memory_iterations, 0, manager.get());
+      manager->WaitForAllThreads();
+      manager.reset();
+
+      memory_manager->Stop(&memory_result);
+    }
+
+    // Ok, now actualy report.
+    BenchmarkReporter::Run report = CreateRunReport(
+        b, i.results, memory_iterations, memory_result, i.seconds);
+
+    if (!report.error_occurred && b.complexity != oNone)
+      complexity_reports.push_back(report);
+
+    run_results.non_aggregates.push_back(report);
+  }
+};
+
+}  // end namespace
+
+RunResults RunBenchmark(
+    const benchmark::internal::BenchmarkInstance& b,
+    std::vector<BenchmarkReporter::Run>* complexity_reports) {
+  internal::BenchmarkRunner r(b, complexity_reports);
+  return r.get_results();
+}
+
+}  // end namespace internal
+
+}  // end namespace benchmark
diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h
new file mode 100644
index 0000000..96e8282
--- /dev/null
+++ b/src/benchmark_runner.h
@@ -0,0 +1,51 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BENCHMARK_RUNNER_H_
+#define BENCHMARK_RUNNER_H_
+
+#include "benchmark_api_internal.h"
+#include "internal_macros.h"
+
+DECLARE_double(benchmark_min_time);
+
+DECLARE_int32(benchmark_repetitions);
+
+DECLARE_bool(benchmark_report_aggregates_only);
+
+DECLARE_bool(benchmark_display_aggregates_only);
+
+namespace benchmark {
+
+namespace internal {
+
+extern MemoryManager* memory_manager;
+
+struct RunResults {
+  std::vector<BenchmarkReporter::Run> non_aggregates;
+  std::vector<BenchmarkReporter::Run> aggregates_only;
+
+  bool display_report_aggregates_only = false;
+  bool file_report_aggregates_only = false;
+};
+
+RunResults RunBenchmark(
+    const benchmark::internal::BenchmarkInstance& b,
+    std::vector<BenchmarkReporter::Run>* complexity_reports);
+
+}  // namespace internal
+
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_RUNNER_H_
diff --git a/src/colorprint.cc b/src/colorprint.cc
index 2dec4a8..fff6a98 100644
--- a/src/colorprint.cc
+++ b/src/colorprint.cc
@@ -25,7 +25,7 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Windows.h>
+#include <windows.h>
 #include <io.h>
 #else
 #include <unistd.h>
diff --git a/src/complexity.cc b/src/complexity.cc
index 02fe3fe..0be73e0 100644
--- a/src/complexity.cc
+++ b/src/complexity.cc
@@ -36,10 +36,10 @@ BigOFunc* FittingCurve(BigO complexity) {
       return [](int64_t n) -> double { return std::pow(n, 3); };
     case oLogN:
       /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](int64_t n) { return kLog2E * log(n); };
+      return [](int64_t n) { return kLog2E * log(static_cast<double>(n)); };
     case oNLogN:
       /* Note: can't use log2 because Android's GNU STL lacks it */
-      return [](int64_t n) { return kLog2E * n * log(n); };
+      return [](int64_t n) { return kLog2E * n * log(static_cast<double>(n)); };
     case o1:
     default:
       return [](int64_t) { return 1.0; };
@@ -182,12 +182,15 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
     result_cpu = MinimalLeastSq(n, cpu_time, reports[0].complexity);
     result_real = MinimalLeastSq(n, real_time, result_cpu.complexity);
   }
-  std::string benchmark_name =
-      reports[0].benchmark_name.substr(0, reports[0].benchmark_name.find('/'));
+
+  std::string run_name = reports[0].benchmark_name().substr(
+      0, reports[0].benchmark_name().find('/'));
 
   // Get the data from the accumulator to BenchmarkReporter::Run's.
   Run big_o;
-  big_o.benchmark_name = benchmark_name + "_BigO";
+  big_o.run_name = run_name;
+  big_o.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  big_o.aggregate_name = "BigO";
   big_o.iterations = 0;
   big_o.real_accumulated_time = result_real.coef;
   big_o.cpu_accumulated_time = result_cpu.coef;
@@ -203,8 +206,10 @@ std::vector<BenchmarkReporter::Run> ComputeBigO(
 
   // Only add label to mean/stddev if it is same for all runs
   Run rms;
+  rms.run_name = run_name;
   big_o.report_label = reports[0].report_label;
-  rms.benchmark_name = benchmark_name + "_RMS";
+  rms.run_type = BenchmarkReporter::Run::RT_Aggregate;
+  rms.aggregate_name = "RMS";
   rms.report_label = big_o.report_label;
   rms.iterations = 0;
   rms.real_accumulated_time = result_real.rms / multiplier;
diff --git a/src/console_reporter.cc b/src/console_reporter.cc
index 48920ca..7de7138 100644
--- a/src/console_reporter.cc
+++ b/src/console_reporter.cc
@@ -106,7 +106,7 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   auto name_color =
       (result.report_big_o || result.report_rms) ? COLOR_BLUE : COLOR_GREEN;
   printer(Out, name_color, "%-*s ", name_field_width_,
-          result.benchmark_name.c_str());
+          result.benchmark_name().c_str());
 
   if (result.error_occurred) {
     printer(Out, COLOR_RED, "ERROR OCCURRED: \'%s\'",
@@ -114,18 +114,6 @@ void ConsoleReporter::PrintRunData(const Run& result) {
     printer(Out, COLOR_DEFAULT, "\n");
     return;
   }
-  // Format bytes per second
-  std::string rate;
-  if (result.bytes_per_second > 0) {
-    rate = StrCat(" ", HumanReadableNumber(result.bytes_per_second), "B/s");
-  }
-
-  // Format items per second
-  std::string items;
-  if (result.items_per_second > 0) {
-    items =
-        StrCat(" ", HumanReadableNumber(result.items_per_second), " items/s");
-  }
 
   const double real_time = result.GetAdjustedRealTime();
   const double cpu_time = result.GetAdjustedCPUTime();
@@ -150,7 +138,7 @@ void ConsoleReporter::PrintRunData(const Run& result) {
   for (auto& c : result.counters) {
     const std::size_t cNameLen = std::max(std::string::size_type(10),
                                           c.first.length());
-    auto const& s = HumanReadableNumber(c.second.value, 1000);
+    auto const& s = HumanReadableNumber(c.second.value, c.second.oneK);
     if (output_options_ & OO_Tabular) {
       if (c.second.flags & Counter::kIsRate) {
         printer(Out, COLOR_DEFAULT, " %*s/s", cNameLen - 2, s.c_str());
@@ -164,14 +152,6 @@ void ConsoleReporter::PrintRunData(const Run& result) {
     }
   }
 
-  if (!rate.empty()) {
-    printer(Out, COLOR_DEFAULT, " %*s", 13, rate.c_str());
-  }
-
-  if (!items.empty()) {
-    printer(Out, COLOR_DEFAULT, " %*s", 18, items.c_str());
-  }
-
   if (!result.report_label.empty()) {
     printer(Out, COLOR_DEFAULT, " %s", result.report_label.c_str());
   }
diff --git a/src/csv_reporter.cc b/src/csv_reporter.cc
index 4a64190..d2f1d27 100644
--- a/src/csv_reporter.cc
+++ b/src/csv_reporter.cc
@@ -49,6 +49,8 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
     // save the names of all the user counters
     for (const auto& run : reports) {
       for (const auto& cnt : run.counters) {
+        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+          continue;
         user_counter_names_.insert(cnt.first);
       }
     }
@@ -69,6 +71,8 @@ void CSVReporter::ReportRuns(const std::vector<Run>& reports) {
     // check that all the current counters are saved in the name set
     for (const auto& run : reports) {
       for (const auto& cnt : run.counters) {
+        if (cnt.first == "bytes_per_second" || cnt.first == "items_per_second")
+          continue;
         CHECK(user_counter_names_.find(cnt.first) != user_counter_names_.end())
             << "All counters must be present in each run. "
             << "Counter named \"" << cnt.first
@@ -88,7 +92,7 @@ void CSVReporter::PrintRunData(const Run& run) {
 
   // Field with embedded double-quote characters must be doubled and the field
   // delimited with double-quotes.
-  std::string name = run.benchmark_name;
+  std::string name = run.benchmark_name();
   ReplaceAll(&name, "\"", "\"\"");
   Out << '"' << name << "\",";
   if (run.error_occurred) {
@@ -117,12 +121,12 @@ void CSVReporter::PrintRunData(const Run& run) {
   }
   Out << ",";
 
-  if (run.bytes_per_second > 0.0) {
-    Out << run.bytes_per_second;
+  if (run.counters.find("bytes_per_second") != run.counters.end()) {
+    Out << run.counters.at("bytes_per_second");
   }
   Out << ",";
-  if (run.items_per_second > 0.0) {
-    Out << run.items_per_second;
+  if (run.counters.find("items_per_second") != run.counters.end()) {
+    Out << run.counters.at("items_per_second");
   }
   Out << ",";
   if (!run.report_label.empty()) {
diff --git a/src/cycleclock.h b/src/cycleclock.h
index 00d5764..f5e37b0 100644
--- a/src/cycleclock.h
+++ b/src/cycleclock.h
@@ -41,7 +41,7 @@ extern "C" uint64_t __rdtsc();
 #pragma intrinsic(__rdtsc)
 #endif
 
-#ifndef BENCHMARK_OS_WINDOWS
+#if !defined(BENCHMARK_OS_WINDOWS) || defined(BENCHMARK_OS_MINGW)
 #include <sys/time.h>
 #include <time.h>
 #endif
diff --git a/src/internal_macros.h b/src/internal_macros.h
index b7e9203..5dbf4fd 100644
--- a/src/internal_macros.h
+++ b/src/internal_macros.h
@@ -11,9 +11,6 @@
 #ifndef __has_feature
 #define __has_feature(x) 0
 #endif
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
 
 #if defined(__clang__)
   #if !defined(COMPILER_CLANG)
@@ -43,6 +40,9 @@
   #define BENCHMARK_OS_CYGWIN 1
 #elif defined(_WIN32)
   #define BENCHMARK_OS_WINDOWS 1
+  #if defined(__MINGW32__)
+    #define BENCHMARK_OS_MINGW 1
+  #endif
 #elif defined(__APPLE__)
   #define BENCHMARK_OS_APPLE 1
   #include "TargetConditionals.h"
@@ -87,14 +87,6 @@
   #define BENCHMARK_MAYBE_UNUSED
 #endif
 
-#if defined(COMPILER_GCC) || __has_builtin(__builtin_unreachable)
-  #define BENCHMARK_UNREACHABLE() __builtin_unreachable()
-#elif defined(COMPILER_MSVC)
-  #define BENCHMARK_UNREACHABLE() __assume(false)
-#else
-  #define BENCHMARK_UNREACHABLE() ((void)0)
-#endif
-
 // clang-format on
 
 #endif  // BENCHMARK_INTERNAL_MACROS_H_
diff --git a/src/json_reporter.cc b/src/json_reporter.cc
index 611605a..6866c71 100644
--- a/src/json_reporter.cc
+++ b/src/json_reporter.cc
@@ -78,7 +78,12 @@ bool JSONReporter::ReportContext(const Context& context) {
   out << indent << FormatKV("date", walltime_value) << ",\n";
 
   if (Context::executable_name) {
-    out << indent << FormatKV("executable", Context::executable_name) << ",\n";
+    // windows uses backslash for its path separator,
+    // which must be escaped in JSON otherwise it blows up conforming JSON
+    // decoders
+    std::string executable_name = Context::executable_name;
+    ReplaceAll(&executable_name, "\\", "\\\\");
+    out << indent << FormatKV("executable", executable_name) << ",\n";
   }
 
   CPUInfo const& info = context.cpu_info;
@@ -154,7 +159,20 @@ void JSONReporter::Finalize() {
 void JSONReporter::PrintRunData(Run const& run) {
   std::string indent(6, ' ');
   std::ostream& out = GetOutputStream();
-  out << indent << FormatKV("name", run.benchmark_name) << ",\n";
+  out << indent << FormatKV("name", run.benchmark_name()) << ",\n";
+  out << indent << FormatKV("run_name", run.run_name) << ",\n";
+  out << indent << FormatKV("run_type", [&run]() -> const char* {
+    switch (run.run_type) {
+      case BenchmarkReporter::Run::RT_Iteration:
+        return "iteration";
+      case BenchmarkReporter::Run::RT_Aggregate:
+        return "aggregate";
+    }
+    BENCHMARK_UNREACHABLE();
+  }()) << ",\n";
+  if (run.run_type == BenchmarkReporter::Run::RT_Aggregate) {
+    out << indent << FormatKV("aggregate_name", run.aggregate_name) << ",\n";
+  }
   if (run.error_occurred) {
     out << indent << FormatKV("error_occurred", run.error_occurred) << ",\n";
     out << indent << FormatKV("error_message", run.error_message) << ",\n";
@@ -175,17 +193,16 @@ void JSONReporter::PrintRunData(Run const& run) {
   } else if (run.report_rms) {
     out << indent << FormatKV("rms", run.GetAdjustedCPUTime());
   }
-  if (run.bytes_per_second > 0.0) {
-    out << ",\n"
-        << indent << FormatKV("bytes_per_second", run.bytes_per_second);
-  }
-  if (run.items_per_second > 0.0) {
-    out << ",\n"
-        << indent << FormatKV("items_per_second", run.items_per_second);
-  }
+
   for (auto& c : run.counters) {
     out << ",\n" << indent << FormatKV(c.first, c.second);
   }
+
+  if (run.has_memory_result) {
+    out << ",\n" << indent << FormatKV("allocs_per_iter", run.allocs_per_iter);
+    out << ",\n" << indent << FormatKV("max_bytes_used", run.max_bytes_used);
+  }
+
   if (!run.report_label.empty()) {
     out << ",\n" << indent << FormatKV("label", run.report_label);
   }
diff --git a/src/reporter.cc b/src/reporter.cc
index 541661a..fe86617 100644
--- a/src/reporter.cc
+++ b/src/reporter.cc
@@ -72,6 +72,14 @@ const char *BenchmarkReporter::Context::executable_name;
 
 BenchmarkReporter::Context::Context() : cpu_info(CPUInfo::Get()) {}
 
+std::string BenchmarkReporter::Run::benchmark_name() const {
+  std::string name = run_name;
+  if (run_type == RT_Aggregate) {
+    name += "_" + aggregate_name;
+  }
+  return name;
+}
+
 double BenchmarkReporter::Run::GetAdjustedRealTime() const {
   double new_time = real_accumulated_time * GetTimeUnitMultiplier(time_unit);
   if (iterations != 0) new_time /= static_cast<double>(iterations);
diff --git a/src/sleep.cc b/src/sleep.cc
index 54aa04a..1512ac9 100644
--- a/src/sleep.cc
+++ b/src/sleep.cc
@@ -21,7 +21,7 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Windows.h>
+#include <windows.h>
 #endif
 
 namespace benchmark {
diff --git a/src/statistics.cc b/src/statistics.cc
index 612dda2..e821aec 100644
--- a/src/statistics.cc
+++ b/src/statistics.cc
@@ -91,13 +91,9 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
   // Accumulators.
   std::vector<double> real_accumulated_time_stat;
   std::vector<double> cpu_accumulated_time_stat;
-  std::vector<double> bytes_per_second_stat;
-  std::vector<double> items_per_second_stat;
 
   real_accumulated_time_stat.reserve(reports.size());
   cpu_accumulated_time_stat.reserve(reports.size());
-  bytes_per_second_stat.reserve(reports.size());
-  items_per_second_stat.reserve(reports.size());
 
   // All repetitions should be run with the same number of iterations so we
   // can take this information from the first benchmark.
@@ -123,13 +119,11 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
 
   // Populate the accumulators.
   for (Run const& run : reports) {
-    CHECK_EQ(reports[0].benchmark_name, run.benchmark_name);
+    CHECK_EQ(reports[0].benchmark_name(), run.benchmark_name());
     CHECK_EQ(run_iterations, run.iterations);
     if (run.error_occurred) continue;
     real_accumulated_time_stat.emplace_back(run.real_accumulated_time);
     cpu_accumulated_time_stat.emplace_back(run.cpu_accumulated_time);
-    items_per_second_stat.emplace_back(run.items_per_second);
-    bytes_per_second_stat.emplace_back(run.bytes_per_second);
     // user counters
     for (auto const& cnt : run.counters) {
       auto it = counter_stats.find(cnt.first);
@@ -147,24 +141,43 @@ std::vector<BenchmarkReporter::Run> ComputeStats(
     }
   }
 
+  const double iteration_rescale_factor =
+      double(reports.size()) / double(run_iterations);
+
   for (const auto& Stat : *reports[0].statistics) {
     // Get the data from the accumulator to BenchmarkReporter::Run's.
     Run data;
-    data.benchmark_name = reports[0].benchmark_name + "_" + Stat.name_;
+    data.run_name = reports[0].benchmark_name();
+    data.run_type = BenchmarkReporter::Run::RT_Aggregate;
+    data.aggregate_name = Stat.name_;
     data.report_label = report_label;
-    data.iterations = run_iterations;
+
+    // It is incorrect to say that an aggregate is computed over
+    // run's iterations, because those iterations already got averaged.
+    // Similarly, if there are N repetitions with 1 iterations each,
+    // an aggregate will be computed over N measurements, not 1.
+    // Thus it is best to simply use the count of separate reports.
+    data.iterations = reports.size();
 
     data.real_accumulated_time = Stat.compute_(real_accumulated_time_stat);
     data.cpu_accumulated_time = Stat.compute_(cpu_accumulated_time_stat);
-    data.bytes_per_second = Stat.compute_(bytes_per_second_stat);
-    data.items_per_second = Stat.compute_(items_per_second_stat);
+
+    // We will divide these times by data.iterations when reporting, but the
+    // data.iterations is not nessesairly the scale of these measurements,
+    // because in each repetition, these timers are sum over all the iterations.
+    // And if we want to say that the stats are over N repetitions and not
+    // M iterations, we need to multiply these by (N/M).
+    data.real_accumulated_time *= iteration_rescale_factor;
+    data.cpu_accumulated_time *= iteration_rescale_factor;
 
     data.time_unit = reports[0].time_unit;
 
     // user counters
     for (auto const& kv : counter_stats) {
+      // Do *NOT* rescale the custom counters. They are already properly scaled.
       const auto uc_stat = Stat.compute_(kv.second.s);
-      auto c = Counter(uc_stat, counter_stats[kv.first].c.flags);
+      auto c = Counter(uc_stat, counter_stats[kv.first].c.flags,
+                       counter_stats[kv.first].c.oneK);
       data.counters[kv.first] = c;
     }
 
diff --git a/src/sysinfo.cc b/src/sysinfo.cc
index 73064b9..d740047 100644
--- a/src/sysinfo.cc
+++ b/src/sysinfo.cc
@@ -15,10 +15,10 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Shlwapi.h>
+#include <shlwapi.h>
 #undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
-#include <VersionHelpers.h>
-#include <Windows.h>
+#include <versionhelpers.h>
+#include <windows.h>
 #else
 #include <fcntl.h>
 #ifndef BENCHMARK_OS_FUCHSIA
@@ -288,7 +288,7 @@ std::vector<CPUInfo::CacheInfo> GetCacheSizesMacOSX() {
     std::string name;
     std::string type;
     int level;
-    size_t num_sharing;
+    uint64_t num_sharing;
   } Cases[] = {{"hw.l1dcachesize", "Data", 1, CacheCounts[1]},
                {"hw.l1icachesize", "Instruction", 1, CacheCounts[1]},
                {"hw.l2cachesize", "Unified", 2, CacheCounts[2]},
@@ -404,7 +404,13 @@ int GetNumCPUs() {
     if (ln.empty()) continue;
     size_t SplitIdx = ln.find(':');
     std::string value;
+#if defined(__s390__)
+    // s390 has another format in /proc/cpuinfo
+    // it needs to be parsed differently
+    if (SplitIdx != std::string::npos) value = ln.substr(Key.size()+1,SplitIdx-Key.size()-1);
+#else
     if (SplitIdx != std::string::npos) value = ln.substr(SplitIdx + 1);
+#endif
     if (ln.size() >= Key.size() && ln.compare(0, Key.size(), Key) == 0) {
       NumCPUs++;
       if (!value.empty()) {
diff --git a/src/thread_manager.h b/src/thread_manager.h
index 82b4d72..6e274c7 100644
--- a/src/thread_manager.h
+++ b/src/thread_manager.h
@@ -42,8 +42,6 @@ class ThreadManager {
     double real_time_used = 0;
     double cpu_time_used = 0;
     double manual_time_used = 0;
-    int64_t bytes_processed = 0;
-    int64_t items_processed = 0;
     int64_t complexity_n = 0;
     std::string report_label_;
     std::string error_message_;
diff --git a/src/timers.cc b/src/timers.cc
index 2010e24..7613ff9 100644
--- a/src/timers.cc
+++ b/src/timers.cc
@@ -16,10 +16,10 @@
 #include "internal_macros.h"
 
 #ifdef BENCHMARK_OS_WINDOWS
-#include <Shlwapi.h>
+#include <shlwapi.h>
 #undef StrCat  // Don't let StrCat in string_util.h be renamed to lstrcatA
-#include <VersionHelpers.h>
-#include <Windows.h>
+#include <versionhelpers.h>
+#include <windows.h>
 #else
 #include <fcntl.h>
 #ifndef BENCHMARK_OS_FUCHSIA
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index f49ca51..f15ce20 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -125,9 +125,21 @@ add_test(templated_fixture_test templated_fixture_test --benchmark_min_time=0.01
 compile_output_test(user_counters_test)
 add_test(user_counters_test user_counters_test --benchmark_min_time=0.01)
 
+compile_output_test(report_aggregates_only_test)
+add_test(report_aggregates_only_test report_aggregates_only_test --benchmark_min_time=0.01)
+
+compile_output_test(display_aggregates_only_test)
+add_test(display_aggregates_only_test display_aggregates_only_test --benchmark_min_time=0.01)
+
 compile_output_test(user_counters_tabular_test)
 add_test(user_counters_tabular_test user_counters_tabular_test --benchmark_counters_tabular=true --benchmark_min_time=0.01)
 
+compile_output_test(user_counters_thousands_test)
+add_test(user_counters_thousands_test user_counters_thousands_test --benchmark_min_time=0.01)
+
+compile_output_test(memory_manager_test)
+add_test(memory_manager_test memory_manager_test --benchmark_min_time=0.01)
+
 check_cxx_compiler_flag(-std=c++03 BENCHMARK_HAS_CXX03_FLAG)
 if (BENCHMARK_HAS_CXX03_FLAG)
   compile_benchmark_test(cxx03_test)
diff --git a/test/complexity_test.cc b/test/complexity_test.cc
index c732e6e..323ddfe 100644
--- a/test/complexity_test.cc
+++ b/test/complexity_test.cc
@@ -12,9 +12,10 @@ namespace {
 #define ADD_COMPLEXITY_CASES(...) \
   int CONCAT(dummy, __LINE__) = AddComplexityTest(__VA_ARGS__)
 
-int AddComplexityTest(std::string big_o_test_name, std::string rms_test_name,
-                      std::string big_o) {
-  SetSubstitutions({{"%bigo_name", big_o_test_name},
+int AddComplexityTest(std::string test_name, std::string big_o_test_name,
+                      std::string rms_test_name, std::string big_o) {
+  SetSubstitutions({{"%name", test_name},
+                    {"%bigo_name", big_o_test_name},
                     {"%rms_name", rms_test_name},
                     {"%bigo_str", "[ ]* %float " + big_o},
                     {"%bigo", big_o},
@@ -25,12 +26,18 @@ int AddComplexityTest(std::string big_o_test_name, std::string rms_test_name,
        {"^%bigo_name", MR_Not},  // Assert we we didn't only matched a name.
        {"^%rms_name %rms %rms[ ]*$", MR_Next}});
   AddCases(TC_JSONOut, {{"\"name\": \"%bigo_name\",$"},
+                        {"\"run_name\": \"%name\",$", MR_Next},
+                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                        {"\"aggregate_name\": \"BigO\",$", MR_Next},
                         {"\"cpu_coefficient\": %float,$", MR_Next},
                         {"\"real_coefficient\": %float,$", MR_Next},
                         {"\"big_o\": \"%bigo\",$", MR_Next},
                         {"\"time_unit\": \"ns\"$", MR_Next},
                         {"}", MR_Next},
                         {"\"name\": \"%rms_name\",$"},
+                        {"\"run_name\": \"%name\",$", MR_Next},
+                        {"\"run_type\": \"aggregate\",$", MR_Next},
+                        {"\"aggregate_name\": \"RMS\",$", MR_Next},
                         {"\"rms\": %float$", MR_Next},
                         {"}", MR_Next}});
   AddCases(TC_CSVOut, {{"^\"%bigo_name\",,%float,%float,%bigo,,,,,$"},
@@ -59,6 +66,7 @@ BENCHMARK(BM_Complexity_O1)->Range(1, 1 << 18)->Complexity([](int64_t) {
   return 1.0;
 });
 
+const char *one_test_name = "BM_Complexity_O1";
 const char *big_o_1_test_name = "BM_Complexity_O1_BigO";
 const char *rms_o_1_test_name = "BM_Complexity_O1_RMS";
 const char *enum_big_o_1 = "\\([0-9]+\\)";
@@ -69,13 +77,16 @@ const char *auto_big_o_1 = "(\\([0-9]+\\))|(lgN)";
 const char *lambda_big_o_1 = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, enum_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     enum_big_o_1);
 
 // Add auto enum tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, auto_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     auto_big_o_1);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_1_test_name, rms_o_1_test_name, lambda_big_o_1);
+ADD_COMPLEXITY_CASES(one_test_name, big_o_1_test_name, rms_o_1_test_name,
+                     lambda_big_o_1);
 
 // ========================================================================= //
 // --------------------------- Testing BigO O(N) --------------------------- //
@@ -85,7 +96,7 @@ std::vector<int> ConstructRandomVector(int64_t size) {
   std::vector<int> v;
   v.reserve(static_cast<int>(size));
   for (int i = 0; i < size; ++i) {
-    v.push_back(std::rand() % size);
+    v.push_back(static_cast<int>(std::rand() % size));
   }
   return v;
 }
@@ -106,22 +117,25 @@ BENCHMARK(BM_Complexity_O_N)
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int64_t n) -> double { return n; });
+    ->Complexity([](int64_t n) -> double { return static_cast<double>(n); });
 BENCHMARK(BM_Complexity_O_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
     ->Complexity();
 
+const char *n_test_name = "BM_Complexity_O_N";
 const char *big_o_n_test_name = "BM_Complexity_O_N_BigO";
 const char *rms_o_n_test_name = "BM_Complexity_O_N_RMS";
 const char *enum_auto_big_o_n = "N";
 const char *lambda_big_o_n = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, enum_auto_big_o_n);
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     enum_auto_big_o_n);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_n_test_name, rms_o_n_test_name, lambda_big_o_n);
+ADD_COMPLEXITY_CASES(n_test_name, big_o_n_test_name, rms_o_n_test_name,
+                     lambda_big_o_n);
 
 // ========================================================================= //
 // ------------------------- Testing BigO O(N*lgN) ------------------------- //
@@ -142,24 +156,25 @@ BENCHMARK(BM_Complexity_O_N_log_N)
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
-    ->Complexity([](int64_t n) { return kLog2E * n * log(n); });
+    ->Complexity([](int64_t n) { return kLog2E * n * log(static_cast<double>(n)); });
 BENCHMARK(BM_Complexity_O_N_log_N)
     ->RangeMultiplier(2)
     ->Range(1 << 10, 1 << 16)
     ->Complexity();
 
+const char *n_lg_n_test_name = "BM_Complexity_O_N_log_N";
 const char *big_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_BigO";
 const char *rms_o_n_lg_n_test_name = "BM_Complexity_O_N_log_N_RMS";
 const char *enum_auto_big_o_n_lg_n = "NlgN";
 const char *lambda_big_o_n_lg_n = "f\\(N\\)";
 
 // Add enum tests
-ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name,
-                     enum_auto_big_o_n_lg_n);
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, enum_auto_big_o_n_lg_n);
 
 // Add lambda tests
-ADD_COMPLEXITY_CASES(big_o_n_lg_n_test_name, rms_o_n_lg_n_test_name,
-                     lambda_big_o_n_lg_n);
+ADD_COMPLEXITY_CASES(n_lg_n_test_name, big_o_n_lg_n_test_name,
+                     rms_o_n_lg_n_test_name, lambda_big_o_n_lg_n);
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
diff --git a/test/display_aggregates_only_test.cc b/test/display_aggregates_only_test.cc
new file mode 100644
index 0000000..3c36d3f
--- /dev/null
+++ b/test/display_aggregates_only_test.cc
@@ -0,0 +1,43 @@
+
+#undef NDEBUG
+#include <cstdio>
+#include <string>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// Ok this test is super ugly. We want to check what happens with the file
+// reporter in the presence of DisplayAggregatesOnly().
+// We do not care about console output, the normal tests check that already.
+
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->DisplayAggregatesOnly();
+
+int main(int argc, char* argv[]) {
+  const std::string output = GetFileReporterOutput(argc, argv);
+
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 6 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3\"") != 3 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
+          1) {
+    std::cout << "Precondition mismatch. Expected to only find 6 "
+                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "output:\n";
+    std::cout << output;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/test/memory_manager_test.cc b/test/memory_manager_test.cc
new file mode 100644
index 0000000..94be608
--- /dev/null
+++ b/test/memory_manager_test.cc
@@ -0,0 +1,42 @@
+#include <memory>
+
+#include "../src/check.h"
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+class TestMemoryManager : public benchmark::MemoryManager {
+  void Start() {}
+  void Stop(Result* result) {
+    result->num_allocs = 42;
+    result->max_bytes_used = 42000;
+  }
+};
+
+void BM_empty(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(state.iterations());
+  }
+}
+BENCHMARK(BM_empty);
+
+ADD_CASES(TC_ConsoleOut, {{"^BM_empty %console_report$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_empty\",$"},
+                       {"\"run_name\": \"BM_empty\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
+                       {"\"iterations\": %int,$", MR_Next},
+                       {"\"real_time\": %float,$", MR_Next},
+                       {"\"cpu_time\": %float,$", MR_Next},
+                       {"\"time_unit\": \"ns\",$", MR_Next},
+                       {"\"allocs_per_iter\": %float,$", MR_Next},
+                       {"\"max_bytes_used\": 42000$", MR_Next},
+                       {"}", MR_Next}});
+ADD_CASES(TC_CSVOut, {{"^\"BM_empty\",%csv_report$"}});
+
+
+int main(int argc, char *argv[]) {
+  std::unique_ptr<benchmark::MemoryManager> mm(new TestMemoryManager());
+
+  benchmark::RegisterMemoryManager(mm.get());
+  RunOutputTests(argc, argv);
+  benchmark::RegisterMemoryManager(nullptr);
+}
diff --git a/test/output_test.h b/test/output_test.h
index 31a9199..9385761 100644
--- a/test/output_test.h
+++ b/test/output_test.h
@@ -60,6 +60,13 @@ int SetSubstitutions(
 // Run all output tests.
 void RunOutputTests(int argc, char* argv[]);
 
+// Count the number of 'pat' substrings in the 'haystack' string.
+int SubstrCnt(const std::string& haystack, const std::string& pat);
+
+// Run registered benchmarks with file reporter enabled, and return the content
+// outputted by the file reporter.
+std::string GetFileReporterOutput(int argc, char* argv[]);
+
 // ========================================================================= //
 // ------------------------- Results checking ------------------------------ //
 // ========================================================================= //
diff --git a/test/output_test_helper.cc b/test/output_test_helper.cc
index 394c4f5..f2da752 100644
--- a/test/output_test_helper.cc
+++ b/test/output_test_helper.cc
@@ -1,8 +1,11 @@
+#include <cstdio>
 #include <cstring>
+#include <fstream>
 #include <iostream>
 #include <map>
 #include <memory>
 #include <sstream>
+#include <streambuf>
 
 #include "../src/benchmark_api_internal.h"
 #include "../src/check.h"  // NOTE: check.h is for internal use only!
@@ -41,9 +44,11 @@ SubMap& GetSubstitutions() {
       {"%hrfloat", "[0-9]*[.]?[0-9]+([eE][-+][0-9]+)?[kMGTPEZYmunpfazy]?"},
       {"%int", "[ ]*[0-9]+"},
       {" %s ", "[ ]+"},
-      {"%time", "[ ]*[0-9]{1,6} ns"},
-      {"%console_report", "[ ]*[0-9]{1,6} ns [ ]*[0-9]{1,6} ns [ ]*[0-9]+"},
-      {"%console_us_report", "[ ]*[0-9] us [ ]*[0-9] us [ ]*[0-9]+"},
+      {"%time", "[ ]*[0-9]+ ns"},
+      {"%console_report", "[ ]*[0-9]+ ns [ ]*[0-9]+ ns [ ]*[0-9]+"},
+      {"%console_time_only_report", "[ ]*[0-9]+ ns [ ]*[0-9]+ ns"},
+      {"%console_us_report", "[ ]*[0-9]+ us [ ]*[0-9]+ us [ ]*[0-9]+"},
+      {"%console_us_time_only_report", "[ ]*[0-9]+ us [ ]*[0-9]+ us"},
       {"%csv_header",
        "name,iterations,real_time,cpu_time,time_unit,bytes_per_second,"
        "items_per_second,label,error_occurred,error_message"},
@@ -423,3 +428,37 @@ void RunOutputTests(int argc, char* argv[]) {
   CHECK(std::strcmp(csv.name, "CSVReporter") == 0);
   internal::GetResultsChecker().CheckResults(csv.out_stream);
 }
+
+int SubstrCnt(const std::string& haystack, const std::string& pat) {
+  if (pat.length() == 0) return 0;
+  int count = 0;
+  for (size_t offset = haystack.find(pat); offset != std::string::npos;
+       offset = haystack.find(pat, offset + pat.length()))
+    ++count;
+  return count;
+}
+
+std::string GetFileReporterOutput(int argc, char* argv[]) {
+  std::vector<char*> new_argv(argv, argv + argc);
+  assert(static_cast<decltype(new_argv)::size_type>(argc) == new_argv.size());
+
+  std::string tmp_file_name = std::tmpnam(nullptr);
+  std::cout << "Will be using this as the tmp file: " << tmp_file_name << '\n';
+
+  std::string tmp = "--benchmark_out=";
+  tmp += tmp_file_name;
+  new_argv.emplace_back(const_cast<char*>(tmp.c_str()));
+
+  argc = int(new_argv.size());
+
+  benchmark::Initialize(&argc, new_argv.data());
+  benchmark::RunSpecifiedBenchmarks();
+
+  // Read the output back from the file, and delete the file.
+  std::ifstream tmp_stream(tmp_file_name);
+  std::string output = std::string((std::istreambuf_iterator<char>(tmp_stream)),
+                                   std::istreambuf_iterator<char>());
+  std::remove(tmp_file_name.c_str());
+
+  return output;
+}
diff --git a/test/register_benchmark_test.cc b/test/register_benchmark_test.cc
index 18de6d6..3ac5b21 100644
--- a/test/register_benchmark_test.cc
+++ b/test/register_benchmark_test.cc
@@ -30,8 +30,8 @@ struct TestCase {
 
   void CheckRun(Run const& run) const {
     // clang-format off
-    CHECK(name == run.benchmark_name) << "expected " << name << " got "
-                                      << run.benchmark_name;
+    CHECK(name == run.benchmark_name()) << "expected " << name << " got "
+                                      << run.benchmark_name();
     if (label) {
       CHECK(run.report_label == label) << "expected " << label << " got "
                                        << run.report_label;
diff --git a/test/report_aggregates_only_test.cc b/test/report_aggregates_only_test.cc
new file mode 100644
index 0000000..9646b9b
--- /dev/null
+++ b/test/report_aggregates_only_test.cc
@@ -0,0 +1,39 @@
+
+#undef NDEBUG
+#include <cstdio>
+#include <string>
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// Ok this test is super ugly. We want to check what happens with the file
+// reporter in the presence of ReportAggregatesOnly().
+// We do not care about console output, the normal tests check that already.
+
+void BM_SummaryRepeat(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
+
+int main(int argc, char* argv[]) {
+  const std::string output = GetFileReporterOutput(argc, argv);
+
+  if (SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3") != 3 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_mean\"") != 1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_median\"") !=
+          1 ||
+      SubstrCnt(output, "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"") !=
+          1) {
+    std::cout << "Precondition mismatch. Expected to only find three "
+                 "occurrences of \"BM_SummaryRepeat/repeats:3\" substring:\n"
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_mean\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_median\", "
+                 "\"name\": \"BM_SummaryRepeat/repeats:3_stddev\"\nThe entire "
+                 "output:\n";
+    std::cout << output;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/test/reporter_output_test.cc b/test/reporter_output_test.cc
index 1662fcb..80314b3 100644
--- a/test/reporter_output_test.cc
+++ b/test/reporter_output_test.cc
@@ -17,13 +17,14 @@ static int AddContextCases() {
            {
                {"%int[-/]%int[-/]%int %int:%int:%int$", MR_Default},
                {"Running .*/reporter_output_test(\\.exe)?$", MR_Next},
-               {"Run on \\(%int X %float MHz CPU s\\)", MR_Next},
+               {"Run on \\(%int X %float MHz CPU s?\\)", MR_Next},
            });
   AddCases(TC_JSONOut,
            {{"^\\{", MR_Default},
             {"\"context\":", MR_Next},
             {"\"date\": \"", MR_Next},
-            {"\"executable\": \".*/reporter_output_test(\\.exe)?\",", MR_Next},
+            {"\"executable\": \".*(/|\\\\)reporter_output_test(\\.exe)?\",",
+             MR_Next},
             {"\"num_cpus\": %int,$", MR_Next},
             {"\"mhz_per_cpu\": %float,$", MR_Next},
             {"\"cpu_scaling_enabled\": ", MR_Next},
@@ -64,6 +65,8 @@ BENCHMARK(BM_basic);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_basic %console_report$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_basic\",$"},
+                       {"\"run_name\": \"BM_basic\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -82,9 +85,11 @@ void BM_bytes_per_second(benchmark::State& state) {
 }
 BENCHMARK(BM_bytes_per_second);
 
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_bytes_per_second %console_report +%float[kM]{0,1}B/s$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_bytes_per_second %console_report "
+                           "bytes_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_bytes_per_second\",$"},
+                       {"\"run_name\": \"BM_bytes_per_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -104,9 +109,11 @@ void BM_items_per_second(benchmark::State& state) {
 }
 BENCHMARK(BM_items_per_second);
 
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_items_per_second %console_report +%float[kM]{0,1} items/s$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_items_per_second %console_report "
+                           "items_per_second=%float[kM]{0,1}/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_items_per_second\",$"},
+                       {"\"run_name\": \"BM_items_per_second\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -128,6 +135,8 @@ BENCHMARK(BM_label);
 
 ADD_CASES(TC_ConsoleOut, {{"^BM_label %console_report some label$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_label\",$"},
+                       {"\"run_name\": \"BM_label\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -149,6 +158,8 @@ void BM_error(benchmark::State& state) {
 BENCHMARK(BM_error);
 ADD_CASES(TC_ConsoleOut, {{"^BM_error[ ]+ERROR OCCURRED: 'message'$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_error\",$"},
+                       {"\"run_name\": \"BM_error\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"error_occurred\": true,$", MR_Next},
                        {"\"error_message\": \"message\",$", MR_Next}});
 
@@ -165,7 +176,9 @@ void BM_no_arg_name(benchmark::State& state) {
 }
 BENCHMARK(BM_no_arg_name)->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_no_arg_name/3 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_no_arg_name/3\",$"},
+                       {"\"run_name\": \"BM_no_arg_name/3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_no_arg_name/3\",%csv_report$"}});
 
 // ========================================================================= //
@@ -178,7 +191,9 @@ void BM_arg_name(benchmark::State& state) {
 }
 BENCHMARK(BM_arg_name)->ArgName("first")->Arg(3);
 ADD_CASES(TC_ConsoleOut, {{"^BM_arg_name/first:3 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_name/first:3\",$"},
+                       {"\"run_name\": \"BM_arg_name/first:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_name/first:3\",%csv_report$"}});
 
 // ========================================================================= //
@@ -192,7 +207,10 @@ void BM_arg_names(benchmark::State& state) {
 BENCHMARK(BM_arg_names)->Args({2, 5, 4})->ArgNames({"first", "", "third"});
 ADD_CASES(TC_ConsoleOut,
           {{"^BM_arg_names/first:2/5/third:4 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_arg_names/first:2/5/third:4\",$"},
+           {"\"run_name\": \"BM_arg_names/first:2/5/third:4\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_arg_names/first:2/5/third:4\",%csv_report$"}});
 
 // ========================================================================= //
@@ -221,16 +239,33 @@ void BM_Repeat(benchmark::State& state) {
 }
 // need two repetitions min to be able to output any aggregate output
 BENCHMARK(BM_Repeat)->Repetitions(2);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:2 %console_report$"},
-                          {"^BM_Repeat/repeats:2 %console_report$"},
-                          {"^BM_Repeat/repeats:2_mean %console_report$"},
-                          {"^BM_Repeat/repeats:2_median %console_report$"},
-                          {"^BM_Repeat/repeats:2_stddev %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:2 %console_report$"},
+           {"^BM_Repeat/repeats:2 %console_report$"},
+           {"^BM_Repeat/repeats:2_mean %console_time_only_report [ ]*2$"},
+           {"^BM_Repeat/repeats:2_median %console_time_only_report [ ]*2$"},
+           {"^BM_Repeat/repeats:2_stddev %console_time_only_report [ ]*2$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\"", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:2_median\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"}});
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:2_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:2\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 2,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2_mean\",%csv_report$"},
@@ -238,18 +273,37 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:2\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:2_stddev\",%csv_report$"}});
 // but for two repetitions, mean and median is the same, so let's repeat..
 BENCHMARK(BM_Repeat)->Repetitions(3);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3 %console_report$"},
-                          {"^BM_Repeat/repeats:3_mean %console_report$"},
-                          {"^BM_Repeat/repeats:3_median %console_report$"},
-                          {"^BM_Repeat/repeats:3_stddev %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3 %console_report$"},
+           {"^BM_Repeat/repeats:3_mean %console_time_only_report [ ]*3$"},
+           {"^BM_Repeat/repeats:3_median %console_time_only_report [ ]*3$"},
+           {"^BM_Repeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:3_median\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"}});
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:3_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:3\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3\",%csv_report$"},
@@ -258,20 +312,41 @@ ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:3\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:3_stddev\",%csv_report$"}});
 // median differs between even/odd number of repetitions, so just to be sure
 BENCHMARK(BM_Repeat)->Repetitions(4);
-ADD_CASES(TC_ConsoleOut, {{"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4 %console_report$"},
-                          {"^BM_Repeat/repeats:4_mean %console_report$"},
-                          {"^BM_Repeat/repeats:4_median %console_report$"},
-                          {"^BM_Repeat/repeats:4_stddev %console_report$"}});
+ADD_CASES(TC_ConsoleOut,
+          {{"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4 %console_report$"},
+           {"^BM_Repeat/repeats:4_mean %console_time_only_report [ ]*4$"},
+           {"^BM_Repeat/repeats:4_median %console_time_only_report [ ]*4$"},
+           {"^BM_Repeat/repeats:4_stddev %console_time_only_report [ ]*4$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_mean\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"aggregate_name\": \"mean\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next},
                        {"\"name\": \"BM_Repeat/repeats:4_median\",$"},
-                       {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"}});
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"aggregate_name\": \"median\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next},
+                       {"\"name\": \"BM_Repeat/repeats:4_stddev\",$"},
+                       {"\"run_name\": \"BM_Repeat/repeats:4\",$", MR_Next},
+                       {"\"run_type\": \"aggregate\",$", MR_Next},
+                       {"\"aggregate_name\": \"stddev\",$", MR_Next},
+                       {"\"iterations\": 4,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Repeat/repeats:4\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:4\",%csv_report$"},
                       {"^\"BM_Repeat/repeats:4\",%csv_report$"},
@@ -288,7 +363,9 @@ void BM_RepeatOnce(benchmark::State& state) {
 }
 BENCHMARK(BM_RepeatOnce)->Repetitions(1)->ReportAggregatesOnly();
 ADD_CASES(TC_ConsoleOut, {{"^BM_RepeatOnce/repeats:1 %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"}});
+ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_RepeatOnce/repeats:1\",$"},
+                       {"\"run_name\": \"BM_RepeatOnce/repeats:1\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_RepeatOnce/repeats:1\",%csv_report$"}});
 
 // Test that non-aggregate data is not reported
@@ -297,20 +374,72 @@ void BM_SummaryRepeat(benchmark::State& state) {
   }
 }
 BENCHMARK(BM_SummaryRepeat)->Repetitions(3)->ReportAggregatesOnly();
-ADD_CASES(TC_ConsoleOut,
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
+     {"^BM_SummaryRepeat/repeats:3_mean %console_time_only_report [ ]*3$"},
+     {"^BM_SummaryRepeat/repeats:3_median %console_time_only_report [ ]*3$"},
+     {"^BM_SummaryRepeat/repeats:3_stddev %console_time_only_report [ ]*3$"}});
+ADD_CASES(TC_JSONOut,
           {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
-           {"^BM_SummaryRepeat/repeats:3_mean %console_report$"},
-           {"^BM_SummaryRepeat/repeats:3_median %console_report$"},
-           {"^BM_SummaryRepeat/repeats:3_stddev %console_report$"}});
-ADD_CASES(TC_JSONOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
-                       {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"}});
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_mean\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_median\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"name\": \"BM_SummaryRepeat/repeats:3_stddev\",$"},
+           {"\"run_name\": \"BM_SummaryRepeat/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next}});
 ADD_CASES(TC_CSVOut, {{".*BM_SummaryRepeat/repeats:3 ", MR_Not},
                       {"^\"BM_SummaryRepeat/repeats:3_mean\",%csv_report$"},
                       {"^\"BM_SummaryRepeat/repeats:3_median\",%csv_report$"},
                       {"^\"BM_SummaryRepeat/repeats:3_stddev\",%csv_report$"}});
 
+// Test that non-aggregate data is not displayed.
+// NOTE: this test is kinda bad. we are only testing the display output.
+//       But we don't check that the file output still contains everything...
+void BM_SummaryDisplay(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+}
+BENCHMARK(BM_SummaryDisplay)->Repetitions(2)->DisplayAggregatesOnly();
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+     {"^BM_SummaryDisplay/repeats:2_mean %console_time_only_report [ ]*2$"},
+     {"^BM_SummaryDisplay/repeats:2_median %console_time_only_report [ ]*2$"},
+     {"^BM_SummaryDisplay/repeats:2_stddev %console_time_only_report [ ]*2$"}});
+ADD_CASES(TC_JSONOut,
+          {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_mean\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_median\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"name\": \"BM_SummaryDisplay/repeats:2_stddev\",$"},
+           {"\"run_name\": \"BM_SummaryDisplay/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next}});
+ADD_CASES(TC_CSVOut,
+          {{".*BM_SummaryDisplay/repeats:2 ", MR_Not},
+           {"^\"BM_SummaryDisplay/repeats:2_mean\",%csv_report$"},
+           {"^\"BM_SummaryDisplay/repeats:2_median\",%csv_report$"},
+           {"^\"BM_SummaryDisplay/repeats:2_stddev\",%csv_report$"}});
+
+// Test repeats with custom time unit.
 void BM_RepeatTimeUnit(benchmark::State& state) {
   for (auto _ : state) {
   }
@@ -319,18 +448,34 @@ BENCHMARK(BM_RepeatTimeUnit)
     ->Repetitions(3)
     ->ReportAggregatesOnly()
     ->Unit(benchmark::kMicrosecond);
-ADD_CASES(TC_ConsoleOut,
+ADD_CASES(
+    TC_ConsoleOut,
+    {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
+     {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_time_only_report [ ]*3$"},
+     {"^BM_RepeatTimeUnit/repeats:3_median %console_us_time_only_report [ "
+      "]*3$"},
+     {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_time_only_report [ "
+      "]*3$"}});
+ADD_CASES(TC_JSONOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
-           {"^BM_RepeatTimeUnit/repeats:3_mean %console_us_report$"},
-           {"^BM_RepeatTimeUnit/repeats:3_median %console_us_report$"},
-           {"^BM_RepeatTimeUnit/repeats:3_stddev %console_us_report$"}});
-ADD_CASES(TC_JSONOut, {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
-                       {"\"time_unit\": \"us\",?$"},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
-                       {"\"time_unit\": \"us\",?$"},
-                       {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
-                       {"\"time_unit\": \"us\",?$"}});
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_mean\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_median\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"},
+           {"\"name\": \"BM_RepeatTimeUnit/repeats:3_stddev\",$"},
+           {"\"run_name\": \"BM_RepeatTimeUnit/repeats:3\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 3,$", MR_Next},
+           {"\"time_unit\": \"us\",?$"}});
 ADD_CASES(TC_CSVOut,
           {{".*BM_RepeatTimeUnit/repeats:3 ", MR_Not},
            {"^\"BM_RepeatTimeUnit/repeats:3_mean\",%csv_us_report$"},
@@ -346,37 +491,92 @@ const auto UserStatistics = [](const std::vector<double>& v) {
 };
 void BM_UserStats(benchmark::State& state) {
   for (auto _ : state) {
+    state.SetIterationTime(150 / 10e8);
   }
 }
 // clang-format off
 BENCHMARK(BM_UserStats)
   ->Repetitions(3)
+  ->Iterations(5)
+  ->UseManualTime()
   ->ComputeStatistics("", UserStatistics);
 // clang-format on
 
 // check that user-provided stats is calculated, and is after the default-ones
 // empty string as name is intentional, it would sort before anything else
-ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/repeats:3 %console_report$"},
-                          {"^BM_UserStats/repeats:3 %console_report$"},
-                          {"^BM_UserStats/repeats:3 %console_report$"},
-                          {"^BM_UserStats/repeats:3_mean %console_report$"},
-                          {"^BM_UserStats/repeats:3_median %console_report$"},
-                          {"^BM_UserStats/repeats:3_stddev %console_report$"},
-                          {"^BM_UserStats/repeats:3_ %console_report$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_UserStats/repeats:3\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_mean\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_median\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_stddev\",$"},
-                       {"\"name\": \"BM_UserStats/repeats:3_\",$"}});
-ADD_CASES(TC_CSVOut, {{"^\"BM_UserStats/repeats:3\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_mean\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_median\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_stddev\",%csv_report$"},
-                      {"^\"BM_UserStats/repeats:3_\",%csv_report$"}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time [ "
+                           "]* 150 ns %time [ ]*5$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_mean [ ]* 150 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_median [ ]* 150 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/"
+                           "manual_time_stddev [ ]* 0 ns %time [ ]*3$"},
+                          {"^BM_UserStats/iterations:5/repeats:3/manual_time_ "
+                           "[ ]* 150 ns %time [ ]*3$"}});
+ADD_CASES(
+    TC_JSONOut,
+    {{"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"iteration\",$", MR_Next},
+     {"\"iterations\": 5,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"aggregate_name\": \"mean\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_median\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"aggregate_name\": \"median\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_stddev\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"aggregate_name\": \"stddev\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": %float,$", MR_Next},
+     {"\"name\": \"BM_UserStats/iterations:5/repeats:3/manual_time_\",$"},
+     {"\"run_name\": \"BM_UserStats/iterations:5/repeats:3/manual_time\",$",
+      MR_Next},
+     {"\"run_type\": \"aggregate\",$", MR_Next},
+     {"\"aggregate_name\": \"\",$", MR_Next},
+     {"\"iterations\": 3,$", MR_Next},
+     {"\"real_time\": 1\\.5(0)*e\\+(0)*2,$", MR_Next}});
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_mean\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/"
+      "manual_time_median\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/"
+      "manual_time_stddev\",%csv_report$"},
+     {"^\"BM_UserStats/iterations:5/repeats:3/manual_time_\",%csv_report$"}});
 
 // ========================================================================= //
 // --------------------------- TEST CASES END ------------------------------ //
diff --git a/test/skip_with_error_test.cc b/test/skip_with_error_test.cc
index 39785fb..0657977 100644
--- a/test/skip_with_error_test.cc
+++ b/test/skip_with_error_test.cc
@@ -33,8 +33,8 @@ struct TestCase {
   typedef benchmark::BenchmarkReporter::Run Run;
 
   void CheckRun(Run const& run) const {
-    CHECK(name == run.benchmark_name)
-        << "expected " << name << " got " << run.benchmark_name;
+    CHECK(name == run.benchmark_name())
+        << "expected " << name << " got " << run.benchmark_name();
     CHECK(error_occurred == run.error_occurred);
     CHECK(error_message == run.error_message);
     if (error_occurred) {
diff --git a/test/user_counters_tabular_test.cc b/test/user_counters_tabular_test.cc
index 4f126b6..030e989 100644
--- a/test/user_counters_tabular_test.cc
+++ b/test/user_counters_tabular_test.cc
@@ -69,18 +69,21 @@ void BM_Counters_Tabular(benchmark::State& state) {
   });
 }
 BENCHMARK(BM_Counters_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float,$", MR_Next},
-                       {"\"Frob\": %float,$", MR_Next},
-                       {"\"Lob\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_Tabular/threads:%int\",%csv_report,"
                        "%float,%float,%float,%float,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -113,18 +116,22 @@ void BM_CounterRates_Tabular(benchmark::State& state) {
   });
 }
 BENCHMARK(BM_CounterRates_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float,$", MR_Next},
-                       {"\"Frob\": %float,$", MR_Next},
-                       {"\"Lob\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterRates_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterRates_Tabular/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float,$", MR_Next},
+           {"\"Frob\": %float,$", MR_Next},
+           {"\"Lob\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterRates_Tabular/threads:%int\",%csv_report,"
                        "%float,%float,%float,%float,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -157,15 +164,18 @@ void BM_CounterSet0_Tabular(benchmark::State& state) {
   });
 }
 BENCHMARK(BM_CounterSet0_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet0_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet0_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet0_Tabular/threads:%int\",%csv_report,"
                        "%float,,%float,%float,,"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -189,15 +199,18 @@ void BM_CounterSet1_Tabular(benchmark::State& state) {
   });
 }
 BENCHMARK(BM_CounterSet1_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bar\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet1_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet1_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bar\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet1_Tabular/threads:%int\",%csv_report,"
                        "%float,,%float,%float,,"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -225,15 +238,18 @@ void BM_CounterSet2_Tabular(benchmark::State& state) {
   });
 }
 BENCHMARK(BM_CounterSet2_Tabular)->ThreadRange(1, 16);
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"Bat\": %float,$", MR_Next},
-                       {"\"Baz\": %float,$", MR_Next},
-                       {"\"Foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_CounterSet2_Tabular/threads:%int\",$"},
+           {"\"run_name\": \"BM_CounterSet2_Tabular/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"Bat\": %float,$", MR_Next},
+           {"\"Baz\": %float,$", MR_Next},
+           {"\"Foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_CounterSet2_Tabular/threads:%int\",%csv_report,"
                        ",%float,%float,%float,,"}});
 // VS2013 does not allow this function to be passed as a lambda argument
diff --git a/test/user_counters_test.cc b/test/user_counters_test.cc
index 7f7ccb9..bb0d6b4 100644
--- a/test/user_counters_test.cc
+++ b/test/user_counters_test.cc
@@ -32,6 +32,8 @@ BENCHMARK(BM_Counters_Simple);
 ADD_CASES(TC_ConsoleOut,
           {{"^BM_Counters_Simple %console_report bar=%hrfloat foo=%hrfloat$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Simple\",$"},
+                       {"\"run_name\": \"BM_Counters_Simple\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -66,19 +68,22 @@ void BM_Counters_WithBytesAndItemsPSec(benchmark::State& state) {
   state.SetItemsProcessed(150);
 }
 BENCHMARK(BM_Counters_WithBytesAndItemsPSec);
-ADD_CASES(TC_ConsoleOut,
-          {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
-            "bar=%hrfloat foo=%hrfloat +%hrfloatB/s +%hrfloat items/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bytes_per_second\": %float,$", MR_Next},
-                       {"\"items_per_second\": %float,$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_WithBytesAndItemsPSec %console_report "
+                           "bar=%hrfloat bytes_per_second=%hrfloat/s "
+                           "foo=%hrfloat items_per_second=%hrfloat/s$"}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_WithBytesAndItemsPSec\",$"},
+           {"\"run_name\": \"BM_Counters_WithBytesAndItemsPSec\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"bytes_per_second\": %float,$", MR_Next},
+           {"\"foo\": %float,$", MR_Next},
+           {"\"items_per_second\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_WithBytesAndItemsPSec\","
                        "%csv_bytes_items_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -110,6 +115,8 @@ ADD_CASES(
     TC_ConsoleOut,
     {{"^BM_Counters_Rate %console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Rate\",$"},
+                       {"\"run_name\": \"BM_Counters_Rate\",$", MR_Next},
+                       {"\"run_type\": \"iteration\",$", MR_Next},
                        {"\"iterations\": %int,$", MR_Next},
                        {"\"real_time\": %float,$", MR_Next},
                        {"\"cpu_time\": %float,$", MR_Next},
@@ -141,14 +148,17 @@ void BM_Counters_Threads(benchmark::State& state) {
 BENCHMARK(BM_Counters_Threads)->ThreadRange(1, 8);
 ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_Threads/threads:%int %console_report "
                            "bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Threads/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_Threads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(
     TC_CSVOut,
     {{"^\"BM_Counters_Threads/threads:%int\",%csv_report,%float,%float$"}});
@@ -174,14 +184,17 @@ void BM_Counters_AvgThreads(benchmark::State& state) {
 BENCHMARK(BM_Counters_AvgThreads)->ThreadRange(1, 8);
 ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreads/threads:%int "
                            "%console_report bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgThreads/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_AvgThreads/threads:%int\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(
     TC_CSVOut,
     {{"^\"BM_Counters_AvgThreads/threads:%int\",%csv_report,%float,%float$"}});
@@ -210,6 +223,9 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgThreadsRate/threads:%int "
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$"},
+           {"\"run_name\": \"BM_Counters_AvgThreadsRate/threads:%int\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -242,14 +258,17 @@ void BM_Counters_IterationInvariant(benchmark::State& state) {
 BENCHMARK(BM_Counters_IterationInvariant);
 ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_IterationInvariant %console_report "
                            "bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_IterationInvariant\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_IterationInvariant\",$"},
+           {"\"run_name\": \"BM_Counters_IterationInvariant\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut,
           {{"^\"BM_Counters_IterationInvariant\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -281,6 +300,9 @@ ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kIsIterationInvariantRate "
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
 ADD_CASES(TC_JSONOut,
           {{"\"name\": \"BM_Counters_kIsIterationInvariantRate\",$"},
+           {"\"run_name\": \"BM_Counters_kIsIterationInvariantRate\",$",
+            MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
            {"\"iterations\": %int,$", MR_Next},
            {"\"real_time\": %float,$", MR_Next},
            {"\"cpu_time\": %float,$", MR_Next},
@@ -316,14 +338,17 @@ void BM_Counters_AvgIterations(benchmark::State& state) {
 BENCHMARK(BM_Counters_AvgIterations);
 ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_AvgIterations %console_report "
                            "bar=%hrfloat foo=%hrfloat$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_AvgIterations\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_AvgIterations\",$"},
+           {"\"run_name\": \"BM_Counters_AvgIterations\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut,
           {{"^\"BM_Counters_AvgIterations\",%csv_report,%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
@@ -351,14 +376,17 @@ void BM_Counters_kAvgIterationsRate(benchmark::State& state) {
 BENCHMARK(BM_Counters_kAvgIterationsRate);
 ADD_CASES(TC_ConsoleOut, {{"^BM_Counters_kAvgIterationsRate "
                            "%console_report bar=%hrfloat/s foo=%hrfloat/s$"}});
-ADD_CASES(TC_JSONOut, {{"\"name\": \"BM_Counters_kAvgIterationsRate\",$"},
-                       {"\"iterations\": %int,$", MR_Next},
-                       {"\"real_time\": %float,$", MR_Next},
-                       {"\"cpu_time\": %float,$", MR_Next},
-                       {"\"time_unit\": \"ns\",$", MR_Next},
-                       {"\"bar\": %float,$", MR_Next},
-                       {"\"foo\": %float$", MR_Next},
-                       {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_kAvgIterationsRate\",$"},
+           {"\"run_name\": \"BM_Counters_kAvgIterationsRate\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"bar\": %float,$", MR_Next},
+           {"\"foo\": %float$", MR_Next},
+           {"}", MR_Next}});
 ADD_CASES(TC_CSVOut, {{"^\"BM_Counters_kAvgIterationsRate\",%csv_report,"
                        "%float,%float$"}});
 // VS2013 does not allow this function to be passed as a lambda argument
diff --git a/test/user_counters_thousands_test.cc b/test/user_counters_thousands_test.cc
new file mode 100644
index 0000000..fa0ef97
--- /dev/null
+++ b/test/user_counters_thousands_test.cc
@@ -0,0 +1,161 @@
+
+#undef NDEBUG
+
+#include "benchmark/benchmark.h"
+#include "output_test.h"
+
+// ========================================================================= //
+// ------------------------ Thousands Customisation ------------------------ //
+// ========================================================================= //
+
+void BM_Counters_Thousands(benchmark::State& state) {
+  for (auto _ : state) {
+  }
+  namespace bm = benchmark;
+  state.counters.insert({
+      {"t0_1000000DefaultBase",
+       bm::Counter(1000 * 1000, bm::Counter::kDefaults)},
+      {"t1_1000000Base1000", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1000)},
+      {"t2_1000000Base1024", bm::Counter(1000 * 1000, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1024)},
+      {"t3_1048576Base1000", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1000)},
+      {"t4_1048576Base1024", bm::Counter(1024 * 1024, bm::Counter::kDefaults,
+                                         benchmark::Counter::OneK::kIs1024)},
+  });
+}
+BENCHMARK(BM_Counters_Thousands)->Repetitions(2);
+ADD_CASES(
+    TC_ConsoleOut,
+    {
+        {"^BM_Counters_Thousands/repeats:2 %console_report "
+         "t0_1000000DefaultBase=1000k "
+         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
+         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2 %console_report "
+         "t0_1000000DefaultBase=1000k "
+         "t1_1000000Base1000=1000k t2_1000000Base1024=976.56[23]k "
+         "t3_1048576Base1000=1048.58k t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_mean %console_report "
+         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
+         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
+         "t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_median %console_report "
+         "t0_1000000DefaultBase=1000k t1_1000000Base1000=1000k "
+         "t2_1000000Base1024=976.56[23]k t3_1048576Base1000=1048.58k "
+         "t4_1048576Base1024=1024k$"},
+        {"^BM_Counters_Thousands/repeats:2_stddev %console_time_only_report [ "
+         "]*2 t0_1000000DefaultBase=0 t1_1000000Base1000=0 "
+         "t2_1000000Base1024=0 t3_1048576Base1000=0 t4_1048576Base1024=0$"},
+    });
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"iteration\",$", MR_Next},
+           {"\"iterations\": %int,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_mean\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"mean\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_median\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"median\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t1_1000000Base1000\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t2_1000000Base1024\": 1\\.(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t3_1048576Base1000\": 1\\.048576(0)*e\\+(0)*6,$", MR_Next},
+           {"\"t4_1048576Base1024\": 1\\.048576(0)*e\\+(0)*6$", MR_Next},
+           {"}", MR_Next}});
+ADD_CASES(TC_JSONOut,
+          {{"\"name\": \"BM_Counters_Thousands/repeats:2_stddev\",$"},
+           {"\"run_name\": \"BM_Counters_Thousands/repeats:2\",$", MR_Next},
+           {"\"run_type\": \"aggregate\",$", MR_Next},
+           {"\"aggregate_name\": \"stddev\",$", MR_Next},
+           {"\"iterations\": 2,$", MR_Next},
+           {"\"real_time\": %float,$", MR_Next},
+           {"\"cpu_time\": %float,$", MR_Next},
+           {"\"time_unit\": \"ns\",$", MR_Next},
+           {"\"t0_1000000DefaultBase\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t1_1000000Base1000\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t2_1000000Base1024\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t3_1048576Base1000\": 0\\.(0)*e\\+(0)*,$", MR_Next},
+           {"\"t4_1048576Base1024\": 0\\.(0)*e\\+(0)*$", MR_Next},
+           {"}", MR_Next}});
+
+ADD_CASES(
+    TC_CSVOut,
+    {{"^\"BM_Counters_Thousands/"
+      "repeats:2\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\.04858e\\+("
+      "0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\.04858e\\+("
+      "0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2_mean\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\."
+      "04858e\\+(0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/"
+      "repeats:2_median\",%csv_report,1e\\+(0)*6,1e\\+(0)*6,1e\\+(0)*6,1\\."
+      "04858e\\+(0)*6,1\\.04858e\\+(0)*6$"},
+     {"^\"BM_Counters_Thousands/repeats:2_stddev\",%csv_report,0,0,0,0,0$"}});
+// VS2013 does not allow this function to be passed as a lambda argument
+// to CHECK_BENCHMARK_RESULTS()
+void CheckThousands(Results const& e) {
+  if (e.name != "BM_Counters_Thousands/repeats:2")
+    return;  // Do not check the aggregates!
+
+  // check that the values are within 0.01% of the expected values
+  CHECK_FLOAT_COUNTER_VALUE(e, "t0_1000000DefaultBase", EQ, 1000 * 1000,
+                            0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t1_1000000Base1000", EQ, 1000 * 1000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t2_1000000Base1024", EQ, 1000 * 1000, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t3_1048576Base1000", EQ, 1024 * 1024, 0.0001);
+  CHECK_FLOAT_COUNTER_VALUE(e, "t4_1048576Base1024", EQ, 1024 * 1024, 0.0001);
+}
+CHECK_BENCHMARK_RESULTS("BM_Counters_Thousands", &CheckThousands);
+
+// ========================================================================= //
+// --------------------------- TEST CASES END ------------------------------ //
+// ========================================================================= //
+
+int main(int argc, char* argv[]) { RunOutputTests(argc, argv); }
diff --git a/tools/compare.py b/tools/compare.py
index d27e24b..9ff5c14 100755
--- a/tools/compare.py
+++ b/tools/compare.py
@@ -36,6 +36,17 @@ def create_parser():
     parser = ArgumentParser(
         description='versatile benchmark output compare tool')
 
+    parser.add_argument(
+        '-a',
+        '--display_aggregates_only',
+        dest='display_aggregates_only',
+        action="store_true",
+        help="If there are repetitions, by default, we display everything - the"
+             " actual runs, and the aggregates computed. Sometimes, it is "
+             "desirable to only view the aggregates. E.g. when there are a lot "
+             "of repetitions. Do note that only the display is affected. "
+             "Internally, all the actual runs are still used, e.g. for U test.")
+
     utest = parser.add_argument_group()
     utest.add_argument(
         '--no-utest',
@@ -200,6 +211,9 @@ def main():
 
     check_inputs(test_baseline, test_contender, benchmark_options)
 
+    if args.display_aggregates_only:
+        benchmark_options += ['--benchmark_display_aggregates_only=true']
+
     options_baseline = []
     options_contender = []
 
@@ -223,7 +237,8 @@ def main():
 
     # Diff and output
     output_lines = gbench.report.generate_difference_report(
-        json1, json2, args.utest, args.utest_alpha)
+        json1, json2, args.display_aggregates_only,
+        args.utest, args.utest_alpha)
     print(description)
     for ln in output_lines:
         print(ln)
@@ -246,6 +261,7 @@ class TestParser(unittest.TestCase):
     def test_benchmarks_basic(self):
         parsed = self.parser.parse_args(
             ['benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarks')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
@@ -255,6 +271,7 @@ class TestParser(unittest.TestCase):
     def test_benchmarks_basic_without_utest(self):
         parsed = self.parser.parse_args(
             ['--no-utest', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertFalse(parsed.utest)
         self.assertEqual(parsed.utest_alpha, 0.05)
         self.assertEqual(parsed.mode, 'benchmarks')
@@ -262,9 +279,20 @@ class TestParser(unittest.TestCase):
         self.assertEqual(parsed.test_contender[0].name, self.testInput1)
         self.assertFalse(parsed.benchmark_options)
 
+    def test_benchmarks_basic_display_aggregates_only(self):
+        parsed = self.parser.parse_args(
+            ['-a', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertTrue(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
     def test_benchmarks_basic_with_utest_alpha(self):
         parsed = self.parser.parse_args(
             ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.utest_alpha, 0.314)
         self.assertEqual(parsed.mode, 'benchmarks')
@@ -275,6 +303,7 @@ class TestParser(unittest.TestCase):
     def test_benchmarks_basic_without_utest_with_utest_alpha(self):
         parsed = self.parser.parse_args(
             ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertFalse(parsed.utest)
         self.assertEqual(parsed.utest_alpha, 0.314)
         self.assertEqual(parsed.mode, 'benchmarks')
@@ -285,6 +314,7 @@ class TestParser(unittest.TestCase):
     def test_benchmarks_with_remainder(self):
         parsed = self.parser.parse_args(
             ['benchmarks', self.testInput0, self.testInput1, 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarks')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
@@ -294,6 +324,7 @@ class TestParser(unittest.TestCase):
     def test_benchmarks_with_remainder_after_doubleminus(self):
         parsed = self.parser.parse_args(
             ['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarks')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
@@ -303,6 +334,7 @@ class TestParser(unittest.TestCase):
     def test_filters_basic(self):
         parsed = self.parser.parse_args(
             ['filters', self.testInput0, 'c', 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'filters')
         self.assertEqual(parsed.test[0].name, self.testInput0)
@@ -313,6 +345,7 @@ class TestParser(unittest.TestCase):
     def test_filters_with_remainder(self):
         parsed = self.parser.parse_args(
             ['filters', self.testInput0, 'c', 'd', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'filters')
         self.assertEqual(parsed.test[0].name, self.testInput0)
@@ -323,6 +356,7 @@ class TestParser(unittest.TestCase):
     def test_filters_with_remainder_after_doubleminus(self):
         parsed = self.parser.parse_args(
             ['filters', self.testInput0, 'c', 'd', '--', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'filters')
         self.assertEqual(parsed.test[0].name, self.testInput0)
@@ -333,6 +367,7 @@ class TestParser(unittest.TestCase):
     def test_benchmarksfiltered_basic(self):
         parsed = self.parser.parse_args(
             ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarksfiltered')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
@@ -344,6 +379,7 @@ class TestParser(unittest.TestCase):
     def test_benchmarksfiltered_with_remainder(self):
         parsed = self.parser.parse_args(
             ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarksfiltered')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
@@ -355,6 +391,7 @@ class TestParser(unittest.TestCase):
     def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
         parsed = self.parser.parse_args(
             ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
+        self.assertFalse(parsed.display_aggregates_only)
         self.assertTrue(parsed.utest)
         self.assertEqual(parsed.mode, 'benchmarksfiltered')
         self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
@@ -365,7 +402,7 @@ class TestParser(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    # unittest.main()
+    #unittest.main()
     main()
 
 # vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
diff --git a/tools/compare_bench.py b/tools/compare_bench.py
deleted file mode 100755
index 7bbf0d0..0000000
--- a/tools/compare_bench.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env python
-"""
-compare_bench.py - Compare two benchmarks or their results and report the
-                   difference.
-"""
-import argparse
-from argparse import ArgumentParser
-import sys
-import gbench
-from gbench import util, report
-from gbench.util import *
-
-def check_inputs(in1, in2, flags):
-    """
-    Perform checking on the user provided inputs and diagnose any abnormalities
-    """
-    in1_kind, in1_err = classify_input_file(in1)
-    in2_kind, in2_err = classify_input_file(in2)
-    output_file = find_benchmark_flag('--benchmark_out=', flags)
-    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
-    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
-        print(("WARNING: '--benchmark_out=%s' will be passed to both "
-              "benchmarks causing it to be overwritten") % output_file)
-    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
-        print("WARNING: passing --benchmark flags has no effect since both "
-              "inputs are JSON")
-    if output_type is not None and output_type != 'json':
-        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare_bench.py`"
-              " is not supported.") % output_type)
-        sys.exit(1)
-
-
-def main():
-    parser = ArgumentParser(
-        description='compare the results of two benchmarks')
-    parser.add_argument(
-        'test1', metavar='test1', type=str, nargs=1,
-        help='A benchmark executable or JSON output file')
-    parser.add_argument(
-        'test2', metavar='test2', type=str, nargs=1,
-        help='A benchmark executable or JSON output file')
-    parser.add_argument(
-        'benchmark_options', metavar='benchmark_options', nargs=argparse.REMAINDER,
-        help='Arguments to pass when running benchmark executables'
-    )
-    args, unknown_args = parser.parse_known_args()
-    # Parse the command line flags
-    test1 = args.test1[0]
-    test2 = args.test2[0]
-    if unknown_args:
-        # should never happen
-        print("Unrecognized positional argument arguments: '%s'"
-              % unknown_args)
-        exit(1)
-    benchmark_options = args.benchmark_options
-    check_inputs(test1, test2, benchmark_options)
-    # Run the benchmarks and report the results
-    json1 = gbench.util.run_or_load_benchmark(test1, benchmark_options)
-    json2 = gbench.util.run_or_load_benchmark(test2, benchmark_options)
-    output_lines = gbench.report.generate_difference_report(json1, json2)
-    print('Comparing %s to %s' % (test1, test2))
-    for ln in output_lines:
-        print(ln)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/gbench/Inputs/test3_run0.json b/tools/gbench/Inputs/test3_run0.json
index ca793f3..49f8b06 100644
--- a/tools/gbench/Inputs/test3_run0.json
+++ b/tools/gbench/Inputs/test3_run0.json
@@ -9,6 +9,7 @@
   "benchmarks": [
     {
       "name": "BM_One",
+      "run_type": "aggregate",
       "iterations": 1000,
       "real_time": 10,
       "cpu_time": 100,
@@ -25,15 +26,40 @@
       "name": "BM_Two",
       "iterations": 1000,
       "real_time": 8,
+      "cpu_time": 86,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
       "cpu_time": 80,
       "time_unit": "ns"
     },
     {
       "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 77,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
       "iterations": 1000,
       "real_time": 8,
       "cpu_time": 80,
       "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 82,
+      "time_unit": "ns"
     }
   ]
 }
diff --git a/tools/gbench/Inputs/test3_run1.json b/tools/gbench/Inputs/test3_run1.json
index e5cf50c..acc5ba1 100644
--- a/tools/gbench/Inputs/test3_run1.json
+++ b/tools/gbench/Inputs/test3_run1.json
@@ -16,6 +16,7 @@
     },
     {
       "name": "BM_Two",
+      "run_type": "aggregate",
       "iterations": 1000,
       "real_time": 10,
       "cpu_time": 89,
@@ -25,14 +26,39 @@
       "name": "BM_Two",
       "iterations": 1000,
       "real_time": 7,
-      "cpu_time": 70,
+      "cpu_time": 72,
       "time_unit": "ns"
     },
     {
       "name": "short",
+      "run_type": "aggregate",
       "iterations": 1000,
-      "real_time": 8,
-      "cpu_time": 80,
+      "real_time": 7,
+      "cpu_time": 75,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 762,
+      "real_time": 4.54,
+      "cpu_time": 66.6,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 800,
+      "cpu_time": 1,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1200,
+      "real_time": 5,
+      "cpu_time": 53,
       "time_unit": "ns"
     }
   ]
diff --git a/tools/gbench/report.py b/tools/gbench/report.py
index 4d03a54..28bab34 100644
--- a/tools/gbench/report.py
+++ b/tools/gbench/report.py
@@ -36,6 +36,7 @@ BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
 
 UTEST_MIN_REPETITIONS = 2
 UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
+UTEST_COL_NAME = "_pvalue"
 
 
 def color_format(use_color, fmt_str, *args, **kwargs):
@@ -93,9 +94,103 @@ def filter_benchmark(json_orig, family, replacement=""):
     return filtered
 
 
+def get_unique_benchmark_names(json):
+    """
+    While *keeping* the order, give all the unique 'names' used for benchmarks.
+    """
+    seen = set()
+    uniqued = [x['name'] for x in json['benchmarks']
+               if x['name'] not in seen and
+               (seen.add(x['name']) or True)]
+    return uniqued
+
+
+def intersect(list1, list2):
+    """
+    Given two lists, get a new list consisting of the elements only contained
+    in *both of the input lists*, while preserving the ordering.
+    """
+    return [x for x in list1 if x in list2]
+
+
+def partition_benchmarks(json1, json2):
+    """
+    While preserving the ordering, find benchmarks with the same names in
+    both of the inputs, and group them.
+    (i.e. partition/filter into groups with common name)
+    """
+    json1_unique_names = get_unique_benchmark_names(json1)
+    json2_unique_names = get_unique_benchmark_names(json2)
+    names = intersect(json1_unique_names, json2_unique_names)
+    partitions = []
+    for name in names:
+        # Pick the time unit from the first entry of the lhs benchmark.
+        time_unit = (x['time_unit']
+                     for x in json1['benchmarks'] if x['name'] == name).next()
+        # Filter by name and time unit.
+        lhs = [x for x in json1['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        rhs = [x for x in json2['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        partitions.append([lhs, rhs])
+    return partitions
+
+
+def extract_field(partition, field_name):
+    # The count of elements may be different. We want *all* of them.
+    lhs = [x[field_name] for x in partition[0]]
+    rhs = [x[field_name] for x in partition[1]]
+    return [lhs, rhs]
+
+
+def print_utest(partition, utest_alpha, first_col_width, use_color=True):
+    timings_time = extract_field(partition, 'real_time')
+    timings_cpu = extract_field(partition, 'cpu_time')
+
+    min_rep_cnt = min(len(timings_time[0]),
+                      len(timings_time[1]),
+                      len(timings_cpu[0]),
+                      len(timings_cpu[1]))
+
+    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
+    if min_rep_cnt < UTEST_MIN_REPETITIONS:
+        return []
+
+    def get_utest_color(pval):
+        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
+
+    time_pvalue = mannwhitneyu(
+        timings_time[0], timings_time[1], alternative='two-sided').pvalue
+    cpu_pvalue = mannwhitneyu(
+        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
+
+    dsc = "U Test, Repetitions: {} vs {}".format(
+        len(timings_cpu[0]), len(timings_cpu[1]))
+    dsc_color = BC_OKGREEN
+
+    if min_rep_cnt < UTEST_OPTIMAL_REPETITIONS:
+        dsc_color = BC_WARNING
+        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
+            UTEST_OPTIMAL_REPETITIONS)
+
+    special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
+
+    last_name = partition[0][0]['name']
+    return [color_format(use_color,
+                         special_str,
+                         BC_HEADER,
+                         "{}{}".format(last_name, UTEST_COL_NAME),
+                         first_col_width,
+                         get_utest_color(time_pvalue), time_pvalue,
+                         get_utest_color(cpu_pvalue), cpu_pvalue,
+                         dsc_color, dsc,
+                         endc=BC_ENDC)]
+
+
 def generate_difference_report(
         json1,
         json2,
+        display_aggregates_only=False,
         utest=False,
         utest_alpha=0.05,
         use_color=True):
@@ -112,103 +207,65 @@ def generate_difference_report(
                 return b
         return None
 
-    utest_col_name = "_pvalue"
     first_col_width = max(
         first_col_width,
         len('Benchmark'))
-    first_col_width += len(utest_col_name)
+    first_col_width += len(UTEST_COL_NAME)
     first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
         'Benchmark', 12 + first_col_width)
     output_strs = [first_line, '-' * len(first_line)]
 
-    last_name = None
-    timings_time = [[], []]
-    timings_cpu = [[], []]
-
-    gen = (bn for bn in json1['benchmarks']
-           if 'real_time' in bn and 'cpu_time' in bn)
-    for bn in gen:
-        fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
-        special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
-
-        if last_name is None:
-            last_name = bn['name']
-        if last_name != bn['name']:
-            if ((len(timings_time[0]) >= UTEST_MIN_REPETITIONS) and
-                (len(timings_time[1]) >= UTEST_MIN_REPETITIONS) and
-                (len(timings_cpu[0]) >= UTEST_MIN_REPETITIONS) and
-                    (len(timings_cpu[1]) >= UTEST_MIN_REPETITIONS)):
-                if utest:
-                    def get_utest_color(pval):
-                        if pval >= utest_alpha:
-                            return BC_FAIL
-                        else:
-                            return BC_OKGREEN
-                    time_pvalue = mannwhitneyu(
-                        timings_time[0], timings_time[1], alternative='two-sided').pvalue
-                    cpu_pvalue = mannwhitneyu(
-                        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
-                    dsc = "U Test, Repetitions: {}".format(len(timings_cpu[0]))
-                    dsc_color = BC_OKGREEN
-                    if len(timings_cpu[0]) < UTEST_OPTIMAL_REPETITIONS:
-                        dsc_color = BC_WARNING
-                        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
-                            UTEST_OPTIMAL_REPETITIONS)
-                    output_strs += [color_format(use_color,
-                                                 special_str,
-                                                 BC_HEADER,
-                                                 "{}{}".format(last_name,
-                                                               utest_col_name),
-                                                 first_col_width,
-                                                 get_utest_color(time_pvalue),
-                                                 time_pvalue,
-                                                 get_utest_color(cpu_pvalue),
-                                                 cpu_pvalue,
-                                                 dsc_color,
-                                                 dsc,
-                                                 endc=BC_ENDC)]
-            last_name = bn['name']
-            timings_time = [[], []]
-            timings_cpu = [[], []]
-
-        other_bench = find_test(bn['name'])
-        if not other_bench:
-            continue
-
-        if bn['time_unit'] != other_bench['time_unit']:
-            continue
+    partitions = partition_benchmarks(json1, json2)
+    for partition in partitions:
+        # Careful, we may have different repetition count.
+        for i in range(min(len(partition[0]), len(partition[1]))):
+            bn = partition[0][i]
+            other_bench = partition[1][i]
+
+            # *If* we were asked to only display aggregates,
+            # and if it is non-aggregate, then skip it.
+            if display_aggregates_only and 'run_type' in bn and 'run_type' in other_bench:
+                assert bn['run_type'] == other_bench['run_type']
+                if bn['run_type'] != 'aggregate':
+                    continue
+
+            fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+
+            def get_color(res):
+                if res > 0.05:
+                    return BC_FAIL
+                elif res > -0.07:
+                    return BC_WHITE
+                else:
+                    return BC_CYAN
+
+            tres = calculate_change(bn['real_time'], other_bench['real_time'])
+            cpures = calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+            output_strs += [color_format(use_color,
+                                         fmt_str,
+                                         BC_HEADER,
+                                         bn['name'],
+                                         first_col_width,
+                                         get_color(tres),
+                                         tres,
+                                         get_color(cpures),
+                                         cpures,
+                                         bn['real_time'],
+                                         other_bench['real_time'],
+                                         bn['cpu_time'],
+                                         other_bench['cpu_time'],
+                                         endc=BC_ENDC)]
+
+        # After processing the whole partition, if requested, do the U test.
+        if utest:
+            output_strs += print_utest(partition,
+                                       utest_alpha=utest_alpha,
+                                       first_col_width=first_col_width,
+                                       use_color=use_color)
 
-        def get_color(res):
-            if res > 0.05:
-                return BC_FAIL
-            elif res > -0.07:
-                return BC_WHITE
-            else:
-                return BC_CYAN
-
-        timings_time[0].append(bn['real_time'])
-        timings_time[1].append(other_bench['real_time'])
-        timings_cpu[0].append(bn['cpu_time'])
-        timings_cpu[1].append(other_bench['cpu_time'])
-
-        tres = calculate_change(timings_time[0][-1], timings_time[1][-1])
-        cpures = calculate_change(timings_cpu[0][-1], timings_cpu[1][-1])
-        output_strs += [color_format(use_color,
-                                     fmt_str,
-                                     BC_HEADER,
-                                     bn['name'],
-                                     first_col_width,
-                                     get_color(tres),
-                                     tres,
-                                     get_color(cpures),
-                                     cpures,
-                                     timings_time[0][-1],
-                                     timings_time[1][-1],
-                                     timings_cpu[0][-1],
-                                     timings_cpu[1][-1],
-                                     endc=BC_ENDC)]
     return output_strs
 
+
 ###############################################################################
 # Unit tests
 
@@ -216,6 +273,33 @@ def generate_difference_report(
 import unittest
 
 
+class TestGetUniqueBenchmarkNames(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput = os.path.join(testInputs, 'test3_run0.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
+
+    def test_basic(self):
+        expect_lines = [
+            'BM_One',
+            'BM_Two',
+            'short',  # These two are not sorted
+            'medium', # These two are not sorted
+        ]
+        json = self.load_results()
+        output_lines = get_unique_benchmark_names(json)
+        print("\n")
+        print("\n".join(output_lines))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            self.assertEqual(expect_lines[i], output_lines[i])
+
 class TestReportDifference(unittest.TestCase):
     def load_results(self):
         import json
@@ -259,7 +343,7 @@ class TestReportDifference(unittest.TestCase):
         for i in range(0, len(output_lines)):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(len(parts), 7)
-            self.assertEqual(parts, expect_lines[i])
+            self.assertEqual(expect_lines[i], parts)
 
 
 class TestReportDifferenceBetweenFamilies(unittest.TestCase):
@@ -293,7 +377,7 @@ class TestReportDifferenceBetweenFamilies(unittest.TestCase):
         for i in range(0, len(output_lines)):
             parts = [x for x in output_lines[i].split(' ') if x]
             self.assertEqual(len(parts), 7)
-            self.assertEqual(parts, expect_lines[i])
+            self.assertEqual(expect_lines[i], parts)
 
 
 class TestReportDifferenceWithUTest(unittest.TestCase):
@@ -316,13 +400,83 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
         expect_lines = [
             ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
             ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
-            ['BM_Two', '+0.2500', '+0.1125', '8', '10', '80', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '0.6985',
+             '0.6985',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+        ]
+        json1, json2 = self.load_results()
+        output_lines_with_header = generate_difference_report(
+            json1, json2, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
+        unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+        testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+        with open(testOutput1, 'r') as f:
+            json1 = json.load(f)
+        with open(testOutput2, 'r') as f:
+            json2 = json.load(f)
+        return json1, json2
+
+    def test_utest(self):
+        expect_lines = []
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
             ['BM_Two_pvalue',
-             '0.2207',
-             '0.6831',
+             '0.6985',
+             '0.6985',
              'U',
              'Test,',
              'Repetitions:',
+             '2',
+             'vs',
              '2.',
              'WARNING:',
              'Results',
@@ -330,18 +484,35 @@ class TestReportDifferenceWithUTest(unittest.TestCase):
              '9+',
              'repetitions',
              'recommended.'],
-            ['short', '+0.0000', '+0.0000', '8', '8', '80', '80'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.1489',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
         ]
         json1, json2 = self.load_results()
         output_lines_with_header = generate_difference_report(
-            json1, json2, True, 0.05, use_color=False)
+            json1, json2, display_aggregates_only=True,
+            utest=True, utest_alpha=0.05, use_color=False)
         output_lines = output_lines_with_header[2:]
         print("\n")
         print("\n".join(output_lines_with_header))
         self.assertEqual(len(output_lines), len(expect_lines))
         for i in range(0, len(output_lines)):
             parts = [x for x in output_lines[i].split(' ') if x]
-            self.assertEqual(parts, expect_lines[i])
+            self.assertEqual(expect_lines[i], parts)
 
 
 if __name__ == '__main__':
author	android-build-team Robot <android-build-team-robot@google.com>	2018-11-04 03:05:48 +0000
committer	android-build-team Robot <android-build-team-robot@google.com>	2018-11-04 03:05:48 +0000
commit	d847f84a424ec4513a35dca61629f4b121e7e25c (patch)
tree	293b6b89a54667c8fd3a2895acbb6448475daee6
parent	c68bc3578dbd02bee8c8cedd23dcbb1223b0fc81 (diff)
parent	295f386099df283f36505edc41a8f656ac3bc11e (diff)
download	google-benchmark-d847f84a424ec4513a35dca61629f4b121e7e25c.tar.gz