Mali Bifrost DDK r16p0 KMD

Provenance: aa8b3ff0f (collaborate/EAC/b_r16p0) BX304L01B-BU-00000-r16p0-01rel0 BX304L06A-BU-00000-r16p0-01rel0 BX304X07X-BU-00000-r16p0-01rel0 Signed-off-by: Sidath Senanayake <sidaths@google.com> Change-Id: I96125862b7cf6596d1b7109853fb4ca39e851056
author: Sidath Senanayake <sidaths@google.com> 2018-12-06 09:09:59 +0100
committer: Sidath Senanayake <sidaths@google.com> 2018-12-06 09:09:59 +0100
commit: a970431fa55f99aba31ea4263fdc8e70019a9ccd (patch)
tree: 91bb7f49a4869c0385338fe144f53ac8b98468ea /mali_kbase
parent: f10b3de5283d0c196459f18160161e48cfadae81 (diff)
download: gpu-a970431fa55f99aba31ea4263fdc8e70019a9ccd.tar.gz
99 files changed, 8921 insertions, 5224 deletions
diff --git a/mali_kbase/Kbuild b/mali_kbase/Kbuild
index 9b3cb91..8e73e1f 100644
--- a/mali_kbase/Kbuild
+++ b/mali_kbase/Kbuild
@@ -21,7 +21,7 @@
 
 
 # Driver version string which is returned to userspace via an ioctl
-MALI_RELEASE_NAME ?= "r15p0-01rel0"
+MALI_RELEASE_NAME ?= "r16p0-01rel0"
 
 # Paths required for build
 KBASE_PATH = $(src)
@@ -33,7 +33,6 @@ MALI_CUSTOMER_RELEASE ?= 1
 MALI_USE_CSF ?= 0
 MALI_UNIT_TEST ?= 0
 MALI_KERNEL_TEST_API ?= 0
-MALI_MOCK_TEST ?= 0
 MALI_COVERAGE ?= 0
 CONFIG_MALI_PLATFORM_NAME ?= "devicetree"
 
@@ -43,7 +42,6 @@ DEFINES = \
 	-DMALI_USE_CSF=$(MALI_USE_CSF) \
 	-DMALI_KERNEL_TEST_API=$(MALI_KERNEL_TEST_API) \
 	-DMALI_UNIT_TEST=$(MALI_UNIT_TEST) \
-	-DMALI_MOCK_TEST=$(MALI_MOCK_TEST) \
 	-DMALI_COVERAGE=$(MALI_COVERAGE) \
 	-DMALI_RELEASE_NAME=\"$(MALI_RELEASE_NAME)\"
 
@@ -61,7 +59,7 @@ DEFINES += -DMALI_KBASE_BUILD
 
 # Use our defines when compiling
 ccflags-y += $(DEFINES) -I$(KBASE_PATH)   -I$(KBASE_PLATFORM_PATH) -I$(UMP_PATH) -I$(srctree)/include/linux
-subdir-ccflags-y += $(DEFINES) -I$(KBASE_PATH)   -I$(KBASE_PLATFORM_PATH) -I$(OSK_PATH) -I$(UMP_PATH) -I$(srctree)/include/linux
+subdir-ccflags-y += $(DEFINES) -I$(KBASE_PATH)   -I$(KBASE_PLATFORM_PATH) -I$(UMP_PATH) -I$(srctree)/include/linux
 
 SRC := \
 	mali_kbase_device.c \
@@ -80,10 +78,15 @@ SRC := \
 	mali_kbase_pm.c \
 	mali_kbase_config.c \
 	mali_kbase_vinstr.c \
+	mali_kbase_hwcnt.c \
+	mali_kbase_hwcnt_backend_gpu.c \
+	mali_kbase_hwcnt_gpu.c \
+	mali_kbase_hwcnt_legacy.c \
+	mali_kbase_hwcnt_types.c \
+	mali_kbase_hwcnt_virtualizer.c \
 	mali_kbase_softjobs.c \
 	mali_kbase_10969_workaround.c \
 	mali_kbase_hw.c \
-	mali_kbase_utility.c \
 	mali_kbase_debug.c \
 	mali_kbase_gpu_memory_debugfs.c \
 	mali_kbase_mem_linux.c \
@@ -154,11 +157,6 @@ mali_kbase-$(CONFIG_SYNC_FILE) += \
 	mali_kbase_sync_common.o \
 	mali_kbase_fence.o
 
-ifeq ($(MALI_MOCK_TEST),1)
-# Test functionality
-mali_kbase-y += tests/internal/src/mock/mali_kbase_pm_driver_mock.o
-endif
-
 include  $(src)/backend/gpu/Kbuild
 mali_kbase-y += $(BACKEND:.c=.o)
 
diff --git a/mali_kbase/Kconfig b/mali_kbase/Kconfig
index af2a5aa..7c10016 100644
--- a/mali_kbase/Kconfig
+++ b/mali_kbase/Kconfig
@@ -31,14 +31,12 @@ menuconfig MALI_MIDGARD
 	  this will generate a single module, called mali_kbase.
 
 config MALI_GATOR_SUPPORT
-	bool "Streamline support via Gator"
+	bool "Enable Streamline tracing support"
 	depends on MALI_MIDGARD
-	default n
+	default y
 	help
-	  Adds diagnostic support for use with the ARM Streamline Performance Analyzer.
-	  You will need the Gator device driver already loaded before loading this driver when enabling
-	  Streamline debug support.
-	  This is a legacy interface required by older versions of Streamline.
+	  Enables kbase tracing used by the Arm Streamline Performance Analyzer.
+	  The tracepoints are used to derive GPU activity charts in Streamline.
 
 config MALI_MIDGARD_DVFS
 	bool "Enable legacy DVFS"
diff --git a/mali_kbase/Makefile b/mali_kbase/Makefile
index 13af9f4..08b2fa9 100644
--- a/mali_kbase/Makefile
+++ b/mali_kbase/Makefile
@@ -25,10 +25,6 @@ KDIR ?= /lib/modules/$(shell uname -r)/build
 BUSLOG_PATH_RELATIVE = $(CURDIR)/../../../..
 KBASE_PATH_RELATIVE = $(CURDIR)
 
-ifeq ($(MALI_UNIT_TEST), 1)
-	EXTRA_SYMBOLS += $(KBASE_PATH_RELATIVE)/tests/internal/src/kernel_assert_module/linux/Module.symvers
-endif
-
 ifeq ($(CONFIG_MALI_FPGA_BUS_LOGGER),y)
 #Add bus logger symbols
 EXTRA_SYMBOLS += $(BUSLOG_PATH_RELATIVE)/drivers/base/bus_logger/Module.symvers
diff --git a/mali_kbase/Makefile.kbase b/mali_kbase/Makefile.kbase
index d7898cb..6b0f81e 100644
--- a/mali_kbase/Makefile.kbase
+++ b/mali_kbase/Makefile.kbase
@@ -1,5 +1,5 @@
 #
-# (C) COPYRIGHT 2010 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2010, 2013, 2018 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -19,5 +19,5 @@
 #
 #
 
-EXTRA_CFLAGS += -I$(ROOT) -I$(KBASE_PATH) -I$(OSK_PATH)/src/linux/include -I$(KBASE_PATH)/platform_$(PLATFORM)
+EXTRA_CFLAGS += -I$(ROOT) -I$(KBASE_PATH) -I$(KBASE_PATH)/platform_$(PLATFORM)
 
diff --git a/mali_kbase/Mconfig b/mali_kbase/Mconfig
index 1b6bffc..46dca14 100644
--- a/mali_kbase/Mconfig
+++ b/mali_kbase/Mconfig
@@ -23,15 +23,12 @@ menuconfig MALI_MIDGARD
 	  this will generate a single module, called mali_kbase.
 
 config MALI_GATOR_SUPPORT
-	bool "Streamline support via Gator"
+	bool "Enable Streamline tracing support"
 	depends on MALI_MIDGARD && !BACKEND_USER
-	default y if INSTRUMENTATION_STREAMLINE_OLD
-	default n
+	default y
 	help
-	  Adds diagnostic support for use with the ARM Streamline Performance Analyzer.
-	  You will need the Gator device driver already loaded before loading this driver when enabling
-	  Streamline debug support.
-	  This is a legacy interface required by older versions of Streamline.
+	  Enables kbase tracing used by the Arm Streamline Performance Analyzer.
+	  The tracepoints are used to derive GPU activity charts in Streamline.
 
 config MALI_MIDGARD_DVFS
 	bool "Enable legacy DVFS"
@@ -88,11 +85,6 @@ config MALI_PLATFORM_NAME
 	  When PLATFORM_CUSTOM is set, this needs to be set manually to
 	  pick up the desired platform files.
 
-config MALI_MOCK_TEST
-	bool
-	depends on MALI_MIDGARD && !RELEASE
-	default y
-
 # MALI_EXPERT configuration options
 
 menuconfig MALI_EXPERT
diff --git a/mali_kbase/backend/gpu/Kbuild b/mali_kbase/backend/gpu/Kbuild
index dcd8ca4..2dc1455 100644
--- a/mali_kbase/backend/gpu/Kbuild
+++ b/mali_kbase/backend/gpu/Kbuild
@@ -38,14 +38,12 @@ BACKEND += \
 	backend/gpu/mali_kbase_pm_ca.c \
 	backend/gpu/mali_kbase_pm_always_on.c \
 	backend/gpu/mali_kbase_pm_coarse_demand.c \
-	backend/gpu/mali_kbase_pm_demand.c \
 	backend/gpu/mali_kbase_pm_policy.c \
 	backend/gpu/mali_kbase_time.c
 
 ifeq ($(MALI_CUSTOMER_RELEASE),0)
 BACKEND += \
-	backend/gpu/mali_kbase_pm_demand_always_powered.c \
-	backend/gpu/mali_kbase_pm_fast_start.c
+	backend/gpu/mali_kbase_pm_always_on_demand.c
 endif
 
 ifeq ($(CONFIG_MALI_DEVFREQ),y)
diff --git a/mali_kbase/backend/gpu/mali_kbase_backend_config.h b/mali_kbase/backend/gpu/mali_kbase_backend_config.h
index 196a776..4a61f96 100644
--- a/mali_kbase/backend/gpu/mali_kbase_backend_config.h
+++ b/mali_kbase/backend/gpu/mali_kbase_backend_config.h
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2014-2015 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -27,8 +27,5 @@
 #ifndef _KBASE_BACKEND_CONFIG_H_
 #define _KBASE_BACKEND_CONFIG_H_
 
-/* Enable GPU reset API */
-#define KBASE_GPU_RESET_EN 1
-
 #endif /* _KBASE_BACKEND_CONFIG_H_ */
 
diff --git a/mali_kbase/backend/gpu/mali_kbase_devfreq.c b/mali_kbase/backend/gpu/mali_kbase_devfreq.c
index 683a24c..5ade012 100644
--- a/mali_kbase/backend/gpu/mali_kbase_devfreq.c
+++ b/mali_kbase/backend/gpu/mali_kbase_devfreq.c
@@ -283,8 +283,11 @@ static int kbase_devfreq_init_core_mask_table(struct kbase_device *kbdev)
 			real_freq = opp_freq;
 		if (of_property_read_u64(node, "opp-core-mask", &core_mask))
 			core_mask = shader_present;
-		if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_11056) &&
-				core_mask != shader_present) {
+		if (core_mask != shader_present &&
+				(kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_11056) ||
+				 corestack_driver_control ||
+				 platform_power_down_only)) {
+
 			dev_warn(kbdev->dev, "Ignoring OPP %llu - Dynamic Core Scaling not supported on this GPU\n",
 					opp_freq);
 			continue;
diff --git a/mali_kbase/backend/gpu/mali_kbase_device_hw.c b/mali_kbase/backend/gpu/mali_kbase_device_hw.c
index ebc3022..5dd059f 100644
--- a/mali_kbase/backend/gpu/mali_kbase_device_hw.c
+++ b/mali_kbase/backend/gpu/mali_kbase_device_hw.c
@@ -29,6 +29,7 @@
 #include <backend/gpu/mali_kbase_pm_internal.h>
 
 #include <backend/gpu/mali_kbase_device_internal.h>
+#include <mali_kbase_config_defaults.h>
 
 #if !defined(CONFIG_MALI_NO_MALI)
 
@@ -220,6 +221,84 @@ static void kbase_report_gpu_fault(struct kbase_device *kbdev, int multiple)
 		dev_warn(kbdev->dev, "There were multiple GPU faults - some have not been reported\n");
 }
 
+void kbase_gpu_start_cache_clean_nolock(struct kbase_device *kbdev)
+{
+	u32 irq_mask;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (kbdev->cache_clean_in_progress) {
+		/* If this is called while another clean is in progress, we
+		 * can't rely on the current one to flush any new changes in
+		 * the cache. Instead, trigger another cache clean immediately
+		 * after this one finishes.
+		 */
+		kbdev->cache_clean_queued = true;
+		return;
+	}
+
+	/* Enable interrupt */
+	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
+				irq_mask | CLEAN_CACHES_COMPLETED);
+
+	KBASE_TRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, NULL, 0u, 0);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
+					GPU_COMMAND_CLEAN_INV_CACHES);
+
+	kbdev->cache_clean_in_progress = true;
+}
+
+void kbase_gpu_start_cache_clean(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	kbase_gpu_start_cache_clean_nolock(kbdev);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+}
+
+static void kbase_clean_caches_done(struct kbase_device *kbdev)
+{
+	u32 irq_mask;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	if (kbdev->cache_clean_queued) {
+		kbdev->cache_clean_queued = false;
+
+		KBASE_TRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, NULL, 0u, 0);
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
+				GPU_COMMAND_CLEAN_INV_CACHES);
+	} else {
+		/* Disable interrupt */
+		irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
+				irq_mask & ~CLEAN_CACHES_COMPLETED);
+
+		kbdev->cache_clean_in_progress = false;
+
+		wake_up(&kbdev->cache_clean_wait);
+	}
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+}
+
+void kbase_gpu_wait_cache_clean(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	while (kbdev->cache_clean_in_progress) {
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		wait_event_interruptible(kbdev->cache_clean_wait,
+				!kbdev->cache_clean_in_progress);
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	}
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+}
+
 void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
 {
 	KBASE_TRACE_ADD(kbdev, CORE_GPU_IRQ, NULL, NULL, 0u, val);
@@ -232,18 +311,29 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
 	if (val & PRFCNT_SAMPLE_COMPLETED)
 		kbase_instr_hwcnt_sample_done(kbdev);
 
-	if (val & CLEAN_CACHES_COMPLETED)
-		kbase_clean_caches_done(kbdev);
-
 	KBASE_TRACE_ADD(kbdev, CORE_GPU_IRQ_CLEAR, NULL, NULL, 0u, val);
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_CLEAR), val);
 
-	/* kbase_pm_check_transitions must be called after the IRQ has been
-	 * cleared. This is because it might trigger further power transitions
-	 * and we don't want to miss the interrupt raised to notify us that
-	 * these further transitions have finished.
+	/* kbase_pm_check_transitions (called by kbase_pm_power_changed) must
+	 * be called after the IRQ has been cleared. This is because it might
+	 * trigger further power transitions and we don't want to miss the
+	 * interrupt raised to notify us that these further transitions have
+	 * finished. The same applies to kbase_clean_caches_done() - if another
+	 * clean was queued, it might trigger another clean, which might
+	 * generate another interrupt which shouldn't be missed.
+	 */
+
+	if (val & CLEAN_CACHES_COMPLETED)
+		kbase_clean_caches_done(kbdev);
+
+	/* When 'platform_power_down_only' is enabled, the L2 cache is not
+	 * powered down, but flushed before the GPU power down (which is done
+	 * by the platform code). So the L2 state machine requests a cache
+	 * flush. And when that flush completes, the L2 state machine needs to
+	 * be re-invoked to proceed with the GPU power down.
 	 */
-	if (val & POWER_CHANGED_ALL)
+	if (val & POWER_CHANGED_ALL ||
+			(platform_power_down_only && (val & CLEAN_CACHES_COMPLETED)))
 		kbase_pm_power_changed(kbdev);
 
 	KBASE_TRACE_ADD(kbdev, CORE_GPU_IRQ_DONE, NULL, NULL, 0u, val);
diff --git a/mali_kbase/backend/gpu/mali_kbase_device_internal.h b/mali_kbase/backend/gpu/mali_kbase_device_internal.h
index 928efe9..7886e96 100644
--- a/mali_kbase/backend/gpu/mali_kbase_device_internal.h
+++ b/mali_kbase/backend/gpu/mali_kbase_device_internal.h
@@ -50,6 +50,31 @@ void kbase_reg_write(struct kbase_device *kbdev, u32 offset, u32 value);
  */
 u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset);
 
+/**
+ * kbase_gpu_start_cache_clean - Start a cache clean
+ * @kbdev: Kbase device
+ *
+ * Issue a cache clean and invalidate command to hardware. This function will
+ * take hwaccess_lock.
+ */
+void kbase_gpu_start_cache_clean(struct kbase_device *kbdev);
+
+/**
+ * kbase_gpu_start_cache_clean_nolock - Start a cache clean
+ * @kbdev: Kbase device
+ *
+ * Issue a cache clean and invalidate command to hardware. hwaccess_lock
+ * must be held by the caller.
+ */
+void kbase_gpu_start_cache_clean_nolock(struct kbase_device *kbdev);
+
+/**
+ * kbase_gpu_wait_cache_clean - Wait for cache cleaning to finish
+ * @kbdev: Kbase device
+ *
+ * This function will take hwaccess_lock, and may sleep.
+ */
+void kbase_gpu_wait_cache_clean(struct kbase_device *kbdev);
 
 /**
  * kbase_gpu_interrupt - GPU interrupt handler
diff --git a/mali_kbase/backend/gpu/mali_kbase_gpu.c b/mali_kbase/backend/gpu/mali_kbase_gpu.c
index 881d50c..995d34d 100644
--- a/mali_kbase/backend/gpu/mali_kbase_gpu.c
+++ b/mali_kbase/backend/gpu/mali_kbase_gpu.c
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2014-2017 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -56,7 +56,7 @@ int kbase_backend_early_init(struct kbase_device *kbdev)
 	if (err)
 		goto fail_interrupts;
 
-	err = kbase_hwaccess_pm_init(kbdev);
+	err = kbase_hwaccess_pm_early_init(kbdev);
 	if (err)
 		goto fail_pm;
 
@@ -74,7 +74,7 @@ fail_runtime_pm:
 
 void kbase_backend_early_term(struct kbase_device *kbdev)
 {
-	kbase_hwaccess_pm_term(kbdev);
+	kbase_hwaccess_pm_early_term(kbdev);
 	kbase_release_interrupts(kbdev);
 	kbase_pm_runtime_term(kbdev);
 	kbasep_platform_device_term(kbdev);
@@ -84,10 +84,14 @@ int kbase_backend_late_init(struct kbase_device *kbdev)
 {
 	int err;
 
-	err = kbase_hwaccess_pm_powerup(kbdev, PM_HW_ISSUES_DETECT);
+	err = kbase_hwaccess_pm_late_init(kbdev);
 	if (err)
 		return err;
 
+	err = kbase_hwaccess_pm_powerup(kbdev, PM_HW_ISSUES_DETECT);
+	if (err)
+		goto fail_pm_powerup;
+
 	err = kbase_backend_timer_init(kbdev);
 	if (err)
 		goto fail_timer;
@@ -121,6 +125,8 @@ fail_interrupt_test:
 	kbase_backend_timer_term(kbdev);
 fail_timer:
 	kbase_hwaccess_pm_halt(kbdev);
+fail_pm_powerup:
+	kbase_hwaccess_pm_late_term(kbdev);
 
 	return err;
 }
@@ -131,5 +137,5 @@ void kbase_backend_late_term(struct kbase_device *kbdev)
 	kbase_job_slot_term(kbdev);
 	kbase_backend_timer_term(kbdev);
 	kbase_hwaccess_pm_halt(kbdev);
+	kbase_hwaccess_pm_late_term(kbdev);
 }
-
diff --git a/mali_kbase/backend/gpu/mali_kbase_instr_backend.c b/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
index 6c69132..79c04d9 100644
--- a/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_instr_backend.c
@@ -33,49 +33,17 @@
 #include <backend/gpu/mali_kbase_pm_internal.h>
 #include <backend/gpu/mali_kbase_instr_internal.h>
 
-/**
- * kbasep_instr_hwcnt_cacheclean - Issue Cache Clean & Invalidate command to
- * hardware
- *
- * @kbdev: Kbase device
- */
-static void kbasep_instr_hwcnt_cacheclean(struct kbase_device *kbdev)
-{
-	unsigned long flags;
-	unsigned long pm_flags;
-	u32 irq_mask;
-
-	spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
-	KBASE_DEBUG_ASSERT(kbdev->hwcnt.backend.state ==
-					KBASE_INSTR_STATE_REQUEST_CLEAN);
-
-	/* Enable interrupt */
-	spin_lock_irqsave(&kbdev->hwaccess_lock, pm_flags);
-	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
-	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
-				irq_mask | CLEAN_CACHES_COMPLETED);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
-
-	/* clean&invalidate the caches so we're sure the mmu tables for the dump
-	 * buffer is valid */
-	KBASE_TRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, NULL, 0u, 0);
-	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
-					GPU_COMMAND_CLEAN_INV_CACHES);
-	kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_CLEANING;
-
-	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
-}
-
 int kbase_instr_hwcnt_enable_internal(struct kbase_device *kbdev,
 					struct kbase_context *kctx,
-					struct kbase_ioctl_hwcnt_enable *enable)
+					struct kbase_instr_hwcnt_enable *enable)
 {
-	unsigned long flags, pm_flags;
+	unsigned long flags;
 	int err = -EINVAL;
 	u32 irq_mask;
-	int ret;
 	u32 prfcnt_config;
 
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
 	/* alignment failure */
 	if ((enable->dump_buffer == 0ULL) || (enable->dump_buffer & (2048 - 1)))
 		goto out_err;
@@ -84,53 +52,30 @@ int kbase_instr_hwcnt_enable_internal(struct kbase_device *kbdev,
 	 */
 	kbase_pm_ca_instr_enable(kbdev);
 
-	/* Request the cores early on synchronously - we'll release them on any
-	 * errors (e.g. instrumentation already active) */
-	kbase_pm_request_cores_sync(kbdev, true, true);
-
 	spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
 
 	if (kbdev->hwcnt.backend.state != KBASE_INSTR_STATE_DISABLED) {
 		/* Instrumentation is already enabled */
 		spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
-		goto out_unrequest_cores;
+		goto out_err;
 	}
 
 	/* Enable interrupt */
-	spin_lock_irqsave(&kbdev->hwaccess_lock, pm_flags);
 	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK), irq_mask |
 						PRFCNT_SAMPLE_COMPLETED);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
 
 	/* In use, this context is the owner */
 	kbdev->hwcnt.kctx = kctx;
 	/* Remember the dump address so we can reprogram it later */
 	kbdev->hwcnt.addr = enable->dump_buffer;
-
-	/* Request the clean */
-	kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_REQUEST_CLEAN;
-	kbdev->hwcnt.backend.triggered = 0;
-	/* Clean&invalidate the caches so we're sure the mmu tables for the dump
-	 * buffer is valid */
-	ret = queue_work(kbdev->hwcnt.backend.cache_clean_wq,
-					&kbdev->hwcnt.backend.cache_clean_work);
-	KBASE_DEBUG_ASSERT(ret);
+	kbdev->hwcnt.addr_bytes = enable->dump_buffer_bytes;
 
 	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
 
-	/* Wait for cacheclean to complete */
-	wait_event(kbdev->hwcnt.backend.wait,
-					kbdev->hwcnt.backend.triggered != 0);
-
-	KBASE_DEBUG_ASSERT(kbdev->hwcnt.backend.state ==
-							KBASE_INSTR_STATE_IDLE);
-
-	kbase_pm_request_l2_caches(kbdev);
-
 	/* Configure */
 	prfcnt_config = kctx->as_nr << PRFCNT_CONFIG_AS_SHIFT;
-#ifdef CONFIG_MALI_PRFCNT_SET_SECONDARY
+	if (enable->use_secondary)
 	{
 		u32 gpu_id = kbdev->gpu_props.props.raw_props.gpu_id;
 		u32 product_id = (gpu_id & GPU_ID_VERSION_PRODUCT_ID)
@@ -140,7 +85,6 @@ int kbase_instr_hwcnt_enable_internal(struct kbase_device *kbdev,
 		if (arch_v6)
 			prfcnt_config |= 1 << PRFCNT_CONFIG_SETSELECT_SHIFT;
 	}
-#endif
 
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(PRFCNT_CONFIG),
 			prfcnt_config | PRFCNT_CONFIG_MODE_OFF);
@@ -184,10 +128,6 @@ int kbase_instr_hwcnt_enable_internal(struct kbase_device *kbdev,
 
 	dev_dbg(kbdev->dev, "HW counters dumping set-up for context %p", kctx);
 	return err;
- out_unrequest_cores:
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	kbase_pm_release_cores(kbdev, true, true);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
  out_err:
 	return err;
 }
@@ -200,17 +140,20 @@ int kbase_instr_hwcnt_disable_internal(struct kbase_context *kctx)
 	struct kbase_device *kbdev = kctx->kbdev;
 
 	while (1) {
+		spin_lock_irqsave(&kbdev->hwaccess_lock, pm_flags);
 		spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
 
 		if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_DISABLED) {
 			/* Instrumentation is not enabled */
 			spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
 			goto out;
 		}
 
 		if (kbdev->hwcnt.kctx != kctx) {
 			/* Instrumentation has been setup for another context */
 			spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
 			goto out;
 		}
 
@@ -218,6 +161,7 @@ int kbase_instr_hwcnt_disable_internal(struct kbase_context *kctx)
 			break;
 
 		spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
 
 		/* Ongoing dump/setup - wait for its completion */
 		wait_event(kbdev->hwcnt.backend.wait,
@@ -228,7 +172,6 @@ int kbase_instr_hwcnt_disable_internal(struct kbase_context *kctx)
 	kbdev->hwcnt.backend.triggered = 0;
 
 	/* Disable interrupt */
-	spin_lock_irqsave(&kbdev->hwaccess_lock, pm_flags);
 	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
 				irq_mask & ~PRFCNT_SAMPLE_COMPLETED);
@@ -238,15 +181,12 @@ int kbase_instr_hwcnt_disable_internal(struct kbase_context *kctx)
 
 	kbdev->hwcnt.kctx = NULL;
 	kbdev->hwcnt.addr = 0ULL;
+	kbdev->hwcnt.addr_bytes = 0ULL;
 
 	kbase_pm_ca_instr_disable(kbdev);
 
-	kbase_pm_release_cores(kbdev, true, true);
-
-	kbase_pm_release_l2_caches(kbdev);
-
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
 	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
 
 	dev_dbg(kbdev->dev, "HW counters dumping disabled for context %p",
 									kctx);
@@ -331,33 +271,34 @@ KBASE_EXPORT_SYMBOL(kbase_instr_hwcnt_dump_complete);
 void kbasep_cache_clean_worker(struct work_struct *data)
 {
 	struct kbase_device *kbdev;
-	unsigned long flags;
+	unsigned long flags, pm_flags;
 
 	kbdev = container_of(data, struct kbase_device,
 						hwcnt.backend.cache_clean_work);
 
-	mutex_lock(&kbdev->cacheclean_lock);
-	kbasep_instr_hwcnt_cacheclean(kbdev);
-
+	spin_lock_irqsave(&kbdev->hwaccess_lock, pm_flags);
 	spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
-	/* Wait for our condition, and any reset to complete */
-	while (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_CLEANING) {
-		spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
-		wait_event(kbdev->hwcnt.backend.cache_clean_wait,
-				kbdev->hwcnt.backend.state !=
-						KBASE_INSTR_STATE_CLEANING);
-		spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
-	}
+
+	/* Clean and invalidate the caches so we're sure the mmu tables for the
+	 * dump buffer is valid.
+	 */
 	KBASE_DEBUG_ASSERT(kbdev->hwcnt.backend.state ==
-						KBASE_INSTR_STATE_CLEANED);
+					KBASE_INSTR_STATE_REQUEST_CLEAN);
+	kbase_gpu_start_cache_clean_nolock(kbdev);
+	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
+
+	kbase_gpu_wait_cache_clean(kbdev);
 
+	spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
+	KBASE_DEBUG_ASSERT(kbdev->hwcnt.backend.state ==
+					KBASE_INSTR_STATE_REQUEST_CLEAN);
 	/* All finished and idle */
 	kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_IDLE;
 	kbdev->hwcnt.backend.triggered = 1;
 	wake_up(&kbdev->hwcnt.backend.wait);
 
 	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
-	mutex_unlock(&kbdev->cacheclean_lock);
 }
 
 void kbase_instr_hwcnt_sample_done(struct kbase_device *kbdev)
@@ -389,40 +330,13 @@ void kbase_instr_hwcnt_sample_done(struct kbase_device *kbdev)
 	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
 }
 
-void kbase_clean_caches_done(struct kbase_device *kbdev)
-{
-	u32 irq_mask;
-
-	if (kbdev->hwcnt.backend.state != KBASE_INSTR_STATE_DISABLED) {
-		unsigned long flags;
-		unsigned long pm_flags;
-
-		spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
-		/* Disable interrupt */
-		spin_lock_irqsave(&kbdev->hwaccess_lock, pm_flags);
-		irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
-		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK),
-				irq_mask & ~CLEAN_CACHES_COMPLETED);
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, pm_flags);
-
-		/* Wakeup... */
-		if (kbdev->hwcnt.backend.state == KBASE_INSTR_STATE_CLEANING) {
-			/* Only wake if we weren't resetting */
-			kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_CLEANED;
-			wake_up(&kbdev->hwcnt.backend.cache_clean_wait);
-		}
-
-		spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
-	}
-}
-
 int kbase_instr_hwcnt_wait_for_dump(struct kbase_context *kctx)
 {
 	struct kbase_device *kbdev = kctx->kbdev;
 	unsigned long flags;
 	int err;
 
-	/* Wait for dump & cacheclean to complete */
+	/* Wait for dump & cache clean to complete */
 	wait_event(kbdev->hwcnt.backend.wait,
 					kbdev->hwcnt.backend.triggered != 0);
 
@@ -477,7 +391,6 @@ int kbase_instr_backend_init(struct kbase_device *kbdev)
 	kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_DISABLED;
 
 	init_waitqueue_head(&kbdev->hwcnt.backend.wait);
-	init_waitqueue_head(&kbdev->hwcnt.backend.cache_clean_wait);
 	INIT_WORK(&kbdev->hwcnt.backend.cache_clean_work,
 						kbasep_cache_clean_worker);
 	kbdev->hwcnt.backend.triggered = 0;
@@ -494,4 +407,3 @@ void kbase_instr_backend_term(struct kbase_device *kbdev)
 {
 	destroy_workqueue(kbdev->hwcnt.backend.cache_clean_wq);
 }
-
diff --git a/mali_kbase/backend/gpu/mali_kbase_instr_defs.h b/mali_kbase/backend/gpu/mali_kbase_instr_defs.h
index fb55d2d..c9fb759 100644
--- a/mali_kbase/backend/gpu/mali_kbase_instr_defs.h
+++ b/mali_kbase/backend/gpu/mali_kbase_instr_defs.h
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2014, 2016 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014, 2016, 2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -39,11 +39,6 @@ enum kbase_instr_state {
 	KBASE_INSTR_STATE_DUMPING,
 	/* We've requested a clean to occur on a workqueue */
 	KBASE_INSTR_STATE_REQUEST_CLEAN,
-	/* Hardware is currently cleaning and invalidating caches. */
-	KBASE_INSTR_STATE_CLEANING,
-	/* Cache clean completed, and either a) a dump is complete, or
-	 * b) instrumentation can now be setup. */
-	KBASE_INSTR_STATE_CLEANED,
 	/* An error has occured during DUMPING (page fault). */
 	KBASE_INSTR_STATE_FAULT
 };
@@ -54,7 +49,6 @@ struct kbase_instr_backend {
 	int triggered;
 
 	enum kbase_instr_state state;
-	wait_queue_head_t cache_clean_wait;
 	struct workqueue_struct *cache_clean_wq;
 	struct work_struct  cache_clean_work;
 };
diff --git a/mali_kbase/backend/gpu/mali_kbase_instr_internal.h b/mali_kbase/backend/gpu/mali_kbase_instr_internal.h
index 608379e..2254b9f 100644
--- a/mali_kbase/backend/gpu/mali_kbase_instr_internal.h
+++ b/mali_kbase/backend/gpu/mali_kbase_instr_internal.h
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2014 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014, 2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -36,12 +36,6 @@
 void kbasep_cache_clean_worker(struct work_struct *data);
 
 /**
- * kbase_clean_caches_done() - Cache clean interrupt received
- * @kbdev: Kbase device
- */
-void kbase_clean_caches_done(struct kbase_device *kbdev);
-
-/**
  * kbase_instr_hwcnt_sample_done() - Dump complete interrupt received
  * @kbdev: Kbase device
  */
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_hw.c b/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
index fee19aa..acd4a5a 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_hw.c
@@ -31,10 +31,10 @@
 #include <mali_kbase_gator.h>
 #endif
 #include <mali_kbase_tlstream.h>
-#include <mali_kbase_vinstr.h>
 #include <mali_kbase_hw.h>
 #include <mali_kbase_hwaccess_jm.h>
 #include <mali_kbase_ctx_sched.h>
+#include <mali_kbase_hwcnt_context.h>
 #include <backend/gpu/mali_kbase_device_internal.h>
 #include <backend/gpu/mali_kbase_irq_internal.h>
 #include <backend/gpu/mali_kbase_jm_internal.h>
@@ -42,11 +42,9 @@
 #define beenthere(kctx, f, a...) \
 			dev_dbg(kctx->kbdev->dev, "%s:" f, __func__, ##a)
 
-#if KBASE_GPU_RESET_EN
 static void kbasep_try_reset_gpu_early(struct kbase_device *kbdev);
 static void kbasep_reset_timeout_worker(struct work_struct *data);
 static enum hrtimer_restart kbasep_reset_timer_callback(struct hrtimer *timer);
-#endif /* KBASE_GPU_RESET_EN */
 
 static inline int kbasep_jm_is_js_free(struct kbase_device *kbdev, int js,
 						struct kbase_context *kctx)
@@ -77,7 +75,7 @@ static u64 kbase_job_write_affinity(struct kbase_device *kbdev,
 		struct mali_base_gpu_coherent_group_info *coherency_info =
 			&kbdev->gpu_props.props.coherency_info;
 
-		affinity = kbase_pm_ca_get_core_mask(kbdev) &
+		affinity = kbdev->pm.backend.shaders_avail &
 				kbdev->pm.debug_core_mask[js];
 
 		/* JS2 on a dual core group system targets core group 1. All
@@ -89,7 +87,7 @@ static u64 kbase_job_write_affinity(struct kbase_device *kbdev,
 			affinity &= coherency_info->group[0].core_mask;
 	} else {
 		/* Use all cores */
-		affinity = kbase_pm_ca_get_core_mask(kbdev) &
+		affinity = kbdev->pm.backend.shaders_avail &
 				kbdev->pm.debug_core_mask[js];
 	}
 
@@ -141,6 +139,8 @@ void kbase_job_hw_submit(struct kbase_device *kbdev,
 	if (0 != (katom->core_req & BASE_JD_REQ_SKIP_CACHE_END) &&
 			!(kbdev->serialize_jobs & KBASE_SERIALIZE_RESET))
 		cfg |= JS_CONFIG_END_FLUSH_NO_ACTION;
+	else if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_CLEAN_ONLY_SAFE))
+		cfg |= JS_CONFIG_END_FLUSH_CLEAN;
 	else
 		cfg |= JS_CONFIG_END_FLUSH_CLEAN_INVALIDATE;
 
@@ -465,7 +465,6 @@ void kbase_job_done(struct kbase_device *kbdev, u32 done)
 	}
 
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-#if KBASE_GPU_RESET_EN
 	if (atomic_read(&kbdev->hwaccess.backend.reset_gpu) ==
 						KBASE_RESET_GPU_COMMITTED) {
 		/* If we're trying to reset the GPU then we might be able to do
@@ -474,7 +473,6 @@ void kbase_job_done(struct kbase_device *kbdev, u32 done)
 		 */
 		kbasep_try_reset_gpu_early(kbdev);
 	}
-#endif /* KBASE_GPU_RESET_EN */
 	KBASE_TRACE_ADD(kbdev, JM_IRQ_END, NULL, NULL, 0, count);
 }
 KBASE_EXPORT_TEST_API(kbase_job_done);
@@ -800,7 +798,6 @@ void kbase_jm_wait_for_zero_jobs(struct kbase_context *kctx)
 	if (timeout != 0)
 		goto exit;
 
-#if KBASE_GPU_RESET_EN
 	if (kbase_prepare_to_reset_gpu(kbdev)) {
 		dev_err(kbdev->dev,
 			"Issueing GPU soft-reset because jobs failed to be killed (within %d ms) as part of context termination (e.g. process exit)\n",
@@ -812,12 +809,6 @@ void kbase_jm_wait_for_zero_jobs(struct kbase_context *kctx)
 	wait_event(kbdev->hwaccess.backend.reset_wait,
 			atomic_read(&kbdev->hwaccess.backend.reset_gpu)
 			== KBASE_RESET_GPU_NOT_PENDING);
-#else
-	dev_warn(kbdev->dev,
-		"Jobs failed to be killed (within %d ms) as part of context termination (e.g. process exit)\n",
-		ZAP_TIMEOUT);
-
-#endif
 exit:
 	dev_dbg(kbdev->dev, "Zap: Finished Context %p", kctx);
 
@@ -845,7 +836,6 @@ u32 kbase_backend_get_current_flush_id(struct kbase_device *kbdev)
 
 int kbase_job_slot_init(struct kbase_device *kbdev)
 {
-#if KBASE_GPU_RESET_EN
 	kbdev->hwaccess.backend.reset_workq = alloc_workqueue(
 						"Mali reset workqueue", 0, 1);
 	if (NULL == kbdev->hwaccess.backend.reset_workq)
@@ -858,7 +848,6 @@ int kbase_job_slot_init(struct kbase_device *kbdev)
 							HRTIMER_MODE_REL);
 	kbdev->hwaccess.backend.reset_timer.function =
 						kbasep_reset_timer_callback;
-#endif
 
 	return 0;
 }
@@ -871,13 +860,10 @@ void kbase_job_slot_halt(struct kbase_device *kbdev)
 
 void kbase_job_slot_term(struct kbase_device *kbdev)
 {
-#if KBASE_GPU_RESET_EN
 	destroy_workqueue(kbdev->hwaccess.backend.reset_workq);
-#endif
 }
 KBASE_EXPORT_TEST_API(kbase_job_slot_term);
 
-#if KBASE_GPU_RESET_EN
 /**
  * kbasep_check_for_afbc_on_slot() - Check whether AFBC is in use on this slot
  * @kbdev: kbase device pointer
@@ -935,7 +921,6 @@ static bool kbasep_check_for_afbc_on_slot(struct kbase_device *kbdev,
 
 	return ret;
 }
-#endif /* KBASE_GPU_RESET_EN */
 
 /**
  * kbase_job_slot_softstop_swflags - Soft-stop a job with flags
@@ -992,7 +977,6 @@ void kbase_job_slot_hardstop(struct kbase_context *kctx, int js,
 {
 	struct kbase_device *kbdev = kctx->kbdev;
 	bool stopped;
-#if KBASE_GPU_RESET_EN
 	/* We make the check for AFBC before evicting/stopping atoms.  Note
 	 * that no other thread can modify the slots whilst we have the
 	 * hwaccess_lock. */
@@ -1000,12 +984,10 @@ void kbase_job_slot_hardstop(struct kbase_context *kctx, int js,
 			kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_T76X_3542)
 			&& kbasep_check_for_afbc_on_slot(kbdev, kctx, js,
 					 target_katom);
-#endif
 
 	stopped = kbase_backend_soft_hard_stop_slot(kbdev, kctx, js,
 							target_katom,
 							JS_COMMAND_HARD_STOP);
-#if KBASE_GPU_RESET_EN
 	if (stopped && (kbase_hw_has_issue(kctx->kbdev, BASE_HW_ISSUE_8401) ||
 			kbase_hw_has_issue(kctx->kbdev, BASE_HW_ISSUE_9510) ||
 			needs_workaround_for_afbc)) {
@@ -1020,7 +1002,6 @@ void kbase_job_slot_hardstop(struct kbase_context *kctx, int js,
 			kbase_reset_gpu_locked(kbdev);
 		}
 	}
-#endif
 }
 
 /**
@@ -1085,8 +1066,6 @@ void kbase_job_check_leave_disjoint(struct kbase_device *kbdev,
 	}
 }
 
-
-#if KBASE_GPU_RESET_EN
 static void kbase_debug_dump_registers(struct kbase_device *kbdev)
 {
 	int i;
@@ -1129,7 +1108,6 @@ static void kbasep_reset_timeout_worker(struct work_struct *data)
 	struct kbase_device *kbdev;
 	ktime_t end_timestamp = ktime_get();
 	struct kbasep_js_device_data *js_devdata;
-	bool try_schedule = false;
 	bool silent = false;
 	u32 max_loops = KBASE_CLEAN_CACHE_MAX_LOOPS;
 
@@ -1147,9 +1125,10 @@ static void kbasep_reset_timeout_worker(struct work_struct *data)
 
 	KBASE_TRACE_ADD(kbdev, JM_BEGIN_RESET_WORKER, NULL, NULL, 0u, 0);
 
-	/* Suspend vinstr.
-	 * This call will block until vinstr is suspended. */
-	kbase_vinstr_suspend(kbdev->vinstr_ctx);
+	/* Disable GPU hardware counters.
+	 * This call will block until counters are disabled.
+	 */
+	kbase_hwcnt_context_disable(kbdev->hwcnt_gpu_ctx);
 
 	/* Make sure the timer has completed - this cannot be done from
 	 * interrupt context, so this cannot be done within
@@ -1164,15 +1143,18 @@ static void kbasep_reset_timeout_worker(struct work_struct *data)
 						KBASE_RESET_GPU_NOT_PENDING);
 		kbase_disjoint_state_down(kbdev);
 		wake_up(&kbdev->hwaccess.backend.reset_wait);
-		kbase_vinstr_resume(kbdev->vinstr_ctx);
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 		return;
 	}
 
 	KBASE_DEBUG_ASSERT(kbdev->irq_reset_flush == false);
 
-	spin_lock_irqsave(&kbdev->hwcnt.lock, flags);
-	spin_lock(&kbdev->hwaccess_lock);
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	spin_lock(&kbdev->mmu_mask_change);
+	kbase_pm_reset_start_locked(kbdev);
+
 	/* We're about to flush out the IRQs and their bottom half's */
 	kbdev->irq_reset_flush = true;
 
@@ -1181,8 +1163,7 @@ static void kbasep_reset_timeout_worker(struct work_struct *data)
 	kbase_pm_disable_interrupts_nolock(kbdev);
 
 	spin_unlock(&kbdev->mmu_mask_change);
-	spin_unlock(&kbdev->hwaccess_lock);
-	spin_unlock_irqrestore(&kbdev->hwcnt.lock, flags);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
 	/* Ensure that any IRQ handlers have finished
 	 * Must be done without any locks IRQ handlers will take */
@@ -1244,37 +1225,33 @@ static void kbasep_reset_timeout_worker(struct work_struct *data)
 
 	kbase_pm_enable_interrupts(kbdev);
 
-	atomic_set(&kbdev->hwaccess.backend.reset_gpu,
-						KBASE_RESET_GPU_NOT_PENDING);
-
 	kbase_disjoint_state_down(kbdev);
 
-	wake_up(&kbdev->hwaccess.backend.reset_wait);
-	if (!silent)
-		dev_err(kbdev->dev, "Reset complete");
-
-	if (js_devdata->nr_contexts_pullable > 0 && !kbdev->poweroff_pending)
-		try_schedule = true;
-
 	mutex_unlock(&js_devdata->runpool_mutex);
 
 	mutex_lock(&kbdev->pm.lock);
 
+	kbase_pm_reset_complete(kbdev);
+
 	/* Find out what cores are required now */
 	kbase_pm_update_cores_state(kbdev);
 
 	/* Synchronously request and wait for those cores, because if
 	 * instrumentation is enabled it would need them immediately. */
-	kbase_pm_check_transitions_sync(kbdev);
+	kbase_pm_wait_for_desired_state(kbdev);
 
 	mutex_unlock(&kbdev->pm.lock);
 
+	atomic_set(&kbdev->hwaccess.backend.reset_gpu,
+						KBASE_RESET_GPU_NOT_PENDING);
+
+	wake_up(&kbdev->hwaccess.backend.reset_wait);
+	if (!silent)
+		dev_err(kbdev->dev, "Reset complete");
+
 	/* Try submitting some jobs to restart processing */
-	if (try_schedule) {
-		KBASE_TRACE_ADD(kbdev, JM_SUBMIT_AFTER_RESET, NULL, NULL, 0u,
-									0);
-		kbase_js_sched_all(kbdev);
-	}
+	KBASE_TRACE_ADD(kbdev, JM_SUBMIT_AFTER_RESET, NULL, NULL, 0u, 0);
+	kbase_js_sched_all(kbdev);
 
 	/* Process any pending slot updates */
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
@@ -1283,8 +1260,10 @@ static void kbasep_reset_timeout_worker(struct work_struct *data)
 
 	kbase_pm_context_idle(kbdev);
 
-	/* Release vinstr */
-	kbase_vinstr_resume(kbdev->vinstr_ctx);
+	/* Re-enable GPU hardware counters */
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
 	KBASE_TRACE_ADD(kbdev, JM_END_RESET_WORKER, NULL, NULL, 0u, 0);
 }
@@ -1458,20 +1437,22 @@ void kbase_reset_gpu_locked(struct kbase_device *kbdev)
 	kbasep_try_reset_gpu_early_locked(kbdev);
 }
 
-void kbase_reset_gpu_silent(struct kbase_device *kbdev)
+int kbase_reset_gpu_silent(struct kbase_device *kbdev)
 {
 	if (atomic_cmpxchg(&kbdev->hwaccess.backend.reset_gpu,
 						KBASE_RESET_GPU_NOT_PENDING,
 						KBASE_RESET_GPU_SILENT) !=
 						KBASE_RESET_GPU_NOT_PENDING) {
 		/* Some other thread is already resetting the GPU */
-		return;
+		return -EAGAIN;
 	}
 
 	kbase_disjoint_state_up(kbdev);
 
 	queue_work(kbdev->hwaccess.backend.reset_workq,
 			&kbdev->hwaccess.backend.reset_work);
+
+	return 0;
 }
 
 bool kbase_reset_gpu_active(struct kbase_device *kbdev)
@@ -1482,4 +1463,3 @@ bool kbase_reset_gpu_active(struct kbase_device *kbdev)
 
 	return true;
 }
-#endif /* KBASE_GPU_RESET_EN */
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_internal.h b/mali_kbase/backend/gpu/mali_kbase_jm_internal.h
index 831491e..452ddee 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_internal.h
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_internal.h
@@ -159,31 +159,11 @@ void kbase_job_slot_halt(struct kbase_device *kbdev);
 void kbase_job_slot_term(struct kbase_device *kbdev);
 
 /**
- * kbase_gpu_cacheclean - Cause a GPU cache clean & flush
+ * kbase_gpu_cache_clean - Cause a GPU cache clean & flush
  * @kbdev: Device pointer
  *
  * Caller must not be in IRQ context
  */
-void kbase_gpu_cacheclean(struct kbase_device *kbdev);
-
-static inline bool kbase_atom_needs_tiler(struct kbase_device *kbdev,
-		base_jd_core_req core_req)
-{
-	return core_req & BASE_JD_REQ_T;
-}
-
-static inline bool kbase_atom_needs_shaders(struct kbase_device *kbdev,
-		base_jd_core_req core_req)
-{
-	if (!kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_XAFFINITY))
-		return true;
-	if ((core_req & (BASE_JD_REQ_FS | BASE_JD_REQ_CS | BASE_JD_REQ_T)) ==
-			BASE_JD_REQ_T) {
-		/* Tiler only atom */
-		return false;
-	}
-
-	return true;
-}
+void kbase_gpu_cache_clean(struct kbase_device *kbdev);
 
 #endif /* _KBASE_JM_HWACCESS_H_ */
diff --git a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
index bdb94be..c714582 100644
--- a/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
+++ b/mali_kbase/backend/gpu/mali_kbase_jm_rb.c
@@ -30,6 +30,7 @@
 #include <mali_kbase_jm.h>
 #include <mali_kbase_js.h>
 #include <mali_kbase_tlstream.h>
+#include <mali_kbase_hwcnt_context.h>
 #include <mali_kbase_10969_workaround.h>
 #include <backend/gpu/mali_kbase_cache_policy_backend.h>
 #include <backend/gpu/mali_kbase_device_internal.h>
@@ -296,143 +297,14 @@ int kbase_backend_slot_free(struct kbase_device *kbdev, int js)
 }
 
 
-static void kbasep_js_job_check_deref_cores(struct kbase_device *kbdev,
-						struct kbase_jd_atom *katom);
-
-static bool kbasep_js_job_check_ref_cores(struct kbase_device *kbdev,
-						int js,
-						struct kbase_jd_atom *katom)
-{
-	base_jd_core_req core_req = katom->core_req;
-
-	/* NOTE: The following uses a number of FALLTHROUGHs to optimize the
-	 * calls to this function. Ending of the function is indicated by BREAK
-	 * OUT.
-	 */
-	switch (katom->coreref_state) {
-		/* State when job is first attempted to be run */
-	case KBASE_ATOM_COREREF_STATE_NO_CORES_REQUESTED:
-		/* Request the cores */
-		kbase_pm_request_cores(kbdev,
-				kbase_atom_needs_tiler(kbdev, core_req),
-				kbase_atom_needs_shaders(kbdev, core_req));
-
-		/* Proceed to next state */
-		katom->coreref_state =
-		KBASE_ATOM_COREREF_STATE_WAITING_FOR_REQUESTED_CORES;
-
-		/* ***FALLTHROUGH: TRANSITION TO HIGHER STATE*** */
-
-	case KBASE_ATOM_COREREF_STATE_WAITING_FOR_REQUESTED_CORES:
-		{
-			bool cores_ready;
-
-			cores_ready = kbase_pm_cores_requested(kbdev,
-				kbase_atom_needs_tiler(kbdev, core_req),
-				kbase_atom_needs_shaders(kbdev,	core_req));
-
-			if (!cores_ready) {
-				/* Stay in this state and return, to retry at
-				 * this state later.
-				 */
-				KBASE_TRACE_ADD_SLOT_INFO(kbdev,
-				JS_CORE_REF_REGISTER_INUSE_FAILED,
-						katom->kctx, katom,
-						katom->jc, js,
-						(u32) 0);
-				/* *** BREAK OUT: No state transition *** */
-				break;
-			}
-			/* Proceed to next state */
-			katom->coreref_state = KBASE_ATOM_COREREF_STATE_READY;
-			/* *** BREAK OUT: Cores Ready *** */
-			break;
-		}
-
-	default:
-		KBASE_DEBUG_ASSERT_MSG(false,
-				"Unhandled kbase_atom_coreref_state %d",
-				katom->coreref_state);
-		break;
-	}
-
-	return (katom->coreref_state == KBASE_ATOM_COREREF_STATE_READY);
-}
-
-static void kbasep_js_job_check_deref_cores(struct kbase_device *kbdev,
-						struct kbase_jd_atom *katom)
-{
-	base_jd_core_req core_req = katom->core_req;
-
-	KBASE_DEBUG_ASSERT(kbdev != NULL);
-	KBASE_DEBUG_ASSERT(katom != NULL);
-
-	switch (katom->coreref_state) {
-	case KBASE_ATOM_COREREF_STATE_READY:
-		/* State where atom was submitted to the HW - just proceed to
-		 * power-down */
-
-		/* *** FALLTHROUGH *** */
-
-	case KBASE_ATOM_COREREF_STATE_WAITING_FOR_REQUESTED_CORES:
-		/* State where cores were requested */
-		kbase_pm_release_cores(kbdev,
-				kbase_atom_needs_tiler(kbdev, core_req),
-				kbase_atom_needs_shaders(kbdev, core_req));
-		break;
-
-	case KBASE_ATOM_COREREF_STATE_NO_CORES_REQUESTED:
-		/* Initial state - nothing required */
-		break;
-
-	default:
-		KBASE_DEBUG_ASSERT_MSG(false,
-						"Unhandled coreref_state: %d",
-							katom->coreref_state);
-		break;
-	}
-
-	katom->coreref_state = KBASE_ATOM_COREREF_STATE_NO_CORES_REQUESTED;
-}
-
-static void kbasep_js_job_check_deref_cores_nokatom(struct kbase_device *kbdev,
-		base_jd_core_req core_req,
-		enum kbase_atom_coreref_state coreref_state)
-{
-	KBASE_DEBUG_ASSERT(kbdev != NULL);
-
-	switch (coreref_state) {
-	case KBASE_ATOM_COREREF_STATE_READY:
-		/* State where atom was submitted to the HW - just proceed to
-		 * power-down */
-
-		/* *** FALLTHROUGH *** */
-
-	case KBASE_ATOM_COREREF_STATE_WAITING_FOR_REQUESTED_CORES:
-		/* State where cores were requested */
-		kbase_pm_release_cores(kbdev,
-				kbase_atom_needs_tiler(kbdev, core_req),
-				kbase_atom_needs_shaders(kbdev, core_req));
-		break;
-
-	case KBASE_ATOM_COREREF_STATE_NO_CORES_REQUESTED:
-		/* Initial state - nothing required */
-		break;
-
-	default:
-		KBASE_DEBUG_ASSERT_MSG(false,
-						"Unhandled coreref_state: %d",
-							coreref_state);
-		break;
-	}
-}
-
 static void kbase_gpu_release_atom(struct kbase_device *kbdev,
 					struct kbase_jd_atom *katom,
 					ktime_t *end_timestamp)
 {
 	struct kbase_context *kctx = katom->kctx;
 
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
 	switch (katom->gpu_rb_state) {
 	case KBASE_ATOM_GPU_RB_NOT_IN_SLOT_RB:
 		/* Should be impossible */
@@ -468,26 +340,47 @@ static void kbase_gpu_release_atom(struct kbase_device *kbdev,
 		break;
 
 	case KBASE_ATOM_GPU_RB_WAITING_PROTECTED_MODE_TRANSITION:
+		if (kbase_jd_katom_is_protected(katom) &&
+				(katom->protected_state.enter !=
+				KBASE_ATOM_ENTER_PROTECTED_CHECK) &&
+				(katom->protected_state.enter !=
+				KBASE_ATOM_ENTER_PROTECTED_HWCNT))
+			kbase_pm_protected_override_disable(kbdev);
+		if (!kbase_jd_katom_is_protected(katom) &&
+				(katom->protected_state.exit !=
+				KBASE_ATOM_EXIT_PROTECTED_CHECK) &&
+				(katom->protected_state.exit !=
+				KBASE_ATOM_EXIT_PROTECTED_RESET_WAIT))
+			kbase_pm_protected_override_disable(kbdev);
+
 		if (katom->protected_state.enter !=
 				KBASE_ATOM_ENTER_PROTECTED_CHECK ||
 				katom->protected_state.exit !=
 				KBASE_ATOM_EXIT_PROTECTED_CHECK)
 			kbdev->protected_mode_transition = false;
-
+		/* If the atom has suspended hwcnt but has not yet entered
+		 * protected mode, then resume hwcnt now. If the GPU is now in
+		 * protected mode then hwcnt will be resumed by GPU reset so
+		 * don't resume it here.
+		 */
 		if (kbase_jd_katom_is_protected(katom) &&
 				((katom->protected_state.enter ==
 				KBASE_ATOM_ENTER_PROTECTED_IDLE_L2) ||
 				 (katom->protected_state.enter ==
-				KBASE_ATOM_ENTER_PROTECTED_SET_COHERENCY) ||
-				 (katom->protected_state.enter ==
-				KBASE_ATOM_ENTER_PROTECTED_FINISHED))) {
-			kbase_vinstr_resume(kbdev->vinstr_ctx);
+				KBASE_ATOM_ENTER_PROTECTED_SET_COHERENCY))) {
+			WARN_ON(!kbdev->protected_mode_hwcnt_disabled);
+			kbdev->protected_mode_hwcnt_desired = true;
+			if (kbdev->protected_mode_hwcnt_disabled) {
+				kbase_hwcnt_context_enable(
+					kbdev->hwcnt_gpu_ctx);
+				kbdev->protected_mode_hwcnt_disabled = false;
+			}
 		}
 
 		if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_TGOX_R1_1234)) {
 			if (katom->atom_flags &
 					KBASE_KATOM_FLAG_HOLDING_L2_REF_PROT) {
-				kbdev->l2_users_count--;
+				kbase_pm_protected_l2_override(kbdev, false);
 				katom->atom_flags &=
 					~KBASE_KATOM_FLAG_HOLDING_L2_REF_PROT;
 			}
@@ -512,6 +405,8 @@ static void kbase_gpu_release_atom(struct kbase_device *kbdev,
 static void kbase_gpu_mark_atom_for_return(struct kbase_device *kbdev,
 						struct kbase_jd_atom *katom)
 {
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
 	kbase_gpu_release_atom(kbdev, katom, NULL);
 	katom->gpu_rb_state = KBASE_ATOM_GPU_RB_RETURN_TO_JS;
 }
@@ -630,9 +525,7 @@ static int kbase_gpu_protected_mode_reset(struct kbase_device *kbdev)
 
 	/* The protected mode disable callback will be called as part of reset
 	 */
-	kbase_reset_gpu_silent(kbdev);
-
-	return 0;
+	return kbase_reset_gpu_silent(kbdev);
 }
 
 static int kbase_jm_protected_entry(struct kbase_device *kbdev,
@@ -640,6 +533,8 @@ static int kbase_jm_protected_entry(struct kbase_device *kbdev,
 {
 	int err = 0;
 
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
 	err = kbase_gpu_protected_mode_enter(kbdev);
 
 	/*
@@ -648,14 +543,23 @@ static int kbase_jm_protected_entry(struct kbase_device *kbdev,
 	 */
 
 	kbdev->protected_mode_transition = false;
+	kbase_pm_protected_override_disable(kbdev);
+	kbase_pm_update_cores_state_nolock(kbdev);
 
 	KBASE_TLSTREAM_AUX_PROTECTED_ENTER_END(kbdev);
 	if (err) {
 		/*
 		 * Failed to switch into protected mode, resume
-		 * vinstr core and fail atom.
+		 * GPU hwcnt and fail atom.
 		 */
-		kbase_vinstr_resume(kbdev->vinstr_ctx);
+		WARN_ON(!kbdev->protected_mode_hwcnt_disabled);
+		kbdev->protected_mode_hwcnt_desired = true;
+		if (kbdev->protected_mode_hwcnt_disabled) {
+			kbase_hwcnt_context_enable(
+				kbdev->hwcnt_gpu_ctx);
+			kbdev->protected_mode_hwcnt_disabled = false;
+		}
+
 		katom[idx]->event_code = BASE_JD_EVENT_JOB_INVALID;
 		kbase_gpu_mark_atom_for_return(kbdev, katom[idx]);
 		/*
@@ -692,6 +596,8 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev,
 {
 	int err = 0;
 
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
 	switch (katom[idx]->protected_state.enter) {
 	case KBASE_ATOM_ENTER_PROTECTED_CHECK:
 		KBASE_TLSTREAM_AUX_PROTECTED_ENTER_START(kbdev);
@@ -700,25 +606,41 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev,
 		 * there are no atoms currently on the GPU. */
 		WARN_ON(kbdev->protected_mode_transition);
 		WARN_ON(kbase_gpu_atoms_submitted_any(kbdev));
+		/* If hwcnt is disabled, it means we didn't clean up correctly
+		 * during last exit from protected mode.
+		 */
+		WARN_ON(kbdev->protected_mode_hwcnt_disabled);
 
-		kbdev->protected_mode_transition = true;
 		katom[idx]->protected_state.enter =
-			KBASE_ATOM_ENTER_PROTECTED_VINSTR;
+			KBASE_ATOM_ENTER_PROTECTED_HWCNT;
+
+		kbdev->protected_mode_transition = true;
 
 		/* ***FALLTHROUGH: TRANSITION TO HIGHER STATE*** */
 
-	case KBASE_ATOM_ENTER_PROTECTED_VINSTR:
-		if (kbase_vinstr_try_suspend(kbdev->vinstr_ctx) < 0) {
-			/*
-			 * We can't switch now because
-			 * the vinstr core state switch
-			 * is not done yet.
-			 */
+	case KBASE_ATOM_ENTER_PROTECTED_HWCNT:
+		/* See if we can get away with disabling hwcnt atomically */
+		kbdev->protected_mode_hwcnt_desired = false;
+		if (!kbdev->protected_mode_hwcnt_disabled) {
+			if (kbase_hwcnt_context_disable_atomic(
+				kbdev->hwcnt_gpu_ctx))
+				kbdev->protected_mode_hwcnt_disabled = true;
+		}
+
+		/* We couldn't disable atomically, so kick off a worker */
+		if (!kbdev->protected_mode_hwcnt_disabled) {
+#if KERNEL_VERSION(3, 16, 0) > LINUX_VERSION_CODE
+			queue_work(system_wq,
+				&kbdev->protected_mode_hwcnt_disable_work);
+#else
+			queue_work(system_highpri_wq,
+				&kbdev->protected_mode_hwcnt_disable_work);
+#endif
 			return -EAGAIN;
 		}
 
 		/* Once reaching this point GPU must be
-		 * switched to protected mode or vinstr
+		 * switched to protected mode or hwcnt
 		 * re-enabled. */
 
 		/*
@@ -729,6 +651,7 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev,
 		katom[idx]->protected_state.enter =
 			KBASE_ATOM_ENTER_PROTECTED_IDLE_L2;
 
+		kbase_pm_protected_override_enable(kbdev);
 		kbase_pm_update_cores_state_nolock(kbdev);
 
 		/* ***FALLTHROUGH: TRANSITION TO HIGHER STATE*** */
@@ -764,7 +687,8 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev,
 			 * Power on L2 caches; this will also result in the
 			 * correct value written to coherency enable register.
 			 */
-			kbase_pm_request_l2_caches_nolock(kbdev);
+			kbase_pm_protected_l2_override(kbdev, true);
+
 			/*
 			 * Set the flag on the atom that additional
 			 * L2 references are taken.
@@ -787,14 +711,15 @@ static int kbase_jm_enter_protected_mode(struct kbase_device *kbdev,
 			 * Check that L2 caches are powered and, if so,
 			 * enter protected mode.
 			 */
-			if (kbdev->pm.backend.l2_powered != 0) {
+			if (kbdev->pm.backend.l2_state == KBASE_L2_ON) {
 				/*
 				 * Remove additional L2 reference and reset
 				 * the atom flag which denotes it.
 				 */
 				if (katom[idx]->atom_flags &
 					KBASE_KATOM_FLAG_HOLDING_L2_REF_PROT) {
-					kbdev->l2_users_count--;
+					kbase_pm_protected_l2_override(kbdev,
+							false);
 					katom[idx]->atom_flags &=
 						~KBASE_KATOM_FLAG_HOLDING_L2_REF_PROT;
 				}
@@ -825,6 +750,7 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev,
 {
 	int err = 0;
 
+	lockdep_assert_held(&kbdev->hwaccess_lock);
 
 	switch (katom[idx]->protected_state.exit) {
 	case KBASE_ATOM_EXIT_PROTECTED_CHECK:
@@ -844,6 +770,7 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev,
 				KBASE_ATOM_EXIT_PROTECTED_IDLE_L2;
 
 		kbdev->protected_mode_transition = true;
+		kbase_pm_protected_override_enable(kbdev);
 		kbase_pm_update_cores_state_nolock(kbdev);
 
 		/* ***FALLTHROUGH: TRANSITION TO HIGHER STATE*** */
@@ -865,8 +792,12 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev,
 		/* Issue the reset to the GPU */
 		err = kbase_gpu_protected_mode_reset(kbdev);
 
+		if (err == -EAGAIN)
+			return -EAGAIN;
+
 		if (err) {
 			kbdev->protected_mode_transition = false;
+			kbase_pm_protected_override_disable(kbdev);
 
 			/* Failed to exit protected mode, fail atom */
 			katom[idx]->event_code = BASE_JD_EVENT_JOB_INVALID;
@@ -880,7 +811,16 @@ static int kbase_jm_exit_protected_mode(struct kbase_device *kbdev,
 				kbase_jm_return_atom_to_js(kbdev, katom[idx]);
 			}
 
-			kbase_vinstr_resume(kbdev->vinstr_ctx);
+			/* If we're exiting from protected mode, hwcnt must have
+			 * been disabled during entry.
+			 */
+			WARN_ON(!kbdev->protected_mode_hwcnt_disabled);
+			kbdev->protected_mode_hwcnt_desired = true;
+			if (kbdev->protected_mode_hwcnt_disabled) {
+				kbase_hwcnt_context_enable(
+					kbdev->hwcnt_gpu_ctx);
+				kbdev->protected_mode_hwcnt_disabled = false;
+			}
 
 			return -EINVAL;
 		}
@@ -909,6 +849,9 @@ void kbase_backend_slot_update(struct kbase_device *kbdev)
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
+	if (kbase_reset_gpu_active(kbdev))
+		return;
+
 	for (js = 0; js < kbdev->gpu_props.num_job_slots; js++) {
 		struct kbase_jd_atom *katom[2];
 		int idx;
@@ -1014,9 +957,8 @@ void kbase_backend_slot_update(struct kbase_device *kbdev)
 					break;
 				}
 
-				cores_ready =
-					kbasep_js_job_check_ref_cores(kbdev, js,
-								katom[idx]);
+				cores_ready = kbase_pm_cores_requested(kbdev,
+						true);
 
 				if (katom[idx]->event_code ==
 						BASE_JD_EVENT_PM_EVENT) {
@@ -1204,19 +1146,11 @@ void kbase_gpu_complete_hw(struct kbase_device *kbdev, int js,
 		 * corruption we need to flush the cache manually before any
 		 * affected memory gets reused. */
 		katom->need_cache_flush_cores_retained = true;
-		kbase_pm_request_cores(kbdev,
-				kbase_atom_needs_tiler(kbdev, katom->core_req),
-				kbase_atom_needs_shaders(kbdev,
-						katom->core_req));
 	} else if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_10676)) {
 		if (kbdev->gpu_props.num_core_groups > 1 &&
 				katom->device_nr >= 1) {
 			dev_info(kbdev->dev, "JD: Flushing cache due to PRLAM-10676\n");
 			katom->need_cache_flush_cores_retained = true;
-			kbase_pm_request_cores(kbdev,
-				kbase_atom_needs_tiler(kbdev, katom->core_req),
-				kbase_atom_needs_shaders(kbdev,
-						katom->core_req));
 		}
 	}
 
@@ -1408,10 +1342,6 @@ void kbase_backend_reset(struct kbase_device *kbdev, ktime_t *end_timestamp)
 				break;
 			if (katom->protected_state.exit ==
 			    KBASE_ATOM_EXIT_PROTECTED_RESET_WAIT) {
-				KBASE_TLSTREAM_AUX_PROTECTED_LEAVE_END(kbdev);
-
-				kbase_vinstr_resume(kbdev->vinstr_ctx);
-
 				/* protected mode sanity checks */
 				KBASE_DEBUG_ASSERT_MSG(
 					kbase_jd_katom_is_protected(katom) == kbase_gpu_in_protected_mode(kbdev),
@@ -1434,8 +1364,6 @@ void kbase_backend_reset(struct kbase_device *kbdev, ktime_t *end_timestamp)
 			 * it will be processed again from the starting state.
 			 */
 			if (keep_in_jm_rb) {
-				kbasep_js_job_check_deref_cores(kbdev, katom);
-				katom->coreref_state = KBASE_ATOM_COREREF_STATE_NO_CORES_REQUESTED;
 				katom->protected_state.exit = KBASE_ATOM_EXIT_PROTECTED_CHECK;
 				/* As the atom was not removed, increment the
 				 * index so that we read the correct atom in the
@@ -1454,7 +1382,19 @@ void kbase_backend_reset(struct kbase_device *kbdev, ktime_t *end_timestamp)
 		}
 	}
 
+	/* Re-enable GPU hardware counters if we're resetting from protected
+	 * mode.
+	 */
+	kbdev->protected_mode_hwcnt_desired = true;
+	if (kbdev->protected_mode_hwcnt_disabled) {
+		kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+		kbdev->protected_mode_hwcnt_disabled = false;
+
+		KBASE_TLSTREAM_AUX_PROTECTED_LEAVE_END(kbdev);
+	}
+
 	kbdev->protected_mode_transition = false;
+	kbase_pm_protected_override_disable(kbdev);
 }
 
 static inline void kbase_gpu_stop_atom(struct kbase_device *kbdev,
@@ -1475,6 +1415,8 @@ static inline void kbase_gpu_remove_atom(struct kbase_device *kbdev,
 						u32 action,
 						bool disjoint)
 {
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
 	katom->event_code = BASE_JD_EVENT_REMOVED_FROM_NEXT;
 	kbase_gpu_mark_atom_for_return(kbdev, katom);
 	katom->kctx->blocked_js[katom->slot_nr][katom->sched_priority] = true;
@@ -1698,52 +1640,13 @@ bool kbase_backend_soft_hard_stop_slot(struct kbase_device *kbdev,
 	return ret;
 }
 
-void kbase_gpu_cacheclean(struct kbase_device *kbdev)
-{
-	/* Limit the number of loops to avoid a hang if the interrupt is missed
-	 */
-	u32 max_loops = KBASE_CLEAN_CACHE_MAX_LOOPS;
-
-	mutex_lock(&kbdev->cacheclean_lock);
-
-	/* use GPU_COMMAND completion solution */
-	/* clean & invalidate the caches */
-	KBASE_TRACE_ADD(kbdev, CORE_GPU_CLEAN_INV_CACHES, NULL, NULL, 0u, 0);
-	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
-					GPU_COMMAND_CLEAN_INV_CACHES);
-
-	/* wait for cache flush to complete before continuing */
-	while (--max_loops &&
-		(kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_RAWSTAT)) &
-						CLEAN_CACHES_COMPLETED) == 0)
-		;
-
-	/* clear the CLEAN_CACHES_COMPLETED irq */
-	KBASE_TRACE_ADD(kbdev, CORE_GPU_IRQ_CLEAR, NULL, NULL, 0u,
-							CLEAN_CACHES_COMPLETED);
-	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_CLEAR),
-						CLEAN_CACHES_COMPLETED);
-	KBASE_DEBUG_ASSERT_MSG(kbdev->hwcnt.backend.state !=
-						KBASE_INSTR_STATE_CLEANING,
-	    "Instrumentation code was cleaning caches, but Job Management code cleared their IRQ - Instrumentation code will now hang.");
-
-	mutex_unlock(&kbdev->cacheclean_lock);
-}
-
-void kbase_backend_cacheclean(struct kbase_device *kbdev,
+void kbase_backend_cache_clean(struct kbase_device *kbdev,
 		struct kbase_jd_atom *katom)
 {
 	if (katom->need_cache_flush_cores_retained) {
-		unsigned long flags;
-
-		kbase_gpu_cacheclean(kbdev);
+		kbase_gpu_start_cache_clean(kbdev);
+		kbase_gpu_wait_cache_clean(kbdev);
 
-		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		kbase_pm_release_cores(kbdev,
-				kbase_atom_needs_tiler(kbdev, katom->core_req),
-				kbase_atom_needs_shaders(kbdev,
-						katom->core_req));
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 		katom->need_cache_flush_cores_retained = false;
 	}
 }
@@ -1755,7 +1658,7 @@ void kbase_backend_complete_wq(struct kbase_device *kbdev,
 	 * If cache flush required due to HW workaround then perform the flush
 	 * now
 	 */
-	kbase_backend_cacheclean(kbdev, katom);
+	kbase_backend_cache_clean(kbdev, katom);
 
 	if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_10969)            &&
 	    (katom->core_req & BASE_JD_REQ_FS)                        &&
@@ -1774,24 +1677,11 @@ void kbase_backend_complete_wq(struct kbase_device *kbdev,
 			katom->atom_flags |= KBASE_KATOM_FLAGS_RERUN;
 		}
 	}
-
-	/* Clear the coreref_state now - while check_deref_cores() may not have
-	 * been called yet, the caller will have taken a copy of this field. If
-	 * this is not done, then if the atom is re-scheduled (following a soft
-	 * stop) then the core reference would not be retaken. */
-	katom->coreref_state = KBASE_ATOM_COREREF_STATE_NO_CORES_REQUESTED;
 }
 
 void kbase_backend_complete_wq_post_sched(struct kbase_device *kbdev,
-		base_jd_core_req core_req,
-		enum kbase_atom_coreref_state coreref_state)
+		base_jd_core_req core_req)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	kbasep_js_job_check_deref_cores_nokatom(kbdev, core_req, coreref_state);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
 	if (!kbdev->pm.active_count) {
 		mutex_lock(&kbdev->js_data.runpool_mutex);
 		mutex_lock(&kbdev->pm.lock);
@@ -1830,6 +1720,3 @@ void kbase_gpu_dump_slots(struct kbase_device *kbdev)
 
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
-
-
-
diff --git a/mali_kbase/backend/gpu/mali_kbase_js_backend.c b/mali_kbase/backend/gpu/mali_kbase_js_backend.c
index 205a31d..7307be4 100644
--- a/mali_kbase/backend/gpu/mali_kbase_js_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_js_backend.c
@@ -250,14 +250,12 @@ static enum hrtimer_restart timer_callback(struct hrtimer *timer)
 			}
 		}
 	}
-#if KBASE_GPU_RESET_EN
 	if (reset_needed) {
 		dev_err(kbdev->dev, "JS: Job has been on the GPU for too long (JS_RESET_TICKS_SS/DUMPING timeout hit). Issueing GPU soft-reset to resolve.");
 
 		if (kbase_prepare_to_reset_gpu_locked(kbdev))
 			kbase_reset_gpu_locked(kbdev);
 	}
-#endif /* KBASE_GPU_RESET_EN */
 	/* the timer is re-issued if there is contexts in the run-pool */
 
 	if (backend->timer_running)
diff --git a/mali_kbase/backend/gpu/mali_kbase_mmu_hw_direct.c b/mali_kbase/backend/gpu/mali_kbase_mmu_hw_direct.c
index f3487d9..ba5bf72 100644
--- a/mali_kbase/backend/gpu/mali_kbase_mmu_hw_direct.c
+++ b/mali_kbase/backend/gpu/mali_kbase_mmu_hw_direct.c
@@ -242,16 +242,20 @@ void kbase_mmu_interrupt(struct kbase_device *kbdev, u32 irq_stat)
 void kbase_mmu_hw_configure(struct kbase_device *kbdev, struct kbase_as *as)
 {
 	struct kbase_mmu_setup *current_setup = &as->current_setup;
-	u32 transcfg = 0;
+	u64 transcfg = 0;
 
 	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_AARCH64_MMU)) {
-		transcfg = current_setup->transcfg & 0xFFFFFFFFUL;
+		transcfg = current_setup->transcfg;
 
 		/* Set flag AS_TRANSCFG_PTW_MEMATTR_WRITE_BACK */
 		/* Clear PTW_MEMATTR bits */
 		transcfg &= ~AS_TRANSCFG_PTW_MEMATTR_MASK;
 		/* Enable correct PTW_MEMATTR bits */
 		transcfg |= AS_TRANSCFG_PTW_MEMATTR_WRITE_BACK;
+		/* Ensure page-tables reads use read-allocate cache-policy in
+		 * the L2
+		 */
+		transcfg |= AS_TRANSCFG_R_ALLOCATE;
 
 		if (kbdev->system_coherency == COHERENCY_ACE) {
 			/* Set flag AS_TRANSCFG_PTW_SH_OS (outer shareable) */
@@ -264,7 +268,7 @@ void kbase_mmu_hw_configure(struct kbase_device *kbdev, struct kbase_as *as)
 		kbase_reg_write(kbdev, MMU_AS_REG(as->number, AS_TRANSCFG_LO),
 				transcfg);
 		kbase_reg_write(kbdev, MMU_AS_REG(as->number, AS_TRANSCFG_HI),
-				(current_setup->transcfg >> 32) & 0xFFFFFFFFUL);
+				(transcfg >> 32) & 0xFFFFFFFFUL);
 	} else {
 		if (kbdev->system_coherency == COHERENCY_ACE)
 			current_setup->transtab |= AS_TRANSTAB_LPAE_SHARE_OUTER;
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
index a448a3b..c19a0d1 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_backend.c
@@ -31,11 +31,13 @@
 
 #include <mali_kbase_pm.h>
 #include <mali_kbase_hwaccess_jm.h>
+#include <mali_kbase_hwcnt_context.h>
 #include <backend/gpu/mali_kbase_js_internal.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
 #include <backend/gpu/mali_kbase_jm_internal.h>
 
 static void kbase_pm_gpu_poweroff_wait_wq(struct work_struct *data);
+static void kbase_pm_hwcnt_disable_worker(struct work_struct *data);
 
 int kbase_pm_runtime_init(struct kbase_device *kbdev)
 {
@@ -112,7 +114,7 @@ void kbase_pm_register_access_disable(struct kbase_device *kbdev)
 	kbdev->pm.backend.gpu_powered = false;
 }
 
-int kbase_hwaccess_pm_init(struct kbase_device *kbdev)
+int kbase_hwaccess_pm_early_init(struct kbase_device *kbdev)
 {
 	int ret = 0;
 
@@ -128,12 +130,12 @@ int kbase_hwaccess_pm_init(struct kbase_device *kbdev)
 	INIT_WORK(&kbdev->pm.backend.gpu_poweroff_wait_work,
 			kbase_pm_gpu_poweroff_wait_wq);
 
+	kbdev->pm.backend.ca_cores_enabled = ~0ull;
 	kbdev->pm.backend.gpu_powered = false;
 	kbdev->pm.suspending = false;
 #ifdef CONFIG_MALI_DEBUG
 	kbdev->pm.backend.driver_ready_for_irqs = false;
 #endif /* CONFIG_MALI_DEBUG */
-	kbdev->pm.backend.gpu_in_desired_state = true;
 	init_waitqueue_head(&kbdev->pm.backend.gpu_in_desired_state_wait);
 
 	/* Initialise the metrics subsystem */
@@ -141,9 +143,6 @@ int kbase_hwaccess_pm_init(struct kbase_device *kbdev)
 	if (ret)
 		return ret;
 
-	init_waitqueue_head(&kbdev->pm.backend.l2_powered_wait);
-	kbdev->pm.backend.l2_powered = 0;
-
 	init_waitqueue_head(&kbdev->pm.backend.reset_done_wait);
 	kbdev->pm.backend.reset_done = false;
 
@@ -161,8 +160,13 @@ int kbase_hwaccess_pm_init(struct kbase_device *kbdev)
 	if (kbase_pm_policy_init(kbdev) != 0)
 		goto pm_policy_fail;
 
+	if (kbase_pm_state_machine_init(kbdev) != 0)
+		goto pm_state_machine_fail;
+
 	return 0;
 
+pm_state_machine_fail:
+	kbase_pm_policy_term(kbdev);
 pm_policy_fail:
 	kbase_pm_ca_term(kbdev);
 workq_fail:
@@ -170,6 +174,19 @@ workq_fail:
 	return -EINVAL;
 }
 
+int kbase_hwaccess_pm_late_init(struct kbase_device *kbdev)
+{
+	KBASE_DEBUG_ASSERT(kbdev != NULL);
+
+	kbdev->pm.backend.hwcnt_desired = false;
+	kbdev->pm.backend.hwcnt_disabled = true;
+	INIT_WORK(&kbdev->pm.backend.hwcnt_disable_work,
+		kbase_pm_hwcnt_disable_worker);
+	kbase_hwcnt_context_disable(kbdev->hwcnt_gpu_ctx);
+
+	return 0;
+}
+
 void kbase_pm_do_poweron(struct kbase_device *kbdev, bool is_resume)
 {
 	lockdep_assert_held(&kbdev->pm.lock);
@@ -178,6 +195,17 @@ void kbase_pm_do_poweron(struct kbase_device *kbdev, bool is_resume)
 	 * kbase_pm_clock_off() */
 	kbase_pm_clock_on(kbdev, is_resume);
 
+	if (!is_resume) {
+		unsigned long flags;
+
+		/* Force update of L2 state - if we have abandoned a power off
+		 * then this may be required to power the L2 back on.
+		 */
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		kbase_pm_update_state(kbdev);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	}
+
 	/* Update core status as required by the policy */
 	kbase_pm_update_cores_state(kbdev);
 
@@ -194,36 +222,24 @@ static void kbase_pm_gpu_poweroff_wait_wq(struct work_struct *data)
 	struct kbasep_js_device_data *js_devdata = &kbdev->js_data;
 	unsigned long flags;
 
-#if !PLATFORM_POWER_DOWN_ONLY
-	/* Wait for power transitions to complete. We do this with no locks held
-	 * so that we don't deadlock with any pending workqueues */
-	kbase_pm_check_transitions_sync(kbdev);
-#endif /* !PLATFORM_POWER_DOWN_ONLY */
+	if (!platform_power_down_only)
+		/* Wait for power transitions to complete. We do this with no locks held
+		 * so that we don't deadlock with any pending workqueues.
+		 */
+		kbase_pm_wait_for_desired_state(kbdev);
 
 	mutex_lock(&js_devdata->runpool_mutex);
 	mutex_lock(&kbdev->pm.lock);
 
-#if PLATFORM_POWER_DOWN_ONLY
-	if (kbdev->pm.backend.gpu_powered) {
-		if (kbase_pm_get_ready_cores(kbdev, KBASE_PM_CORE_L2)) {
-			/* If L2 cache is powered then we must flush it before
-			 * we power off the GPU. Normally this would have been
-			 * handled when the L2 was powered off. */
-			kbase_gpu_cacheclean(kbdev);
-		}
-	}
-#endif /* PLATFORM_POWER_DOWN_ONLY */
-
 	if (!backend->poweron_required) {
-#if !PLATFORM_POWER_DOWN_ONLY
-		unsigned long flags;
+		if (!platform_power_down_only) {
+			unsigned long flags;
 
-		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		WARN_ON(kbdev->l2_available_bitmap ||
-				kbdev->shader_available_bitmap ||
-				kbdev->tiler_available_bitmap);
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-#endif /* !PLATFORM_POWER_DOWN_ONLY */
+			spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+			WARN_ON(backend->shaders_state != KBASE_SHADERS_OFF_CORESTACK_OFF ||
+					backend->l2_state != KBASE_L2_OFF);
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		}
 
 		/* Disable interrupts and turn the clock off */
 		if (!kbase_pm_clock_off(kbdev, backend->poweroff_is_suspend)) {
@@ -256,6 +272,8 @@ static void kbase_pm_gpu_poweroff_wait_wq(struct work_struct *data)
 	backend->poweroff_wait_in_progress = false;
 	if (backend->poweron_required) {
 		backend->poweron_required = false;
+		kbdev->pm.backend.l2_desired = true;
+		kbase_pm_update_state(kbdev);
 		kbase_pm_update_cores_state_nolock(kbdev);
 		kbase_backend_slot_update(kbdev);
 	}
@@ -267,6 +285,45 @@ static void kbase_pm_gpu_poweroff_wait_wq(struct work_struct *data)
 	wake_up(&kbdev->pm.backend.poweroff_wait);
 }
 
+static void kbase_pm_hwcnt_disable_worker(struct work_struct *data)
+{
+	struct kbase_device *kbdev = container_of(data, struct kbase_device,
+			pm.backend.hwcnt_disable_work);
+	struct kbase_pm_device_data *pm = &kbdev->pm;
+	struct kbase_pm_backend_data *backend = &pm->backend;
+	unsigned long flags;
+
+	bool do_disable;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	do_disable = !backend->hwcnt_desired && !backend->hwcnt_disabled;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	if (!do_disable)
+		return;
+
+	kbase_hwcnt_context_disable(kbdev->hwcnt_gpu_ctx);
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	do_disable = !backend->hwcnt_desired && !backend->hwcnt_disabled;
+
+	if (do_disable) {
+		/* PM state did not change while we were doing the disable,
+		 * so commit the work we just performed and continue the state
+		 * machine.
+		 */
+		backend->hwcnt_disabled = true;
+		kbase_pm_update_state(kbdev);
+	} else {
+		/* PM state was updated while we were doing the disable,
+		 * so we need to undo the disable we just performed.
+		 */
+		kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+	}
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+}
+
 void kbase_pm_do_poweroff(struct kbase_device *kbdev, bool is_suspend)
 {
 	unsigned long flags;
@@ -274,29 +331,36 @@ void kbase_pm_do_poweroff(struct kbase_device *kbdev, bool is_suspend)
 	lockdep_assert_held(&kbdev->pm.lock);
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	if (!kbdev->pm.backend.poweroff_wait_in_progress) {
-		/* Force all cores off */
-		kbdev->pm.backend.desired_shader_state = 0;
-		kbdev->pm.backend.desired_tiler_state = 0;
-
-		/* Force all cores to be unavailable, in the situation where
-		 * transitions are in progress for some cores but not others,
-		 * and kbase_pm_check_transitions_nolock can not immediately
-		 * power off the cores */
-		kbdev->shader_available_bitmap = 0;
-		kbdev->tiler_available_bitmap = 0;
-		kbdev->l2_available_bitmap = 0;
-
-		kbdev->pm.backend.poweroff_wait_in_progress = true;
-		kbdev->pm.backend.poweroff_is_suspend = is_suspend;
 
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-		/*Kick off wq here. Callers will have to wait*/
-		queue_work(kbdev->pm.backend.gpu_poweroff_wait_wq,
-				&kbdev->pm.backend.gpu_poweroff_wait_work);
+	spin_lock(&kbdev->pm.backend.gpu_powered_lock);
+	if (!kbdev->pm.backend.gpu_powered) {
+		spin_unlock(&kbdev->pm.backend.gpu_powered_lock);
+		goto unlock_hwaccess;
 	} else {
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		spin_unlock(&kbdev->pm.backend.gpu_powered_lock);
 	}
+
+	if (kbdev->pm.backend.poweroff_wait_in_progress)
+		goto unlock_hwaccess;
+
+	/* Force all cores off */
+	kbdev->pm.backend.shaders_desired = false;
+	kbdev->pm.backend.l2_desired = false;
+
+	kbdev->pm.backend.poweroff_wait_in_progress = true;
+	kbdev->pm.backend.poweroff_is_suspend = is_suspend;
+	kbdev->pm.backend.invoke_poweroff_wait_wq_when_l2_off = true;
+
+	/* l2_desired being false should cause the state machine to
+	 * start powering off the L2. When it actually is powered off,
+	 * the interrupt handler will call kbase_pm_l2_update_state()
+	 * again, which will trigger the kbase_pm_gpu_poweroff_wait_wq.
+	 * Callers of this function will need to wait on poweroff_wait.
+	 */
+	kbase_pm_update_state(kbdev);
+
+unlock_hwaccess:
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
 
 static bool is_poweroff_in_progress(struct kbase_device *kbdev)
@@ -341,8 +405,6 @@ int kbase_hwaccess_pm_powerup(struct kbase_device *kbdev,
 		return ret;
 	}
 
-	kbasep_pm_init_core_use_bitmaps(kbdev);
-
 	kbdev->pm.debug_core_mask_all = kbdev->pm.debug_core_mask[0] =
 			kbdev->pm.debug_core_mask[1] =
 			kbdev->pm.debug_core_mask[2] =
@@ -385,20 +447,20 @@ void kbase_hwaccess_pm_halt(struct kbase_device *kbdev)
 	KBASE_DEBUG_ASSERT(kbdev != NULL);
 
 	mutex_lock(&kbdev->pm.lock);
-	kbase_pm_cancel_deferred_poweroff(kbdev);
 	kbase_pm_do_poweroff(kbdev, false);
 	mutex_unlock(&kbdev->pm.lock);
 }
 
 KBASE_EXPORT_TEST_API(kbase_hwaccess_pm_halt);
 
-void kbase_hwaccess_pm_term(struct kbase_device *kbdev)
+void kbase_hwaccess_pm_early_term(struct kbase_device *kbdev)
 {
 	KBASE_DEBUG_ASSERT(kbdev != NULL);
 	KBASE_DEBUG_ASSERT(kbdev->pm.active_count == 0);
 	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_cycle_counter_requests == 0);
 
 	/* Free any resources the policy allocated */
+	kbase_pm_state_machine_term(kbdev);
 	kbase_pm_policy_term(kbdev);
 	kbase_pm_ca_term(kbdev);
 
@@ -408,16 +470,29 @@ void kbase_hwaccess_pm_term(struct kbase_device *kbdev)
 	destroy_workqueue(kbdev->pm.backend.gpu_poweroff_wait_wq);
 }
 
+void kbase_hwaccess_pm_late_term(struct kbase_device *kbdev)
+{
+	KBASE_DEBUG_ASSERT(kbdev != NULL);
+
+	cancel_work_sync(&kbdev->pm.backend.hwcnt_disable_work);
+
+	if (kbdev->pm.backend.hwcnt_disabled) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	}
+}
+
 void kbase_pm_power_changed(struct kbase_device *kbdev)
 {
-	bool cores_are_available;
 	unsigned long flags;
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	cores_are_available = kbase_pm_check_transitions_nolock(kbdev);
+	kbase_pm_update_state(kbdev);
 
-	if (cores_are_available)
-		kbase_backend_slot_update(kbdev);
+	kbase_backend_slot_update(kbdev);
 
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
@@ -455,7 +530,6 @@ void kbase_hwaccess_pm_suspend(struct kbase_device *kbdev)
 	mutex_lock(&js_devdata->runpool_mutex);
 	mutex_lock(&kbdev->pm.lock);
 
-	kbase_pm_cancel_deferred_poweroff(kbdev);
 	kbase_pm_do_poweroff(kbdev, true);
 
 	kbase_backend_timer_suspend(kbdev);
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_ca.c b/mali_kbase/backend/gpu/mali_kbase_pm_ca.c
index d4e8e42..2cb9452 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_ca.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_ca.c
@@ -30,15 +30,15 @@
 
 int kbase_pm_ca_init(struct kbase_device *kbdev)
 {
-	struct kbase_pm_backend_data *pm_backend = &kbdev->pm.backend;
 #ifdef CONFIG_MALI_DEVFREQ
+	struct kbase_pm_backend_data *pm_backend = &kbdev->pm.backend;
+
 	if (kbdev->current_core_mask)
 		pm_backend->ca_cores_enabled = kbdev->current_core_mask;
 	else
 		pm_backend->ca_cores_enabled =
 				kbdev->gpu_props.props.raw_props.shader_present;
 #endif
-	pm_backend->ca_in_transition = false;
 
 	return 0;
 }
@@ -55,10 +55,17 @@ void kbase_devfreq_set_core_mask(struct kbase_device *kbdev, u64 core_mask)
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
+	if (!(core_mask & kbdev->pm.debug_core_mask_all)) {
+		dev_err(kbdev->dev, "OPP core mask 0x%llX does not intersect with debug mask 0x%llX\n",
+				core_mask, kbdev->pm.debug_core_mask_all);
+		goto unlock;
+	}
+
 	pm_backend->ca_cores_enabled = core_mask;
 
-	kbase_pm_update_cores_state_nolock(kbdev);
+	kbase_pm_update_state(kbdev);
 
+unlock:
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
 	dev_dbg(kbdev->dev, "Devfreq policy : new core mask=%llX\n",
@@ -89,19 +96,12 @@ KBASE_EXPORT_TEST_API(kbase_pm_ca_get_core_mask);
 
 void kbase_pm_ca_instr_enable(struct kbase_device *kbdev)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	lockdep_assert_held(&kbdev->hwaccess_lock);
 	kbdev->pm.backend.instr_enabled = true;
-
-	kbase_pm_update_cores_state_nolock(kbdev);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
 
 void kbase_pm_ca_instr_disable(struct kbase_device *kbdev)
 {
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 	kbdev->pm.backend.instr_enabled = false;
-
-	kbase_pm_update_cores_state_nolock(kbdev);
 }
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_ca.h b/mali_kbase/backend/gpu/mali_kbase_pm_ca.h
index 2b005c9..274581d 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_ca.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_ca.h
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2011-2015 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2011-2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_defs.h b/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
index 7fe8eb3..0cff22e 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_defs.h
@@ -29,10 +29,8 @@
 
 #include "mali_kbase_pm_always_on.h"
 #include "mali_kbase_pm_coarse_demand.h"
-#include "mali_kbase_pm_demand.h"
 #if !MALI_CUSTOMER_RELEASE
-#include "mali_kbase_pm_demand_always_powered.h"
-#include "mali_kbase_pm_fast_start.h"
+#include "mali_kbase_pm_always_on_demand.h"
 #endif
 
 /* Forward definition - see mali_kbase.h */
@@ -65,6 +63,70 @@ enum kbase_pm_core_type {
 };
 
 /**
+ * enum kbase_l2_core_state - The states used for the L2 cache & tiler power
+ *                            state machine.
+ *
+ * @KBASE_L2_OFF: The L2 cache and tiler are off
+ * @KBASE_L2_PEND_ON: The L2 cache and tiler are powering on
+ * @KBASE_L2_ON_HWCNT_ENABLE: The L2 cache and tiler are on, and hwcnt is being
+ *                            enabled
+ * @KBASE_L2_ON: The L2 cache and tiler are on, and hwcnt is enabled
+ * @KBASE_L2_ON_HWCNT_DISABLE: The L2 cache and tiler are on, and hwcnt is being
+ *                             disabled
+ * @KBASE_L2_POWER_DOWN: The L2 cache and tiler are about to be powered off
+ * @KBASE_L2_PEND_OFF: The L2 cache and tiler are powering off
+ * @KBASE_L2_RESET_WAIT: The GPU is resetting, L2 cache and tiler power state
+ *                       are unknown
+ */
+enum kbase_l2_core_state {
+	KBASE_L2_OFF = 0,
+	KBASE_L2_PEND_ON,
+	KBASE_L2_ON_HWCNT_ENABLE,
+	KBASE_L2_ON,
+	KBASE_L2_ON_HWCNT_DISABLE,
+	KBASE_L2_POWER_DOWN,
+	KBASE_L2_PEND_OFF,
+	KBASE_L2_RESET_WAIT
+};
+
+/**
+ * enum kbase_shader_core_state - The states used for the shaders' state machine.
+ *
+ * @KBASE_SHADERS_OFF_CORESTACK_OFF: The shaders and core stacks are off
+ * @KBASE_SHADERS_OFF_CORESTACK_PEND_ON: The shaders are off, core stacks have
+ *                                       been requested to power on
+ * @KBASE_SHADERS_PEND_ON_CORESTACK_ON: Core stacks are on, shaders have been
+ *                                      requested to power on
+ * @KBASE_SHADERS_ON_CORESTACK_ON: The shaders and core stacks are on
+ * @KBASE_SHADERS_WAIT_OFF_CORESTACK_ON: The shaders have been requested to
+ *                                       power off, but they remain on for the
+ *                                       duration of the hysteresis timer
+ * @KBASE_SHADERS_WAIT_FINISHED_CORESTACK_ON: The hysteresis timer has expired
+ * @KBASE_SHADERS_PEND_OFF_CORESTACK_ON: The core stacks are on, and the shaders
+ *                                       have been requested to power off
+ * @KBASE_SHADERS_OFF_CORESTACK_PEND_OFF: The shaders are off, and the core stacks
+ *                                        have been requested to power off
+ * @KBASE_SHADERS_OFF_CORESTACK_OFF_TIMER_PEND_OFF: Shaders and corestacks are
+ *                                                  off, but the tick timer
+ *                                                  cancellation is still
+ *                                                  pending.
+ * @KBASE_SHADERS_RESET_WAIT: The GPU is resetting, shader and core stack power
+ *                            states are unknown
+ */
+enum kbase_shader_core_state {
+	KBASE_SHADERS_OFF_CORESTACK_OFF = 0,
+	KBASE_SHADERS_OFF_CORESTACK_PEND_ON,
+	KBASE_SHADERS_PEND_ON_CORESTACK_ON,
+	KBASE_SHADERS_ON_CORESTACK_ON,
+	KBASE_SHADERS_WAIT_OFF_CORESTACK_ON,
+	KBASE_SHADERS_WAIT_FINISHED_CORESTACK_ON,
+	KBASE_SHADERS_PEND_OFF_CORESTACK_ON,
+	KBASE_SHADERS_OFF_CORESTACK_PEND_OFF,
+	KBASE_SHADERS_OFF_CORESTACK_OFF_TIMER_PEND_OFF,
+	KBASE_SHADERS_RESET_WAIT
+};
+
+/**
  * struct kbasep_pm_metrics - Metrics data collected for use by the power
  *                            management framework.
  *
@@ -128,13 +190,39 @@ struct kbasep_pm_metrics_state {
 #endif
 };
 
+/**
+ * struct kbasep_pm_tick_timer_state - State for the shader hysteresis timer
+ * @wq: Work queue to wait for the timer to stopped
+ * @work: Work item which cancels the timer
+ * @timer: Timer for powering off the shader cores
+ * @configured_interval: Period of GPU poweroff timer
+ * @configured_ticks: User-configured number of ticks to wait after the shader
+ *                    power down request is received before turning off the cores
+ * @remaining_ticks: Number of remaining timer ticks until shaders are powered off
+ * @cancel_queued: True if the cancellation work item has been queued. This is
+ *                 required to ensure that it is not queued twice, e.g. after
+ *                 a reset, which could cause the timer to be incorrectly
+ *                 cancelled later by a delayed workitem.
+ * @needed: Whether the timer should restart itself
+ */
+struct kbasep_pm_tick_timer_state {
+	struct workqueue_struct *wq;
+	struct work_struct work;
+	struct hrtimer timer;
+
+	ktime_t configured_interval;
+	unsigned int configured_ticks;
+	unsigned int remaining_ticks;
+
+	bool cancel_queued;
+	bool needed;
+};
+
 union kbase_pm_policy_data {
 	struct kbasep_pm_policy_always_on always_on;
 	struct kbasep_pm_policy_coarse_demand coarse_demand;
-	struct kbasep_pm_policy_demand demand;
 #if !MALI_CUSTOMER_RELEASE
-	struct kbasep_pm_policy_demand_always_powered demand_always_powered;
-	struct kbasep_pm_policy_fast_start fast_start;
+	struct kbasep_pm_policy_always_on_demand always_on_demand;
 #endif
 };
 
@@ -147,39 +235,14 @@ union kbase_pm_policy_data {
  * @pm_current_policy: The policy that is currently actively controlling the
  *                     power state.
  * @pm_policy_data:    Private data for current PM policy
- * @ca_in_transition:  Flag indicating when core availability policy is
- *                     transitioning cores. The core availability policy must
- *                     set this when a change in core availability is occurring.
- *                     power_change_lock must be held when accessing this.
  * @reset_done:        Flag when a reset is complete
  * @reset_done_wait:   Wait queue to wait for changes to @reset_done
- * @l2_powered_wait:   Wait queue for whether the l2 cache has been powered as
- *                     requested
- * @l2_powered:        State indicating whether all the l2 caches are powered.
- *                     Non-zero indicates they're *all* powered
- *                     Zero indicates that some (or all) are not powered
  * @gpu_cycle_counter_requests: The reference count of active gpu cycle counter
  *                              users
  * @gpu_cycle_counter_requests_lock: Lock to protect @gpu_cycle_counter_requests
- * @desired_shader_state: A bit mask identifying the shader cores that the
- *                        power policy would like to be on. The current state
- *                        of the cores may be different, but there should be
- *                        transitions in progress that will eventually achieve
- *                        this state (assuming that the policy doesn't change
- *                        its mind in the mean time).
- * @powering_on_shader_state: A bit mask indicating which shader cores are
- *                            currently in a power-on transition
- * @desired_tiler_state: A bit mask identifying the tiler cores that the power
- *                       policy would like to be on. See @desired_shader_state
- * @powering_on_tiler_state: A bit mask indicating which tiler core are
- *                           currently in a power-on transition
- * @powering_on_l2_state: A bit mask indicating which l2-caches are currently
- *                        in a power-on transition
- * @powering_on_stack_state: A bit mask indicating which core stacks are
- *                           currently in a power-on transition
- * @gpu_in_desired_state: This flag is set if the GPU is powered as requested
- *                        by the desired_xxx_state variables
- * @gpu_in_desired_state_wait: Wait queue set when @gpu_in_desired_state != 0
+ * @gpu_in_desired_state_wait: Wait queue set when the GPU is in the desired
+ *                             state according to the L2 and shader power state
+ *                             machines
  * @gpu_powered:       Set to true when the GPU is powered and register
  *                     accesses are possible, false otherwise
  * @instr_enabled:     Set to true when instrumentation is enabled,
@@ -192,26 +255,12 @@ union kbase_pm_policy_data {
  * @gpu_powered_lock:  Spinlock that must be held when writing @gpu_powered or
  *                     accessing @driver_ready_for_irqs
  * @metrics:           Structure to hold metrics for the GPU
- * @gpu_poweroff_pending: number of poweroff timer ticks until the GPU is
- *                        powered off
- * @shader_poweroff_pending_time: number of poweroff timer ticks until shaders
- *                        and/or timers are powered off
- * @gpu_poweroff_timer: Timer for powering off GPU
- * @gpu_poweroff_wq:   Workqueue to power off GPU on when timer fires
- * @gpu_poweroff_work: Workitem used on @gpu_poweroff_wq
- * @shader_poweroff_pending: Bit mask of shaders to be powered off on next
- *                           timer callback
- * @tiler_poweroff_pending: Bit mask of tilers to be powered off on next timer
- *                          callback
- * @poweroff_timer_needed: true if the poweroff timer is currently required,
- *                         false otherwise
- * @poweroff_timer_running: true if the poweroff timer is currently running,
- *                          false otherwise
- *                          power_change_lock should be held when accessing,
- *                          unless there is no way the timer can be running (eg
- *                          hrtimer_cancel() was called immediately before)
+ * @shader_tick_timer: Structure to hold the shader poweroff tick timer state
  * @poweroff_wait_in_progress: true if a wait for GPU power off is in progress.
  *                             hwaccess_lock must be held when accessing
+ * @invoke_poweroff_wait_wq_when_l2_off: flag indicating that the L2 power state
+ *                                       machine should invoke the poweroff
+ *                                       worker after the L2 has turned off.
  * @poweron_required: true if a GPU power on is required. Should only be set
  *                    when poweroff_wait_in_progress is true, and therefore the
  *                    GPU can not immediately be powered on. pm.lock must be
@@ -236,35 +285,49 @@ union kbase_pm_policy_data {
  * @callback_power_runtime_idle: Optional callback when the GPU may be idle. See
  *                              &struct kbase_pm_callback_conf
  * @ca_cores_enabled: Cores that are currently available
+ * @l2_state: The current state of the L2 cache state machine. See
+ *            &enum kbase_l2_core_state
+ * @l2_desired: True if the L2 cache should be powered on by the L2 cache state
+ *              machine
+ * @shaders_state: The current state of the shader state machine.
+ * @shaders_avail: This is updated by the state machine when it is in a state
+ *                 where it can handle changes to the core availability. This
+ *                 is internal to the shader state machine and should *not* be
+ *                 modified elsewhere.
+ * @shaders_desired: True if the PM active count or power policy requires the
+ *                   shader cores to be on. This is used as an input to the
+ *                   shader power state machine.  The current state of the
+ *                   cores may be different, but there should be transitions in
+ *                   progress that will eventually achieve this state (assuming
+ *                   that the policy doesn't change its mind in the mean time).
+ * @in_reset: True if a GPU is resetting and normal power manager operation is
+ *            suspended
+ * @protected_transition_override : True if a protected mode transition is in
+ *                                  progress and is overriding power manager
+ *                                  behaviour.
+ * @protected_l2_override : Non-zero if the L2 cache is required during a
+ *                          protected mode transition. Has no effect if not
+ *                          transitioning.
+ * @hwcnt_desired: True if we want GPU hardware counters to be enabled.
+ * @hwcnt_disabled: True if GPU hardware counters are not enabled.
+ * @hwcnt_disable_work: Work item to disable GPU hardware counters, used if
+ *                      atomic disable is not possible.
  *
  * Note:
  * During an IRQ, @pm_current_policy can be NULL when the policy is being
  * changed with kbase_pm_set_policy(). The change is protected under
- * kbase_device.pm.power_change_lock. Direct access to this from IRQ context
+ * kbase_device.pm.pcower_change_lock. Direct access to this from IRQ context
  * must therefore check for NULL. If NULL, then kbase_pm_set_policy() will
  * re-issue the policy functions that would have been done under IRQ.
  */
 struct kbase_pm_backend_data {
 	const struct kbase_pm_policy *pm_current_policy;
 	union kbase_pm_policy_data pm_policy_data;
-	bool ca_in_transition;
 	bool reset_done;
 	wait_queue_head_t reset_done_wait;
-	wait_queue_head_t l2_powered_wait;
-	int l2_powered;
 	int gpu_cycle_counter_requests;
 	spinlock_t gpu_cycle_counter_requests_lock;
 
-	u64 desired_shader_state;
-	u64 powering_on_shader_state;
-	u64 desired_tiler_state;
-	u64 powering_on_tiler_state;
-	u64 powering_on_l2_state;
-#ifdef CONFIG_MALI_CORESTACK
-	u64 powering_on_stack_state;
-#endif /* CONFIG_MALI_CORESTACK */
-
-	bool gpu_in_desired_state;
 	wait_queue_head_t gpu_in_desired_state_wait;
 
 	bool gpu_powered;
@@ -279,23 +342,12 @@ struct kbase_pm_backend_data {
 
 	spinlock_t gpu_powered_lock;
 
-
 	struct kbasep_pm_metrics_state metrics;
 
-	int gpu_poweroff_pending;
-	int shader_poweroff_pending_time;
-
-	struct hrtimer gpu_poweroff_timer;
-	struct workqueue_struct *gpu_poweroff_wq;
-	struct work_struct gpu_poweroff_work;
-
-	u64 shader_poweroff_pending;
-	u64 tiler_poweroff_pending;
-
-	bool poweroff_timer_needed;
-	bool poweroff_timer_running;
+	struct kbasep_pm_tick_timer_state shader_tick_timer;
 
 	bool poweroff_wait_in_progress;
+	bool invoke_poweroff_wait_wq_when_l2_off;
 	bool poweron_required;
 	bool poweroff_is_suspend;
 
@@ -312,25 +364,38 @@ struct kbase_pm_backend_data {
 	void (*callback_power_runtime_off)(struct kbase_device *kbdev);
 	int (*callback_power_runtime_idle)(struct kbase_device *kbdev);
 
-#ifdef CONFIG_MALI_DEVFREQ
 	u64 ca_cores_enabled;
-#endif
+
+	enum kbase_l2_core_state l2_state;
+	enum kbase_shader_core_state shaders_state;
+	u64 shaders_avail;
+	bool l2_desired;
+	bool shaders_desired;
+
+	bool in_reset;
+
+	bool protected_transition_override;
+	int protected_l2_override;
+
+	bool hwcnt_desired;
+	bool hwcnt_disabled;
+	struct work_struct hwcnt_disable_work;
 };
 
 
 /* List of policy IDs */
 enum kbase_pm_policy_id {
-	KBASE_PM_POLICY_ID_DEMAND = 1,
-	KBASE_PM_POLICY_ID_ALWAYS_ON,
 	KBASE_PM_POLICY_ID_COARSE_DEMAND,
 #if !MALI_CUSTOMER_RELEASE
-	KBASE_PM_POLICY_ID_DEMAND_ALWAYS_POWERED,
-	KBASE_PM_POLICY_ID_FAST_START
+	KBASE_PM_POLICY_ID_ALWAYS_ON_DEMAND,
 #endif
+	KBASE_PM_POLICY_ID_ALWAYS_ON
 };
 
 typedef u32 kbase_pm_policy_flags;
 
+#define KBASE_PM_POLICY_FLAG_DISABLED_WITH_POWER_DOWN_ONLY (1u)
+
 /**
  * struct kbase_pm_policy - Power policy structure.
  *
@@ -377,13 +442,8 @@ struct kbase_pm_policy {
 	/**
 	 * Function called to find out if shader cores are needed
 	 *
-	 * This needs to at least satisfy kbdev->shader_needed_cnt, and so must
-	 * never return false when kbdev->shader_needed_cnt > 0.
-	 *
-	 * Note that kbdev->pm.active_count being 0 is not a good indicator
-	 * that kbdev->shader_needed_cnt is also 0 - refer to the documentation
-	 * on the active_count member in struct kbase_pm_device_data and
-	 * kbase_pm_is_active().
+	 * This needs to at least satisfy kbdev->pm.backend.shaders_desired,
+	 * and so must never return false when shaders_desired is true.
 	 *
 	 * @kbdev: The kbase device structure for the device (must be a
 	 *         valid pointer)
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_demand.c b/mali_kbase/backend/gpu/mali_kbase_pm_demand.c
deleted file mode 100644
index 01727d6..0000000
--- a/mali_kbase/backend/gpu/mali_kbase_pm_demand.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *
- * (C) COPYRIGHT 2010-2018 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU licence.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * SPDX-License-Identifier: GPL-2.0
- *
- */
-
-
-
-/*
- * A simple demand based power management policy
- */
-
-#include <mali_kbase.h>
-#include <mali_kbase_pm.h>
-
-static bool demand_shaders_needed(struct kbase_device *kbdev)
-{
-	return (kbdev->shader_needed_cnt > 0);
-}
-
-static bool demand_get_core_active(struct kbase_device *kbdev)
-{
-	return kbase_pm_is_active(kbdev);
-}
-
-static void demand_init(struct kbase_device *kbdev)
-{
-	CSTD_UNUSED(kbdev);
-}
-
-static void demand_term(struct kbase_device *kbdev)
-{
-	CSTD_UNUSED(kbdev);
-}
-
-/*
- * The struct kbase_pm_policy structure for the demand power policy.
- *
- * This is the static structure that defines the demand power policy's callback
- * and name.
- */
-const struct kbase_pm_policy kbase_pm_demand_policy_ops = {
-	"demand",			/* name */
-	demand_init,			/* init */
-	demand_term,			/* term */
-	demand_shaders_needed,		/* shaders_needed */
-	demand_get_core_active,		/* get_core_active */
-	0u,				/* flags */
-	KBASE_PM_POLICY_ID_DEMAND,	/* id */
-};
-
-KBASE_EXPORT_TEST_API(kbase_pm_demand_policy_ops);
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_demand.h b/mali_kbase/backend/gpu/mali_kbase_pm_demand.h
deleted file mode 100644
index 4b05e6d..0000000
--- a/mali_kbase/backend/gpu/mali_kbase_pm_demand.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- *
- * (C) COPYRIGHT 2011-2015,2018 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU licence.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * SPDX-License-Identifier: GPL-2.0
- *
- */
-
-
-
-/*
- * A simple demand based power management policy
- */
-
-#ifndef MALI_KBASE_PM_DEMAND_H
-#define MALI_KBASE_PM_DEMAND_H
-
-/**
- * DOC: Demand power management policy
- *
- * The demand power management policy has the following characteristics:
- * - When KBase indicates that the GPU will be powered up, but we don't yet
- *   know which Job Chains are to be run:
- *  - The Shader Cores are not powered up
- *
- * - When KBase indicates that Shader Cores are needed to submit the currently
- *   queued Job Chains:
- *  - Shader Cores are powered up
- *
- * - When KBase indicates that the GPU need not be powered:
- *  - The Shader Cores are powered off, and the GPU itself is powered off too.
- *
- * Note:
- * - KBase indicates the GPU will be powered up when it has a User Process that
- *   has just started to submit Job Chains.
- *
- * - KBase indicates the GPU need not be powered when all the Job Chains from
- *   User Processes have finished, and it is waiting for a User Process to
- *   submit some more Job Chains.
- */
-
-/**
- * struct kbasep_pm_policy_demand - Private structure for policy instance data
- *
- * @dummy: No state is needed, a dummy variable
- *
- * This contains data that is private to the demand power policy.
- */
-struct kbasep_pm_policy_demand {
-	int dummy;
-};
-
-extern const struct kbase_pm_policy kbase_pm_demand_policy_ops;
-
-#endif /* MALI_KBASE_PM_DEMAND_H */
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
index cdd5cf7..2e6599a 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_driver.c
@@ -29,15 +29,14 @@
 #include <mali_kbase.h>
 #include <mali_kbase_config_defaults.h>
 #include <mali_midg_regmap.h>
-#if defined(CONFIG_MALI_GATOR_SUPPORT)
 #include <mali_kbase_gator.h>
-#endif
 #include <mali_kbase_tlstream.h>
 #include <mali_kbase_pm.h>
 #include <mali_kbase_config_defaults.h>
 #include <mali_kbase_smc.h>
 #include <mali_kbase_hwaccess_jm.h>
 #include <mali_kbase_ctx_sched.h>
+#include <mali_kbase_hwcnt_context.h>
 #include <backend/gpu/mali_kbase_cache_policy_backend.h>
 #include <backend/gpu/mali_kbase_device_internal.h>
 #include <backend/gpu/mali_kbase_irq_internal.h>
@@ -45,11 +44,23 @@
 
 #include <linux/of.h>
 
-#if MALI_MOCK_TEST
-#define MOCKABLE(function) function##_original
+#ifdef CONFIG_MALI_CORESTACK
+bool corestack_driver_control = true;
 #else
-#define MOCKABLE(function) function
-#endif				/* MALI_MOCK_TEST */
+bool corestack_driver_control; /* Default value of 0/false */
+#endif
+module_param(corestack_driver_control, bool, 0000);
+MODULE_PARM_DESC(corestack_driver_control,
+		"Let the driver power on/off the GPU core stack independently "
+		"without involving the Power Domain Controller. This should "
+		"only be enabled on platforms for which integration of the PDC "
+		"to the Mali GPU is known to be problematic.");
+KBASE_EXPORT_TEST_API(corestack_driver_control);
+
+bool platform_power_down_only = PLATFORM_POWER_DOWN_ONLY;
+module_param(platform_power_down_only, bool, 0000);
+MODULE_PARM_DESC(platform_power_down_only,
+		"Disable power down of individual cores.");
 
 /**
  * enum kbasep_pm_action - Actions that can be performed on a core.
@@ -79,6 +90,47 @@ static u64 kbase_pm_get_state(
 		enum kbase_pm_core_type core_type,
 		enum kbasep_pm_action action);
 
+static bool kbase_pm_is_l2_desired(struct kbase_device *kbdev)
+{
+	if (kbdev->pm.backend.protected_transition_override &&
+			kbdev->pm.backend.protected_l2_override)
+		return true;
+
+	if (kbdev->pm.backend.protected_transition_override &&
+			!kbdev->pm.backend.shaders_desired)
+		return false;
+
+	return kbdev->pm.backend.l2_desired;
+}
+
+void kbase_pm_protected_override_enable(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	kbdev->pm.backend.protected_transition_override = true;
+}
+void kbase_pm_protected_override_disable(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	kbdev->pm.backend.protected_transition_override = false;
+}
+
+void kbase_pm_protected_l2_override(struct kbase_device *kbdev, bool override)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (override) {
+		kbdev->pm.backend.protected_l2_override++;
+		WARN_ON(kbdev->pm.backend.protected_l2_override <= 0);
+	} else {
+		kbdev->pm.backend.protected_l2_override--;
+		WARN_ON(kbdev->pm.backend.protected_l2_override < 0);
+	}
+
+	kbase_pm_update_state(kbdev);
+}
+
 /**
  * core_type_to_reg - Decode a core type and action to a register.
  *
@@ -96,24 +148,24 @@ static u64 kbase_pm_get_state(
 static u32 core_type_to_reg(enum kbase_pm_core_type core_type,
 						enum kbasep_pm_action action)
 {
-#ifdef CONFIG_MALI_CORESTACK
-	if (core_type == KBASE_PM_CORE_STACK) {
-		switch (action) {
-		case ACTION_PRESENT:
-			return STACK_PRESENT_LO;
-		case ACTION_READY:
-			return STACK_READY_LO;
-		case ACTION_PWRON:
-			return STACK_PWRON_LO;
-		case ACTION_PWROFF:
-			return STACK_PWROFF_LO;
-		case ACTION_PWRTRANS:
-			return STACK_PWRTRANS_LO;
-		default:
-			BUG();
+	if (corestack_driver_control) {
+		if (core_type == KBASE_PM_CORE_STACK) {
+			switch (action) {
+			case ACTION_PRESENT:
+				return STACK_PRESENT_LO;
+			case ACTION_READY:
+				return STACK_READY_LO;
+			case ACTION_PWRON:
+				return STACK_PWRON_LO;
+			case ACTION_PWROFF:
+				return STACK_PWROFF_LO;
+			case ACTION_PWRTRANS:
+				return STACK_PWRTRANS_LO;
+			default:
+				WARN(1, "Invalid action for core type\n");
+			}
 		}
 	}
-#endif /* CONFIG_MALI_CORESTACK */
 
 	return (u32)core_type + (u32)action;
 }
@@ -170,6 +222,12 @@ static void kbase_pm_invoke(struct kbase_device *kbdev,
 	u32 lo = cores & 0xFFFFFFFF;
 	u32 hi = (cores >> 32) & 0xFFFFFFFF;
 
+	/* When 'platform_power_down_only' is enabled, no core type should be
+	 * turned off individually.
+	 */
+	KBASE_DEBUG_ASSERT(!(action == ACTION_PWROFF &&
+			platform_power_down_only));
+
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
 	reg = core_type_to_reg(core_type, action);
@@ -272,16 +330,6 @@ static u64 kbase_pm_get_state(struct kbase_device *kbdev,
 	return (((u64) hi) << 32) | ((u64) lo);
 }
 
-void kbasep_pm_init_core_use_bitmaps(struct kbase_device *kbdev)
-{
-	kbdev->shader_available_bitmap = 0;
-	kbdev->tiler_available_bitmap = 0;
-	kbdev->l2_users_count = 0;
-	kbdev->l2_available_bitmap = 0;
-	kbdev->tiler_needed_cnt = 0;
-	kbdev->shader_needed_cnt = 0;
-}
-
 /**
  * kbase_pm_get_present_cores - Get the cores that are present
  *
@@ -385,525 +433,776 @@ u64 kbase_pm_get_ready_cores(struct kbase_device *kbdev,
 
 KBASE_EXPORT_TEST_API(kbase_pm_get_ready_cores);
 
-/**
- * kbase_pm_transition_core_type - Perform power transitions for a particular
- *                                 core type.
- *
- * This function will perform any available power transitions to make the actual
- * hardware state closer to the desired state. If a core is currently
- * transitioning then changes to the power state of that call cannot be made
- * until the transition has finished. Cores which are not present in the
- * hardware are ignored if they are specified in the desired_state bitmask,
- * however the return value will always be 0 in this case.
- *
- * @kbdev:             The kbase device
- * @type:              The core type to perform transitions for
- * @desired_state:     A bit mask of the desired state of the cores
- * @in_use:            A bit mask of the cores that are currently running
- *                     jobs. These cores have to be kept powered up because
- *                     there are jobs running (or about to run) on them.
- * @available:         Receives a bit mask of the cores that the job
- *                     scheduler can use to submit jobs to. May be NULL if
- *                     this is not needed.
- * @powering_on:       Bit mask to update with cores that are
- *                    transitioning to a power-on state.
- *
- * Return: true if the desired state has been reached, false otherwise
- */
-static bool kbase_pm_transition_core_type(struct kbase_device *kbdev,
-						enum kbase_pm_core_type type,
-						u64 desired_state,
-						u64 in_use,
-						u64 * const available,
-						u64 *powering_on)
+static u64 kbase_pm_l2_update_state(struct kbase_device *kbdev)
 {
-	u64 present;
-	u64 ready;
-	u64 trans;
-	u64 powerup;
-	u64 powerdown;
-	u64 powering_on_trans;
-	u64 desired_state_in_use;
+	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
+	u64 l2_present = kbdev->gpu_props.props.raw_props.l2_present;
+	u64 tiler_present = kbdev->gpu_props.props.raw_props.tiler_present;
+	enum kbase_l2_core_state prev_state;
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-	/* Get current state */
-	present = kbase_pm_get_present_cores(kbdev, type);
-	trans = kbase_pm_get_trans_cores(kbdev, type);
-	ready = kbase_pm_get_ready_cores(kbdev, type);
-
-	/* mask off ready from trans in case transitions finished between the
-	 * register reads */
-	trans &= ~ready;
+	do {
+		/* Get current state */
+		u64 l2_trans = kbase_pm_get_trans_cores(kbdev,
+				KBASE_PM_CORE_L2);
+		u64 l2_ready = kbase_pm_get_ready_cores(kbdev,
+				KBASE_PM_CORE_L2);
+		u64 tiler_trans = kbase_pm_get_trans_cores(kbdev,
+				KBASE_PM_CORE_TILER);
+		u64 tiler_ready = kbase_pm_get_ready_cores(kbdev,
+				KBASE_PM_CORE_TILER);
+
+		/* mask off ready from trans in case transitions finished
+		 * between the register reads
+		 */
+		l2_trans &= ~l2_ready;
+		tiler_trans &= ~tiler_ready;
+
+		prev_state = backend->l2_state;
+
+		switch (backend->l2_state) {
+		case KBASE_L2_OFF:
+			if (kbase_pm_is_l2_desired(kbdev)) {
+				/* L2 is required, power on.  Powering on the
+				 * tiler will also power the first L2 cache.
+				 */
+				kbase_pm_invoke(kbdev, KBASE_PM_CORE_TILER,
+						tiler_present, ACTION_PWRON);
+
+				/* If we have more than one L2 cache then we
+				 * must power them on explicitly.
+				 */
+				if (l2_present != 1)
+					kbase_pm_invoke(kbdev, KBASE_PM_CORE_L2,
+							l2_present & ~1,
+							ACTION_PWRON);
+				backend->l2_state = KBASE_L2_PEND_ON;
+			}
+			break;
 
-	powering_on_trans = trans & *powering_on;
+		case KBASE_L2_PEND_ON:
+			if (!l2_trans && l2_ready == l2_present && !tiler_trans
+					&& tiler_ready == tiler_present) {
+				KBASE_TRACE_ADD(kbdev,
+						PM_CORES_CHANGE_AVAILABLE_TILER,
+						NULL, NULL, 0u,
+						(u32)tiler_ready);
+				/*
+				 * Ensure snoops are enabled after L2 is powered
+				 * up. Note that kbase keeps track of the snoop
+				 * state, so safe to repeatedly call.
+				 */
+				kbase_pm_cache_snoop_enable(kbdev);
+
+				/* With the L2 enabled, we can now enable
+				 * hardware counters.
+				 */
+				backend->l2_state = KBASE_L2_ON_HWCNT_ENABLE;
+
+				/* Now that the L2 is on, the shaders can start
+				 * powering on if they're required. The obvious
+				 * way to do this would be to call
+				 * kbase_pm_shaders_update_state() here.
+				 * However, that would make the two state
+				 * machines mutually recursive, as the opposite
+				 * would be needed for powering down. Instead,
+				 * callers of this function should use the
+				 * kbase_pm_update_state() wrapper, which will
+				 * call the shader state machine immediately
+				 * after the L2 (for power up), or
+				 * automatically re-invoke the L2 state machine
+				 * when the shaders power down.
+				 */
+			}
+			break;
+
+		case KBASE_L2_ON_HWCNT_ENABLE:
+			backend->hwcnt_desired = true;
+			if (backend->hwcnt_disabled) {
+				kbase_hwcnt_context_enable(
+					kbdev->hwcnt_gpu_ctx);
+				backend->hwcnt_disabled = false;
+			}
+			backend->l2_state = KBASE_L2_ON;
+			break;
+
+		case KBASE_L2_ON:
+			if (!kbase_pm_is_l2_desired(kbdev)) {
+				/* Do not power off L2 until the shaders and
+				 * core stacks are off.
+				 */
+				if (backend->shaders_state != KBASE_SHADERS_OFF_CORESTACK_OFF)
+					break;
+
+				/* We need to make sure hardware counters are
+				 * disabled before powering down the L2, to
+				 * prevent loss of data.
+				 *
+				 * We waited until after the cores were powered
+				 * down to prevent ping-ponging between hwcnt
+				 * enabled and disabled, which would have
+				 * happened if userspace submitted more work
+				 * while we were trying to power down.
+				 */
+				backend->l2_state = KBASE_L2_ON_HWCNT_DISABLE;
+			}
+			break;
+
+		case KBASE_L2_ON_HWCNT_DISABLE:
+			/* If the L2 became desired while we were waiting on the
+			 * worker to do the actual hwcnt disable (which might
+			 * happen if some work was submitted immediately after
+			 * the shaders powered off), then we need to early-out
+			 * of this state and re-enable hwcnt.
+			 *
+			 * If we get lucky, the hwcnt disable might not have
+			 * actually started yet, and the logic in the hwcnt
+			 * enable state will prevent the worker from
+			 * performing the disable entirely, preventing loss of
+			 * any hardware counter data.
+			 *
+			 * If the hwcnt disable has started, then we'll lose
+			 * a tiny amount of hardware counter data between the
+			 * disable and the re-enable occurring.
+			 *
+			 * This loss of data is preferable to the alternative,
+			 * which is to block the shader cores from doing any
+			 * work until we're sure hwcnt has been re-enabled.
+			 */
+			if (kbase_pm_is_l2_desired(kbdev)) {
+				backend->l2_state = KBASE_L2_ON_HWCNT_ENABLE;
+				break;
+			}
 
-	if (available != NULL)
-		*available = (ready | powering_on_trans) & desired_state;
+			/* See if we can get away with disabling hwcnt
+			 * atomically, otherwise kick off a worker.
+			 */
+			backend->hwcnt_desired = false;
+			if (!backend->hwcnt_disabled) {
+				if (kbase_hwcnt_context_disable_atomic(
+					kbdev->hwcnt_gpu_ctx))
+					backend->hwcnt_disabled = true;
+				else
+#if KERNEL_VERSION(3, 16, 0) > LINUX_VERSION_CODE
+					queue_work(system_wq,
+						&backend->hwcnt_disable_work);
+#else
+					queue_work(system_highpri_wq,
+						&backend->hwcnt_disable_work);
+#endif
+			}
 
-	if (trans) /* Do not progress if any cores are transitioning */
-		return false;
+			if (backend->hwcnt_disabled)
+				backend->l2_state = KBASE_L2_POWER_DOWN;
+			break;
+
+		case KBASE_L2_POWER_DOWN:
+			if (!platform_power_down_only)
+				/* Powering off the L2 will also power off the
+				 * tiler.
+				 */
+				kbase_pm_invoke(kbdev, KBASE_PM_CORE_L2,
+						l2_present,
+						ACTION_PWROFF);
+			else
+				/* If L2 cache is powered then we must flush it
+				 * before we power off the GPU. Normally this
+				 * would have been handled when the L2 was
+				 * powered off.
+				 */
+				kbase_gpu_start_cache_clean_nolock(
+						kbdev);
 
-	*powering_on = powering_on_trans;
+			KBASE_TRACE_ADD(kbdev, PM_CORES_CHANGE_AVAILABLE_TILER,
+					NULL, NULL, 0u, 0u);
+
+			backend->l2_state = KBASE_L2_PEND_OFF;
+			break;
+
+		case KBASE_L2_PEND_OFF:
+			if (!platform_power_down_only) {
+				/* We only need to check the L2 here - if the L2
+				 * is off then the tiler is definitely also off.
+				 */
+				if (!l2_trans && !l2_ready)
+					/* L2 is now powered off */
+					backend->l2_state = KBASE_L2_OFF;
+			} else {
+				if (!kbdev->cache_clean_in_progress)
+					backend->l2_state = KBASE_L2_OFF;
+			}
+			break;
 
-	/* Update desired state to include the in-use cores. These have to be
-	 * kept powered up because there are jobs running or about to run on
-	 * these cores
-	 */
-	desired_state_in_use = desired_state | in_use;
-
-	/* Update state of whether l2 caches are powered */
-	if (type == KBASE_PM_CORE_L2) {
-		if ((ready == present) && (desired_state_in_use == ready) &&
-								(trans == 0)) {
-			/* All are ready, none will be turned off, and none are
-			 * transitioning */
-			kbdev->pm.backend.l2_powered = 1;
-			/*
-			 * Ensure snoops are enabled after L2 is powered up,
-			 * note that kbase keeps track of the snoop state, so
-			 * safe to repeatedly call.
-			 */
-			kbase_pm_cache_snoop_enable(kbdev);
-			if (kbdev->l2_users_count > 0) {
-				/* Notify any registered l2 cache users
-				 * (optimized out when no users waiting) */
-				wake_up(&kbdev->pm.backend.l2_powered_wait);
+		case KBASE_L2_RESET_WAIT:
+			if (!backend->in_reset) {
+				/* Reset complete */
+				backend->l2_state = KBASE_L2_OFF;
 			}
-		} else
-			kbdev->pm.backend.l2_powered = 0;
-	}
+			break;
 
-	if (desired_state == ready && (trans == 0))
-		return true;
+		default:
+			WARN(1, "Invalid state in l2_state: %d",
+					backend->l2_state);
+		}
+	} while (backend->l2_state != prev_state);
 
-	/* Restrict the cores to those that are actually present */
-	powerup = desired_state_in_use & present;
-	powerdown = (~desired_state_in_use) & present;
-
-	/* Restrict to cores that are not already in the desired state */
-	powerup &= ~ready;
-	powerdown &= ready;
-
-	/* Don't transition any cores that are already transitioning, except for
-	 * Mali cores that support the following case:
-	 *
-	 * If the SHADER_PWRON or TILER_PWRON registers are written to turn on
-	 * a core that is currently transitioning to power off, then this is
-	 * remembered and the shader core is automatically powered up again once
-	 * the original transition completes. Once the automatic power on is
-	 * complete any job scheduled on the shader core should start.
-	 */
-	powerdown &= ~trans;
+	if (kbdev->pm.backend.invoke_poweroff_wait_wq_when_l2_off &&
+			backend->l2_state == KBASE_L2_OFF) {
+		kbdev->pm.backend.invoke_poweroff_wait_wq_when_l2_off = false;
+		queue_work(kbdev->pm.backend.gpu_poweroff_wait_wq,
+				&kbdev->pm.backend.gpu_poweroff_wait_work);
+	}
+
+	if (backend->l2_state == KBASE_L2_ON)
+		return l2_present;
+	return 0;
+}
 
-	if (kbase_hw_has_feature(kbdev,
-				BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS))
-		if (KBASE_PM_CORE_SHADER == type || KBASE_PM_CORE_TILER == type)
-			trans = powering_on_trans; /* for exception cases, only
-						    * mask off cores in power on
-						    * transitions */
+static void shader_poweroff_timer_stop_callback(struct work_struct *data)
+{
+	unsigned long flags;
+	struct kbasep_pm_tick_timer_state *stt = container_of(data,
+			struct kbasep_pm_tick_timer_state, work);
+	struct kbase_device *kbdev = container_of(stt, struct kbase_device,
+			pm.backend.shader_tick_timer);
 
-	powerup &= ~trans;
+	hrtimer_cancel(&stt->timer);
 
-	/* Perform transitions if any */
-	kbase_pm_invoke(kbdev, type, powerup, ACTION_PWRON);
-#if !PLATFORM_POWER_DOWN_ONLY
-	kbase_pm_invoke(kbdev, type, powerdown, ACTION_PWROFF);
-#endif
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
-	/* Recalculate cores transitioning on, and re-evaluate our state */
-	powering_on_trans |= powerup;
-	*powering_on = powering_on_trans;
-	if (available != NULL)
-		*available = (ready | powering_on_trans) & desired_state;
+	stt->cancel_queued = false;
+	if (kbdev->pm.backend.gpu_powered)
+		kbase_pm_update_state(kbdev);
 
-	return false;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
 
-KBASE_EXPORT_TEST_API(kbase_pm_transition_core_type);
-
 /**
- * get_desired_cache_status - Determine which caches should be on for a
- *                            particular core state
+ * shader_poweroff_timer_queue_cancel - cancel the shader poweroff tick timer
+ * @kbdev:      pointer to kbase device
  *
- * This function takes a bit mask of the present caches and the cores (or
- * caches) that are attached to the caches that will be powered. It then
- * computes which caches should be turned on to allow the cores requested to be
- * powered up.
+ * Synchronization between the shader state machine and the timer thread is
+ * difficult. This is because situations may arise where the state machine
+ * wants to start the timer, but the callback is already running, and has
+ * already passed the point at which it checks whether it is required, and so
+ * cancels itself, even though the state machine may have just tried to call
+ * hrtimer_start.
  *
- * @present:       The bit mask of present caches
- * @cores_powered: A bit mask of cores (or L2 caches) that are desired to
- *                 be powered
- * @tilers_powered: The bit mask of tilers that are desired to be powered
+ * This cannot be stopped by holding hwaccess_lock in the timer thread,
+ * because there are still infinitesimally small sections at the start and end
+ * of the callback where the lock is not held.
  *
- * Return: A bit mask of the caches that should be turned on
+ * Instead, a new state is added to the shader state machine,
+ * KBASE_SHADERS_OFF_CORESTACK_OFF_TIMER_PEND_OFF. This is used to guarantee
+ * that when the shaders are switched off, the timer has definitely been
+ * cancelled. As a result, when KBASE_SHADERS_ON_CORESTACK_ON is left and the
+ * timer is started, it is guaranteed that either the timer is already running
+ * (from an availability change or cancelled timer), or hrtimer_start will
+ * succeed. It is critical to avoid ending up in
+ * KBASE_SHADERS_WAIT_OFF_CORESTACK_ON without the timer running, or it could
+ * hang there forever.
  */
-static u64 get_desired_cache_status(u64 present, u64 cores_powered,
-		u64 tilers_powered)
+static void shader_poweroff_timer_queue_cancel(struct kbase_device *kbdev)
 {
-	u64 desired = 0;
+	struct kbasep_pm_tick_timer_state *stt =
+			&kbdev->pm.backend.shader_tick_timer;
 
-	while (present) {
-		/* Find out which is the highest set bit */
-		u64 bit = fls64(present) - 1;
-		u64 bit_mask = 1ull << bit;
-		/* Create a mask which has all bits from 'bit' upwards set */
+	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-		u64 mask = ~(bit_mask - 1);
+	stt->needed = false;
 
-		/* If there are any cores powered at this bit or above (that
-		 * haven't previously been processed) then we need this core on
-		 */
-		if (cores_powered & mask)
-			desired |= bit_mask;
-
-		/* Remove bits from cores_powered and present */
-		cores_powered &= ~mask;
-		present &= ~bit_mask;
+	if (hrtimer_active(&stt->timer) && !stt->cancel_queued) {
+		stt->cancel_queued = true;
+		queue_work(stt->wq, &stt->work);
 	}
+}
 
-	/* Power up the required L2(s) for the tiler */
-	if (tilers_powered)
-		desired |= 1;
+static void kbase_pm_shaders_update_state(struct kbase_device *kbdev)
+{
+	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
+	struct kbasep_pm_tick_timer_state *stt =
+			&kbdev->pm.backend.shader_tick_timer;
+	enum kbase_shader_core_state prev_state;
+	u64 stacks_avail = 0;
 
-	return desired;
-}
+	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-KBASE_EXPORT_TEST_API(get_desired_cache_status);
+	if (corestack_driver_control)
+		/* Always power on all the corestacks. Disabling certain
+		 * corestacks when their respective shaders are not in the
+		 * available bitmap is not currently supported.
+		 */
+		stacks_avail = kbase_pm_get_present_cores(kbdev, KBASE_PM_CORE_STACK);
 
-#ifdef CONFIG_MALI_CORESTACK
-u64 kbase_pm_core_stack_mask(u64 cores)
-{
-	u64 stack_mask = 0;
-	size_t const MAX_CORE_ID = 31;
-	size_t const NUM_CORES_PER_STACK = 4;
-	size_t i;
-
-	for (i = 0; i <= MAX_CORE_ID; ++i) {
-		if (test_bit(i, (unsigned long *)&cores)) {
-			/* Every core which ID >= 16 is filled to stacks 4-7
-			 * instead of 0-3 */
-			size_t const stack_num = (i >= 16) ?
-				(i % NUM_CORES_PER_STACK) + 4 :
-				(i % NUM_CORES_PER_STACK);
-			set_bit(stack_num, (unsigned long *)&stack_mask);
+	do {
+		u64 shaders_trans = kbase_pm_get_trans_cores(kbdev, KBASE_PM_CORE_SHADER);
+		u64 shaders_ready = kbase_pm_get_ready_cores(kbdev, KBASE_PM_CORE_SHADER);
+		u64 stacks_trans = 0;
+		u64 stacks_ready = 0;
+
+		if (corestack_driver_control) {
+			stacks_trans = kbase_pm_get_trans_cores(kbdev, KBASE_PM_CORE_STACK);
+			stacks_ready = kbase_pm_get_ready_cores(kbdev, KBASE_PM_CORE_STACK);
 		}
-	}
 
-	return stack_mask;
-}
-#endif /* CONFIG_MALI_CORESTACK */
+		/* mask off ready from trans in case transitions finished
+		 * between the register reads
+		 */
+		shaders_trans &= ~shaders_ready;
+		stacks_trans &= ~stacks_ready;
 
-bool
-MOCKABLE(kbase_pm_check_transitions_nolock) (struct kbase_device *kbdev)
-{
-	bool cores_are_available = false;
-	bool in_desired_state = true;
-	u64 desired_l2_state;
-#ifdef CONFIG_MALI_CORESTACK
-	u64 desired_stack_state;
-	u64 stacks_powered;
-#endif /* CONFIG_MALI_CORESTACK */
-	u64 cores_powered;
-	u64 tilers_powered;
-	u64 tiler_available_bitmap;
-	u64 tiler_transitioning_bitmap;
-	u64 shader_available_bitmap;
-	u64 shader_ready_bitmap;
-	u64 shader_transitioning_bitmap;
-	u64 l2_available_bitmap;
-	u64 prev_l2_available_bitmap;
-	u64 l2_inuse_bitmap;
+		prev_state = backend->shaders_state;
 
-	KBASE_DEBUG_ASSERT(NULL != kbdev);
-	lockdep_assert_held(&kbdev->hwaccess_lock);
+		switch (backend->shaders_state) {
+		case KBASE_SHADERS_OFF_CORESTACK_OFF:
+			/* Ignore changes to the shader core availability
+			 * except at certain points where we can handle it,
+			 * i.e. off and SHADERS_ON_CORESTACK_ON.
+			 */
+			backend->shaders_avail = kbase_pm_ca_get_core_mask(kbdev);
 
-	spin_lock(&kbdev->pm.backend.gpu_powered_lock);
-	if (kbdev->pm.backend.gpu_powered == false) {
-		spin_unlock(&kbdev->pm.backend.gpu_powered_lock);
-		if (kbdev->pm.backend.desired_shader_state == 0 &&
-				kbdev->pm.backend.desired_tiler_state == 0)
-			return true;
-		return false;
-	}
+			if (backend->shaders_desired && backend->l2_state == KBASE_L2_ON) {
+				if (corestack_driver_control)
+					kbase_pm_invoke(kbdev, KBASE_PM_CORE_STACK,
+							stacks_avail, ACTION_PWRON);
 
-	/* If any cores are already powered then, we must keep the caches on */
-	shader_transitioning_bitmap = kbase_pm_get_trans_cores(kbdev,
-							KBASE_PM_CORE_SHADER);
-	cores_powered = kbase_pm_get_ready_cores(kbdev, KBASE_PM_CORE_SHADER);
-	cores_powered |= kbdev->pm.backend.desired_shader_state;
+				backend->shaders_state = KBASE_SHADERS_OFF_CORESTACK_PEND_ON;
+			}
+			break;
 
-#ifdef CONFIG_MALI_CORESTACK
-	/* Work out which core stacks want to be powered */
-	desired_stack_state = kbase_pm_core_stack_mask(cores_powered);
-	stacks_powered = kbase_pm_get_ready_cores(kbdev, KBASE_PM_CORE_STACK) |
-		desired_stack_state;
-#endif /* CONFIG_MALI_CORESTACK */
-
-	/* Work out which tilers want to be powered */
-	tiler_transitioning_bitmap = kbase_pm_get_trans_cores(kbdev,
-							KBASE_PM_CORE_TILER);
-	tilers_powered = kbase_pm_get_ready_cores(kbdev, KBASE_PM_CORE_TILER);
-	tilers_powered |= kbdev->pm.backend.desired_tiler_state;
-
-	/* If there are l2 cache users registered, keep all l2s powered even if
-	 * all other cores are off. */
-	if (kbdev->l2_users_count > 0)
-		cores_powered |= kbdev->gpu_props.props.raw_props.l2_present;
-
-	desired_l2_state = get_desired_cache_status(
-			kbdev->gpu_props.props.raw_props.l2_present,
-			cores_powered, tilers_powered);
-
-	l2_inuse_bitmap = get_desired_cache_status(
-			kbdev->gpu_props.props.raw_props.l2_present,
-			cores_powered | shader_transitioning_bitmap,
-			tilers_powered | tiler_transitioning_bitmap);
+		case KBASE_SHADERS_OFF_CORESTACK_PEND_ON:
+			if (!stacks_trans && stacks_ready == stacks_avail) {
+				kbase_pm_invoke(kbdev, KBASE_PM_CORE_SHADER,
+						backend->shaders_avail, ACTION_PWRON);
 
-#ifdef CONFIG_MALI_CORESTACK
-	if (stacks_powered)
-		desired_l2_state |= 1;
-#endif /* CONFIG_MALI_CORESTACK */
+				backend->shaders_state = KBASE_SHADERS_PEND_ON_CORESTACK_ON;
 
-	/* If any l2 cache is on, then enable l2 #0, for use by job manager */
-	if (0 != desired_l2_state)
-		desired_l2_state |= 1;
+			}
+			break;
+
+		case KBASE_SHADERS_PEND_ON_CORESTACK_ON:
+			if (!shaders_trans && shaders_ready == backend->shaders_avail) {
+				KBASE_TRACE_ADD(kbdev,
+						PM_CORES_CHANGE_AVAILABLE,
+						NULL, NULL, 0u, (u32)shaders_ready);
+				backend->shaders_state = KBASE_SHADERS_ON_CORESTACK_ON;
+			}
+			break;
+
+		case KBASE_SHADERS_ON_CORESTACK_ON:
+			backend->shaders_avail = kbase_pm_ca_get_core_mask(kbdev);
+
+			if (!backend->shaders_desired) {
+				if (kbdev->pm.backend.protected_transition_override ||
+						!stt->configured_ticks ||
+						WARN_ON(stt->cancel_queued)) {
+					backend->shaders_state = KBASE_SHADERS_WAIT_FINISHED_CORESTACK_ON;
+				} else {
+					stt->remaining_ticks = stt->configured_ticks;
+					stt->needed = true;
+
+					/* The shader hysteresis timer is not
+					 * done the obvious way, which would be
+					 * to start an hrtimer when the shader
+					 * power off is requested. Instead,
+					 * use a 'tick' timer, and set the
+					 * remaining number of ticks on a power
+					 * off request.  This avoids the
+					 * latency of starting, then
+					 * immediately cancelling an hrtimer
+					 * when the shaders are re-requested
+					 * before the timeout expires.
+					 */
+					if (!hrtimer_active(&stt->timer))
+						hrtimer_start(&stt->timer,
+								stt->configured_interval,
+								HRTIMER_MODE_REL);
+
+					backend->shaders_state = KBASE_SHADERS_WAIT_OFF_CORESTACK_ON;
+				}
+			} else if (!platform_power_down_only) {
+				if (backend->shaders_avail & ~shaders_ready) {
+					backend->shaders_avail |= shaders_ready;
+
+					kbase_pm_invoke(kbdev, KBASE_PM_CORE_SHADER,
+							backend->shaders_avail & ~shaders_ready,
+							ACTION_PWRON);
+					backend->shaders_state = KBASE_SHADERS_PEND_ON_CORESTACK_ON;
+
+				}
+			}
+			break;
 
-	prev_l2_available_bitmap = kbdev->l2_available_bitmap;
-	in_desired_state &= kbase_pm_transition_core_type(kbdev,
-			KBASE_PM_CORE_L2, desired_l2_state, l2_inuse_bitmap,
-			&l2_available_bitmap,
-			&kbdev->pm.backend.powering_on_l2_state);
+		case KBASE_SHADERS_WAIT_OFF_CORESTACK_ON:
+			if (WARN_ON(!hrtimer_active(&stt->timer))) {
+				stt->remaining_ticks = 0;
+				backend->shaders_state = KBASE_SHADERS_WAIT_FINISHED_CORESTACK_ON;
+			}
 
-	kbdev->l2_available_bitmap = l2_available_bitmap;
+			if (backend->shaders_desired) {
+				stt->remaining_ticks = 0;
+				backend->shaders_state = KBASE_SHADERS_ON_CORESTACK_ON;
+			} else if (stt->remaining_ticks == 0) {
+				backend->shaders_state = KBASE_SHADERS_WAIT_FINISHED_CORESTACK_ON;
+			}
+			break;
 
+		case KBASE_SHADERS_WAIT_FINISHED_CORESTACK_ON:
+			shader_poweroff_timer_queue_cancel(kbdev);
 
-#ifdef CONFIG_MALI_CORESTACK
-	if (in_desired_state) {
-		in_desired_state &= kbase_pm_transition_core_type(kbdev,
-				KBASE_PM_CORE_STACK, desired_stack_state, 0,
-				&kbdev->stack_available_bitmap,
-				&kbdev->pm.backend.powering_on_stack_state);
-	}
-#endif /* CONFIG_MALI_CORESTACK */
-
-	if (in_desired_state) {
-		in_desired_state &= kbase_pm_transition_core_type(kbdev,
-				KBASE_PM_CORE_TILER,
-				kbdev->pm.backend.desired_tiler_state,
-				0, &tiler_available_bitmap,
-				&kbdev->pm.backend.powering_on_tiler_state);
-		in_desired_state &= kbase_pm_transition_core_type(kbdev,
-				KBASE_PM_CORE_SHADER,
-				kbdev->pm.backend.desired_shader_state,
-				0, &shader_available_bitmap,
-				&kbdev->pm.backend.powering_on_shader_state);
-
-		if (kbdev->shader_available_bitmap != shader_available_bitmap)
-			KBASE_TRACE_ADD(kbdev, PM_CORES_CHANGE_AVAILABLE, NULL,
-						NULL, 0u,
-						(u32) shader_available_bitmap);
-
-		kbdev->shader_available_bitmap = shader_available_bitmap;
-
-		if (kbdev->tiler_available_bitmap != tiler_available_bitmap)
-			KBASE_TRACE_ADD(kbdev, PM_CORES_CHANGE_AVAILABLE_TILER,
-						NULL, NULL, 0u,
-						(u32) tiler_available_bitmap);
+			if (!platform_power_down_only)
+				kbase_pm_invoke(kbdev, KBASE_PM_CORE_SHADER,
+						shaders_ready, ACTION_PWROFF);
 
-		kbdev->tiler_available_bitmap = tiler_available_bitmap;
+			KBASE_TRACE_ADD(kbdev,
+					PM_CORES_CHANGE_AVAILABLE,
+					NULL, NULL, 0u, 0u);
 
-	} else if ((l2_available_bitmap &
-			kbdev->gpu_props.props.raw_props.tiler_present) !=
-			kbdev->gpu_props.props.raw_props.tiler_present) {
-		tiler_available_bitmap = 0;
+			backend->shaders_state = KBASE_SHADERS_PEND_OFF_CORESTACK_ON;
+			break;
 
-		kbdev->tiler_available_bitmap = tiler_available_bitmap;
-	}
+		case KBASE_SHADERS_PEND_OFF_CORESTACK_ON:
+			if ((!shaders_trans && !shaders_ready) || platform_power_down_only) {
+				if (corestack_driver_control && !platform_power_down_only)
+					kbase_pm_invoke(kbdev, KBASE_PM_CORE_STACK,
+							stacks_avail, ACTION_PWROFF);
 
-	/* State updated for slow-path waiters */
-	kbdev->pm.backend.gpu_in_desired_state = in_desired_state;
-
-	shader_ready_bitmap = kbase_pm_get_ready_cores(kbdev,
-							KBASE_PM_CORE_SHADER);
-	shader_transitioning_bitmap = kbase_pm_get_trans_cores(kbdev,
-							KBASE_PM_CORE_SHADER);
-
-	/* Determine whether the cores are now available (even if the set of
-	 * available cores is empty). Note that they can be available even if
-	 * we've not finished transitioning to the desired state */
-	if ((kbdev->shader_available_bitmap &
-					kbdev->pm.backend.desired_shader_state)
-				== kbdev->pm.backend.desired_shader_state &&
-		(kbdev->tiler_available_bitmap &
-					kbdev->pm.backend.desired_tiler_state)
-				== kbdev->pm.backend.desired_tiler_state) {
-		cores_are_available = true;
-
-		KBASE_TRACE_ADD(kbdev, PM_CORES_AVAILABLE, NULL, NULL, 0u,
-				(u32)(kbdev->shader_available_bitmap &
-				kbdev->pm.backend.desired_shader_state));
-		KBASE_TRACE_ADD(kbdev, PM_CORES_AVAILABLE_TILER, NULL, NULL, 0u,
-				(u32)(kbdev->tiler_available_bitmap &
-				kbdev->pm.backend.desired_tiler_state));
-	}
+				backend->shaders_state = KBASE_SHADERS_OFF_CORESTACK_PEND_OFF;
+			}
+			break;
+
+		case KBASE_SHADERS_OFF_CORESTACK_PEND_OFF:
+			if ((!stacks_trans && !stacks_ready) || platform_power_down_only)
+				backend->shaders_state = KBASE_SHADERS_OFF_CORESTACK_OFF_TIMER_PEND_OFF;
+			break;
+
+		case KBASE_SHADERS_OFF_CORESTACK_OFF_TIMER_PEND_OFF:
+			if (!hrtimer_active(&stt->timer) && !stt->cancel_queued)
+				backend->shaders_state = KBASE_SHADERS_OFF_CORESTACK_OFF;
+			break;
+
+		case KBASE_SHADERS_RESET_WAIT:
+			/* Reset complete */
+			if (!backend->in_reset)
+				backend->shaders_state = KBASE_SHADERS_OFF_CORESTACK_OFF_TIMER_PEND_OFF;
+			break;
+		}
+	} while (backend->shaders_state != prev_state);
+}
 
-	if (in_desired_state) {
-		KBASE_DEBUG_ASSERT(cores_are_available);
+static bool kbase_pm_is_in_desired_state_nolock(struct kbase_device *kbdev)
+{
+	bool in_desired_state = true;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (kbase_pm_is_l2_desired(kbdev) &&
+			kbdev->pm.backend.l2_state != KBASE_L2_ON)
+		in_desired_state = false;
+	else if (!kbase_pm_is_l2_desired(kbdev) &&
+			kbdev->pm.backend.l2_state != KBASE_L2_OFF)
+		in_desired_state = false;
+
+	if (kbdev->pm.backend.shaders_desired &&
+			kbdev->pm.backend.shaders_state != KBASE_SHADERS_ON_CORESTACK_ON)
+		in_desired_state = false;
+	else if (!kbdev->pm.backend.shaders_desired &&
+			kbdev->pm.backend.shaders_state != KBASE_SHADERS_OFF_CORESTACK_OFF)
+		in_desired_state = false;
+
+	return in_desired_state;
+}
+
+static bool kbase_pm_is_in_desired_state(struct kbase_device *kbdev)
+{
+	bool in_desired_state;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	in_desired_state = kbase_pm_is_in_desired_state_nolock(kbdev);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	return in_desired_state;
+}
+
+static bool kbase_pm_is_in_desired_state_with_l2_powered(
+		struct kbase_device *kbdev)
+{
+	bool in_desired_state = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	if (kbase_pm_is_in_desired_state_nolock(kbdev) &&
+			(kbdev->pm.backend.l2_state == KBASE_L2_ON))
+		in_desired_state = true;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	return in_desired_state;
+}
+
+static void kbase_pm_trace_power_state(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
 
 #if defined(CONFIG_MALI_GATOR_SUPPORT)
-		kbase_trace_mali_pm_status(KBASE_PM_CORE_L2,
-						kbase_pm_get_ready_cores(kbdev,
-							KBASE_PM_CORE_L2));
-		kbase_trace_mali_pm_status(KBASE_PM_CORE_SHADER,
-						kbase_pm_get_ready_cores(kbdev,
-							KBASE_PM_CORE_SHADER));
-		kbase_trace_mali_pm_status(KBASE_PM_CORE_TILER,
-						kbase_pm_get_ready_cores(kbdev,
-							KBASE_PM_CORE_TILER));
-#ifdef CONFIG_MALI_CORESTACK
+	kbase_trace_mali_pm_status(KBASE_PM_CORE_L2,
+					kbase_pm_get_ready_cores(kbdev,
+						KBASE_PM_CORE_L2));
+	kbase_trace_mali_pm_status(KBASE_PM_CORE_SHADER,
+					kbase_pm_get_ready_cores(kbdev,
+						KBASE_PM_CORE_SHADER));
+	kbase_trace_mali_pm_status(KBASE_PM_CORE_TILER,
+					kbase_pm_get_ready_cores(kbdev,
+						KBASE_PM_CORE_TILER));
+	if (corestack_driver_control)
 		kbase_trace_mali_pm_status(KBASE_PM_CORE_STACK,
 						kbase_pm_get_ready_cores(kbdev,
 							KBASE_PM_CORE_STACK));
-#endif /* CONFIG_MALI_CORESTACK */
 #endif
 
-		KBASE_TLSTREAM_AUX_PM_STATE(
-				KBASE_PM_CORE_L2,
-				kbase_pm_get_ready_cores(
-					kbdev, KBASE_PM_CORE_L2));
-		KBASE_TLSTREAM_AUX_PM_STATE(
-				KBASE_PM_CORE_SHADER,
-				kbase_pm_get_ready_cores(
-					kbdev, KBASE_PM_CORE_SHADER));
-		KBASE_TLSTREAM_AUX_PM_STATE(
-				KBASE_PM_CORE_TILER,
-				kbase_pm_get_ready_cores(
-					kbdev,
-					KBASE_PM_CORE_TILER));
-#ifdef CONFIG_MALI_CORESTACK
+	KBASE_TLSTREAM_AUX_PM_STATE(
+			KBASE_PM_CORE_L2,
+			kbase_pm_get_ready_cores(
+				kbdev, KBASE_PM_CORE_L2));
+	KBASE_TLSTREAM_AUX_PM_STATE(
+			KBASE_PM_CORE_SHADER,
+			kbase_pm_get_ready_cores(
+				kbdev, KBASE_PM_CORE_SHADER));
+	KBASE_TLSTREAM_AUX_PM_STATE(
+			KBASE_PM_CORE_TILER,
+			kbase_pm_get_ready_cores(
+				kbdev,
+				KBASE_PM_CORE_TILER));
+
+	if (corestack_driver_control)
 		KBASE_TLSTREAM_AUX_PM_STATE(
 				KBASE_PM_CORE_STACK,
 				kbase_pm_get_ready_cores(
 					kbdev,
 					KBASE_PM_CORE_STACK));
-#endif /* CONFIG_MALI_CORESTACK */
+}
+
+void kbase_pm_update_state(struct kbase_device *kbdev)
+{
+	enum kbase_shader_core_state prev_shaders_state =
+			kbdev->pm.backend.shaders_state;
 
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (!kbdev->pm.backend.gpu_powered)
+		return; /* Do nothing if the GPU is off */
+
+	kbase_pm_l2_update_state(kbdev);
+	kbase_pm_shaders_update_state(kbdev);
+
+	/* If the shaders just turned off, re-invoke the L2 state machine, in
+	 * case it was waiting for the shaders to turn off before powering down
+	 * the L2.
+	 */
+	if (prev_shaders_state != KBASE_SHADERS_OFF_CORESTACK_OFF &&
+			kbdev->pm.backend.shaders_state == KBASE_SHADERS_OFF_CORESTACK_OFF)
+		kbase_pm_l2_update_state(kbdev);
+
+	if (kbase_pm_is_in_desired_state_nolock(kbdev)) {
 		KBASE_TRACE_ADD(kbdev, PM_DESIRED_REACHED, NULL, NULL,
-				kbdev->pm.backend.gpu_in_desired_state,
-				(u32)kbdev->pm.backend.desired_shader_state);
-		KBASE_TRACE_ADD(kbdev, PM_DESIRED_REACHED_TILER, NULL, NULL, 0u,
-				(u32)kbdev->pm.backend.desired_tiler_state);
+				true, kbdev->pm.backend.shaders_avail);
 
-		/* Wake slow-path waiters. Job scheduler does not use this. */
-		KBASE_TRACE_ADD(kbdev, PM_WAKE_WAITERS, NULL, NULL, 0u, 0);
+		kbase_pm_trace_power_state(kbdev);
 
+		KBASE_TRACE_ADD(kbdev, PM_WAKE_WAITERS, NULL, NULL, 0u, 0);
 		wake_up(&kbdev->pm.backend.gpu_in_desired_state_wait);
 	}
+}
 
-	spin_unlock(&kbdev->pm.backend.gpu_powered_lock);
-
-	kbdev->shader_ready_bitmap = shader_ready_bitmap;
-	kbdev->shader_transitioning_bitmap = shader_transitioning_bitmap;
-
-	/* The core availability policy is not allowed to keep core group 0
-	 * turned off (unless it was changing the l2 power state) */
-	if (!((shader_ready_bitmap | shader_transitioning_bitmap) &
-		kbdev->gpu_props.props.coherency_info.group[0].core_mask) &&
-		(prev_l2_available_bitmap == desired_l2_state) &&
-		!(kbase_pm_ca_get_core_mask(kbdev) &
-		kbdev->gpu_props.props.coherency_info.group[0].core_mask))
-		BUG();
-
-	/* The core availability policy is allowed to keep core group 1 off,
-	 * but all jobs specifically targeting CG1 must fail */
-	if (!((shader_ready_bitmap | shader_transitioning_bitmap) &
-		kbdev->gpu_props.props.coherency_info.group[1].core_mask) &&
-		!(kbase_pm_ca_get_core_mask(kbdev) &
-		kbdev->gpu_props.props.coherency_info.group[1].core_mask))
-		kbdev->pm.backend.cg1_disabled = true;
-	else
-		kbdev->pm.backend.cg1_disabled = false;
+static enum hrtimer_restart
+shader_tick_timer_callback(struct hrtimer *timer)
+{
+	struct kbasep_pm_tick_timer_state *stt = container_of(timer,
+			struct kbasep_pm_tick_timer_state, timer);
+	struct kbase_device *kbdev = container_of(stt, struct kbase_device,
+			pm.backend.shader_tick_timer);
+	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
+	unsigned long flags;
+	enum hrtimer_restart restart = HRTIMER_NORESTART;
 
-	return cores_are_available;
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	if (stt->remaining_ticks &&
+			backend->shaders_state == KBASE_SHADERS_WAIT_OFF_CORESTACK_ON) {
+		stt->remaining_ticks--;
+
+		/* If the remaining ticks just changed from 1 to 0, invoke the
+		 * PM state machine to power off the shader cores.
+		 */
+		if (!stt->remaining_ticks && !backend->shaders_desired)
+			kbase_pm_update_state(kbdev);
+	}
+
+	if (stt->needed) {
+		hrtimer_forward_now(timer, stt->configured_interval);
+		restart = HRTIMER_RESTART;
+	}
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	return restart;
 }
-KBASE_EXPORT_TEST_API(kbase_pm_check_transitions_nolock);
 
-/* Timeout for kbase_pm_check_transitions_sync when wait_event_killable has
+int kbase_pm_state_machine_init(struct kbase_device *kbdev)
+{
+	struct kbasep_pm_tick_timer_state *stt = &kbdev->pm.backend.shader_tick_timer;
+
+	stt->wq = alloc_workqueue("kbase_pm_shader_poweroff", WQ_HIGHPRI | WQ_UNBOUND, 1);
+	if (!stt->wq)
+		return -ENOMEM;
+
+	INIT_WORK(&stt->work, shader_poweroff_timer_stop_callback);
+
+	stt->needed = false;
+	hrtimer_init(&stt->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	stt->timer.function = shader_tick_timer_callback;
+	stt->configured_interval = HR_TIMER_DELAY_NSEC(DEFAULT_PM_GPU_POWEROFF_TICK_NS);
+	stt->configured_ticks = DEFAULT_PM_POWEROFF_TICK_SHADER;
+
+	return 0;
+}
+
+void kbase_pm_state_machine_term(struct kbase_device *kbdev)
+{
+	hrtimer_cancel(&kbdev->pm.backend.shader_tick_timer.timer);
+	destroy_workqueue(kbdev->pm.backend.shader_tick_timer.wq);
+}
+
+void kbase_pm_reset_start_locked(struct kbase_device *kbdev)
+{
+	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	backend->in_reset = true;
+	backend->l2_state = KBASE_L2_RESET_WAIT;
+	backend->shaders_state = KBASE_SHADERS_RESET_WAIT;
+
+	/* We're in a reset, so hwcnt will have been synchronously disabled by
+	 * this function's caller as part of the reset process. We therefore
+	 * know that any call to kbase_hwcnt_context_disable_atomic, if
+	 * required to sync the hwcnt refcount with our internal state, is
+	 * guaranteed to succeed.
+	 */
+	backend->hwcnt_desired = false;
+	if (!backend->hwcnt_disabled) {
+		WARN_ON(!kbase_hwcnt_context_disable_atomic(
+			kbdev->hwcnt_gpu_ctx));
+		backend->hwcnt_disabled = true;
+	}
+
+	shader_poweroff_timer_queue_cancel(kbdev);
+}
+
+void kbase_pm_reset_complete(struct kbase_device *kbdev)
+{
+	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	backend->in_reset = false;
+	kbase_pm_update_state(kbdev);
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+}
+
+/* Timeout for kbase_pm_wait_for_desired_state when wait_event_killable has
  * aborted due to a fatal signal. If the time spent waiting has exceeded this
  * threshold then there is most likely a hardware issue. */
 #define PM_TIMEOUT (5*HZ) /* 5s */
 
-void kbase_pm_check_transitions_sync(struct kbase_device *kbdev)
+static void kbase_pm_timed_out(struct kbase_device *kbdev)
+{
+	dev_err(kbdev->dev, "Power transition timed out unexpectedly\n");
+	dev_err(kbdev->dev, "Desired state :\n");
+	dev_err(kbdev->dev, "\tShader=%016llx\n",
+			kbdev->pm.backend.shaders_desired ? kbdev->pm.backend.shaders_avail : 0);
+	dev_err(kbdev->dev, "Current state :\n");
+	dev_err(kbdev->dev, "\tShader=%08x%08x\n",
+			kbase_reg_read(kbdev,
+				GPU_CONTROL_REG(SHADER_READY_HI)),
+			kbase_reg_read(kbdev,
+				GPU_CONTROL_REG(SHADER_READY_LO)));
+	dev_err(kbdev->dev, "\tTiler =%08x%08x\n",
+			kbase_reg_read(kbdev,
+				GPU_CONTROL_REG(TILER_READY_HI)),
+			kbase_reg_read(kbdev,
+				GPU_CONTROL_REG(TILER_READY_LO)));
+	dev_err(kbdev->dev, "\tL2    =%08x%08x\n",
+			kbase_reg_read(kbdev,
+				GPU_CONTROL_REG(L2_READY_HI)),
+			kbase_reg_read(kbdev,
+				GPU_CONTROL_REG(L2_READY_LO)));
+	dev_err(kbdev->dev, "Cores transitioning :\n");
+	dev_err(kbdev->dev, "\tShader=%08x%08x\n",
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(
+					SHADER_PWRTRANS_HI)),
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(
+					SHADER_PWRTRANS_LO)));
+	dev_err(kbdev->dev, "\tTiler =%08x%08x\n",
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(
+					TILER_PWRTRANS_HI)),
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(
+					TILER_PWRTRANS_LO)));
+	dev_err(kbdev->dev, "\tL2    =%08x%08x\n",
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(
+					L2_PWRTRANS_HI)),
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(
+					L2_PWRTRANS_LO)));
+
+	dev_err(kbdev->dev, "Sending reset to GPU - all running jobs will be lost\n");
+	if (kbase_prepare_to_reset_gpu(kbdev))
+		kbase_reset_gpu(kbdev);
+}
+
+void kbase_pm_wait_for_l2_powered(struct kbase_device *kbdev)
 {
 	unsigned long flags;
 	unsigned long timeout;
-	bool cores_are_available;
-	int ret;
+	int err;
 
-	/* Force the transition to be checked and reported - the cores may be
-	 * 'available' (for job submission) but not fully powered up. */
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	kbase_pm_update_state(kbdev);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	timeout = jiffies + PM_TIMEOUT;
 
-	cores_are_available = kbase_pm_check_transitions_nolock(kbdev);
+	/* Wait for cores */
+	err = wait_event_killable(kbdev->pm.backend.gpu_in_desired_state_wait,
+			kbase_pm_is_in_desired_state_with_l2_powered(kbdev));
+
+	if (err < 0 && time_after(jiffies, timeout))
+		kbase_pm_timed_out(kbdev);
+}
 
-	/* Don't need 'cores_are_available', because we don't return anything */
-	CSTD_UNUSED(cores_are_available);
+void kbase_pm_wait_for_desired_state(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+	unsigned long timeout;
+	int err;
+
+	/* Let the state machine latch the most recent desired state. */
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	kbase_pm_update_state(kbdev);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
 	timeout = jiffies + PM_TIMEOUT;
 
 	/* Wait for cores */
-	ret = wait_event_killable(kbdev->pm.backend.gpu_in_desired_state_wait,
-			kbdev->pm.backend.gpu_in_desired_state);
-
-	if (ret < 0 && time_after(jiffies, timeout)) {
-		dev_err(kbdev->dev, "Power transition timed out unexpectedly\n");
-		dev_err(kbdev->dev, "Desired state :\n");
-		dev_err(kbdev->dev, "\tShader=%016llx\n",
-				kbdev->pm.backend.desired_shader_state);
-		dev_err(kbdev->dev, "\tTiler =%016llx\n",
-				kbdev->pm.backend.desired_tiler_state);
-		dev_err(kbdev->dev, "Current state :\n");
-		dev_err(kbdev->dev, "\tShader=%08x%08x\n",
-				kbase_reg_read(kbdev,
-					GPU_CONTROL_REG(SHADER_READY_HI)),
-				kbase_reg_read(kbdev,
-					GPU_CONTROL_REG(SHADER_READY_LO)));
-		dev_err(kbdev->dev, "\tTiler =%08x%08x\n",
-				kbase_reg_read(kbdev,
-					GPU_CONTROL_REG(TILER_READY_HI)),
-				kbase_reg_read(kbdev,
-					GPU_CONTROL_REG(TILER_READY_LO)));
-		dev_err(kbdev->dev, "\tL2    =%08x%08x\n",
-				kbase_reg_read(kbdev,
-					GPU_CONTROL_REG(L2_READY_HI)),
-				kbase_reg_read(kbdev,
-					GPU_CONTROL_REG(L2_READY_LO)));
-		dev_err(kbdev->dev, "Cores transitioning :\n");
-		dev_err(kbdev->dev, "\tShader=%08x%08x\n",
-				kbase_reg_read(kbdev, GPU_CONTROL_REG(
-						SHADER_PWRTRANS_HI)),
-				kbase_reg_read(kbdev, GPU_CONTROL_REG(
-						SHADER_PWRTRANS_LO)));
-		dev_err(kbdev->dev, "\tTiler =%08x%08x\n",
-				kbase_reg_read(kbdev, GPU_CONTROL_REG(
-						TILER_PWRTRANS_HI)),
-				kbase_reg_read(kbdev, GPU_CONTROL_REG(
-						TILER_PWRTRANS_LO)));
-		dev_err(kbdev->dev, "\tL2    =%08x%08x\n",
-				kbase_reg_read(kbdev, GPU_CONTROL_REG(
-						L2_PWRTRANS_HI)),
-				kbase_reg_read(kbdev, GPU_CONTROL_REG(
-						L2_PWRTRANS_LO)));
-#if KBASE_GPU_RESET_EN
-		dev_err(kbdev->dev, "Sending reset to GPU - all running jobs will be lost\n");
-		if (kbase_prepare_to_reset_gpu(kbdev))
-			kbase_reset_gpu(kbdev);
-#endif /* KBASE_GPU_RESET_EN */
-	}
+	err = wait_event_killable(kbdev->pm.backend.gpu_in_desired_state_wait,
+			kbase_pm_is_in_desired_state(kbdev));
+
+	if (err < 0 && time_after(jiffies, timeout))
+		kbase_pm_timed_out(kbdev);
 }
-KBASE_EXPORT_TEST_API(kbase_pm_check_transitions_sync);
+KBASE_EXPORT_TEST_API(kbase_pm_wait_for_desired_state);
 
 void kbase_pm_enable_interrupts(struct kbase_device *kbdev)
 {
@@ -957,7 +1256,6 @@ void kbase_pm_disable_interrupts(struct kbase_device *kbdev)
 
 KBASE_EXPORT_TEST_API(kbase_pm_disable_interrupts);
 
-
 /*
  * pmu layout:
  * 0x0000: PMU TAG (RO) (0xCAFECAFE)
@@ -990,12 +1288,7 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume)
 		kbdev->pm.backend.callback_power_resume(kbdev);
 		return;
 	} else if (kbdev->pm.backend.callback_power_on) {
-		kbdev->pm.backend.callback_power_on(kbdev);
-		/* If your platform properly keeps the GPU state you may use the
-		 * return value of the callback_power_on function to
-		 * conditionally reset the GPU on power up. Currently we are
-		 * conservative and always reset the GPU. */
-		reset_required = true;
+		reset_required = kbdev->pm.backend.callback_power_on(kbdev);
 	}
 
 	spin_lock_irqsave(&kbdev->pm.backend.gpu_powered_lock, flags);
@@ -1014,8 +1307,14 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume)
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 	mutex_unlock(&kbdev->mmu_hw_mutex);
 
-	/* Lastly, enable the interrupts */
+	/* Enable the interrupts */
 	kbase_pm_enable_interrupts(kbdev);
+
+	/* Turn on the L2 caches */
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	kbdev->pm.backend.l2_desired = true;
+	kbase_pm_update_state(kbdev);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
 
 KBASE_EXPORT_TEST_API(kbase_pm_clock_on);
@@ -1028,7 +1327,7 @@ bool kbase_pm_clock_off(struct kbase_device *kbdev, bool is_suspend)
 	lockdep_assert_held(&kbdev->pm.lock);
 
 	/* ASSERT that the cores should now be unavailable. No lock needed. */
-	KBASE_DEBUG_ASSERT(kbdev->shader_available_bitmap == 0u);
+	WARN_ON(kbdev->pm.backend.shaders_state != KBASE_SHADERS_OFF_CORESTACK_OFF);
 
 	kbdev->poweroff_pending = true;
 
@@ -1252,14 +1551,31 @@ static void kbase_pm_hw_issues_detect(struct kbase_device *kbdev)
 	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_TLS_HASHING))
 		kbdev->hw_quirks_sc |= SC_TLS_HASH_ENABLE;
 
+	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_IDVS_GROUP_SIZE)) {
+		int default_idvs_group_size = 0xF;
+		u32 tmp;
+
+		if (of_property_read_u32(kbdev->dev->of_node,
+					  "idvs-group-size", &tmp))
+			tmp = default_idvs_group_size;
+
+		if (tmp > JM_MAX_IDVS_GROUP_SIZE) {
+			dev_err(kbdev->dev,
+				"idvs-group-size of %d is too large. Maximum value is %d",
+				tmp, JM_MAX_IDVS_GROUP_SIZE);
+			tmp = default_idvs_group_size;
+		}
+
+		kbdev->hw_quirks_jm |= tmp << JM_IDVS_GROUP_SIZE_SHIFT;
+	}
+
 	if (!kbdev->hw_quirks_jm)
 		kbdev->hw_quirks_jm = kbase_reg_read(kbdev,
 				GPU_CONTROL_REG(JM_CONFIG));
 
-#ifdef CONFIG_MALI_CORESTACK
 #define MANUAL_POWER_CONTROL ((u32)(1 << 8))
-	kbdev->hw_quirks_jm |= MANUAL_POWER_CONTROL;
-#endif /* CONFIG_MALI_CORESTACK */
+	if (corestack_driver_control)
+		kbdev->hw_quirks_jm |= MANUAL_POWER_CONTROL;
 }
 
 static void kbase_pm_hw_issues_apply(struct kbase_device *kbdev)
@@ -1411,7 +1727,6 @@ int kbase_pm_init_hw(struct kbase_device *kbdev, unsigned int flags)
 {
 	unsigned long irq_flags;
 	int err;
-	bool resume_vinstr = false;
 
 	KBASE_DEBUG_ASSERT(NULL != kbdev);
 	lockdep_assert_held(&kbdev->pm.lock);
@@ -1438,15 +1753,9 @@ int kbase_pm_init_hw(struct kbase_device *kbdev, unsigned int flags)
 
 	/* The cores should be made unavailable due to the reset */
 	spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
-	if (kbdev->shader_available_bitmap != 0u)
-			KBASE_TRACE_ADD(kbdev, PM_CORES_CHANGE_AVAILABLE, NULL,
-						NULL, 0u, (u32)0u);
-	if (kbdev->tiler_available_bitmap != 0u)
-			KBASE_TRACE_ADD(kbdev, PM_CORES_CHANGE_AVAILABLE_TILER,
-						NULL, NULL, 0u, (u32)0u);
-	kbdev->shader_available_bitmap = 0u;
-	kbdev->tiler_available_bitmap = 0u;
-	kbdev->l2_available_bitmap = 0u;
+	if (kbdev->pm.backend.shaders_state != KBASE_SHADERS_OFF_CORESTACK_OFF)
+		KBASE_TRACE_ADD(kbdev, PM_CORES_CHANGE_AVAILABLE, NULL,
+				NULL, 0u, (u32)0u);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
 
 	/* Soft reset the GPU */
@@ -1457,10 +1766,7 @@ int kbase_pm_init_hw(struct kbase_device *kbdev, unsigned int flags)
 		err = kbase_pm_do_reset(kbdev);
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
-	if (kbdev->protected_mode)
-		resume_vinstr = true;
 	kbdev->protected_mode = false;
-
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
 
 	if (err)
@@ -1484,9 +1790,7 @@ int kbase_pm_init_hw(struct kbase_device *kbdev, unsigned int flags)
 	 * false when called from kbase_pm_powerup */
 	if (kbdev->pm.backend.gpu_cycle_counter_requests &&
 						(flags & PM_ENABLE_IRQS)) {
-		/* enable interrupts as the L2 may have to be powered on */
 		kbase_pm_enable_interrupts(kbdev);
-		kbase_pm_request_l2_caches(kbdev);
 
 		/* Re-enable the counters if we need to */
 		spin_lock_irqsave(
@@ -1499,10 +1803,6 @@ int kbase_pm_init_hw(struct kbase_device *kbdev, unsigned int flags)
 			&kbdev->pm.backend.gpu_cycle_counter_requests_lock,
 								irq_flags);
 
-		spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
-		kbase_pm_release_l2_caches(kbdev);
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
-
 		kbase_pm_disable_interrupts(kbdev);
 	}
 
@@ -1510,10 +1810,16 @@ int kbase_pm_init_hw(struct kbase_device *kbdev, unsigned int flags)
 		kbase_pm_enable_interrupts(kbdev);
 
 exit:
-	/* If GPU is leaving protected mode resume vinstr operation. */
-	if (kbdev->vinstr_ctx && resume_vinstr)
-		kbase_vinstr_resume(kbdev->vinstr_ctx);
-
+	/* Re-enable GPU hardware counters if we're resetting from protected
+	 * mode.
+	 */
+	spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
+	kbdev->protected_mode_hwcnt_desired = true;
+	if (kbdev->protected_mode_hwcnt_disabled) {
+		kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+		kbdev->protected_mode_hwcnt_disabled = false;
+	}
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
 	return err;
 }
 
@@ -1527,9 +1833,8 @@ exit:
  * kbase_pm_request_gpu_cycle_counter() or
  * kbase_pm_request_gpu_cycle_counter_l2_is_on() only
  *
- * When this function is called the l2 cache must be on and the l2 cache users
- * count must have been incremented by a call to (
- * kbase_pm_request_l2_caches() or kbase_pm_request_l2_caches_l2_on() )
+ * When this function is called the l2 cache must be on - i.e., the GPU must be
+ * on.
  *
  * @kbdev:     The kbase device structure of the device
  */
@@ -1561,8 +1866,6 @@ void kbase_pm_request_gpu_cycle_counter(struct kbase_device *kbdev)
 	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_cycle_counter_requests <
 								INT_MAX);
 
-	kbase_pm_request_l2_caches(kbdev);
-
 	kbase_pm_request_gpu_cycle_counter_do_request(kbdev);
 }
 
@@ -1577,8 +1880,6 @@ void kbase_pm_request_gpu_cycle_counter_l2_is_on(struct kbase_device *kbdev)
 	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_cycle_counter_requests <
 								INT_MAX);
 
-	kbase_pm_request_l2_caches_l2_is_on(kbdev);
-
 	kbase_pm_request_gpu_cycle_counter_do_request(kbdev);
 }
 
@@ -1606,8 +1907,6 @@ void kbase_pm_release_gpu_cycle_counter_nolock(struct kbase_device *kbdev)
 	spin_unlock_irqrestore(
 			&kbdev->pm.backend.gpu_cycle_counter_requests_lock,
 									flags);
-
-	kbase_pm_release_l2_caches(kbdev);
 }
 
 void kbase_pm_release_gpu_cycle_counter(struct kbase_device *kbdev)
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_internal.h b/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
index 0d3599a..e88b3a8 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_internal.h
@@ -163,7 +163,7 @@ void kbase_pm_enable_interrupts(struct kbase_device *kbdev);
  * kbase_pm_disable_interrupts - Disable interrupts on the device.
  *
  * This prevents delivery of Power Management interrupts to the CPU so that
- * kbase_pm_check_transitions_nolock() will not be called from the IRQ handler
+ * kbase_pm_update_state() will not be called from the IRQ handler
  * until kbase_pm_enable_interrupts() or kbase_pm_clock_on() is called.
  *
  * Interrupts are also disabled after a call to kbase_pm_clock_off().
@@ -206,58 +206,38 @@ int kbase_pm_init_hw(struct kbase_device *kbdev, unsigned int flags);
  */
 void kbase_pm_reset_done(struct kbase_device *kbdev);
 
-
 /**
- * kbase_pm_check_transitions_nolock - Check if there are any power transitions
- *                                     to make, and if so start them.
- *
- * This function will check the desired_xx_state members of
- * struct kbase_pm_device_data and the actual status of the hardware to see if
- * any power transitions can be made at this time to make the hardware state
- * closer to the state desired by the power policy.
+ * kbase_pm_wait_for_desired_state - Wait for the desired power state to be
+ *                                   reached
  *
- * The return value can be used to check whether all the desired cores are
- * available, and so whether it's worth submitting a job (e.g. from a Power
- * Management IRQ).
+ * Wait for the L2 and shader power state machines to reach the states
+ * corresponding to the values of 'l2_desired' and 'shaders_desired'.
  *
- * Note that this still returns true when desired_xx_state has no
- * cores. That is: of the no cores desired, none were *un*available. In
- * this case, the caller may still need to try submitting jobs. This is because
- * the Core Availability Policy might have taken us to an intermediate state
- * where no cores are powered, before powering on more cores (e.g. for core
- * rotation)
+ * The usual use-case for this is to ensure cores are 'READY' after performing
+ * a GPU Reset.
  *
- * The caller must hold kbase_device.pm.power_change_lock
+ * Unlike kbase_pm_update_state(), the caller must not hold hwaccess_lock,
+ * because this function will take that lock itself.
  *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
- *
- * Return:      non-zero when all desired cores are available. That is,
- *              it's worthwhile for the caller to submit a job.
- *              false otherwise
  */
-bool kbase_pm_check_transitions_nolock(struct kbase_device *kbdev);
+void kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
 
 /**
- * kbase_pm_check_transitions_sync - Synchronous and locking variant of
- *                                   kbase_pm_check_transitions_nolock()
- *
- * On returning, the desired state at the time of the call will have been met.
+ * kbase_pm_wait_for_l2_powered - Wait for the L2 cache to be powered on
  *
- * There is nothing to stop the core being switched off by calls to
- * kbase_pm_release_cores() or kbase_pm_unrequest_cores(). Therefore, the
- * caller must have already made a call to
- * kbase_pm_request_cores()/kbase_pm_request_cores_sync() previously.
+ * Wait for the L2 to be powered on, and for the L2 and shader state machines to
+ * stabilise by reaching the states corresponding to the values of 'l2_desired'
+ * and 'shaders_desired'.
  *
- * The usual use-case for this is to ensure cores are 'READY' after performing
- * a GPU Reset.
+ * kbdev->pm.active_count must be non-zero when calling this function.
  *
- * Unlike kbase_pm_check_transitions_nolock(), the caller must not hold
- * kbase_device.pm.power_change_lock, because this function will take that
- * lock itself.
+ * Unlike kbase_pm_update_state(), the caller must not hold hwaccess_lock,
+ * because this function will take that lock itself.
  *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
-void kbase_pm_check_transitions_sync(struct kbase_device *kbdev);
+void kbase_pm_wait_for_l2_powered(struct kbase_device *kbdev);
 
 /**
  * kbase_pm_update_cores_state_nolock - Variant of kbase_pm_update_cores_state()
@@ -269,6 +249,25 @@ void kbase_pm_check_transitions_sync(struct kbase_device *kbdev);
 void kbase_pm_update_cores_state_nolock(struct kbase_device *kbdev);
 
 /**
+ * kbase_pm_update_state - Update the L2 and shader power state machines
+ * @kbdev: Device pointer
+ */
+void kbase_pm_update_state(struct kbase_device *kbdev);
+
+/**
+ * kbase_pm_state_machine_init - Initialize the state machines, primarily the
+ *                               shader poweroff timer
+ * @kbdev: Device pointer
+ */
+int kbase_pm_state_machine_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_pm_state_machine_term - Clean up the PM state machines' data
+ * @kbdev: Device pointer
+ */
+void kbase_pm_state_machine_term(struct kbase_device *kbdev);
+
+/**
  * kbase_pm_update_cores_state - Update the desired state of shader cores from
  *                               the Power Policy, and begin any power
  *                               transitions.
@@ -283,24 +282,6 @@ void kbase_pm_update_cores_state_nolock(struct kbase_device *kbdev);
 void kbase_pm_update_cores_state(struct kbase_device *kbdev);
 
 /**
- * kbase_pm_cancel_deferred_poweroff - Cancel any pending requests to power off
- *                                     the GPU and/or shader cores.
- *
- * This should be called by any functions which directly power off the GPU.
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- */
-void kbase_pm_cancel_deferred_poweroff(struct kbase_device *kbdev);
-
-/**
- * kbasep_pm_init_core_use_bitmaps - Initialise data tracking the required
- *                                   and used cores.
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- */
-void kbasep_pm_init_core_use_bitmaps(struct kbase_device *kbdev);
-
-/**
  * kbasep_pm_metrics_init - Initialize the metrics gathering framework.
  *
  * This must be called before other metric gathering APIs are called.
@@ -577,4 +558,67 @@ void kbase_pm_cache_snoop_disable(struct kbase_device *kbdev);
 void kbase_devfreq_set_core_mask(struct kbase_device *kbdev, u64 core_mask);
 #endif
 
+/**
+ * kbase_pm_reset_start_locked - Signal that GPU reset has started
+ * @kbdev: Device pointer
+ *
+ * Normal power management operation will be suspended until the reset has
+ * completed.
+ *
+ * Caller must hold hwaccess_lock.
+ */
+void kbase_pm_reset_start_locked(struct kbase_device *kbdev);
+
+/**
+ * kbase_pm_reset_complete - Signal that GPU reset has completed
+ * @kbdev: Device pointer
+ *
+ * Normal power management operation will be resumed. The power manager will
+ * re-evaluate what cores are needed and power on or off as required.
+ */
+void kbase_pm_reset_complete(struct kbase_device *kbdev);
+
+/**
+ * kbase_pm_protected_override_enable - Enable the protected mode override
+ * @kbdev: Device pointer
+ *
+ * When the protected mode override is enabled, all shader cores are requested
+ * to power down, and the L2 power state can be controlled by
+ * kbase_pm_protected_l2_override().
+ *
+ * Caller must hold hwaccess_lock.
+ */
+void kbase_pm_protected_override_enable(struct kbase_device *kbdev);
+
+/**
+ * kbase_pm_protected_override_disable - Disable the protected mode override
+ * @kbdev: Device pointer
+ *
+ * Caller must hold hwaccess_lock.
+ */
+void kbase_pm_protected_override_disable(struct kbase_device *kbdev);
+
+/**
+ * kbase_pm_protected_l2_override - Control the protected mode L2 override
+ * @kbdev: Device pointer
+ * @override: true to enable the override, false to disable
+ *
+ * When the driver is transitioning in or out of protected mode, the L2 cache is
+ * forced to power off. This can be overridden to force the L2 cache to power
+ * on. This is required to change coherency settings on some GPUs.
+ */
+void kbase_pm_protected_l2_override(struct kbase_device *kbdev, bool override);
+
+/* If true, the driver should explicitly control corestack power management,
+ * instead of relying on the Power Domain Controller.
+ */
+extern bool corestack_driver_control;
+
+/* If true, disable powering-down of individual cores, and just power-down at
+ * the top-level using platform-specific code.
+ * If false, use the expected behaviour of controlling the individual cores
+ * from within the driver.
+ */
+extern bool platform_power_down_only;
+
 #endif /* _KBASE_BACKEND_PM_INTERNAL_H_ */
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_policy.c b/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
index 6dd00a9..2f06a0a 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_policy.c
@@ -30,215 +30,51 @@
 #include <mali_kbase_config_defaults.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
 
-static const struct kbase_pm_policy *const policy_list[] = {
+static const struct kbase_pm_policy *const all_policy_list[] = {
 #ifdef CONFIG_MALI_NO_MALI
 	&kbase_pm_always_on_policy_ops,
-	&kbase_pm_demand_policy_ops,
 	&kbase_pm_coarse_demand_policy_ops,
 #if !MALI_CUSTOMER_RELEASE
-	&kbase_pm_demand_always_powered_policy_ops,
-	&kbase_pm_fast_start_policy_ops,
+	&kbase_pm_always_on_demand_policy_ops,
 #endif
 #else				/* CONFIG_MALI_NO_MALI */
-#if !PLATFORM_POWER_DOWN_ONLY
-	&kbase_pm_demand_policy_ops,
-#endif /* !PLATFORM_POWER_DOWN_ONLY */
 	&kbase_pm_coarse_demand_policy_ops,
-	&kbase_pm_always_on_policy_ops,
 #if !MALI_CUSTOMER_RELEASE
-#if !PLATFORM_POWER_DOWN_ONLY
-	&kbase_pm_demand_always_powered_policy_ops,
-	&kbase_pm_fast_start_policy_ops,
-#endif /* !PLATFORM_POWER_DOWN_ONLY */
+	&kbase_pm_always_on_demand_policy_ops,
 #endif
+	&kbase_pm_always_on_policy_ops
 #endif /* CONFIG_MALI_NO_MALI */
 };
 
-/* The number of policies available in the system.
- * This is derived from the number of functions listed in policy_get_functions.
+/* A filtered list of policies available in the system, calculated by filtering
+ * all_policy_list based on the flags provided by each policy.
  */
-#define POLICY_COUNT (sizeof(policy_list)/sizeof(*policy_list))
-
-
-/* Function IDs for looking up Timeline Trace codes in
- * kbase_pm_change_state_trace_code */
-enum kbase_pm_func_id {
-	KBASE_PM_FUNC_ID_REQUEST_CORES_START,
-	KBASE_PM_FUNC_ID_REQUEST_CORES_END,
-	KBASE_PM_FUNC_ID_RELEASE_CORES_START,
-	KBASE_PM_FUNC_ID_RELEASE_CORES_END,
-	/* Note: kbase_pm_unrequest_cores() is on the slow path, and we neither
-	 * expect to hit it nor tend to hit it very much anyway. We can detect
-	 * whether we need more instrumentation by a difference between
-	 * PM_CHECKTRANS events and PM_SEND/HANDLE_EVENT. */
-
-	/* Must be the last */
-	KBASE_PM_FUNC_ID_COUNT
-};
-
-
-/* State changes during request/unrequest/release-ing cores */
-enum {
-	KBASE_PM_CHANGE_STATE_SHADER = (1u << 0),
-	KBASE_PM_CHANGE_STATE_TILER  = (1u << 1),
-
-	/* These two must be last */
-	KBASE_PM_CHANGE_STATE_MASK = (KBASE_PM_CHANGE_STATE_TILER |
-						KBASE_PM_CHANGE_STATE_SHADER),
-	KBASE_PM_CHANGE_STATE_COUNT = KBASE_PM_CHANGE_STATE_MASK + 1
-};
-typedef u32 kbase_pm_change_state;
+static const struct kbase_pm_policy *enabled_policy_list[ARRAY_SIZE(all_policy_list)];
+static size_t enabled_policy_count;
 
-/**
- * kbasep_pm_do_poweroff_cores - Process a poweroff request and power down any
- *                               requested shader cores
- * @kbdev: Device pointer
- */
-static void kbasep_pm_do_poweroff_cores(struct kbase_device *kbdev)
+static void generate_filtered_policy_list(void)
 {
-	u64 prev_shader_state = kbdev->pm.backend.desired_shader_state;
-	u64 prev_tiler_state = kbdev->pm.backend.desired_tiler_state;
-
-	lockdep_assert_held(&kbdev->hwaccess_lock);
-
-	kbdev->pm.backend.desired_shader_state &=
-			~kbdev->pm.backend.shader_poweroff_pending;
-	kbdev->pm.backend.desired_tiler_state &=
-			~kbdev->pm.backend.tiler_poweroff_pending;
-
-	kbdev->pm.backend.shader_poweroff_pending = 0;
-	kbdev->pm.backend.tiler_poweroff_pending = 0;
-
-	if (prev_shader_state != kbdev->pm.backend.desired_shader_state ||
-			prev_tiler_state !=
-				kbdev->pm.backend.desired_tiler_state ||
-			kbdev->pm.backend.ca_in_transition) {
-		bool cores_are_available;
-
-		cores_are_available = kbase_pm_check_transitions_nolock(kbdev);
-
-		/* Don't need 'cores_are_available',
-		 * because we don't return anything */
-		CSTD_UNUSED(cores_are_available);
-	}
-}
-
-static enum hrtimer_restart
-kbasep_pm_do_gpu_poweroff_callback(struct hrtimer *timer)
-{
-	struct kbase_device *kbdev;
-	unsigned long flags;
-
-	kbdev = container_of(timer, struct kbase_device,
-						pm.backend.gpu_poweroff_timer);
-
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-
-	/* It is safe for this call to do nothing if the work item is already
-	 * queued. The worker function will read the must up-to-date state of
-	 * kbdev->pm.backend.gpu_poweroff_pending under lock.
-	 *
-	 * If a state change occurs while the worker function is processing,
-	 * this call will succeed as a work item can be requeued once it has
-	 * started processing.
-	 */
-	if (kbdev->pm.backend.gpu_poweroff_pending)
-		queue_work(kbdev->pm.backend.gpu_poweroff_wq,
-					&kbdev->pm.backend.gpu_poweroff_work);
-
-	if (kbdev->pm.backend.shader_poweroff_pending ||
-			kbdev->pm.backend.tiler_poweroff_pending) {
-		kbdev->pm.backend.shader_poweroff_pending_time--;
-
-		KBASE_DEBUG_ASSERT(
-				kbdev->pm.backend.shader_poweroff_pending_time
-									>= 0);
-
-		if (!kbdev->pm.backend.shader_poweroff_pending_time)
-			kbasep_pm_do_poweroff_cores(kbdev);
-	}
+	size_t i;
 
-	if (kbdev->pm.backend.poweroff_timer_needed) {
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	for (i = 0; i < ARRAY_SIZE(all_policy_list); ++i) {
+		const struct kbase_pm_policy *pol = all_policy_list[i];
 
-		hrtimer_add_expires(timer, kbdev->pm.gpu_poweroff_time);
+		if (platform_power_down_only &&
+				(pol->flags & KBASE_PM_POLICY_FLAG_DISABLED_WITH_POWER_DOWN_ONLY))
+			continue;
 
-		return HRTIMER_RESTART;
+		enabled_policy_list[enabled_policy_count++] = pol;
 	}
-
-	kbdev->pm.backend.poweroff_timer_running = false;
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-	return HRTIMER_NORESTART;
-}
-
-static void kbasep_pm_do_gpu_poweroff_wq(struct work_struct *data)
-{
-	unsigned long flags;
-	struct kbase_device *kbdev;
-	bool do_poweroff = false;
-
-	kbdev = container_of(data, struct kbase_device,
-						pm.backend.gpu_poweroff_work);
-
-	mutex_lock(&kbdev->pm.lock);
-
-	if (kbdev->pm.backend.gpu_poweroff_pending == 0) {
-		mutex_unlock(&kbdev->pm.lock);
-		return;
-	}
-
-	kbdev->pm.backend.gpu_poweroff_pending--;
-
-	if (kbdev->pm.backend.gpu_poweroff_pending > 0) {
-		mutex_unlock(&kbdev->pm.lock);
-		return;
-	}
-
-	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_poweroff_pending == 0);
-
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-
-	/* Only power off the GPU if a request is still pending */
-	if (!kbdev->pm.backend.pm_current_policy->get_core_active(kbdev))
-		do_poweroff = true;
-
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-	if (do_poweroff) {
-		kbdev->pm.backend.poweroff_timer_needed = false;
-		hrtimer_cancel(&kbdev->pm.backend.gpu_poweroff_timer);
-		kbdev->pm.backend.poweroff_timer_running = false;
-
-		/* Power off the GPU */
-		kbase_pm_do_poweroff(kbdev, false);
-	}
-
-	mutex_unlock(&kbdev->pm.lock);
 }
 
 int kbase_pm_policy_init(struct kbase_device *kbdev)
 {
-	struct workqueue_struct *wq;
-
-	wq = alloc_workqueue("kbase_pm_do_poweroff",
-			WQ_HIGHPRI | WQ_UNBOUND, 1);
-	if (!wq)
-		return -ENOMEM;
-
-	kbdev->pm.backend.gpu_poweroff_wq = wq;
-	INIT_WORK(&kbdev->pm.backend.gpu_poweroff_work,
-			kbasep_pm_do_gpu_poweroff_wq);
-	hrtimer_init(&kbdev->pm.backend.gpu_poweroff_timer,
-			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	kbdev->pm.backend.gpu_poweroff_timer.function =
-			kbasep_pm_do_gpu_poweroff_callback;
-	kbdev->pm.backend.pm_current_policy = policy_list[0];
+	generate_filtered_policy_list();
+	if (enabled_policy_count == 0)
+		return -EINVAL;
+
+	kbdev->pm.backend.pm_current_policy = enabled_policy_list[0];
 	kbdev->pm.backend.pm_current_policy->init(kbdev);
-	kbdev->pm.gpu_poweroff_time =
-			HR_TIMER_DELAY_NSEC(DEFAULT_PM_GPU_POWEROFF_TICK_NS);
-	kbdev->pm.poweroff_shader_ticks = DEFAULT_PM_POWEROFF_TICK_SHADER;
-	kbdev->pm.poweroff_gpu_ticks = DEFAULT_PM_POWEROFF_TICK_GPU;
 
 	return 0;
 }
@@ -246,29 +82,6 @@ int kbase_pm_policy_init(struct kbase_device *kbdev)
 void kbase_pm_policy_term(struct kbase_device *kbdev)
 {
 	kbdev->pm.backend.pm_current_policy->term(kbdev);
-	destroy_workqueue(kbdev->pm.backend.gpu_poweroff_wq);
-}
-
-void kbase_pm_cancel_deferred_poweroff(struct kbase_device *kbdev)
-{
-	unsigned long flags;
-
-	lockdep_assert_held(&kbdev->pm.lock);
-
-	kbdev->pm.backend.poweroff_timer_needed = false;
-	hrtimer_cancel(&kbdev->pm.backend.gpu_poweroff_timer);
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	kbdev->pm.backend.poweroff_timer_running = false;
-
-	/* If wq is already running but is held off by pm.lock, make sure it has
-	 * no effect */
-	kbdev->pm.backend.gpu_poweroff_pending = 0;
-
-	kbdev->pm.backend.shader_poweroff_pending = 0;
-	kbdev->pm.backend.tiler_poweroff_pending = 0;
-	kbdev->pm.backend.shader_poweroff_pending_time = 0;
-
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 }
 
 void kbase_pm_update_active(struct kbase_device *kbdev)
@@ -291,35 +104,24 @@ void kbase_pm_update_active(struct kbase_device *kbdev)
 		kbdev->pm.backend.pm_current_policy->name);
 
 	if (active) {
-		if (backend->gpu_poweroff_pending) {
-			/* Cancel any pending power off request */
-			backend->gpu_poweroff_pending = 0;
-
-			/* If a request was pending then the GPU was still
-			 * powered, so no need to continue */
-			if (!kbdev->poweroff_pending) {
-				spin_unlock_irqrestore(&kbdev->hwaccess_lock,
-						flags);
-				return;
-			}
-		}
-
-		if (!backend->poweroff_timer_running && !backend->gpu_powered &&
-				(pm->poweroff_gpu_ticks ||
-				pm->poweroff_shader_ticks)) {
-			backend->poweroff_timer_needed = true;
-			backend->poweroff_timer_running = true;
-			hrtimer_start(&backend->gpu_poweroff_timer,
-					pm->gpu_poweroff_time,
-					HRTIMER_MODE_REL);
-		}
-
 		/* Power on the GPU and any cores requested by the policy */
-		if (pm->backend.poweroff_wait_in_progress) {
+		if (!pm->backend.invoke_poweroff_wait_wq_when_l2_off &&
+				pm->backend.poweroff_wait_in_progress) {
 			KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
 			pm->backend.poweron_required = true;
 			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 		} else {
+			/* Cancel the the invocation of
+			 * kbase_pm_gpu_poweroff_wait_wq() from the L2 state
+			 * machine. This is safe - it
+			 * invoke_poweroff_wait_wq_when_l2_off is true, then
+			 * the poweroff work hasn't even been queued yet,
+			 * meaning we can go straight to powering on.
+			 */
+			pm->backend.invoke_poweroff_wait_wq_when_l2_off = false;
+			pm->backend.poweroff_wait_in_progress = false;
+			pm->backend.l2_desired = true;
+
 			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 			kbase_pm_do_poweron(kbdev, false);
 		}
@@ -328,89 +130,21 @@ void kbase_pm_update_active(struct kbase_device *kbdev)
 		 * when there are contexts active */
 		KBASE_DEBUG_ASSERT(pm->active_count == 0);
 
-		if (backend->shader_poweroff_pending ||
-				backend->tiler_poweroff_pending) {
-			backend->shader_poweroff_pending = 0;
-			backend->tiler_poweroff_pending = 0;
-			backend->shader_poweroff_pending_time = 0;
-		}
-
 		/* Request power off */
 		if (pm->backend.gpu_powered) {
-			if (pm->poweroff_gpu_ticks) {
-				backend->gpu_poweroff_pending =
-						pm->poweroff_gpu_ticks;
-				backend->poweroff_timer_needed = true;
-				if (!backend->poweroff_timer_running) {
-					/* Start timer if not running (eg if
-					 * power policy has been changed from
-					 * always_on to something else). This
-					 * will ensure the GPU is actually
-					 * powered off */
-					backend->poweroff_timer_running
-							= true;
-					hrtimer_start(
-						&backend->gpu_poweroff_timer,
-						pm->gpu_poweroff_time,
-						HRTIMER_MODE_REL);
-				}
-				spin_unlock_irqrestore(&kbdev->hwaccess_lock,
-						flags);
-			} else {
-				spin_unlock_irqrestore(&kbdev->hwaccess_lock,
-						flags);
-
-				/* Power off the GPU immediately */
-				kbase_pm_do_poweroff(kbdev, false);
-			}
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+			/* Power off the GPU immediately */
+			kbase_pm_do_poweroff(kbdev, false);
 		} else {
 			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 		}
 	}
 }
 
-/**
- * get_desired_shader_bitmap - Get the desired shader bitmap, based on the
- *                             current power policy
- *
- * @kbdev: The kbase device structure for the device
- *
- * Queries the current power policy to determine if shader cores will be
- * required in the current state, and apply any HW workarounds.
- *
- * Return: bitmap of desired shader cores
- */
-
-static u64 get_desired_shader_bitmap(struct kbase_device *kbdev)
-{
-	u64 desired_bitmap = 0u;
-
-	lockdep_assert_held(&kbdev->hwaccess_lock);
-
-	if (kbdev->pm.backend.pm_current_policy->shaders_needed(kbdev))
-		desired_bitmap = kbase_pm_ca_get_core_mask(kbdev);
-
-	WARN(!desired_bitmap && kbdev->shader_needed_cnt,
-			"Shader cores are needed but policy '%s' did not make them needed",
-			kbdev->pm.backend.pm_current_policy->name);
-
-	if (!kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_XAFFINITY)) {
-		/* Unless XAFFINITY is supported, enable core 0 if tiler
-		 * required, regardless of core availability
-		 */
-		if (kbdev->tiler_needed_cnt > 0)
-			desired_bitmap |= 1;
-	}
-
-	return desired_bitmap;
-}
-
 void kbase_pm_update_cores_state_nolock(struct kbase_device *kbdev)
 {
-	u64 desired_bitmap;
-	u64 desired_tiler_bitmap;
-	bool cores_are_available;
-	bool do_poweroff = false;
+	bool shaders_desired;
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
@@ -419,105 +153,20 @@ void kbase_pm_update_cores_state_nolock(struct kbase_device *kbdev)
 	if (kbdev->pm.backend.poweroff_wait_in_progress)
 		return;
 
-	if (kbdev->protected_mode_transition && !kbdev->shader_needed_cnt &&
-			!kbdev->tiler_needed_cnt) {
+	if (kbdev->pm.backend.protected_transition_override)
 		/* We are trying to change in/out of protected mode - force all
 		 * cores off so that the L2 powers down */
-		desired_bitmap = 0;
-		desired_tiler_bitmap = 0;
-	} else {
-		desired_bitmap = get_desired_shader_bitmap(kbdev);
-
-		if (kbdev->tiler_needed_cnt > 0)
-			desired_tiler_bitmap = 1;
-		else
-			desired_tiler_bitmap = 0;
-	}
+		shaders_desired = false;
+	else
+		shaders_desired = kbdev->pm.backend.pm_current_policy->shaders_needed(kbdev);
 
-	if (kbdev->pm.backend.desired_shader_state != desired_bitmap)
+	if (kbdev->pm.backend.shaders_desired != shaders_desired) {
 		KBASE_TRACE_ADD(kbdev, PM_CORES_CHANGE_DESIRED, NULL, NULL, 0u,
-							(u32)desired_bitmap);
-	/* Are any cores being powered on? */
-	if (~kbdev->pm.backend.desired_shader_state & desired_bitmap ||
-	    ~kbdev->pm.backend.desired_tiler_state & desired_tiler_bitmap ||
-	    kbdev->pm.backend.ca_in_transition) {
-		/* Check if we are powering off any cores before updating shader
-		 * state */
-		if (kbdev->pm.backend.desired_shader_state & ~desired_bitmap ||
-				kbdev->pm.backend.desired_tiler_state &
-				~desired_tiler_bitmap) {
-			/* Start timer to power off cores */
-			kbdev->pm.backend.shader_poweroff_pending |=
-				(kbdev->pm.backend.desired_shader_state &
-							~desired_bitmap);
-			kbdev->pm.backend.tiler_poweroff_pending |=
-				(kbdev->pm.backend.desired_tiler_state &
-							~desired_tiler_bitmap);
-
-			if (kbdev->pm.poweroff_shader_ticks &&
-					!kbdev->protected_mode_transition)
-				kbdev->pm.backend.shader_poweroff_pending_time =
-						kbdev->pm.poweroff_shader_ticks;
-			else
-				do_poweroff = true;
-		}
-
-		kbdev->pm.backend.desired_shader_state = desired_bitmap;
-		kbdev->pm.backend.desired_tiler_state = desired_tiler_bitmap;
-
-		/* If any cores are being powered on, transition immediately */
-		cores_are_available = kbase_pm_check_transitions_nolock(kbdev);
-	} else if (kbdev->pm.backend.desired_shader_state & ~desired_bitmap ||
-				kbdev->pm.backend.desired_tiler_state &
-				~desired_tiler_bitmap) {
-		/* Start timer to power off cores */
-		kbdev->pm.backend.shader_poweroff_pending |=
-				(kbdev->pm.backend.desired_shader_state &
-							~desired_bitmap);
-		kbdev->pm.backend.tiler_poweroff_pending |=
-				(kbdev->pm.backend.desired_tiler_state &
-							~desired_tiler_bitmap);
-		if (kbdev->pm.poweroff_shader_ticks &&
-				!kbdev->protected_mode_transition)
-			kbdev->pm.backend.shader_poweroff_pending_time =
-					kbdev->pm.poweroff_shader_ticks;
-		else
-			kbasep_pm_do_poweroff_cores(kbdev);
-	} else if (kbdev->pm.active_count == 0 && desired_bitmap != 0 &&
-			desired_tiler_bitmap != 0 &&
-			kbdev->pm.backend.poweroff_timer_needed) {
-		/* If power policy is keeping cores on despite there being no
-		 * active contexts then disable poweroff timer as it isn't
-		 * required.
-		 * Only reset poweroff_timer_needed if we're not in the middle
-		 * of the power off callback */
-		kbdev->pm.backend.poweroff_timer_needed = false;
-	}
+				(u32)kbdev->pm.backend.shaders_desired);
 
-	/* Ensure timer does not power off wanted cores and make sure to power
-	 * off unwanted cores */
-	if (kbdev->pm.backend.shader_poweroff_pending ||
-			kbdev->pm.backend.tiler_poweroff_pending) {
-		kbdev->pm.backend.shader_poweroff_pending &=
-				~(kbdev->pm.backend.desired_shader_state &
-								desired_bitmap);
-		kbdev->pm.backend.tiler_poweroff_pending &=
-				~(kbdev->pm.backend.desired_tiler_state &
-				desired_tiler_bitmap);
-
-		if (!kbdev->pm.backend.shader_poweroff_pending &&
-				!kbdev->pm.backend.tiler_poweroff_pending)
-			kbdev->pm.backend.shader_poweroff_pending_time = 0;
+		kbdev->pm.backend.shaders_desired = shaders_desired;
+		kbase_pm_update_state(kbdev);
 	}
-
-	/* Shader poweroff is deferred to the end of the function, to eliminate
-	 * issues caused by the core availability policy recursing into this
-	 * function */
-	if (do_poweroff)
-		kbasep_pm_do_poweroff_cores(kbdev);
-
-	/* Don't need 'cores_are_available', because we don't return anything */
-	CSTD_UNUSED(cores_are_available);
 }
 
 void kbase_pm_update_cores_state(struct kbase_device *kbdev)
@@ -533,12 +182,11 @@ void kbase_pm_update_cores_state(struct kbase_device *kbdev)
 
 int kbase_pm_list_policies(const struct kbase_pm_policy * const **list)
 {
-	if (!list)
-		return POLICY_COUNT;
-
-	*list = policy_list;
+	WARN_ON(enabled_policy_count == 0);
+	if (list)
+		*list = enabled_policy_list;
 
-	return POLICY_COUNT;
+	return enabled_policy_count;
 }
 
 KBASE_EXPORT_TEST_API(kbase_pm_list_policies);
@@ -607,171 +255,3 @@ void kbase_pm_set_policy(struct kbase_device *kbdev,
 }
 
 KBASE_EXPORT_TEST_API(kbase_pm_set_policy);
-
-void kbase_pm_request_cores(struct kbase_device *kbdev,
-				bool tiler_required, bool shader_required)
-{
-	kbase_pm_change_state change_gpu_state = 0u;
-
-	KBASE_DEBUG_ASSERT(kbdev != NULL);
-
-	lockdep_assert_held(&kbdev->hwaccess_lock);
-
-	if (shader_required) {
-		int cnt = ++kbdev->shader_needed_cnt;
-
-		if (cnt == 1)
-			change_gpu_state |= KBASE_PM_CHANGE_STATE_SHADER;
-
-		KBASE_DEBUG_ASSERT(kbdev->shader_needed_cnt != 0);
-	}
-
-	if (tiler_required) {
-		int cnt = ++kbdev->tiler_needed_cnt;
-
-		if (cnt == 1)
-			change_gpu_state |= KBASE_PM_CHANGE_STATE_TILER;
-
-		KBASE_DEBUG_ASSERT(kbdev->tiler_needed_cnt != 0);
-	}
-
-	if (change_gpu_state) {
-		KBASE_TRACE_ADD(kbdev, PM_REQUEST_CHANGE_SHADER_NEEDED, NULL,
-				NULL, 0u, kbdev->shader_needed_cnt);
-		KBASE_TRACE_ADD(kbdev, PM_REQUEST_CHANGE_TILER_NEEDED, NULL,
-				NULL, 0u, kbdev->tiler_needed_cnt);
-
-		kbase_pm_update_cores_state_nolock(kbdev);
-	}
-}
-
-KBASE_EXPORT_TEST_API(kbase_pm_request_cores);
-
-void kbase_pm_release_cores(struct kbase_device *kbdev,
-				bool tiler_required, bool shader_required)
-{
-	kbase_pm_change_state change_gpu_state = 0u;
-
-	KBASE_DEBUG_ASSERT(kbdev != NULL);
-
-	lockdep_assert_held(&kbdev->hwaccess_lock);
-
-	if (shader_required) {
-		int cnt;
-
-		KBASE_DEBUG_ASSERT(kbdev->shader_needed_cnt > 0);
-
-		cnt = --kbdev->shader_needed_cnt;
-
-		if (0 == cnt) {
-			change_gpu_state |= KBASE_PM_CHANGE_STATE_SHADER;
-		}
-	}
-
-	if (tiler_required) {
-		int cnt;
-
-		KBASE_DEBUG_ASSERT(kbdev->tiler_needed_cnt > 0);
-
-		cnt = --kbdev->tiler_needed_cnt;
-
-		if (0 == cnt)
-			change_gpu_state |= KBASE_PM_CHANGE_STATE_TILER;
-	}
-
-	if (change_gpu_state) {
-		KBASE_TRACE_ADD(kbdev, PM_RELEASE_CHANGE_SHADER_NEEDED, NULL,
-				NULL, 0u, kbdev->shader_needed_cnt);
-		KBASE_TRACE_ADD(kbdev, PM_RELEASE_CHANGE_TILER_NEEDED, NULL,
-				NULL, 0u, kbdev->tiler_needed_cnt);
-
-		kbase_pm_update_cores_state_nolock(kbdev);
-	}
-}
-
-KBASE_EXPORT_TEST_API(kbase_pm_release_cores);
-
-void kbase_pm_request_cores_sync(struct kbase_device *kbdev,
-		bool tiler_required, bool shader_required)
-{
-	unsigned long flags;
-
-	kbase_pm_wait_for_poweroff_complete(kbdev);
-
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	kbase_pm_request_cores(kbdev, tiler_required, shader_required);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-	kbase_pm_check_transitions_sync(kbdev);
-}
-
-KBASE_EXPORT_TEST_API(kbase_pm_request_cores_sync);
-
-static void kbase_pm_l2_caches_ref(struct kbase_device *kbdev)
-{
-	lockdep_assert_held(&kbdev->hwaccess_lock);
-
-	kbdev->l2_users_count++;
-
-	KBASE_DEBUG_ASSERT(kbdev->l2_users_count != 0);
-
-	/* Check for the required L2 transitions.
-	 * Caller would block here for the L2 caches of all core groups to be
-	 * powered on, so need to inform the Hw to power up all the L2 caches.
-	 * Can't rely on the l2_users_count value being non-zero previously to
-	 * avoid checking for the transition, as the count could be non-zero
-	 * even if not all the instances of L2 cache are powered up since
-	 * currently the power status of L2 is not tracked separately for each
-	 * core group. Also if the GPU is reset while the L2 is on, L2 will be
-	 * off but the count will be non-zero.
-	 */
-	kbase_pm_check_transitions_nolock(kbdev);
-}
-
-void kbase_pm_request_l2_caches(struct kbase_device *kbdev)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-
-	/* Take the reference on l2_users_count and check core transitions.
-	 */
-	kbase_pm_l2_caches_ref(kbdev);
-
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-	wait_event(kbdev->pm.backend.l2_powered_wait,
-					kbdev->pm.backend.l2_powered == 1);
-}
-
-KBASE_EXPORT_TEST_API(kbase_pm_request_l2_caches);
-
-void kbase_pm_request_l2_caches_nolock(struct kbase_device *kbdev)
-{
-	/* Take the reference on l2_users_count and check core transitions.
-	 */
-	kbase_pm_l2_caches_ref(kbdev);
-}
-
-void kbase_pm_request_l2_caches_l2_is_on(struct kbase_device *kbdev)
-{
-	lockdep_assert_held(&kbdev->hwaccess_lock);
-
-	kbdev->l2_users_count++;
-}
-
-KBASE_EXPORT_TEST_API(kbase_pm_request_l2_caches_l2_is_on);
-
-void kbase_pm_release_l2_caches(struct kbase_device *kbdev)
-{
-	lockdep_assert_held(&kbdev->hwaccess_lock);
-
-	KBASE_DEBUG_ASSERT(kbdev->l2_users_count > 0);
-
-	--kbdev->l2_users_count;
-
-	if (!kbdev->l2_users_count)
-		kbase_pm_check_transitions_nolock(kbdev);
-}
-
-KBASE_EXPORT_TEST_API(kbase_pm_release_l2_caches);
diff --git a/mali_kbase/backend/gpu/mali_kbase_pm_policy.h b/mali_kbase/backend/gpu/mali_kbase_pm_policy.h
index 2e86929..28d258f 100644
--- a/mali_kbase/backend/gpu/mali_kbase_pm_policy.h
+++ b/mali_kbase/backend/gpu/mali_kbase_pm_policy.h
@@ -64,61 +64,10 @@ void kbase_pm_update_active(struct kbase_device *kbdev);
  */
 void kbase_pm_update_cores(struct kbase_device *kbdev);
 
-
-enum kbase_pm_cores_ready {
-	KBASE_CORES_NOT_READY = 0,
-	KBASE_NEW_AFFINITY = 1,
-	KBASE_CORES_READY = 2
-};
-
-
-/**
- * kbase_pm_request_cores - Request the desired cores to be powered up.
- * @kbdev:           Kbase device
- * @tiler_required:  true if tiler is required
- * @shader_required: true if shaders are required
- *
- * Called by the scheduler to request power to the desired cores.
- *
- * There is no guarantee that the HW will be powered up on return. Use
- * kbase_pm_cores_requested()/kbase_pm_cores_ready() to verify that cores are
- * now powered, or instead call kbase_pm_request_cores_sync().
- */
-void kbase_pm_request_cores(struct kbase_device *kbdev, bool tiler_required,
-		bool shader_required);
-
-/**
- * kbase_pm_request_cores_sync - Synchronous variant of kbase_pm_request_cores()
- * @kbdev:           Kbase device
- * @tiler_required:  true if tiler is required
- * @shader_required: true if shaders are required
- *
- * When this function returns, the @shader_cores will be in the READY state.
- *
- * This is safe variant of kbase_pm_check_transitions_sync(): it handles the
- * work of ensuring the requested cores will remain powered until a matching
- * call to kbase_pm_unrequest_cores()/kbase_pm_release_cores() (as appropriate)
- * is made.
- */
-void kbase_pm_request_cores_sync(struct kbase_device *kbdev,
-		bool tiler_required, bool shader_required);
-
-/**
- * kbase_pm_release_cores - Request the desired cores to be powered down.
- * @kbdev:           Kbase device
- * @tiler_required:  true if tiler is required
- * @shader_required: true if shaders are required
- *
- * Called by the scheduler to release its power reference on the desired cores.
- */
-void kbase_pm_release_cores(struct kbase_device *kbdev, bool tiler_required,
-		bool shader_required);
-
 /**
  * kbase_pm_cores_requested - Check that a power request has been locked into
  *                            the HW.
  * @kbdev:           Kbase device
- * @tiler_required:  true if tiler is required
  * @shader_required: true if shaders are required
  *
  * Called by the scheduler to check if a power on request has been locked into
@@ -136,112 +85,23 @@ void kbase_pm_release_cores(struct kbase_device *kbdev, bool tiler_required,
  *         request is still pending.
  */
 static inline bool kbase_pm_cores_requested(struct kbase_device *kbdev,
-		bool tiler_required, bool shader_required)
+		bool shader_required)
 {
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-	if ((shader_required && !kbdev->shader_available_bitmap) ||
-			(tiler_required && !kbdev->tiler_available_bitmap))
+	/* If the L2 & tiler are not on or pending, then the tiler is not yet
+	 * available, and shaders are definitely not powered.
+	 */
+	if (kbdev->pm.backend.l2_state != KBASE_L2_PEND_ON &&
+			kbdev->pm.backend.l2_state != KBASE_L2_ON)
 		return false;
 
-	return true;
-}
-
-/**
- * kbase_pm_cores_ready -  Check that the required cores have been powered on by
- *                         the HW.
- * @kbdev:           Kbase device
- * @tiler_required:  true if tiler is required
- * @shader_required: true if shaders are required
- *
- * Called by the scheduler to check if cores are ready.
- *
- * Note that the caller should ensure that they have first requested cores
- * before calling this function.
- *
- * Caller must hold the hwaccess_lock.
- *
- * Return: true if the cores are ready.
- */
-static inline bool kbase_pm_cores_ready(struct kbase_device *kbdev,
-		bool tiler_required, bool shader_required)
-{
-	lockdep_assert_held(&kbdev->hwaccess_lock);
-
-	if ((shader_required && !kbdev->shader_ready_bitmap) ||
-			(tiler_required && !kbdev->tiler_available_bitmap))
+	if (shader_required &&
+			kbdev->pm.backend.shaders_state != KBASE_SHADERS_PEND_ON_CORESTACK_ON &&
+			kbdev->pm.backend.shaders_state != KBASE_SHADERS_ON_CORESTACK_ON)
 		return false;
 
 	return true;
 }
 
-/**
- * kbase_pm_request_l2_caches - Request l2 caches
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- *
- * Request the use of l2 caches for all core groups, power up, wait and prevent
- * the power manager from powering down the l2 caches.
- *
- * This tells the power management that the caches should be powered up, and
- * they should remain powered, irrespective of the usage of shader cores. This
- * does not return until the l2 caches are powered up.
- *
- * The caller must call kbase_pm_release_l2_caches() when they are finished
- * to allow normal power management of the l2 caches to resume.
- *
- * This should only be used when power management is active.
- */
-void kbase_pm_request_l2_caches(struct kbase_device *kbdev);
-
-/**
- * kbase_pm_request_l2_caches_nolock - Request l2 caches, nolock version
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- *
- * Request the use of l2 caches for all core groups and power up without
- * waiting for power manager to actually power up the cores. This is done
- * because the call to this function is done from within the atomic context
- * and the actual l2 caches being powered up is checked at a later stage.
- * The reference taken on l2 caches is removed when the protected mode atom
- * is released so there is no need to make a call to a matching
- * release_l2_caches().
- *
- * This function is used specifically for the case when l2 caches are
- * to be powered up as part of the sequence for entering protected mode.
- *
- * This should only be used when power management is active.
- */
-void kbase_pm_request_l2_caches_nolock(struct kbase_device *kbdev);
-
-/**
- * kbase_pm_request_l2_caches_l2_is_on - Request l2 caches but don't power on
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- *
- * Increment the count of l2 users but do not attempt to power on the l2
- *
- * It is the callers responsibility to ensure that the l2 is already powered up
- * and to eventually call kbase_pm_release_l2_caches()
- */
-void kbase_pm_request_l2_caches_l2_is_on(struct kbase_device *kbdev);
-
-/**
- * kbase_pm_release_l2_caches - Release l2 caches
- *
- * @kbdev: The kbase device structure for the device (must be a valid pointer)
- *
- * Release the use of l2 caches for all core groups and allow the power manager
- * to power them down when necessary.
- *
- * This tells the power management that the caches can be powered down if
- * necessary, with respect to the usage of shader cores.
- *
- * The caller must have called kbase_pm_request_l2_caches() prior to a call
- * to this.
- *
- * This should only be used when power management is active.
- */
-void kbase_pm_release_l2_caches(struct kbase_device *kbdev);
-
 #endif /* _KBASE_PM_POLICY_H_ */
diff --git a/mali_kbase/backend/gpu/mali_kbase_time.h b/mali_kbase/backend/gpu/mali_kbase_time.h
deleted file mode 100644
index ece7009..0000000
--- a/mali_kbase/backend/gpu/mali_kbase_time.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *
- * (C) COPYRIGHT 2014-2015,2018 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU licence.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * SPDX-License-Identifier: GPL-2.0
- *
- */
-
-#ifndef _KBASE_BACKEND_TIME_H_
-#define _KBASE_BACKEND_TIME_H_
-
-/**
- * kbase_backend_get_gpu_time() - Get current GPU time
- * @kbdev:		Device pointer
- * @cycle_counter:	Pointer to u64 to store cycle counter in
- * @system_time:	Pointer to u64 to store system time in
- * @ts:			Pointer to struct timespec to store current monotonic
- *			time in
- */
-void kbase_backend_get_gpu_time(struct kbase_device *kbdev, u64 *cycle_counter,
-				u64 *system_time, struct timespec *ts);
-
-/**
- * kbase_wait_write_flush() -  Wait for GPU write flush
- * @kbdev:	Kbase device
- *
- * Wait 1000 GPU clock cycles. This delay is known to give the GPU time to flush
- * its write buffer.
- *
- * If GPU resets occur then the counters are reset to zero, the delay may not be
- * as expected.
- *
- * This function is only in use for BASE_HW_ISSUE_6367
- */
-#ifdef CONFIG_MALI_NO_MALI
-static inline void kbase_wait_write_flush(struct kbase_device *kbdev)
-{
-}
-#else
-void kbase_wait_write_flush(struct kbase_device *kbdev);
-#endif
-
-#endif /* _KBASE_BACKEND_TIME_H_ */
diff --git a/mali_kbase/build.bp b/mali_kbase/build.bp
index ba3a25c..2cf685c 100644
--- a/mali_kbase/build.bp
+++ b/mali_kbase/build.bp
@@ -19,9 +19,6 @@ bob_defaults {
     no_mali: {
         kbuild_options: ["CONFIG_MALI_NO_MALI=y"],
     },
-    mali_corestack: {
-        kbuild_options: ["CONFIG_MALI_CORESTACK=y"],
-    },
     mali_devfreq: {
         kbuild_options: ["CONFIG_MALI_DEVFREQ=y"],
     },
@@ -84,8 +81,15 @@ bob_kernel_module {
         "CONFIG_MALI_MIDGARD=m",
         "CONFIG_MALI_NO_MALI_DEFAULT_GPU={{.gpu}}",
         "CONFIG_MALI_PLATFORM_NAME={{.mali_platform_name}}",
-        "MALI_MOCK_TEST={{.mali_mock_test}}",
     ],
+    mali_fpga_bus_logger: {
+        extra_symbols: [
+            "bus_logger",
+        ],
+    },
+    mali_corestack: {
+        kbuild_options: ["CONFIG_MALI_CORESTACK=y"],
+    },
     mali_error_inject: {
         kbuild_options: ["CONFIG_MALI_ERROR_INJECT=y"],
     },
@@ -98,9 +102,6 @@ bob_kernel_module {
     mali_2mb_alloc: {
         kbuild_options: ["CONFIG_MALI_2MB_ALLOC=y"],
     },
-    mali_mock_test: {
-        srcs: ["tests/internal/src/mock/mali_kbase_pm_driver_mock.c"],
-    },
     gpu_has_csf: {
         srcs: [
             "csf/*.c",
diff --git a/mali_kbase/ipa/mali_kbase_ipa.c b/mali_kbase/ipa/mali_kbase_ipa.c
index 520f8fc..9da2878 100644
--- a/mali_kbase/ipa/mali_kbase_ipa.c
+++ b/mali_kbase/ipa/mali_kbase_ipa.c
@@ -38,16 +38,15 @@
 #endif
 
 #define KBASE_IPA_FALLBACK_MODEL_NAME "mali-simple-power-model"
-#define KBASE_IPA_G71_MODEL_NAME      "mali-g71-power-model"
-#define KBASE_IPA_G72_MODEL_NAME      "mali-g72-power-model"
-#define KBASE_IPA_TNOX_MODEL_NAME     "mali-tnox-power-model"
-#define KBASE_IPA_TGOX_R1_MODEL_NAME  "mali-tgox_r1-power-model"
 
-static struct kbase_ipa_model_ops *kbase_ipa_all_model_ops[] = {
+static const struct kbase_ipa_model_ops *kbase_ipa_all_model_ops[] = {
 	&kbase_simple_ipa_model_ops,
 	&kbase_g71_ipa_model_ops,
 	&kbase_g72_ipa_model_ops,
-	&kbase_tnox_ipa_model_ops
+	&kbase_g76_ipa_model_ops,
+	&kbase_g52_ipa_model_ops,
+	&kbase_g52_r1_ipa_model_ops,
+	&kbase_g51_ipa_model_ops
 };
 
 int kbase_ipa_model_recalculate(struct kbase_ipa_model *model)
@@ -68,13 +67,13 @@ int kbase_ipa_model_recalculate(struct kbase_ipa_model *model)
 	return err;
 }
 
-static struct kbase_ipa_model_ops *kbase_ipa_model_ops_find(struct kbase_device *kbdev,
+const struct kbase_ipa_model_ops *kbase_ipa_model_ops_find(struct kbase_device *kbdev,
 							    const char *name)
 {
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(kbase_ipa_all_model_ops); ++i) {
-		struct kbase_ipa_model_ops *ops = kbase_ipa_all_model_ops[i];
+		const struct kbase_ipa_model_ops *ops = kbase_ipa_all_model_ops[i];
 
 		if (!strcmp(ops->name, name))
 			return ops;
@@ -84,6 +83,7 @@ static struct kbase_ipa_model_ops *kbase_ipa_model_ops_find(struct kbase_device
 
 	return NULL;
 }
+KBASE_EXPORT_TEST_API(kbase_ipa_model_ops_find);
 
 const char *kbase_ipa_model_name_from_id(u32 gpu_id)
 {
@@ -93,18 +93,20 @@ const char *kbase_ipa_model_name_from_id(u32 gpu_id)
 	if (GPU_ID_IS_NEW_FORMAT(prod_id)) {
 		switch (GPU_ID2_MODEL_MATCH_VALUE(prod_id)) {
 		case GPU_ID2_PRODUCT_TMIX:
-			return KBASE_IPA_G71_MODEL_NAME;
+			return "mali-g71-power-model";
 		case GPU_ID2_PRODUCT_THEX:
-			return KBASE_IPA_G72_MODEL_NAME;
+			return "mali-g72-power-model";
 		case GPU_ID2_PRODUCT_TNOX:
-			return KBASE_IPA_TNOX_MODEL_NAME;
+			return "mali-g76-power-model";
+		case GPU_ID2_PRODUCT_TSIX:
+			return "mali-g51-power-model";
 		case GPU_ID2_PRODUCT_TGOX:
 			if ((gpu_id & GPU_ID2_VERSION_MAJOR) ==
 					(0 << GPU_ID2_VERSION_MAJOR_SHIFT))
-				/* TGOX r0 shares a power model with TNOX */
-				return KBASE_IPA_TNOX_MODEL_NAME;
+				/* g52 aliased to g76 power-model's ops */
+				return "mali-g52-power-model";
 			else
-				return KBASE_IPA_TGOX_R1_MODEL_NAME;
+				return "mali-g52_r1-power-model";
 		default:
 			return KBASE_IPA_FALLBACK_MODEL_NAME;
 		}
@@ -112,6 +114,7 @@ const char *kbase_ipa_model_name_from_id(u32 gpu_id)
 
 	return KBASE_IPA_FALLBACK_MODEL_NAME;
 }
+KBASE_EXPORT_TEST_API(kbase_ipa_model_name_from_id);
 
 static struct device_node *get_model_dt_node(struct kbase_ipa_model *model)
 {
@@ -244,7 +247,7 @@ void kbase_ipa_term_model(struct kbase_ipa_model *model)
 KBASE_EXPORT_TEST_API(kbase_ipa_term_model);
 
 struct kbase_ipa_model *kbase_ipa_init_model(struct kbase_device *kbdev,
-					     struct kbase_ipa_model_ops *ops)
+					     const struct kbase_ipa_model_ops *ops)
 {
 	struct kbase_ipa_model *model;
 	int err;
@@ -298,7 +301,7 @@ int kbase_ipa_init(struct kbase_device *kbdev)
 {
 
 	const char *model_name;
-	struct kbase_ipa_model_ops *ops;
+	const struct kbase_ipa_model_ops *ops;
 	struct kbase_ipa_model *default_model = NULL;
 	int err;
 
@@ -371,6 +374,8 @@ void kbase_ipa_term(struct kbase_device *kbdev)
 	mutex_lock(&kbdev->ipa.lock);
 	kbase_ipa_term_locked(kbdev);
 	mutex_unlock(&kbdev->ipa.lock);
+
+	mutex_destroy(&kbdev->ipa.lock);
 }
 KBASE_EXPORT_TEST_API(kbase_ipa_term);
 
@@ -517,6 +522,9 @@ static unsigned long kbase_get_static_power(unsigned long voltage)
 	struct kbase_device *kbdev = kbase_find_device(-1);
 #endif
 
+	if (!kbdev)
+		return 0ul;
+
 	mutex_lock(&kbdev->ipa.lock);
 
 	model = get_current_model(kbdev);
@@ -552,6 +560,9 @@ static unsigned long kbase_get_dynamic_power(unsigned long freq,
 	struct kbase_device *kbdev = kbase_find_device(-1);
 #endif
 
+	if (!kbdev)
+		return 0ul;
+
 	mutex_lock(&kbdev->ipa.lock);
 
 	model = kbdev->ipa.fallback_model;
@@ -627,6 +638,9 @@ int kbase_get_real_power(struct devfreq *df, u32 *power,
 	int ret;
 	struct kbase_device *kbdev = dev_get_drvdata(&df->dev);
 
+	if (!kbdev)
+		return -ENODEV;
+
 	mutex_lock(&kbdev->ipa.lock);
 	ret = kbase_get_real_power_locked(kbdev, power, freq, voltage);
 	mutex_unlock(&kbdev->ipa.lock);
diff --git a/mali_kbase/ipa/mali_kbase_ipa.h b/mali_kbase/ipa/mali_kbase_ipa.h
index 4656ded..7462048 100644
--- a/mali_kbase/ipa/mali_kbase_ipa.h
+++ b/mali_kbase/ipa/mali_kbase_ipa.h
@@ -40,7 +40,7 @@ struct devfreq;
 struct kbase_ipa_model {
 	struct kbase_device *kbdev;
 	void *model_data;
-	struct kbase_ipa_model_ops *ops;
+	const struct kbase_ipa_model_ops *ops;
 	struct list_head params;
 	bool missing_dt_node_warning;
 };
@@ -154,6 +154,25 @@ void kbase_ipa_term(struct kbase_device *kbdev);
 int kbase_ipa_model_recalculate(struct kbase_ipa_model *model);
 
 /**
+ * kbase_ipa_model_ops_find - Lookup an IPA model using its name
+ * @kbdev:      pointer to kbase device
+ * @name:       name of model to lookup
+ *
+ * Return: Pointer to model's 'ops' structure, or NULL if the lookup failed.
+ */
+const struct kbase_ipa_model_ops *kbase_ipa_model_ops_find(struct kbase_device *kbdev,
+							   const char *name);
+
+/**
+ * kbase_ipa_model_name_from_id - Find the best model for a given GPU ID
+ * @gpu_id:     GPU ID of GPU the model will be used for
+ *
+ * Return: The name of the appropriate counter-based model, or the name of the
+ *         fallback model if no counter model exists.
+ */
+const char *kbase_ipa_model_name_from_id(u32 gpu_id);
+
+/**
  * kbase_ipa_init_model - Initilaize the particular IPA model
  * @kbdev:      pointer to kbase device
  * @ops:        pointer to object containing model specific methods.
@@ -164,7 +183,7 @@ int kbase_ipa_model_recalculate(struct kbase_ipa_model *model);
  * Return: pointer to kbase_ipa_model on success, NULL on error
  */
 struct kbase_ipa_model *kbase_ipa_init_model(struct kbase_device *kbdev,
-					     struct kbase_ipa_model_ops *ops);
+					     const struct kbase_ipa_model_ops *ops);
 /**
  * kbase_ipa_term_model - Terminate the particular IPA model
  * @model:      pointer to the IPA model object, already initialized
@@ -183,10 +202,12 @@ void kbase_ipa_term_model(struct kbase_ipa_model *model);
  */
 void kbase_ipa_protection_mode_switch_event(struct kbase_device *kbdev);
 
-extern struct kbase_ipa_model_ops kbase_g71_ipa_model_ops;
-extern struct kbase_ipa_model_ops kbase_g72_ipa_model_ops;
-extern struct kbase_ipa_model_ops kbase_tnox_ipa_model_ops;
-extern struct kbase_ipa_model_ops kbase_tgox_r1_ipa_model_ops;
+extern const struct kbase_ipa_model_ops kbase_g71_ipa_model_ops;
+extern const struct kbase_ipa_model_ops kbase_g72_ipa_model_ops;
+extern const struct kbase_ipa_model_ops kbase_g76_ipa_model_ops;
+extern const struct kbase_ipa_model_ops kbase_g52_ipa_model_ops;
+extern const struct kbase_ipa_model_ops kbase_g52_r1_ipa_model_ops;
+extern const struct kbase_ipa_model_ops kbase_g51_ipa_model_ops;
 
 /**
  * kbase_get_real_power() - get the real power consumption of the GPU
diff --git a/mali_kbase/ipa/mali_kbase_ipa_simple.c b/mali_kbase/ipa/mali_kbase_ipa_simple.c
index e684df4..c8399ab 100644
--- a/mali_kbase/ipa/mali_kbase_ipa_simple.c
+++ b/mali_kbase/ipa/mali_kbase_ipa_simple.c
@@ -268,8 +268,9 @@ static int kbase_simple_power_model_init(struct kbase_ipa_model *model)
 							  (void *) model_data,
 							  "mali-simple-power-model-temp-poll");
 	if (IS_ERR(model_data->poll_temperature_thread)) {
+		err = PTR_ERR(model_data->poll_temperature_thread);
 		kfree(model_data);
-		return PTR_ERR(model_data->poll_temperature_thread);
+		return err;
 	}
 
 	err = add_params(model);
diff --git a/mali_kbase/ipa/mali_kbase_ipa_vinstr_common.c b/mali_kbase/ipa/mali_kbase_ipa_vinstr_common.c
index 69c3230..1a6ba01 100644
--- a/mali_kbase/ipa/mali_kbase_ipa_vinstr_common.c
+++ b/mali_kbase/ipa/mali_kbase_ipa_vinstr_common.c
@@ -44,7 +44,7 @@ static inline u32 kbase_ipa_read_hwcnt(
 	struct kbase_ipa_model_vinstr_data *model_data,
 	u32 offset)
 {
-	u8 *p = model_data->vinstr_buffer;
+	u8 *p = (u8 *)model_data->dump_buf.dump_buf;
 
 	return *(u32 *)&p[offset];
 }
@@ -118,125 +118,69 @@ s64 kbase_ipa_single_counter(
 	return counter_value * (s64) coeff;
 }
 
-#ifndef CONFIG_MALI_NO_MALI
-/**
- * kbase_ipa_gpu_active - Inform IPA that GPU is now active
- * @model_data: Pointer to model data
- *
- * This function may cause vinstr to become active.
- */
-static void kbase_ipa_gpu_active(struct kbase_ipa_model_vinstr_data *model_data)
+int kbase_ipa_attach_vinstr(struct kbase_ipa_model_vinstr_data *model_data)
 {
+	int errcode;
 	struct kbase_device *kbdev = model_data->kbdev;
+	struct kbase_hwcnt_virtualizer *hvirt = kbdev->hwcnt_gpu_virt;
+	struct kbase_hwcnt_enable_map enable_map;
+	const struct kbase_hwcnt_metadata *metadata =
+		kbase_hwcnt_virtualizer_metadata(hvirt);
 
-	lockdep_assert_held(&kbdev->pm.lock);
+	if (!metadata)
+		return -1;
 
-	if (!kbdev->ipa.vinstr_active) {
-		kbdev->ipa.vinstr_active = true;
-		kbase_vinstr_resume_client(model_data->vinstr_cli);
+	errcode = kbase_hwcnt_enable_map_alloc(metadata, &enable_map);
+	if (errcode) {
+		dev_err(kbdev->dev, "Failed to allocate IPA enable map");
+		return errcode;
 	}
-}
 
-/**
- * kbase_ipa_gpu_idle - Inform IPA that GPU is now idle
- * @model_data: Pointer to model data
- *
- * This function may cause vinstr to become idle.
- */
-static void kbase_ipa_gpu_idle(struct kbase_ipa_model_vinstr_data *model_data)
-{
-	struct kbase_device *kbdev = model_data->kbdev;
+	kbase_hwcnt_enable_map_enable_all(&enable_map);
 
-	lockdep_assert_held(&kbdev->pm.lock);
-
-	if (kbdev->ipa.vinstr_active) {
-		kbase_vinstr_suspend_client(model_data->vinstr_cli);
-		kbdev->ipa.vinstr_active = false;
+	errcode = kbase_hwcnt_virtualizer_client_create(
+		hvirt, &enable_map, &model_data->hvirt_cli);
+	kbase_hwcnt_enable_map_free(&enable_map);
+	if (errcode) {
+		dev_err(kbdev->dev, "Failed to register IPA with virtualizer");
+		model_data->hvirt_cli = NULL;
+		return errcode;
 	}
-}
-#endif
-
-int kbase_ipa_attach_vinstr(struct kbase_ipa_model_vinstr_data *model_data)
-{
-	struct kbase_device *kbdev = model_data->kbdev;
-	struct kbase_ioctl_hwcnt_reader_setup setup;
-	size_t dump_size;
 
-	dump_size = kbase_vinstr_dump_size(kbdev);
-	model_data->vinstr_buffer = kzalloc(dump_size, GFP_KERNEL);
-	if (!model_data->vinstr_buffer) {
+	errcode = kbase_hwcnt_dump_buffer_alloc(
+		metadata, &model_data->dump_buf);
+	if (errcode) {
 		dev_err(kbdev->dev, "Failed to allocate IPA dump buffer");
-		return -1;
-	}
-
-	setup.jm_bm = ~0u;
-	setup.shader_bm = ~0u;
-	setup.tiler_bm = ~0u;
-	setup.mmu_l2_bm = ~0u;
-	model_data->vinstr_cli = kbase_vinstr_hwcnt_kernel_setup(kbdev->vinstr_ctx,
-			&setup, model_data->vinstr_buffer);
-	if (!model_data->vinstr_cli) {
-		dev_err(kbdev->dev, "Failed to register IPA with vinstr core");
-		kfree(model_data->vinstr_buffer);
-		model_data->vinstr_buffer = NULL;
-		return -1;
+		kbase_hwcnt_virtualizer_client_destroy(model_data->hvirt_cli);
+		model_data->hvirt_cli = NULL;
+		return errcode;
 	}
 
-	kbase_vinstr_hwc_clear(model_data->vinstr_cli);
-
-#ifndef CONFIG_MALI_NO_MALI
-	kbdev->ipa.gpu_active_callback = kbase_ipa_gpu_active;
-	kbdev->ipa.gpu_idle_callback = kbase_ipa_gpu_idle;
-	kbdev->ipa.model_data = model_data;
-	kbdev->ipa.vinstr_active = false;
-	/* Suspend vinstr, to ensure that the GPU is powered off until there is
-	 * something to execute.
-	 */
-	kbase_vinstr_suspend_client(model_data->vinstr_cli);
-#else
-	kbdev->ipa.gpu_active_callback = NULL;
-	kbdev->ipa.gpu_idle_callback = NULL;
-	kbdev->ipa.vinstr_active = true;
-#endif
-
 	return 0;
 }
 
 void kbase_ipa_detach_vinstr(struct kbase_ipa_model_vinstr_data *model_data)
 {
-	struct kbase_device *kbdev = model_data->kbdev;
-
-	kbdev->ipa.gpu_active_callback = NULL;
-	kbdev->ipa.gpu_idle_callback = NULL;
-	kbdev->ipa.model_data = NULL;
-	kbdev->ipa.vinstr_active = false;
-
-	if (model_data->vinstr_cli)
-		kbase_vinstr_detach_client(model_data->vinstr_cli);
-
-	model_data->vinstr_cli = NULL;
-	kfree(model_data->vinstr_buffer);
-	model_data->vinstr_buffer = NULL;
+	if (model_data->hvirt_cli) {
+		kbase_hwcnt_virtualizer_client_destroy(model_data->hvirt_cli);
+		kbase_hwcnt_dump_buffer_free(&model_data->dump_buf);
+		model_data->hvirt_cli = NULL;
+	}
 }
 
 int kbase_ipa_vinstr_dynamic_coeff(struct kbase_ipa_model *model, u32 *coeffp)
 {
 	struct kbase_ipa_model_vinstr_data *model_data =
 			(struct kbase_ipa_model_vinstr_data *)model->model_data;
-	struct kbase_device *kbdev = model_data->kbdev;
 	s64 energy = 0;
 	size_t i;
 	u64 coeff = 0, coeff_mul = 0;
+	u64 start_ts_ns, end_ts_ns;
 	u32 active_cycles;
 	int err = 0;
 
-	if (!kbdev->ipa.vinstr_active) {
-		err = -ENODATA;
-		goto err0; /* GPU powered off - no counters to collect */
-	}
-
-	err = kbase_vinstr_hwc_dump(model_data->vinstr_cli,
-				    BASE_HWCNT_READER_EVENT_MANUAL);
+	err = kbase_hwcnt_virtualizer_client_dump(model_data->hvirt_cli,
+		&start_ts_ns, &end_ts_ns, &model_data->dump_buf);
 	if (err)
 		goto err0;
 
diff --git a/mali_kbase/ipa/mali_kbase_ipa_vinstr_common.h b/mali_kbase/ipa/mali_kbase_ipa_vinstr_common.h
index 0deafae..46e3cd4 100644
--- a/mali_kbase/ipa/mali_kbase_ipa_vinstr_common.h
+++ b/mali_kbase/ipa/mali_kbase_ipa_vinstr_common.h
@@ -24,6 +24,8 @@
 #define _KBASE_IPA_VINSTR_COMMON_H_
 
 #include "mali_kbase.h"
+#include "mali_kbase_hwcnt_virtualizer.h"
+#include "mali_kbase_hwcnt_types.h"
 
 /* Maximum number of IPA groups for an IPA model. */
 #define KBASE_IPA_MAX_GROUP_DEF_NUM  16
@@ -49,8 +51,8 @@ typedef u32 (*kbase_ipa_get_active_cycles_callback)(struct kbase_ipa_model_vinst
  * @groups_def_num:      Number of elements in the array of IPA groups.
  * @get_active_cycles:   Callback to return number of active cycles during
  *                       counter sample period
- * @vinstr_cli:          vinstr client handle
- * @vinstr_buffer:       buffer to dump hardware counters onto
+ * @hvirt_cli:           hardware counter virtualizer client handle
+ * @dump_buf:            buffer to dump hardware counters onto
  * @reference_voltage:   voltage, in mV, of the operating point used when
  *                       deriving the power model coefficients. Range approx
  *                       0.1V - 5V (~= 8V): 2^7 <= reference_voltage <= 2^13
@@ -72,8 +74,8 @@ struct kbase_ipa_model_vinstr_data {
 	const struct kbase_ipa_group *groups_def;
 	size_t groups_def_num;
 	kbase_ipa_get_active_cycles_callback get_active_cycles;
-	struct kbase_vinstr_client *vinstr_cli;
-	void *vinstr_buffer;
+	struct kbase_hwcnt_virtualizer_client *hvirt_cli;
+	struct kbase_hwcnt_dump_buffer dump_buf;
 	s32 reference_voltage;
 	s32 scaling_factor;
 	s32 min_sample_cycles;
diff --git a/mali_kbase/ipa/mali_kbase_ipa_vinstr_g7x.c b/mali_kbase/ipa/mali_kbase_ipa_vinstr_g7x.c
index 8366033..6365d2f 100644
--- a/mali_kbase/ipa/mali_kbase_ipa_vinstr_g7x.c
+++ b/mali_kbase/ipa/mali_kbase_ipa_vinstr_g7x.c
@@ -248,7 +248,7 @@ static const struct kbase_ipa_group ipa_groups_def_g72[] = {
 	},
 };
 
-static const struct kbase_ipa_group ipa_groups_def_tnox[] = {
+static const struct kbase_ipa_group ipa_groups_def_g76[] = {
 	{
 		.name = "gpu_active",
 		.default_value = 122000,
@@ -281,7 +281,7 @@ static const struct kbase_ipa_group ipa_groups_def_tnox[] = {
 	},
 };
 
-static const struct kbase_ipa_group ipa_groups_def_tgox_r1[] = {
+static const struct kbase_ipa_group ipa_groups_def_g52_r1[] = {
 	{
 		.name = "gpu_active",
 		.default_value = 224200,
@@ -314,6 +314,48 @@ static const struct kbase_ipa_group ipa_groups_def_tgox_r1[] = {
 	},
 };
 
+static const struct kbase_ipa_group ipa_groups_def_g51[] = {
+	{
+		.name = "gpu_active",
+		.default_value = 201400,
+		.op = kbase_g7x_jm_single_counter,
+		.counter_block_offset = JM_GPU_ACTIVE,
+	},
+	{
+		.name = "exec_instr_count",
+		.default_value = 392700,
+		.op = kbase_g7x_sum_all_shader_cores,
+		.counter_block_offset = SC_EXEC_INSTR_COUNT,
+	},
+	{
+		.name = "vary_instr",
+		.default_value = 274000,
+		.op = kbase_g7x_sum_all_shader_cores,
+		.counter_block_offset = SC_VARY_INSTR,
+	},
+	{
+		.name = "tex_tfch_num_operations",
+		.default_value = 528000,
+		.op = kbase_g7x_sum_all_shader_cores,
+		.counter_block_offset = SC_TEX_TFCH_NUM_OPERATIONS,
+	},
+	{
+		.name = "l2_access",
+		.default_value = 506400,
+		.op = kbase_g7x_sum_all_memsys_blocks,
+		.counter_block_offset = MEMSYS_L2_ANY_LOOKUP,
+	},
+};
+
+#define IPA_POWER_MODEL_OPS(gpu, init_token) \
+	const struct kbase_ipa_model_ops kbase_ ## gpu ## _ipa_model_ops = { \
+		.name = "mali-" #gpu "-power-model", \
+		.init = kbase_ ## init_token ## _power_model_init, \
+		.term = kbase_ipa_vinstr_common_model_term, \
+		.get_dynamic_coeff = kbase_ipa_vinstr_dynamic_coeff, \
+	}; \
+	KBASE_EXPORT_TEST_API(kbase_ ## gpu ## _ipa_model_ops)
+
 #define STANDARD_POWER_MODEL(gpu, reference_voltage) \
 	static int kbase_ ## gpu ## _power_model_init(\
 			struct kbase_ipa_model *model) \
@@ -326,15 +368,16 @@ static const struct kbase_ipa_group ipa_groups_def_tgox_r1[] = {
 				kbase_g7x_get_active_cycles, \
 				(reference_voltage)); \
 	} \
-	struct kbase_ipa_model_ops kbase_ ## gpu ## _ipa_model_ops = { \
-		.name = "mali-" #gpu "-power-model", \
-		.init = kbase_ ## gpu ## _power_model_init, \
-		.term = kbase_ipa_vinstr_common_model_term, \
-		.get_dynamic_coeff = kbase_ipa_vinstr_dynamic_coeff, \
-	}; \
-	KBASE_EXPORT_TEST_API(kbase_ ## gpu ## _ipa_model_ops)
+	IPA_POWER_MODEL_OPS(gpu, gpu)
+
+#define ALIAS_POWER_MODEL(gpu, as_gpu) \
+	IPA_POWER_MODEL_OPS(gpu, as_gpu)
 
 STANDARD_POWER_MODEL(g71, 800);
 STANDARD_POWER_MODEL(g72, 800);
-STANDARD_POWER_MODEL(tnox, 800);
-STANDARD_POWER_MODEL(tgox_r1, 1000);
+STANDARD_POWER_MODEL(g76, 800);
+STANDARD_POWER_MODEL(g52_r1, 1000);
+STANDARD_POWER_MODEL(g51, 1000);
+
+/* g52 is an alias of g76 (TNOX) for IPA */
+ALIAS_POWER_MODEL(g52, g76);
diff --git a/mali_kbase/mali_base_hwconfig_features.h b/mali_kbase/mali_base_hwconfig_features.h
index 69c22f2..5571f84 100644
--- a/mali_kbase/mali_base_hwconfig_features.h
+++ b/mali_kbase/mali_base_hwconfig_features.h
@@ -59,6 +59,8 @@ enum base_hw_feature {
 	BASE_HW_FEATURE_TLS_HASHING,
 	BASE_HW_FEATURE_THREAD_GROUP_SPLIT,
 	BASE_HW_FEATURE_3BIT_EXT_RW_L2_MMU_CONFIG,
+	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
@@ -203,6 +205,7 @@ static const enum base_hw_feature base_hw_features_tMIx[] = {
 	BASE_HW_FEATURE_FLUSH_REDUCTION,
 	BASE_HW_FEATURE_PROTECTED_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
+	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_END
 };
 
@@ -230,6 +233,7 @@ static const enum base_hw_feature base_hw_features_tHEx[] = {
 	BASE_HW_FEATURE_PROTECTED_MODE,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
+	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_END
 };
 
@@ -257,6 +261,7 @@ static const enum base_hw_feature base_hw_features_tSIx[] = {
 	BASE_HW_FEATURE_PROTECTED_MODE,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
+	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_END
 };
 
@@ -284,6 +289,7 @@ static const enum base_hw_feature base_hw_features_tDVx[] = {
 	BASE_HW_FEATURE_PROTECTED_MODE,
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
+	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_END
 };
 
@@ -314,6 +320,7 @@ static const enum base_hw_feature base_hw_features_tNOx[] = {
 	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_TLS_HASHING,
 	BASE_HW_FEATURE_3BIT_EXT_RW_L2_MMU_CONFIG,
+	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
@@ -344,6 +351,7 @@ static const enum base_hw_feature base_hw_features_tGOx[] = {
 	BASE_HW_FEATURE_AARCH64_MMU,
 	BASE_HW_FEATURE_TLS_HASHING,
 	BASE_HW_FEATURE_3BIT_EXT_RW_L2_MMU_CONFIG,
+	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
@@ -398,6 +406,8 @@ static const enum base_hw_feature base_hw_features_tTRx[] = {
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_AARCH64_MMU,
+	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
@@ -425,10 +435,12 @@ static const enum base_hw_feature base_hw_features_tNAx[] = {
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_AARCH64_MMU,
+	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tULx[] = {
+static const enum base_hw_feature base_hw_features_tBEx[] = {
 	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
 	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
 	BASE_HW_FEATURE_XAFFINITY,
@@ -452,10 +464,12 @@ static const enum base_hw_feature base_hw_features_tULx[] = {
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_AARCH64_MMU,
+	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
-static const enum base_hw_feature base_hw_features_tDUx[] = {
+static const enum base_hw_feature base_hw_features_tULx[] = {
 	BASE_HW_FEATURE_JOBCHAIN_DISAMBIGUATION,
 	BASE_HW_FEATURE_PWRON_DURING_PWROFF_TRANS,
 	BASE_HW_FEATURE_XAFFINITY,
@@ -479,6 +493,7 @@ static const enum base_hw_feature base_hw_features_tDUx[] = {
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_AARCH64_MMU,
+	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
@@ -506,6 +521,7 @@ static const enum base_hw_feature base_hw_features_tBOx[] = {
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_AARCH64_MMU,
+	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
@@ -533,6 +549,7 @@ static const enum base_hw_feature base_hw_features_tIDx[] = {
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_AARCH64_MMU,
+	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
@@ -560,6 +577,7 @@ static const enum base_hw_feature base_hw_features_tVAx[] = {
 	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
 	BASE_HW_FEATURE_COHERENCY_REG,
 	BASE_HW_FEATURE_AARCH64_MMU,
+	BASE_HW_FEATURE_IDVS_GROUP_SIZE,
 	BASE_HW_FEATURE_END
 };
 
diff --git a/mali_kbase/mali_base_hwconfig_issues.h b/mali_kbase/mali_base_hwconfig_issues.h
index b8bd3d0..d7c40ef 100644
--- a/mali_kbase/mali_base_hwconfig_issues.h
+++ b/mali_kbase/mali_base_hwconfig_issues.h
@@ -1222,7 +1222,6 @@ static const enum base_hw_issue base_hw_issues_model_tKAx[] = {
 static const enum base_hw_issue base_hw_issues_tTRx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
@@ -1232,7 +1231,6 @@ static const enum base_hw_issue base_hw_issues_model_tTRx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
@@ -1241,7 +1239,6 @@ static const enum base_hw_issue base_hw_issues_model_tTRx[] = {
 static const enum base_hw_issue base_hw_issues_tNAx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
@@ -1251,45 +1248,40 @@ static const enum base_hw_issue base_hw_issues_model_tNAx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tULx_r0p0[] = {
+static const enum base_hw_issue base_hw_issues_tBEx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tULx[] = {
+static const enum base_hw_issue base_hw_issues_model_tBEx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_tDUx_r0p0[] = {
+static const enum base_hw_issue base_hw_issues_tULx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
 };
 
-static const enum base_hw_issue base_hw_issues_model_tDUx[] = {
+static const enum base_hw_issue base_hw_issues_model_tULx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
@@ -1317,7 +1309,6 @@ static const enum base_hw_issue base_hw_issues_model_tBOx[] = {
 static const enum base_hw_issue base_hw_issues_tIDx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
@@ -1327,7 +1318,6 @@ static const enum base_hw_issue base_hw_issues_model_tIDx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
@@ -1336,7 +1326,6 @@ static const enum base_hw_issue base_hw_issues_model_tIDx[] = {
 static const enum base_hw_issue base_hw_issues_tVAx_r0p0[] = {
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
@@ -1346,7 +1335,6 @@ static const enum base_hw_issue base_hw_issues_model_tVAx[] = {
 	BASE_HW_ISSUE_5736,
 	BASE_HW_ISSUE_9435,
 	BASE_HW_ISSUE_TMIX_8133,
-	BASE_HW_ISSUE_TSIX_1116,
 	BASE_HW_ISSUE_TSIX_2033,
 	BASE_HW_ISSUE_TTRX_1337,
 	BASE_HW_ISSUE_END
diff --git a/mali_kbase/mali_base_kernel.h b/mali_kbase/mali_base_kernel.h
index 297df8b..70dc3c5 100644
--- a/mali_kbase/mali_base_kernel.h
+++ b/mali_kbase/mali_base_kernel.h
@@ -348,15 +348,6 @@ struct base_mem_import_user_buffer {
 
 
 /**
- * @brief Result codes of changing the size of the backing store allocated to a tmem region
- */
-typedef enum base_backing_threshold_status {
-	BASE_BACKING_THRESHOLD_OK = 0,			    /**< Resize successful */
-	BASE_BACKING_THRESHOLD_ERROR_OOM = -2,		    /**< Increase failed due to an out-of-memory condition */
-	BASE_BACKING_THRESHOLD_ERROR_INVALID_ARGUMENTS = -4 /**< Invalid arguments (not tmem, illegal size request, etc.) */
-} base_backing_threshold_status;
-
-/**
  * @addtogroup base_user_api_memory_defered User-side Base Defered Memory Coherency APIs
  * @{
  */
@@ -797,24 +788,6 @@ typedef u32 base_jd_core_req;
 	((core_req & BASE_JD_REQ_SOFT_JOB) || \
 	(core_req & BASE_JD_REQ_ATOM_TYPE) == BASE_JD_REQ_DEP)
 
-/**
- * enum kbase_atom_coreref_state - States to model state machine processed by
- * kbasep_js_job_check_ref_cores(), which handles retaining cores for power
- * management.
- *
- * @KBASE_ATOM_COREREF_STATE_NO_CORES_REQUESTED: Starting state: Cores must be
- * requested.
- * @KBASE_ATOM_COREREF_STATE_WAITING_FOR_REQUESTED_CORES: Cores requested, but
- * waiting for them to be powered
- * @KBASE_ATOM_COREREF_STATE_READY: Cores are powered, atom can be submitted to
- * HW
- */
-enum kbase_atom_coreref_state {
-	KBASE_ATOM_COREREF_STATE_NO_CORES_REQUESTED,
-	KBASE_ATOM_COREREF_STATE_WAITING_FOR_REQUESTED_CORES,
-	KBASE_ATOM_COREREF_STATE_READY
-};
-
 /*
  * Base Atom priority
  *
@@ -822,15 +795,16 @@ enum kbase_atom_coreref_state {
  * BASE_JD_PRIO_<...> definitions below. It is undefined to use a priority
  * level that is not one of those defined below.
  *
- * Priority levels only affect scheduling between atoms of the same type within
- * a base context, and only after the atoms have had dependencies resolved.
- * Fragment atoms does not affect non-frament atoms with lower priorities, and
- * the other way around. For example, a low priority atom that has had its
- * dependencies resolved might run before a higher priority atom that has not
- * had its dependencies resolved.
+ * Priority levels only affect scheduling after the atoms have had dependencies
+ * resolved. For example, a low priority atom that has had its dependencies
+ * resolved might run before a higher priority atom that has not had its
+ * dependencies resolved.
  *
- * The scheduling between base contexts/processes and between atoms from
- * different base contexts/processes is unaffected by atom priority.
+ * In general, fragment atoms do not affect non-fragment atoms with
+ * lower priorities, and vice versa. One exception is that there is only one
+ * priority value for each context. So a high-priority (e.g.) fragment atom
+ * could increase its context priority, causing its non-fragment atoms to also
+ * be scheduled sooner.
  *
  * The atoms are scheduled as follows with respect to their priorities:
  * - Let atoms 'X' and 'Y' be for the same job slot who have dependencies
@@ -842,6 +816,14 @@ enum kbase_atom_coreref_state {
  * - Any two atoms that have the same priority could run in any order with
  *   respect to each other. That is, there is no ordering constraint between
  *   atoms of the same priority.
+ *
+ * The sysfs file 'js_ctx_scheduling_mode' is used to control how atoms are
+ * scheduled between contexts. The default value, 0, will cause higher-priority
+ * atoms to be scheduled first, regardless of their context. The value 1 will
+ * use a round-robin algorithm when deciding which context's atoms to schedule
+ * next, so higher-priority atoms can only preempt lower priority atoms within
+ * the same context. See KBASE_JS_SYSTEM_PRIORITY_MODE and
+ * KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE for more details.
  */
 typedef u8 base_jd_prio;
 
diff --git a/mali_kbase/mali_base_vendor_specific_func.h b/mali_kbase/mali_base_vendor_specific_func.h
deleted file mode 100644
index 5e8add8..0000000
--- a/mali_kbase/mali_base_vendor_specific_func.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *
- * (C) COPYRIGHT 2010, 2012-2013, 2015 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU licence.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * SPDX-License-Identifier: GPL-2.0
- *
- */
-
-
-#ifndef _BASE_VENDOR_SPEC_FUNC_H_
-#define _BASE_VENDOR_SPEC_FUNC_H_
-
-int kbase_get_vendor_specific_cpu_clock_speed(u32 * const);
-
-#endif	/*_BASE_VENDOR_SPEC_FUNC_H_*/
diff --git a/mali_kbase/mali_kbase.h b/mali_kbase/mali_kbase.h
index cdd9ecc..24a021d 100644
--- a/mali_kbase/mali_kbase.h
+++ b/mali_kbase/mali_kbase.h
@@ -68,7 +68,7 @@
 #include "mali_kbase_jd_debugfs.h"
 #include "mali_kbase_gpuprops.h"
 #include "mali_kbase_jm.h"
-#include "mali_kbase_vinstr.h"
+#include "mali_kbase_ioctl.h"
 
 #include "ipa/mali_kbase_ipa.h"
 
@@ -353,22 +353,13 @@ static inline bool kbase_pm_is_suspending(struct kbase_device *kbdev)
  *
  * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
- * This takes into account the following
- *
- * - whether there is an active context reference
- *
- * - whether any of the shader cores or the tiler are needed
- *
- * It should generally be preferred against checking just
- * kbdev->pm.active_count on its own, because some code paths drop their
- * reference on this whilst still having the shader cores/tiler in use.
+ * This takes into account whether there is an active context reference.
  *
  * Return: true if the GPU is active, false otherwise
  */
 static inline bool kbase_pm_is_active(struct kbase_device *kbdev)
 {
-	return (kbdev->pm.active_count > 0 || kbdev->shader_needed_cnt ||
-			kbdev->tiler_needed_cnt);
+	return kbdev->pm.active_count > 0;
 }
 
 /**
@@ -713,6 +704,3 @@ int kbase_io_history_resize(struct kbase_io_history *h, u16 new_size);
 
 
 #endif
-
-
-
diff --git a/mali_kbase/mali_kbase_config_defaults.h b/mali_kbase/mali_kbase_config_defaults.h
index 9d918a8..bb2ab53 100644
--- a/mali_kbase/mali_kbase_config_defaults.h
+++ b/mali_kbase/mali_kbase_config_defaults.h
@@ -171,11 +171,6 @@ enum {
 #define DEFAULT_PM_POWEROFF_TICK_SHADER (2) /* 400-800us */
 
 /**
- * Power Manager number of ticks before GPU is powered off
- */
-#define DEFAULT_PM_POWEROFF_TICK_GPU (2) /* 400-800us */
-
-/**
  * Default scheduling tick granuality
  */
 #define DEFAULT_JS_SCHEDULING_PERIOD_NS    (100000000u) /* 100ms */
diff --git a/mali_kbase/mali_kbase_context.c b/mali_kbase/mali_kbase_context.c
index 628f89a..59609d7 100644
--- a/mali_kbase/mali_kbase_context.c
+++ b/mali_kbase/mali_kbase_context.c
@@ -149,7 +149,7 @@ kbase_create_context(struct kbase_device *kbdev, bool is_compat)
 
 	kctx->id = atomic_add_return(1, &(kbdev->ctx_num)) - 1;
 
-	mutex_init(&kctx->vinstr_cli_lock);
+	mutex_init(&kctx->legacy_hwcnt_lock);
 
 	kbase_timer_setup(&kctx->soft_job_timeout,
 			  kbasep_soft_job_timeout_worker);
@@ -325,9 +325,6 @@ int kbase_context_set_create_flags(struct kbase_context *kctx, u32 flags)
 	if ((flags & BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED) == 0)
 		kbase_ctx_flag_clear(kctx, KCTX_SUBMIT_DISABLED);
 
-	/* Latch the initial attributes into the Job Scheduler */
-	kbasep_js_ctx_attr_set_initial_attrs(kctx->kbdev, kctx);
-
 	spin_unlock_irqrestore(&kctx->kbdev->hwaccess_lock, irq_flags);
 	mutex_unlock(&js_kctx_info->ctx.jsctx_mutex);
  out:
diff --git a/mali_kbase/mali_kbase_core_linux.c b/mali_kbase/mali_kbase_core_linux.c
index d101d97..382285f 100644
--- a/mali_kbase/mali_kbase_core_linux.c
+++ b/mali_kbase/mali_kbase_core_linux.c
@@ -48,7 +48,12 @@
 #include <mali_kbase_hwaccess_jm.h>
 #include <mali_kbase_ctx_sched.h>
 #include <backend/gpu/mali_kbase_device_internal.h>
+#include <backend/gpu/mali_kbase_pm_internal.h>
 #include "mali_kbase_ioctl.h"
+#include "mali_kbase_hwcnt_context.h"
+#include "mali_kbase_hwcnt_virtualizer.h"
+#include "mali_kbase_hwcnt_legacy.h"
+#include "mali_kbase_vinstr.h"
 
 #ifdef CONFIG_MALI_CINSTR_GWT
 #include "mali_kbase_gwt.h"
@@ -161,22 +166,25 @@ enum {
 #endif /* CONFIG_MALI_DEVFREQ */
 	inited_tlstream = (1u << 4),
 	inited_backend_early = (1u << 5),
-	inited_backend_late = (1u << 6),
-	inited_device = (1u << 7),
-	inited_vinstr = (1u << 8),
-	inited_job_fault = (1u << 10),
-	inited_sysfs_group = (1u << 11),
-	inited_misc_register = (1u << 12),
-	inited_get_device = (1u << 13),
-	inited_dev_list = (1u << 14),
-	inited_debugfs = (1u << 15),
-	inited_gpu_device = (1u << 16),
-	inited_registers_map = (1u << 17),
-	inited_io_history = (1u << 18),
-	inited_power_control = (1u << 19),
-	inited_buslogger = (1u << 20),
-	inited_protected = (1u << 21),
-	inited_ctx_sched = (1u << 22)
+	inited_hwcnt_gpu_iface = (1u << 6),
+	inited_hwcnt_gpu_ctx = (1u << 7),
+	inited_hwcnt_gpu_virt = (1u << 8),
+	inited_vinstr = (1u << 9),
+	inited_backend_late = (1u << 10),
+	inited_device = (1u << 11),
+	inited_job_fault = (1u << 13),
+	inited_sysfs_group = (1u << 14),
+	inited_misc_register = (1u << 15),
+	inited_get_device = (1u << 16),
+	inited_dev_list = (1u << 17),
+	inited_debugfs = (1u << 18),
+	inited_gpu_device = (1u << 19),
+	inited_registers_map = (1u << 20),
+	inited_io_history = (1u << 21),
+	inited_power_control = (1u << 22),
+	inited_buslogger = (1u << 23),
+	inited_protected = (1u << 24),
+	inited_ctx_sched = (1u << 25)
 };
 
 static struct kbase_device *to_kbase_device(struct device *dev)
@@ -494,17 +502,13 @@ static int kbase_release(struct inode *inode, struct file *filp)
 
 	filp->private_data = NULL;
 
-	mutex_lock(&kctx->vinstr_cli_lock);
+	mutex_lock(&kctx->legacy_hwcnt_lock);
 	/* If this client was performing hwcnt dumping and did not explicitly
-	 * detach itself, remove it from the vinstr core now */
-	if (kctx->vinstr_cli) {
-		struct kbase_ioctl_hwcnt_enable enable;
-
-		enable.dump_buffer = 0llu;
-		kbase_vinstr_legacy_hwc_setup(
-				kbdev->vinstr_ctx, &kctx->vinstr_cli, &enable);
-	}
-	mutex_unlock(&kctx->vinstr_cli_lock);
+	 * detach itself, destroy it now
+	 */
+	kbase_hwcnt_legacy_client_destroy(kctx->legacy_hwcnt_cli);
+	kctx->legacy_hwcnt_cli = NULL;
+	mutex_unlock(&kctx->legacy_hwcnt_lock);
 
 	kbase_destroy_context(kctx);
 
@@ -592,10 +596,15 @@ static int kbase_api_mem_alloc(struct kbase_context *kctx,
 	if (flags & BASE_MEM_FLAGS_KERNEL_ONLY)
 		return -ENOMEM;
 
+	/* Force SAME_VA if a 64-bit client.
+	 * The only exception is GPU-executable memory if an EXEC_VA zone
+	 * has been initialized. In that case, GPU-executable memory may
+	 * or may not be SAME_VA.
+	 */
 	if ((!kbase_ctx_flag(kctx, KCTX_COMPAT)) &&
 			kbase_ctx_flag(kctx, KCTX_FORCE_SAME_VA)) {
-		/* force SAME_VA if a 64-bit client */
-		flags |= BASE_MEM_SAME_VA;
+		if (!(flags & BASE_MEM_PROT_GPU_EX) || !kbase_has_exec_va_zone(kctx))
+			flags |= BASE_MEM_SAME_VA;
 	}
 
 
@@ -629,13 +638,7 @@ static int kbase_api_mem_free(struct kbase_context *kctx,
 static int kbase_api_hwcnt_reader_setup(struct kbase_context *kctx,
 		struct kbase_ioctl_hwcnt_reader_setup *setup)
 {
-	int ret;
-
-	mutex_lock(&kctx->vinstr_cli_lock);
-	ret = kbase_vinstr_hwcnt_reader_setup(kctx->kbdev->vinstr_ctx, setup);
-	mutex_unlock(&kctx->vinstr_cli_lock);
-
-	return ret;
+	return kbase_vinstr_hwcnt_reader_setup(kctx->kbdev->vinstr_ctx, setup);
 }
 
 static int kbase_api_hwcnt_enable(struct kbase_context *kctx,
@@ -643,10 +646,31 @@ static int kbase_api_hwcnt_enable(struct kbase_context *kctx,
 {
 	int ret;
 
-	mutex_lock(&kctx->vinstr_cli_lock);
-	ret = kbase_vinstr_legacy_hwc_setup(kctx->kbdev->vinstr_ctx,
-			&kctx->vinstr_cli, enable);
-	mutex_unlock(&kctx->vinstr_cli_lock);
+	mutex_lock(&kctx->legacy_hwcnt_lock);
+	if (enable->dump_buffer != 0) {
+		/* Non-zero dump buffer, so user wants to create the client */
+		if (kctx->legacy_hwcnt_cli == NULL) {
+			ret = kbase_hwcnt_legacy_client_create(
+				kctx->kbdev->hwcnt_gpu_virt,
+				enable,
+				&kctx->legacy_hwcnt_cli);
+		} else {
+			/* This context already has a client */
+			ret = -EBUSY;
+		}
+	} else {
+		/* Zero dump buffer, so user wants to destroy the client */
+		if (kctx->legacy_hwcnt_cli != NULL) {
+			kbase_hwcnt_legacy_client_destroy(
+				kctx->legacy_hwcnt_cli);
+			kctx->legacy_hwcnt_cli = NULL;
+			ret = 0;
+		} else {
+			/* This context has no client to destroy */
+			ret = -EINVAL;
+		}
+	}
+	mutex_unlock(&kctx->legacy_hwcnt_lock);
 
 	return ret;
 }
@@ -655,10 +679,9 @@ static int kbase_api_hwcnt_dump(struct kbase_context *kctx)
 {
 	int ret;
 
-	mutex_lock(&kctx->vinstr_cli_lock);
-	ret = kbase_vinstr_hwc_dump(kctx->vinstr_cli,
-			BASE_HWCNT_READER_EVENT_MANUAL);
-	mutex_unlock(&kctx->vinstr_cli_lock);
+	mutex_lock(&kctx->legacy_hwcnt_lock);
+	ret = kbase_hwcnt_legacy_client_dump(kctx->legacy_hwcnt_cli);
+	mutex_unlock(&kctx->legacy_hwcnt_lock);
 
 	return ret;
 }
@@ -667,9 +690,9 @@ static int kbase_api_hwcnt_clear(struct kbase_context *kctx)
 {
 	int ret;
 
-	mutex_lock(&kctx->vinstr_cli_lock);
-	ret = kbase_vinstr_hwc_clear(kctx->vinstr_cli);
-	mutex_unlock(&kctx->vinstr_cli_lock);
+	mutex_lock(&kctx->legacy_hwcnt_lock);
+	ret = kbase_hwcnt_legacy_client_clear(kctx->legacy_hwcnt_cli);
+	mutex_unlock(&kctx->legacy_hwcnt_lock);
 
 	return ret;
 }
@@ -749,6 +772,12 @@ static int kbase_api_mem_jit_init(struct kbase_context *kctx,
 			jit_init->max_allocations, jit_init->trim_level);
 }
 
+static int kbase_api_mem_exec_init(struct kbase_context *kctx,
+		struct kbase_ioctl_mem_exec_init *exec_init)
+{
+	return kbase_region_tracker_init_exec(kctx, exec_init->va_pages);
+}
+
 static int kbase_api_mem_sync(struct kbase_context *kctx,
 		struct kbase_ioctl_mem_sync *sync)
 {
@@ -1169,6 +1198,11 @@ static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				kbase_api_mem_jit_init,
 				struct kbase_ioctl_mem_jit_init);
 		break;
+	case KBASE_IOCTL_MEM_EXEC_INIT:
+		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_MEM_EXEC_INIT,
+				kbase_api_mem_exec_init,
+				struct kbase_ioctl_mem_exec_init);
+		break;
 	case KBASE_IOCTL_MEM_SYNC:
 		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_MEM_SYNC,
 				kbase_api_mem_sync,
@@ -1550,7 +1584,10 @@ static ssize_t set_core_mask(struct device *dev, struct device_attribute *attr,
 {
 	struct kbase_device *kbdev;
 	u64 new_core_mask[3];
-	int items;
+	int items, i;
+	ssize_t err = count;
+	unsigned long flags;
+	u64 shader_present, group0_core_mask;
 
 	kbdev = to_kbase_device(dev);
 
@@ -1561,50 +1598,59 @@ static ssize_t set_core_mask(struct device *dev, struct device_attribute *attr,
 			&new_core_mask[0], &new_core_mask[1],
 			&new_core_mask[2]);
 
+	if (items != 1 && items != 3) {
+		dev_err(kbdev->dev, "Couldn't process core mask write operation.\n"
+			"Use format <core_mask>\n"
+			"or <core_mask_js0> <core_mask_js1> <core_mask_js2>\n");
+		err = -EINVAL;
+		goto end;
+	}
+
 	if (items == 1)
 		new_core_mask[1] = new_core_mask[2] = new_core_mask[0];
 
-	if (items == 1 || items == 3) {
-		u64 shader_present =
-				kbdev->gpu_props.props.raw_props.shader_present;
-		u64 group0_core_mask =
-				kbdev->gpu_props.props.coherency_info.group[0].
-				core_mask;
-
-		if ((new_core_mask[0] & shader_present) != new_core_mask[0] ||
-				!(new_core_mask[0] & group0_core_mask) ||
-			(new_core_mask[1] & shader_present) !=
-						new_core_mask[1] ||
-				!(new_core_mask[1] & group0_core_mask) ||
-			(new_core_mask[2] & shader_present) !=
-						new_core_mask[2] ||
-				!(new_core_mask[2] & group0_core_mask)) {
-			dev_err(dev, "power_policy: invalid core specification\n");
-			return -EINVAL;
-		}
-
-		if (kbdev->pm.debug_core_mask[0] != new_core_mask[0] ||
-				kbdev->pm.debug_core_mask[1] !=
-						new_core_mask[1] ||
-				kbdev->pm.debug_core_mask[2] !=
-						new_core_mask[2]) {
-			unsigned long flags;
-
-			spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-
-			kbase_pm_set_debug_core_mask(kbdev, new_core_mask[0],
-					new_core_mask[1], new_core_mask[2]);
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
-			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	shader_present = kbdev->gpu_props.props.raw_props.shader_present;
+	group0_core_mask = kbdev->gpu_props.props.coherency_info.group[0].core_mask;
+
+	for (i = 0; i < 3; ++i) {
+		if ((new_core_mask[i] & shader_present) != new_core_mask[i]) {
+			dev_err(dev, "Invalid core mask 0x%llX for JS %d: Includes non-existent cores (present = 0x%llX)",
+					new_core_mask[i], i, shader_present);
+			err = -EINVAL;
+			goto unlock;
+
+		} else if (!(new_core_mask[i] & shader_present & kbdev->pm.backend.ca_cores_enabled)) {
+			dev_err(dev, "Invalid core mask 0x%llX for JS %d: No intersection with currently available cores (present = 0x%llX, CA enabled = 0x%llX\n",
+					new_core_mask[i], i,
+					kbdev->gpu_props.props.raw_props.shader_present,
+					kbdev->pm.backend.ca_cores_enabled);
+			err = -EINVAL;
+			goto unlock;
+
+		} else if (!(new_core_mask[i] & group0_core_mask)) {
+			dev_err(dev, "Invalid core mask 0x%llX for JS %d: No intersection with group 0 core mask 0x%llX\n",
+					new_core_mask[i], i, group0_core_mask);
+			err = -EINVAL;
+			goto unlock;
 		}
+	}
 
-		return count;
+	if (kbdev->pm.debug_core_mask[0] != new_core_mask[0] ||
+			kbdev->pm.debug_core_mask[1] !=
+					new_core_mask[1] ||
+			kbdev->pm.debug_core_mask[2] !=
+					new_core_mask[2]) {
+
+		kbase_pm_set_debug_core_mask(kbdev, new_core_mask[0],
+				new_core_mask[1], new_core_mask[2]);
 	}
 
-	dev_err(kbdev->dev, "Couldn't process set_core_mask write operation.\n"
-		"Use format <core_mask>\n"
-		"or <core_mask_js0> <core_mask_js1> <core_mask_js2>\n");
-	return -EINVAL;
+unlock:
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+end:
+	return err;
 }
 
 /*
@@ -2438,9 +2484,11 @@ static ssize_t set_pm_poweroff(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct kbase_device *kbdev;
+	struct kbasep_pm_tick_timer_state *stt;
 	int items;
-	s64 gpu_poweroff_time;
-	int poweroff_shader_ticks, poweroff_gpu_ticks;
+	u64 gpu_poweroff_time;
+	unsigned int poweroff_shader_ticks, poweroff_gpu_ticks;
+	unsigned long flags;
 
 	kbdev = to_kbase_device(dev);
 	if (!kbdev)
@@ -2455,9 +2503,16 @@ static ssize_t set_pm_poweroff(struct device *dev,
 		return -EINVAL;
 	}
 
-	kbdev->pm.gpu_poweroff_time = HR_TIMER_DELAY_NSEC(gpu_poweroff_time);
-	kbdev->pm.poweroff_shader_ticks = poweroff_shader_ticks;
-	kbdev->pm.poweroff_gpu_ticks = poweroff_gpu_ticks;
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	stt = &kbdev->pm.backend.shader_tick_timer;
+	stt->configured_interval = HR_TIMER_DELAY_NSEC(gpu_poweroff_time);
+	stt->configured_ticks = poweroff_shader_ticks;
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	if (poweroff_gpu_ticks != 0)
+		dev_warn(kbdev->dev, "Separate GPU poweroff delay no longer supported.\n");
 
 	return count;
 }
@@ -2477,16 +2532,22 @@ static ssize_t show_pm_poweroff(struct device *dev,
 		struct device_attribute *attr, char * const buf)
 {
 	struct kbase_device *kbdev;
+	struct kbasep_pm_tick_timer_state *stt;
 	ssize_t ret;
+	unsigned long flags;
 
 	kbdev = to_kbase_device(dev);
 	if (!kbdev)
 		return -ENODEV;
 
-	ret = scnprintf(buf, PAGE_SIZE, "%llu %u %u\n",
-			ktime_to_ns(kbdev->pm.gpu_poweroff_time),
-			kbdev->pm.poweroff_shader_ticks,
-			kbdev->pm.poweroff_gpu_ticks);
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	stt = &kbdev->pm.backend.shader_tick_timer;
+	ret = scnprintf(buf, PAGE_SIZE, "%llu %u 0\n",
+			ktime_to_ns(stt->configured_interval),
+			stt->configured_ticks);
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
 	return ret;
 }
@@ -2958,6 +3019,45 @@ static const struct file_operations kbasep_serialize_jobs_debugfs_fops = {
 
 #endif /* CONFIG_DEBUG_FS */
 
+static void kbasep_protected_mode_hwcnt_disable_worker(struct work_struct *data)
+{
+	struct kbase_device *kbdev = container_of(data, struct kbase_device,
+		protected_mode_hwcnt_disable_work);
+	unsigned long flags;
+
+	bool do_disable;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	do_disable = !kbdev->protected_mode_hwcnt_desired &&
+		!kbdev->protected_mode_hwcnt_disabled;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	if (!do_disable)
+		return;
+
+	kbase_hwcnt_context_disable(kbdev->hwcnt_gpu_ctx);
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	do_disable = !kbdev->protected_mode_hwcnt_desired &&
+		!kbdev->protected_mode_hwcnt_disabled;
+
+	if (do_disable) {
+		/* Protected mode state did not change while we were doing the
+		 * disable, so commit the work we just performed and continue
+		 * the state machine.
+		 */
+		kbdev->protected_mode_hwcnt_disabled = true;
+		kbase_backend_slot_update(kbdev);
+	} else {
+		/* Protected mode state was updated while we were doing the
+		 * disable, so we need to undo the disable we just performed.
+		 */
+		kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+	}
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+}
+
 static int kbasep_protected_mode_init(struct kbase_device *kbdev)
 {
 #ifdef CONFIG_OF
@@ -2975,6 +3075,10 @@ static int kbasep_protected_mode_init(struct kbase_device *kbdev)
 		kbdev->protected_dev->data = kbdev;
 		kbdev->protected_ops = &kbase_native_protected_ops;
 		kbdev->protected_mode_support = true;
+		INIT_WORK(&kbdev->protected_mode_hwcnt_disable_work,
+			kbasep_protected_mode_hwcnt_disable_worker);
+		kbdev->protected_mode_hwcnt_desired = true;
+		kbdev->protected_mode_hwcnt_disabled = false;
 		return 0;
 	}
 
@@ -3024,8 +3128,10 @@ static int kbasep_protected_mode_init(struct kbase_device *kbdev)
 
 static void kbasep_protected_mode_term(struct kbase_device *kbdev)
 {
-	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_PROTECTED_MODE))
+	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_PROTECTED_MODE)) {
+		cancel_work_sync(&kbdev->protected_mode_hwcnt_disable_work);
 		kfree(kbdev->protected_dev);
+	}
 }
 
 #ifdef CONFIG_MALI_NO_MALI
@@ -3211,7 +3317,6 @@ static void power_control_term(struct kbase_device *kbdev)
 #ifdef MALI_KBASE_BUILD
 #ifdef CONFIG_DEBUG_FS
 
-#if KBASE_GPU_RESET_EN
 #include <mali_kbase_hwaccess_jm.h>
 
 static void trigger_quirks_reload(struct kbase_device *kbdev)
@@ -3247,7 +3352,6 @@ MAKE_QUIRK_ACCESSORS(tiler);
 MAKE_QUIRK_ACCESSORS(mmu);
 MAKE_QUIRK_ACCESSORS(jm);
 
-#endif /* KBASE_GPU_RESET_EN */
 
 /**
  * debugfs_protected_debug_mode_read - "protected_debug_mode" debugfs read
@@ -3328,7 +3432,6 @@ static int kbase_device_debugfs_init(struct kbase_device *kbdev)
 	kbase_debug_job_fault_debugfs_init(kbdev);
 	kbasep_gpu_memory_debugfs_init(kbdev);
 	kbase_as_fault_debugfs_init(kbdev);
-#if KBASE_GPU_RESET_EN
 	/* fops_* variables created by invocations of macro
 	 * MAKE_QUIRK_ACCESSORS() above. */
 	debugfs_create_file("quirks_sc", 0644,
@@ -3343,7 +3446,6 @@ static int kbase_device_debugfs_init(struct kbase_device *kbdev)
 	debugfs_create_file("quirks_jm", 0644,
 			kbdev->mali_debugfs_directory, kbdev,
 			&fops_jm_quirks);
-#endif /* KBASE_GPU_RESET_EN */
 
 	debugfs_create_bool("infinite_cache", 0644,
 			debugfs_ctx_defaults_directory,
@@ -3558,14 +3660,29 @@ static int kbase_platform_device_remove(struct platform_device *pdev)
 #endif
 
 
+	if (kbdev->inited_subsys & inited_backend_late) {
+		kbase_backend_late_term(kbdev);
+		kbdev->inited_subsys &= ~inited_backend_late;
+	}
+
 	if (kbdev->inited_subsys & inited_vinstr) {
 		kbase_vinstr_term(kbdev->vinstr_ctx);
 		kbdev->inited_subsys &= ~inited_vinstr;
 	}
 
-	if (kbdev->inited_subsys & inited_backend_late) {
-		kbase_backend_late_term(kbdev);
-		kbdev->inited_subsys &= ~inited_backend_late;
+	if (kbdev->inited_subsys & inited_hwcnt_gpu_virt) {
+		kbase_hwcnt_virtualizer_term(kbdev->hwcnt_gpu_virt);
+		kbdev->inited_subsys &= ~inited_hwcnt_gpu_virt;
+	}
+
+	if (kbdev->inited_subsys & inited_hwcnt_gpu_ctx) {
+		kbase_hwcnt_context_term(kbdev->hwcnt_gpu_ctx);
+		kbdev->inited_subsys &= ~inited_hwcnt_gpu_ctx;
+	}
+
+	if (kbdev->inited_subsys & inited_hwcnt_gpu_iface) {
+		kbase_hwcnt_backend_gpu_destroy(&kbdev->hwcnt_gpu_iface);
+		kbdev->inited_subsys &= ~inited_hwcnt_gpu_iface;
 	}
 
 	if (kbdev->inited_subsys & inited_tlstream) {
@@ -3790,20 +3907,40 @@ static int kbase_platform_device_probe(struct platform_device *pdev)
 	}
 	kbdev->inited_subsys |= inited_tlstream;
 
-	err = kbase_backend_late_init(kbdev);
+	/* Initialize the kctx list. This is used by vinstr. */
+	mutex_init(&kbdev->kctx_list_lock);
+	INIT_LIST_HEAD(&kbdev->kctx_list);
+
+	err = kbase_hwcnt_backend_gpu_create(kbdev, &kbdev->hwcnt_gpu_iface);
 	if (err) {
-		dev_err(kbdev->dev, "Late backend initialization failed\n");
+		dev_err(kbdev->dev, "GPU hwcnt backend creation failed\n");
 		kbase_platform_device_remove(pdev);
 		return err;
 	}
-	kbdev->inited_subsys |= inited_backend_late;
+	kbdev->inited_subsys |= inited_hwcnt_gpu_iface;
 
-	/* Initialize the kctx list. This is used by vinstr. */
-	mutex_init(&kbdev->kctx_list_lock);
-	INIT_LIST_HEAD(&kbdev->kctx_list);
+	err = kbase_hwcnt_context_init(&kbdev->hwcnt_gpu_iface,
+		&kbdev->hwcnt_gpu_ctx);
+	if (err) {
+		dev_err(kbdev->dev,
+			"GPU hwcnt context initialization failed\n");
+		kbase_platform_device_remove(pdev);
+		return err;
+	}
+	kbdev->inited_subsys |= inited_hwcnt_gpu_ctx;
 
-	kbdev->vinstr_ctx = kbase_vinstr_init(kbdev);
-	if (!kbdev->vinstr_ctx) {
+	err = kbase_hwcnt_virtualizer_init(
+		kbdev->hwcnt_gpu_ctx, &kbdev->hwcnt_gpu_virt);
+	if (err) {
+		dev_err(kbdev->dev,
+			"GPU hwcnt virtualizer initialization failed\n");
+		kbase_platform_device_remove(pdev);
+		return err;
+	}
+	kbdev->inited_subsys |= inited_hwcnt_gpu_virt;
+
+	err = kbase_vinstr_init(kbdev->hwcnt_gpu_virt, &kbdev->vinstr_ctx);
+	if (err) {
 		dev_err(kbdev->dev,
 			"Virtual instrumentation initialization failed\n");
 		kbase_platform_device_remove(pdev);
@@ -3811,9 +3948,18 @@ static int kbase_platform_device_probe(struct platform_device *pdev)
 	}
 	kbdev->inited_subsys |= inited_vinstr;
 
+	err = kbase_backend_late_init(kbdev);
+	if (err) {
+		dev_err(kbdev->dev, "Late backend initialization failed\n");
+		kbase_platform_device_remove(pdev);
+		return err;
+	}
+	kbdev->inited_subsys |= inited_backend_late;
+
+
 
 #ifdef CONFIG_MALI_DEVFREQ
-	/* Devfreq uses vinstr, so must be initialized after it. */
+	/* Devfreq uses hardware counters, so must be initialized after it. */
 	err = kbase_devfreq_init(kbdev);
 	if (!err)
 		kbdev->inited_subsys |= inited_devfreq;
diff --git a/mali_kbase/mali_kbase_debug_job_fault.c b/mali_kbase/mali_kbase_debug_job_fault.c
index 0029fe3..88bb0d3 100644
--- a/mali_kbase/mali_kbase_debug_job_fault.c
+++ b/mali_kbase/mali_kbase_debug_job_fault.c
@@ -364,7 +364,7 @@ static void *debug_job_fault_start(struct seq_file *m, loff_t *pos)
 		 * job done but we delayed it. Now we should clean cache
 		 * earlier. Then the GPU memory dump should be correct.
 		 */
-		kbase_backend_cacheclean(kbdev, event->katom);
+		kbase_backend_cache_clean(kbdev, event->katom);
 	} else
 		return NULL;
 
diff --git a/mali_kbase/mali_kbase_defs.h b/mali_kbase/mali_kbase_defs.h
index 07ef140..a135742 100644
--- a/mali_kbase/mali_kbase_defs.h
+++ b/mali_kbase/mali_kbase_defs.h
@@ -40,6 +40,7 @@
 #include <mali_kbase_instr_defs.h>
 #include <mali_kbase_pm.h>
 #include <mali_kbase_gpuprops_types.h>
+#include <mali_kbase_hwcnt_backend_gpu.h>
 #include <protected_mode_switcher.h>
 
 
@@ -143,8 +144,6 @@
 #define BASE_MAX_NR_AS              16
 
 /* mmu */
-#define MIDGARD_MMU_VA_BITS 48
-
 #define MIDGARD_MMU_LEVEL(x) (x)
 
 #define MIDGARD_MMU_TOPLEVEL    MIDGARD_MMU_LEVEL(0)
@@ -425,8 +424,8 @@ enum kbase_atom_gpu_rb_state {
  * @KBASE_ATOM_ENTER_PROTECTED_CHECK:  Starting state. Check if there are any atoms
  *                      currently submitted to GPU and protected mode transition is
  *                      not already in progress.
- * @KBASE_ATOM_ENTER_PROTECTED_VINSTR: Wait for vinstr to suspend before entry into
- *                      protected mode.
+ * @KBASE_ATOM_ENTER_PROTECTED_HWCNT: Wait for hardware counter context to
+ *                      become disabled before entry into protected mode.
  * @KBASE_ATOM_ENTER_PROTECTED_IDLE_L2: Wait for the L2 to become idle in preparation
  *                      for the coherency change. L2 shall be powered down and GPU shall
  *                      come out of fully coherent mode before entering protected mode.
@@ -442,7 +441,7 @@ enum kbase_atom_enter_protected_state {
 	 * NOTE: The integer value of this must match KBASE_ATOM_EXIT_PROTECTED_CHECK.
 	 */
 	KBASE_ATOM_ENTER_PROTECTED_CHECK = 0,
-	KBASE_ATOM_ENTER_PROTECTED_VINSTR,
+	KBASE_ATOM_ENTER_PROTECTED_HWCNT,
 	KBASE_ATOM_ENTER_PROTECTED_IDLE_L2,
 	KBASE_ATOM_ENTER_PROTECTED_SET_COHERENCY,
 	KBASE_ATOM_ENTER_PROTECTED_FINISHED,
@@ -513,8 +512,6 @@ struct kbase_ext_res {
  * @jc:                    GPU address of the job-chain.
  * @softjob_data:          Copy of data read from the user space buffer that @jc
  *                         points to.
- * @coreref_state:         state of the atom with respect to retention of shader
- *                         cores for affinity & power management.
  * @fence:                 Stores either an input or output sync fence, depending
  *                         on soft-job type
  * @sync_waiter:           Pointer to the sync fence waiter structure passed to the
@@ -607,7 +604,6 @@ struct kbase_jd_atom {
 	u32 device_nr;
 	u64 jc;
 	void *softjob_data;
-	enum kbase_atom_coreref_state coreref_state;
 #if defined(CONFIG_SYNC)
 	struct sync_fence *fence;
 	struct sync_fence_waiter sync_waiter;
@@ -1073,15 +1069,6 @@ struct kbase_pm_device_data {
 	/* Time in milliseconds between each dvfs sample */
 	u32 dvfs_period;
 
-	/* Period of GPU poweroff timer */
-	ktime_t gpu_poweroff_time;
-
-	/* Number of ticks of GPU poweroff timer before shader is powered off */
-	int poweroff_shader_ticks;
-
-	/* Number of ticks of GPU poweroff timer before GPU is powered off */
-	int poweroff_gpu_ticks;
-
 	struct kbase_pm_backend_data backend;
 };
 
@@ -1254,34 +1241,19 @@ struct kbase_mmu_mode const *kbase_mmu_mode_get_aarch64(void);
  *                         configuration/properties of GPU HW device in use.
  * @hw_issues_mask:        List of SW workarounds for HW issues
  * @hw_features_mask:      List of available HW features.
- * @shader_needed_cnt:     Count for the 64 shader cores, incremented when
- *                         shaders are requested for use and decremented later
- *                         when they are no longer required.
- * @tiler_needed_cnt:      Count for the Tiler block shader cores, incremented
- *                         when Tiler is requested for use and decremented
- *                         later when the Tiler is no longer required.
  * @disjoint_event:        struct for keeping track of the disjoint information,
  *                         that whether the GPU is in a disjoint state and the
  *                         number of disjoint events that have occurred on GPU.
- * @l2_users_count:        Refcount for tracking users of the l2 cache, e.g.
- *                         when using hardware counter instrumentation.
- * @shader_available_bitmap: Bitmap of shader cores that are currently available,
- *                         powered up and the power policy is happy for jobs
- *                         to be submitted to these cores. These are updated
- *                         by the power management code. The job scheduler
- *                         should avoid submitting new jobs to any cores
- *                         that are not marked as available.
- * @tiler_available_bitmap: Bitmap of tiler units that are currently available.
- * @l2_available_bitmap:    Bitmap of the currently available Level 2 caches.
- * @stack_available_bitmap: Bitmap of the currently available Core stacks.
- * @shader_ready_bitmap:    Bitmap of shader cores that are ready (powered on)
- * @shader_transitioning_bitmap: Bitmap of shader cores that are currently changing
- *                         power state.
  * @nr_hw_address_spaces:  Number of address spaces actually available in the
  *                         GPU, remains constant after driver initialisation.
  * @nr_user_address_spaces: Number of address spaces available to user contexts
  * @hwcnt:                  Structure used for instrumentation and HW counters
  *                         dumping
+ * @hwcnt_gpu_iface:       Backend interface for GPU hardware counter access.
+ * @hwcnt_gpu_ctx:         Context for GPU hardware counter access.
+ *                         @hwaccess_lock must be held when calling
+ *                         kbase_hwcnt_context_enable() with @hwcnt_gpu_ctx.
+ * @hwcnt_gpu_virt:        Virtualizer for GPU hardware counters.
  * @vinstr_ctx:            vinstr context created per device
  * @trace_lock:            Lock to serialize the access to trace buffer.
  * @trace_first_out:       Index/offset in the trace buffer at which the first
@@ -1294,8 +1266,14 @@ struct kbase_mmu_mode const *kbase_mmu_mode_get_aarch64(void);
  * @reset_timeout_ms:      Number of milliseconds to wait for the soft stop to
  *                         complete for the GPU jobs before proceeding with the
  *                         GPU reset.
- * @cacheclean_lock:       Lock to serialize the clean & invalidation of GPU caches,
- *                         between Job Manager backend & Instrumentation code.
+ * @cache_clean_in_progress: Set when a cache clean has been started, and
+ *                         cleared when it has finished. This prevents multiple
+ *                         cache cleans being done simultaneously.
+ * @cache_clean_queued:    Set if a cache clean is invoked while another is in
+ *                         progress. If this happens, another cache clean needs
+ *                         to be triggered immediately after completion of the
+ *                         current one.
+ * @cache_clean_wait:      Signalled when a cache clean has finished.
  * @platform_context:      Platform specific private data to be accessed by
  *                         platform specific config files only.
  * @kctx_list:             List of kbase_contexts created for the device, including
@@ -1398,6 +1376,13 @@ struct kbase_mmu_mode const *kbase_mmu_mode_get_aarch64(void);
  * @protected_mode:        set to TRUE when GPU is put into protected mode
  * @protected_mode_transition: set to TRUE when GPU is transitioning into or
  *                         out of protected mode.
+ * @protected_mode_hwcnt_desired: True if we want GPU hardware counters to be
+ *                         enabled. Counters must be disabled before transition
+ *                         into protected mode.
+ * @protected_mode_hwcnt_disabled: True if GPU hardware counters are not
+ *                         enabled.
+ * @protected_mode_hwcnt_disable_work: Work item to disable GPU hardware
+ *                         counters, used if atomic disable is not possible.
  * @protected_mode_support: set to true if protected mode is supported.
  * @buslogger:              Pointer to the structure required for interfacing
  *                          with the bus logger module to set the size of buffer
@@ -1471,24 +1456,11 @@ struct kbase_device {
 	unsigned long hw_issues_mask[(BASE_HW_ISSUE_END + BITS_PER_LONG - 1) / BITS_PER_LONG];
 	unsigned long hw_features_mask[(BASE_HW_FEATURE_END + BITS_PER_LONG - 1) / BITS_PER_LONG];
 
-	u32 tiler_needed_cnt;
-	u32 shader_needed_cnt;
-
 	struct {
 		atomic_t count;
 		atomic_t state;
 	} disjoint_event;
 
-	u32 l2_users_count;
-
-	u64 shader_available_bitmap;
-	u64 tiler_available_bitmap;
-	u64 l2_available_bitmap;
-	u64 stack_available_bitmap;
-
-	u64 shader_ready_bitmap;
-	u64 shader_transitioning_bitmap;
-
 	s8 nr_hw_address_spaces;
 	s8 nr_user_address_spaces;
 
@@ -1498,10 +1470,14 @@ struct kbase_device {
 
 		struct kbase_context *kctx;
 		u64 addr;
+		u64 addr_bytes;
 
 		struct kbase_instr_backend backend;
 	} hwcnt;
 
+	struct kbase_hwcnt_backend_interface hwcnt_gpu_iface;
+	struct kbase_hwcnt_context *hwcnt_gpu_ctx;
+	struct kbase_hwcnt_virtualizer *hwcnt_gpu_virt;
 	struct kbase_vinstr_context *vinstr_ctx;
 
 #if KBASE_TRACE_ENABLE
@@ -1513,7 +1489,9 @@ struct kbase_device {
 
 	u32 reset_timeout_ms;
 
-	struct mutex cacheclean_lock;
+	bool cache_clean_in_progress;
+	bool cache_clean_queued;
+	wait_queue_head_t cache_clean_wait;
 
 	void *platform_context;
 
@@ -1548,27 +1526,9 @@ struct kbase_device {
 		 * the difference between last_metrics and the current values.
 		 */
 		struct kbasep_pm_metrics last_metrics;
-
-		/*
-		 * gpu_active_callback - Inform IPA that GPU is now active
-		 * @model_data: Pointer to model data
-		 */
-		void (*gpu_active_callback)(
-				struct kbase_ipa_model_vinstr_data *model_data);
-
-		/*
-		 * gpu_idle_callback - Inform IPA that GPU is now idle
-		 * @model_data: Pointer to model data
-		 */
-		void (*gpu_idle_callback)(
-				struct kbase_ipa_model_vinstr_data *model_data);
-
 		/* Model data to pass to ipa_gpu_active/idle() */
 		struct kbase_ipa_model_vinstr_data *model_data;
 
-		/* true if IPA is currently using vinstr */
-		bool vinstr_active;
-
 		/* true if use of fallback model has been forced by the User */
 		bool force_fallback_model;
 	} ipa;
@@ -1642,6 +1602,12 @@ struct kbase_device {
 
 	bool protected_mode_transition;
 
+	bool protected_mode_hwcnt_desired;
+
+	bool protected_mode_hwcnt_disabled;
+
+	struct work_struct protected_mode_hwcnt_disable_work;
+
 	bool protected_mode_support;
 
 #ifdef CONFIG_MALI_FPGA_BUS_LOGGER
@@ -1824,6 +1790,9 @@ struct kbase_sub_alloc {
  *                        having the same value for GPU & CPU virtual address.
  * @reg_rbtree_custom:    RB tree of the memory regions allocated from the CUSTOM_VA
  *                        zone of the GPU virtual address space.
+ * @reg_rbtree_exec:      RB tree of the memory regions allocated from the EXEC_VA
+ *                        zone of the GPU virtual address space. Used for GPU-executable
+ *                        allocations which don't need the SAME_VA property.
  * @cookies:              Bitmask containing of BITS_PER_LONG bits, used mainly for
  *                        SAME_VA allocations to defer the reservation of memory region
  *                        (from the GPU virtual address space) from base_mem_alloc
@@ -1896,6 +1865,10 @@ struct kbase_sub_alloc {
  *                        pages used for GPU allocations, done for the context,
  *                        to the memory consumed by the process.
  * @same_va_end:          End address of the SAME_VA zone (in 4KB page units)
+ * @exec_va_start:        Start address of the EXEC_VA zone (in 4KB page units)
+ *                        or U64_MAX if the EXEC_VA zone is uninitialized.
+ * @gpu_va_end:           End address of the GPU va space (in 4KB page units)
+ * @jit_va:               Indicates if a JIT_VA zone has been created.
  * @timeline:             Object tracking the number of atoms currently in flight for
  *                        the context and thread group id of the process, i.e. @tgid.
  * @mem_profile_data:     Buffer containing the profiling information provided by
@@ -1930,9 +1903,11 @@ struct kbase_sub_alloc {
  * @slots_pullable:       Bitmask of slots, indicating the slots for which the
  *                        context has pullable atoms in the runnable tree.
  * @work:                 Work structure used for deferred ASID assignment.
- * @vinstr_cli:           Pointer to the legacy userspace vinstr client, there can
- *                        be only such client per kbase context.
- * @vinstr_cli_lock:      Lock used for the vinstr ioctl calls made for @vinstr_cli.
+ * @legacy_hwcnt_cli:     Pointer to the legacy userspace hardware counters
+ *                        client, there can be only such client per kbase
+ *                        context.
+ * @legacy_hwcnt_lock:    Lock used to prevent concurrent access to
+ *                        @legacy_hwcnt_cli.
  * @completed_jobs:       List containing completed atoms for which base_jd_event is
  *                        to be posted.
  * @work_count:           Number of work items, corresponding to atoms, currently
@@ -2017,6 +1992,7 @@ struct kbase_context {
 	struct mutex            reg_lock;
 	struct rb_root reg_rbtree_same;
 	struct rb_root reg_rbtree_custom;
+	struct rb_root reg_rbtree_exec;
 
 
 	unsigned long    cookies;
@@ -2060,6 +2036,9 @@ struct kbase_context {
 	spinlock_t         mm_update_lock;
 	struct mm_struct __rcu *process_mm;
 	u64 same_va_end;
+	u64 exec_va_start;
+	u64 gpu_va_end;
+	bool jit_va;
 
 #ifdef CONFIG_DEBUG_FS
 	char *mem_profile_data;
@@ -2087,8 +2066,8 @@ struct kbase_context {
 
 	struct work_struct work;
 
-	struct kbase_vinstr_client *vinstr_cli;
-	struct mutex vinstr_cli_lock;
+	struct kbase_hwcnt_legacy_client *legacy_hwcnt_cli;
+	struct mutex legacy_hwcnt_lock;
 
 	struct list_head completed_jobs;
 	atomic_t work_count;
diff --git a/mali_kbase/mali_kbase_device.c b/mali_kbase/mali_kbase_device.c
index 44d16a7..530bb45 100644
--- a/mali_kbase/mali_kbase_device.c
+++ b/mali_kbase/mali_kbase_device.c
@@ -222,7 +222,7 @@ int kbase_device_init(struct kbase_device * const kbdev)
 	if (err)
 		goto term_as;
 
-	mutex_init(&kbdev->cacheclean_lock);
+	init_waitqueue_head(&kbdev->cache_clean_wait);
 
 	kbase_debug_assert_register_hook(&kbasep_trace_hook_wrapper, kbdev);
 
diff --git a/mali_kbase/mali_kbase_gator_api.c b/mali_kbase/mali_kbase_gator_api.c
index 7077c3a..1719edf 100644
--- a/mali_kbase/mali_kbase_gator_api.c
+++ b/mali_kbase/mali_kbase_gator_api.c
@@ -25,6 +25,9 @@
 #include "mali_kbase_mem_linux.h"
 #include "mali_kbase_gator_api.h"
 #include "mali_kbase_gator_hwcnt_names.h"
+#include "mali_kbase_hwcnt_types.h"
+#include "mali_kbase_hwcnt_gpu.h"
+#include "mali_kbase_hwcnt_virtualizer.h"
 
 #define MALI_MAX_CORES_PER_GROUP		4
 #define MALI_MAX_NUM_BLOCKS_PER_GROUP	8
@@ -33,8 +36,9 @@
 
 struct kbase_gator_hwcnt_handles {
 	struct kbase_device *kbdev;
-	struct kbase_vinstr_client *vinstr_cli;
-	void *vinstr_buffer;
+	struct kbase_hwcnt_virtualizer_client *hvcli;
+	struct kbase_hwcnt_enable_map enable_map;
+	struct kbase_hwcnt_dump_buffer dump_buf;
 	struct work_struct dump_work;
 	int dump_complete;
 	spinlock_t dump_lock;
@@ -173,8 +177,10 @@ KBASE_EXPORT_SYMBOL(kbase_gator_hwcnt_term_names);
 
 struct kbase_gator_hwcnt_handles *kbase_gator_hwcnt_init(struct kbase_gator_hwcnt_info *in_out_info)
 {
+	int errcode;
 	struct kbase_gator_hwcnt_handles *hand;
-	struct kbase_ioctl_hwcnt_reader_setup setup;
+	const struct kbase_hwcnt_metadata *metadata;
+	struct kbase_hwcnt_physical_enable_map phys_map;
 	uint32_t dump_size = 0, i = 0;
 
 	if (!in_out_info)
@@ -192,11 +198,20 @@ struct kbase_gator_hwcnt_handles *kbase_gator_hwcnt_init(struct kbase_gator_hwcn
 	if (!hand->kbdev)
 		goto free_hand;
 
-	dump_size = kbase_vinstr_dump_size(hand->kbdev);
-	hand->vinstr_buffer = kzalloc(dump_size, GFP_KERNEL);
-	if (!hand->vinstr_buffer)
+	metadata = kbase_hwcnt_virtualizer_metadata(
+		hand->kbdev->hwcnt_gpu_virt);
+	if (!metadata)
 		goto release_device;
-	in_out_info->kernel_dump_buffer = hand->vinstr_buffer;
+
+	errcode = kbase_hwcnt_enable_map_alloc(metadata, &hand->enable_map);
+	if (errcode)
+		goto release_device;
+
+	errcode = kbase_hwcnt_dump_buffer_alloc(metadata, &hand->dump_buf);
+	if (errcode)
+		goto free_enable_map;
+
+	in_out_info->kernel_dump_buffer = hand->dump_buf.dump_buf;
 
 	in_out_info->nr_cores = hand->kbdev->gpu_props.num_cores;
 	in_out_info->nr_core_groups = hand->kbdev->gpu_props.num_core_groups;
@@ -213,7 +228,7 @@ struct kbase_gator_hwcnt_handles *kbase_gator_hwcnt_init(struct kbase_gator_hwcn
 			in_out_info->nr_core_groups, GFP_KERNEL);
 
 		if (!in_out_info->hwc_layout)
-			goto free_vinstr_buffer;
+			goto free_dump_buf;
 
 		dump_size = in_out_info->nr_core_groups *
 			MALI_MAX_NUM_BLOCKS_PER_GROUP *
@@ -256,7 +271,7 @@ struct kbase_gator_hwcnt_handles *kbase_gator_hwcnt_init(struct kbase_gator_hwcn
 		in_out_info->hwc_layout = kmalloc(sizeof(enum hwc_type) * (2 + nr_sc_bits + nr_l2), GFP_KERNEL);
 
 		if (!in_out_info->hwc_layout)
-			goto free_vinstr_buffer;
+			goto free_dump_buf;
 
 		dump_size = (2 + nr_sc_bits + nr_l2) * MALI_COUNTERS_PER_BLOCK * MALI_BYTES_PER_COUNTER;
 
@@ -275,17 +290,23 @@ struct kbase_gator_hwcnt_handles *kbase_gator_hwcnt_init(struct kbase_gator_hwcn
 		}
 	}
 
+	/* Calculated dump size must be the same as real dump size */
+	if (WARN_ON(dump_size != metadata->dump_buf_bytes))
+		goto free_layout;
+
 	in_out_info->nr_hwc_blocks = i;
 	in_out_info->size = dump_size;
 
-	setup.jm_bm = in_out_info->bitmask[0];
-	setup.tiler_bm = in_out_info->bitmask[1];
-	setup.shader_bm = in_out_info->bitmask[2];
-	setup.mmu_l2_bm = in_out_info->bitmask[3];
-	hand->vinstr_cli = kbase_vinstr_hwcnt_kernel_setup(hand->kbdev->vinstr_ctx,
-			&setup, hand->vinstr_buffer);
-	if (!hand->vinstr_cli) {
-		dev_err(hand->kbdev->dev, "Failed to register gator with vinstr core");
+	phys_map.jm_bm = in_out_info->bitmask[JM_BLOCK];
+	phys_map.tiler_bm = in_out_info->bitmask[TILER_BLOCK];
+	phys_map.shader_bm = in_out_info->bitmask[SHADER_BLOCK];
+	phys_map.mmu_l2_bm = in_out_info->bitmask[MMU_L2_BLOCK];
+	kbase_hwcnt_gpu_enable_map_from_physical(&hand->enable_map, &phys_map);
+	errcode = kbase_hwcnt_virtualizer_client_create(
+		hand->kbdev->hwcnt_gpu_virt, &hand->enable_map, &hand->hvcli);
+	if (errcode) {
+		dev_err(hand->kbdev->dev,
+			"Failed to register gator with hwcnt virtualizer core");
 		goto free_layout;
 	}
 
@@ -293,13 +314,12 @@ struct kbase_gator_hwcnt_handles *kbase_gator_hwcnt_init(struct kbase_gator_hwcn
 
 free_layout:
 	kfree(in_out_info->hwc_layout);
-
-free_vinstr_buffer:
-	kfree(hand->vinstr_buffer);
-
+free_dump_buf:
+	kbase_hwcnt_dump_buffer_free(&hand->dump_buf);
+free_enable_map:
+	kbase_hwcnt_enable_map_free(&hand->enable_map);
 release_device:
 	kbase_release_device(hand->kbdev);
-
 free_hand:
 	kfree(hand);
 	return NULL;
@@ -313,8 +333,9 @@ void kbase_gator_hwcnt_term(struct kbase_gator_hwcnt_info *in_out_info, struct k
 
 	if (opaque_handles) {
 		cancel_work_sync(&opaque_handles->dump_work);
-		kbase_vinstr_detach_client(opaque_handles->vinstr_cli);
-		kfree(opaque_handles->vinstr_buffer);
+		kbase_hwcnt_virtualizer_client_destroy(opaque_handles->hvcli);
+		kbase_hwcnt_dump_buffer_free(&opaque_handles->dump_buf);
+		kbase_hwcnt_enable_map_free(&opaque_handles->enable_map);
 		kbase_release_device(opaque_handles->kbdev);
 		kfree(opaque_handles);
 	}
@@ -323,11 +344,21 @@ KBASE_EXPORT_SYMBOL(kbase_gator_hwcnt_term);
 
 static void dump_worker(struct work_struct *work)
 {
+	int errcode;
+	u64 ts_start_ns;
+	u64 ts_end_ns;
 	struct kbase_gator_hwcnt_handles *hand;
 
 	hand = container_of(work, struct kbase_gator_hwcnt_handles, dump_work);
-	if (!kbase_vinstr_hwc_dump(hand->vinstr_cli,
-			BASE_HWCNT_READER_EVENT_MANUAL)) {
+	errcode = kbase_hwcnt_virtualizer_client_dump(
+		hand->hvcli, &ts_start_ns, &ts_end_ns, &hand->dump_buf);
+	if (!errcode) {
+		/* Patch the header to hide other client's counter choices */
+		kbase_hwcnt_gpu_patch_dump_headers(
+			&hand->dump_buf, &hand->enable_map);
+		/* Zero all non-enabled counters (currently undefined values) */
+		kbase_hwcnt_dump_buffer_zero_non_enabled(
+			&hand->dump_buf, &hand->enable_map);
 		spin_lock_bh(&hand->dump_lock);
 		hand->dump_complete = 1;
 		spin_unlock_bh(&hand->dump_lock);
diff --git a/mali_kbase/mali_kbase_gpu_id.h b/mali_kbase/mali_kbase_gpu_id.h
index 5f84ba9..d432f8e 100644
--- a/mali_kbase/mali_kbase_gpu_id.h
+++ b/mali_kbase/mali_kbase_gpu_id.h
@@ -114,8 +114,8 @@
 #define GPU_ID2_PRODUCT_TEGX              GPU_ID2_MODEL_MAKE(8, 3)
 #define GPU_ID2_PRODUCT_TTRX              GPU_ID2_MODEL_MAKE(9, 0)
 #define GPU_ID2_PRODUCT_TNAX              GPU_ID2_MODEL_MAKE(9, 1)
+#define GPU_ID2_PRODUCT_TBEX              GPU_ID2_MODEL_MAKE(9, 2)
 #define GPU_ID2_PRODUCT_TULX              GPU_ID2_MODEL_MAKE(10, 0)
-#define GPU_ID2_PRODUCT_TDUX              GPU_ID2_MODEL_MAKE(10, 1)
 #define GPU_ID2_PRODUCT_TIDX              GPU_ID2_MODEL_MAKE(10, 3)
 #define GPU_ID2_PRODUCT_TVAX              GPU_ID2_MODEL_MAKE(10, 4)
 
diff --git a/mali_kbase/mali_kbase_hw.c b/mali_kbase/mali_kbase_hw.c
index fc6b644..450926c 100644
--- a/mali_kbase/mali_kbase_hw.c
+++ b/mali_kbase/mali_kbase_hw.c
@@ -74,12 +74,12 @@ void kbase_hw_set_features_mask(struct kbase_device *kbdev)
 		case GPU_ID2_PRODUCT_TNAX:
 			features = base_hw_features_tNAx;
 			break;
+		case GPU_ID2_PRODUCT_TBEX:
+			features = base_hw_features_tBEx;
+			break;
 		case GPU_ID2_PRODUCT_TULX:
 			features = base_hw_features_tULx;
 			break;
-		case GPU_ID2_PRODUCT_TDUX:
-			features = base_hw_features_tDUx;
-			break;
 		case GPU_ID2_PRODUCT_TBOX:
 			features = base_hw_features_tBOx;
 			break;
@@ -213,12 +213,12 @@ static const enum base_hw_issue *kbase_hw_get_issues_for_new_id(
 		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tNAx_r0p0},
 		  {U32_MAX, NULL} } },
 
-		{GPU_ID2_PRODUCT_TULX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tULx_r0p0},
+		{GPU_ID2_PRODUCT_TBEX,
+		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tBEx_r0p0},
 		  {U32_MAX, NULL} } },
 
-		{GPU_ID2_PRODUCT_TDUX,
-		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tDUx_r0p0},
+		{GPU_ID2_PRODUCT_TULX,
+		 {{GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tULx_r0p0},
 		  {U32_MAX, NULL} } },
 
 		{GPU_ID2_PRODUCT_TBOX,
@@ -250,10 +250,8 @@ static const enum base_hw_issue *kbase_hw_get_issues_for_new_id(
 	if (product != NULL) {
 		/* Found a matching product. */
 		const u32 version = gpu_id & GPU_ID2_VERSION;
-#if !MALI_CUSTOMER_RELEASE
 		u32 fallback_version = 0;
 		const enum base_hw_issue *fallback_issues = NULL;
-#endif
 		size_t v;
 
 		/* Stop when we reach the end of the map. */
@@ -265,25 +263,34 @@ static const enum base_hw_issue *kbase_hw_get_issues_for_new_id(
 				break;
 			}
 
-#if !MALI_CUSTOMER_RELEASE
 			/* Check whether this is a candidate for most recent
 				known version not later than the actual
 				version. */
 			if ((version > product->map[v].version) &&
 				(product->map[v].version >= fallback_version)) {
-				fallback_version = product->map[v].version;
-				fallback_issues = product->map[v].issues;
-			}
+#if MALI_CUSTOMER_RELEASE
+				/* Match on version's major and minor fields */
+				if (((version ^ product->map[v].version) >>
+					GPU_ID2_VERSION_MINOR_SHIFT) == 0)
 #endif
+				{
+					fallback_version = product->map[v].version;
+					fallback_issues = product->map[v].issues;
+				}
+			}
 		}
 
-#if !MALI_CUSTOMER_RELEASE
 		if ((issues == NULL) && (fallback_issues != NULL)) {
 			/* Fall back to the issue set of the most recent known
 				version not later than the actual version. */
 			issues = fallback_issues;
 
+#if MALI_CUSTOMER_RELEASE
+			dev_warn(kbdev->dev,
+				"GPU hardware issue table may need updating:\n"
+#else
 			dev_info(kbdev->dev,
+#endif
 				"r%dp%d status %d is unknown; treating as r%dp%d status %d",
 				(gpu_id & GPU_ID2_VERSION_MAJOR) >>
 					GPU_ID2_VERSION_MAJOR_SHIFT,
@@ -305,7 +312,6 @@ static const enum base_hw_issue *kbase_hw_get_issues_for_new_id(
 			kbase_gpuprops_update_core_props_gpu_id(
 				&kbdev->gpu_props.props);
 		}
-#endif
 	}
 	return issues;
 }
@@ -467,12 +473,12 @@ int kbase_hw_set_issues_mask(struct kbase_device *kbdev)
 			case GPU_ID2_PRODUCT_TNAX:
 				issues = base_hw_issues_model_tNAx;
 				break;
+			case GPU_ID2_PRODUCT_TBEX:
+				issues = base_hw_issues_model_tBEx;
+				break;
 			case GPU_ID2_PRODUCT_TULX:
 				issues = base_hw_issues_model_tULx;
 				break;
-			case GPU_ID2_PRODUCT_TDUX:
-				issues = base_hw_issues_model_tDUx;
-				break;
 			case GPU_ID2_PRODUCT_TBOX:
 				issues = base_hw_issues_model_tBOx;
 				break;
diff --git a/mali_kbase/mali_kbase_hwaccess_instr.h b/mali_kbase/mali_kbase_hwaccess_instr.h
index 0c5ceff..d5b9099 100644
--- a/mali_kbase/mali_kbase_hwaccess_instr.h
+++ b/mali_kbase/mali_kbase_hwaccess_instr.h
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2014-2015, 2017 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2015, 2017-2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -32,7 +32,28 @@
 #include <mali_kbase_instr_defs.h>
 
 /**
- * kbase_instr_hwcnt_enable_internal - Enable HW counters collection
+ * struct kbase_instr_hwcnt_enable - Enable hardware counter collection.
+ * @dump_buffer:       GPU address to write counters to.
+ * @dump_buffer_bytes: Size in bytes of the buffer pointed to by dump_buffer.
+ * @jm_bm:             counters selection bitmask (JM).
+ * @shader_bm:         counters selection bitmask (Shader).
+ * @tiler_bm:          counters selection bitmask (Tiler).
+ * @mmu_l2_bm:         counters selection bitmask (MMU_L2).
+ * @use_secondary:     use secondary performance counters set for applicable
+ *                     counter blocks.
+ */
+struct kbase_instr_hwcnt_enable {
+	u64 dump_buffer;
+	u64 dump_buffer_bytes;
+	u32 jm_bm;
+	u32 shader_bm;
+	u32 tiler_bm;
+	u32 mmu_l2_bm;
+	bool use_secondary;
+};
+
+/**
+ * kbase_instr_hwcnt_enable_internal() - Enable HW counters collection
  * @kbdev:	Kbase device
  * @kctx:	Kbase context
  * @enable:	HW counter setup parameters
@@ -43,10 +64,10 @@
  */
 int kbase_instr_hwcnt_enable_internal(struct kbase_device *kbdev,
 				struct kbase_context *kctx,
-				struct kbase_ioctl_hwcnt_enable *enable);
+				struct kbase_instr_hwcnt_enable *enable);
 
 /**
- * kbase_instr_hwcnt_disable_internal - Disable HW counters collection
+ * kbase_instr_hwcnt_disable_internal() - Disable HW counters collection
  * @kctx: Kbase context
  *
  * Context: might sleep, waiting for an ongoing dump to complete
diff --git a/mali_kbase/mali_kbase_hwaccess_jm.h b/mali_kbase/mali_kbase_hwaccess_jm.h
index 580ac98..e2798eb 100644
--- a/mali_kbase/mali_kbase_hwaccess_jm.h
+++ b/mali_kbase/mali_kbase_hwaccess_jm.h
@@ -128,7 +128,7 @@ void kbase_backend_release_ctx_noirq(struct kbase_device *kbdev,
 						struct kbase_context *kctx);
 
 /**
- * kbase_backend_cacheclean - Perform a cache clean if the given atom requires
+ * kbase_backend_cache_clean - Perform a cache clean if the given atom requires
  *                            one
  * @kbdev:	Device pointer
  * @katom:	Pointer to the failed atom
@@ -136,7 +136,7 @@ void kbase_backend_release_ctx_noirq(struct kbase_device *kbdev,
  * On some GPUs, the GPU cache must be cleaned following a failed atom. This
  * function performs a clean if it is required by @katom.
  */
-void kbase_backend_cacheclean(struct kbase_device *kbdev,
+void kbase_backend_cache_clean(struct kbase_device *kbdev,
 		struct kbase_jd_atom *katom);
 
 
@@ -160,14 +160,12 @@ void kbase_backend_complete_wq(struct kbase_device *kbdev,
  *                                        any scheduling has taken place.
  * @kbdev:         Device pointer
  * @core_req:      Core requirements of atom
- * @coreref_state: Coreref state of atom
  *
  * This function should only be called from kbase_jd_done_worker() or
  * js_return_worker().
  */
 void kbase_backend_complete_wq_post_sched(struct kbase_device *kbdev,
-		base_jd_core_req core_req,
-		enum kbase_atom_coreref_state coreref_state);
+		base_jd_core_req core_req);
 
 /**
  * kbase_backend_reset() - The GPU is being reset. Cancel all jobs on the GPU
@@ -277,7 +275,6 @@ void kbase_jm_wait_for_zero_jobs(struct kbase_context *kctx);
  */
 u32 kbase_backend_get_current_flush_id(struct kbase_device *kbdev);
 
-#if KBASE_GPU_RESET_EN
 /**
  * kbase_prepare_to_reset_gpu - Prepare for resetting the GPU.
  * @kbdev: Device pointer
@@ -345,8 +342,11 @@ void kbase_reset_gpu_locked(struct kbase_device *kbdev);
  * of the GPU as part of normal processing (e.g. exiting protected mode) where
  * the driver will have ensured the scheduler has been idled and all other
  * users of the GPU (e.g. instrumentation) have been suspended.
+ *
+ * Return: 0 if the reset was started successfully
+ *         -EAGAIN if another reset is currently in progress
  */
-void kbase_reset_gpu_silent(struct kbase_device *kbdev);
+int kbase_reset_gpu_silent(struct kbase_device *kbdev);
 
 /**
  * kbase_reset_gpu_active - Reports if the GPU is being reset
@@ -355,7 +355,6 @@ void kbase_reset_gpu_silent(struct kbase_device *kbdev);
  * Return: True if the GPU is in the process of being reset.
  */
 bool kbase_reset_gpu_active(struct kbase_device *kbdev);
-#endif
 
 /**
  * kbase_job_slot_hardstop - Hard-stop the specified job slot
diff --git a/mali_kbase/mali_kbase_hwaccess_pm.h b/mali_kbase/mali_kbase_hwaccess_pm.h
index 4598d80..5bb3887 100644
--- a/mali_kbase/mali_kbase_hwaccess_pm.h
+++ b/mali_kbase/mali_kbase_hwaccess_pm.h
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2014-2015 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2015, 2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -44,13 +44,23 @@ struct kbase_device;
  *
  * Must be called before any other power management function
  *
- * @param kbdev The kbase device structure for the device (must be a valid
- *              pointer)
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * Return: 0 if the power management framework was successfully initialized.
+ */
+int kbase_hwaccess_pm_early_init(struct kbase_device *kbdev);
+
+/**
+ * Initialize the power management framework.
+ *
+ * Must be called before any other power management function (except
+ * @ref kbase_hwaccess_pm_early_init)
+ *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  *
- * @return 0 if the power management framework was successfully
- *         initialized.
+ * Return: 0 if the power management framework was successfully initialized.
  */
-int kbase_hwaccess_pm_init(struct kbase_device *kbdev);
+int kbase_hwaccess_pm_late_init(struct kbase_device *kbdev);
 
 /**
  * Terminate the power management framework.
@@ -58,10 +68,19 @@ int kbase_hwaccess_pm_init(struct kbase_device *kbdev);
  * No power management functions may be called after this (except
  * @ref kbase_pm_init)
  *
- * @param kbdev The kbase device structure for the device (must be a valid
- *              pointer)
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ */
+void kbase_hwaccess_pm_early_term(struct kbase_device *kbdev);
+
+/**
+ * Terminate the power management framework.
+ *
+ * No power management functions may be called after this (except
+ * @ref kbase_hwaccess_pm_early_term or @ref kbase_hwaccess_pm_late_init)
+ *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
  */
-void kbase_hwaccess_pm_term(struct kbase_device *kbdev);
+void kbase_hwaccess_pm_late_term(struct kbase_device *kbdev);
 
 /**
  * kbase_hwaccess_pm_powerup - Power up the GPU.
diff --git a/mali_kbase/mali_kbase_hwaccess_time.h b/mali_kbase/mali_kbase_hwaccess_time.h
index 9b86b51..f7539f5 100644
--- a/mali_kbase/mali_kbase_hwaccess_time.h
+++ b/mali_kbase/mali_kbase_hwaccess_time.h
@@ -51,7 +51,11 @@ void kbase_backend_get_gpu_time(struct kbase_device *kbdev, u64 *cycle_counter,
  *
  * This function is only in use for BASE_HW_ISSUE_6367
  */
-#ifndef CONFIG_MALI_NO_MALI
+#ifdef CONFIG_MALI_NO_MALI
+static inline void kbase_wait_write_flush(struct kbase_device *kbdev)
+{
+}
+#else
 void kbase_wait_write_flush(struct kbase_device *kbdev);
 #endif
 
diff --git a/mali_kbase/mali_kbase_hwcnt.c b/mali_kbase/mali_kbase_hwcnt.c
new file mode 100644
index 0000000..efbac6f
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt.c
@@ -0,0 +1,796 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+/*
+ * Implementation of hardware counter context and accumulator APIs.
+ */
+
+#include "mali_kbase_hwcnt_context.h"
+#include "mali_kbase_hwcnt_accumulator.h"
+#include "mali_kbase_hwcnt_backend.h"
+#include "mali_kbase_hwcnt_types.h"
+#include "mali_malisw.h"
+#include "mali_kbase_debug.h"
+#include "mali_kbase_linux.h"
+
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+
+/**
+ * enum kbase_hwcnt_accum_state - Hardware counter accumulator states.
+ * @ACCUM_STATE_ERROR:    Error state, where all accumulator operations fail.
+ * @ACCUM_STATE_DISABLED: Disabled state, where dumping is always disabled.
+ * @ACCUM_STATE_ENABLED:  Enabled state, where dumping is enabled if there are
+ *                        any enabled counters.
+ */
+enum kbase_hwcnt_accum_state {
+	ACCUM_STATE_ERROR,
+	ACCUM_STATE_DISABLED,
+	ACCUM_STATE_ENABLED
+};
+
+/**
+ * struct kbase_hwcnt_accumulator - Hardware counter accumulator structure.
+ * @backend:                Pointer to created counter backend.
+ * @state:                  The current state of the accumulator.
+ *                           - State transition from disabled->enabled or
+ *                             disabled->error requires state_lock.
+ *                           - State transition from enabled->disabled or
+ *                             enabled->error requires both accum_lock and
+ *                             state_lock.
+ *                           - Error state persists until next disable.
+ * @enable_map:             The current set of enabled counters.
+ *                           - Must only be modified while holding both
+ *                             accum_lock and state_lock.
+ *                           - Can be read while holding either lock.
+ *                           - Must stay in sync with enable_map_any_enabled.
+ * @enable_map_any_enabled: True if any counters in the map are enabled, else
+ *                          false. If true, and state is ACCUM_STATE_ENABLED,
+ *                          then the counter backend will be enabled.
+ *                           - Must only be modified while holding both
+ *                             accum_lock and state_lock.
+ *                           - Can be read while holding either lock.
+ *                           - Must stay in sync with enable_map.
+ * @scratch_map:            Scratch enable map, used as temporary enable map
+ *                          storage during dumps.
+ *                           - Must only be read or modified while holding
+ *                             accum_lock.
+ * @accum_buf:              Accumulation buffer, where dumps will be accumulated
+ *                          into on transition to a disable state.
+ *                           - Must only be read or modified while holding
+ *                             accum_lock.
+ * @accumulated:            True if the accumulation buffer has been accumulated
+ *                          into and not subsequently read from yet, else false.
+ *                           - Must only be read or modified while holding
+ *                             accum_lock.
+ * @ts_last_dump_ns:        Timestamp (ns) of the end time of the most recent
+ *                          dump that was requested by the user.
+ *                           - Must only be read or modified while holding
+ *                             accum_lock.
+ */
+struct kbase_hwcnt_accumulator {
+	struct kbase_hwcnt_backend *backend;
+	enum kbase_hwcnt_accum_state state;
+	struct kbase_hwcnt_enable_map enable_map;
+	bool enable_map_any_enabled;
+	struct kbase_hwcnt_enable_map scratch_map;
+	struct kbase_hwcnt_dump_buffer accum_buf;
+	bool accumulated;
+	u64 ts_last_dump_ns;
+};
+
+/**
+ * struct kbase_hwcnt_context - Hardware counter context structure.
+ * @iface:         Pointer to hardware counter backend interface.
+ * @state_lock:    Spinlock protecting state.
+ * @disable_count: Disable count of the context. Initialised to 1.
+ *                 Decremented when the accumulator is acquired, and incremented
+ *                 on release. Incremented on calls to
+ *                 kbase_hwcnt_context_disable[_atomic], and decremented on
+ *                 calls to kbase_hwcnt_context_enable.
+ *                  - Must only be read or modified while holding state_lock.
+ * @accum_lock:    Mutex protecting accumulator.
+ * @accum_inited:  Flag to prevent concurrent accumulator initialisation and/or
+ *                 termination. Set to true before accumulator initialisation,
+ *                 and false after accumulator termination.
+ *                  - Must only be modified while holding both accum_lock and
+ *                    state_lock.
+ *                  - Can be read while holding either lock.
+ * @accum:         Hardware counter accumulator structure.
+ */
+struct kbase_hwcnt_context {
+	const struct kbase_hwcnt_backend_interface *iface;
+	spinlock_t state_lock;
+	size_t disable_count;
+	struct mutex accum_lock;
+	bool accum_inited;
+	struct kbase_hwcnt_accumulator accum;
+};
+
+int kbase_hwcnt_context_init(
+	const struct kbase_hwcnt_backend_interface *iface,
+	struct kbase_hwcnt_context **out_hctx)
+{
+	struct kbase_hwcnt_context *hctx = NULL;
+
+	if (!iface || !out_hctx)
+		return -EINVAL;
+
+	hctx = kzalloc(sizeof(*hctx), GFP_KERNEL);
+	if (!hctx)
+		return -ENOMEM;
+
+	hctx->iface = iface;
+	spin_lock_init(&hctx->state_lock);
+	hctx->disable_count = 1;
+	mutex_init(&hctx->accum_lock);
+	hctx->accum_inited = false;
+
+	*out_hctx = hctx;
+
+	return 0;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_context_init);
+
+void kbase_hwcnt_context_term(struct kbase_hwcnt_context *hctx)
+{
+	if (!hctx)
+		return;
+
+	/* Make sure we didn't leak the accumulator */
+	WARN_ON(hctx->accum_inited);
+	kfree(hctx);
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_context_term);
+
+/**
+ * kbasep_hwcnt_accumulator_term() - Terminate the accumulator for the context.
+ * @hctx: Non-NULL pointer to hardware counter context.
+ */
+static void kbasep_hwcnt_accumulator_term(struct kbase_hwcnt_context *hctx)
+{
+	WARN_ON(!hctx);
+	WARN_ON(!hctx->accum_inited);
+
+	kbase_hwcnt_enable_map_free(&hctx->accum.scratch_map);
+	kbase_hwcnt_dump_buffer_free(&hctx->accum.accum_buf);
+	kbase_hwcnt_enable_map_free(&hctx->accum.enable_map);
+	hctx->iface->term(hctx->accum.backend);
+	memset(&hctx->accum, 0, sizeof(hctx->accum));
+}
+
+/**
+ * kbasep_hwcnt_accumulator_init() - Initialise the accumulator for the context.
+ * @hctx: Non-NULL pointer to hardware counter context.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_hwcnt_accumulator_init(struct kbase_hwcnt_context *hctx)
+{
+	int errcode;
+
+	WARN_ON(!hctx);
+	WARN_ON(!hctx->accum_inited);
+
+	errcode = hctx->iface->init(
+		hctx->iface->info, &hctx->accum.backend);
+	if (errcode)
+		goto error;
+
+	hctx->accum.state = ACCUM_STATE_ERROR;
+
+	errcode = kbase_hwcnt_enable_map_alloc(
+		hctx->iface->metadata, &hctx->accum.enable_map);
+	if (errcode)
+		goto error;
+
+	hctx->accum.enable_map_any_enabled = false;
+
+	errcode = kbase_hwcnt_dump_buffer_alloc(
+		hctx->iface->metadata, &hctx->accum.accum_buf);
+	if (errcode)
+		goto error;
+
+	errcode = kbase_hwcnt_enable_map_alloc(
+		hctx->iface->metadata, &hctx->accum.scratch_map);
+	if (errcode)
+		goto error;
+
+	hctx->accum.accumulated = false;
+
+	hctx->accum.ts_last_dump_ns =
+		hctx->iface->timestamp_ns(hctx->accum.backend);
+
+	return 0;
+
+error:
+	kbasep_hwcnt_accumulator_term(hctx);
+	return errcode;
+}
+
+/**
+ * kbasep_hwcnt_accumulator_disable() - Transition the accumulator into the
+ *                                      disabled state, from the enabled or
+ *                                      error states.
+ * @hctx:       Non-NULL pointer to hardware counter context.
+ * @accumulate: True if we should accumulate before disabling, else false.
+ */
+static void kbasep_hwcnt_accumulator_disable(
+	struct kbase_hwcnt_context *hctx, bool accumulate)
+{
+	int errcode = 0;
+	bool backend_enabled = false;
+	struct kbase_hwcnt_accumulator *accum;
+	unsigned long flags;
+
+	WARN_ON(!hctx);
+	lockdep_assert_held(&hctx->accum_lock);
+	WARN_ON(!hctx->accum_inited);
+
+	accum = &hctx->accum;
+
+	spin_lock_irqsave(&hctx->state_lock, flags);
+
+	WARN_ON(hctx->disable_count != 0);
+	WARN_ON(hctx->accum.state == ACCUM_STATE_DISABLED);
+
+	if ((hctx->accum.state == ACCUM_STATE_ENABLED) &&
+	    (accum->enable_map_any_enabled))
+		backend_enabled = true;
+
+	if (!backend_enabled)
+		hctx->accum.state = ACCUM_STATE_DISABLED;
+
+	spin_unlock_irqrestore(&hctx->state_lock, flags);
+
+	/* Early out if the backend is not already enabled */
+	if (!backend_enabled)
+		return;
+
+	if (!accumulate)
+		goto disable;
+
+	/* Try and accumulate before disabling */
+	errcode = hctx->iface->dump_request(accum->backend);
+	if (errcode)
+		goto disable;
+
+	errcode = hctx->iface->dump_wait(accum->backend);
+	if (errcode)
+		goto disable;
+
+	errcode = hctx->iface->dump_get(accum->backend,
+		&accum->accum_buf, &accum->enable_map, accum->accumulated);
+	if (errcode)
+		goto disable;
+
+	accum->accumulated = true;
+
+disable:
+	hctx->iface->dump_disable(accum->backend);
+
+	/* Regardless of any errors during the accumulate, put the accumulator
+	 * in the disabled state.
+	 */
+	spin_lock_irqsave(&hctx->state_lock, flags);
+
+	hctx->accum.state = ACCUM_STATE_DISABLED;
+
+	spin_unlock_irqrestore(&hctx->state_lock, flags);
+}
+
+/**
+ * kbasep_hwcnt_accumulator_enable() - Transition the accumulator into the
+ *                                     enabled state, from the disabled state.
+ * @hctx: Non-NULL pointer to hardware counter context.
+ */
+static void kbasep_hwcnt_accumulator_enable(struct kbase_hwcnt_context *hctx)
+{
+	int errcode = 0;
+	struct kbase_hwcnt_accumulator *accum;
+
+	WARN_ON(!hctx);
+	lockdep_assert_held(&hctx->state_lock);
+	WARN_ON(!hctx->accum_inited);
+	WARN_ON(hctx->accum.state != ACCUM_STATE_DISABLED);
+
+	accum = &hctx->accum;
+
+	/* The backend only needs enabling if any counters are enabled */
+	if (accum->enable_map_any_enabled)
+		errcode = hctx->iface->dump_enable_nolock(
+			accum->backend, &accum->enable_map);
+
+	if (!errcode)
+		accum->state = ACCUM_STATE_ENABLED;
+	else
+		accum->state = ACCUM_STATE_ERROR;
+}
+
+/**
+ * kbasep_hwcnt_accumulator_dump() - Perform a dump with the most up-to-date
+ *                                   values of enabled counters possible, and
+ *                                   optionally update the set of enabled
+ *                                   counters.
+ * @hctx :       Non-NULL pointer to the hardware counter context
+ * @ts_start_ns: Non-NULL pointer where the start timestamp of the dump will
+ *               be written out to on success
+ * @ts_end_ns:   Non-NULL pointer where the end timestamp of the dump will
+ *               be written out to on success
+ * @dump_buf:    Pointer to the buffer where the dump will be written out to on
+ *               success. If non-NULL, must have the same metadata as the
+ *               accumulator. If NULL, the dump will be discarded.
+ * @new_map:     Pointer to the new counter enable map. If non-NULL, must have
+ *               the same metadata as the accumulator. If NULL, the set of
+ *               enabled counters will be unchanged.
+ */
+static int kbasep_hwcnt_accumulator_dump(
+	struct kbase_hwcnt_context *hctx,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf,
+	const struct kbase_hwcnt_enable_map *new_map)
+{
+	int errcode = 0;
+	unsigned long flags;
+	enum kbase_hwcnt_accum_state state;
+	bool dump_requested = false;
+	bool dump_written = false;
+	bool cur_map_any_enabled;
+	struct kbase_hwcnt_enable_map *cur_map;
+	bool new_map_any_enabled = false;
+	u64 dump_time_ns;
+	struct kbase_hwcnt_accumulator *accum;
+
+	WARN_ON(!hctx);
+	WARN_ON(!ts_start_ns);
+	WARN_ON(!ts_end_ns);
+	WARN_ON(dump_buf && (dump_buf->metadata != hctx->iface->metadata));
+	WARN_ON(new_map && (new_map->metadata != hctx->iface->metadata));
+	WARN_ON(!hctx->accum_inited);
+	lockdep_assert_held(&hctx->accum_lock);
+
+	accum = &hctx->accum;
+	cur_map = &accum->scratch_map;
+
+	/* Save out info about the current enable map */
+	cur_map_any_enabled = accum->enable_map_any_enabled;
+	kbase_hwcnt_enable_map_copy(cur_map, &accum->enable_map);
+
+	if (new_map)
+		new_map_any_enabled =
+			kbase_hwcnt_enable_map_any_enabled(new_map);
+
+	/*
+	 * We're holding accum_lock, so the accumulator state might transition
+	 * from disabled to enabled during this function (as enabling is lock
+	 * free), but it will never disable (as disabling needs to hold the
+	 * accum_lock), nor will it ever transition from enabled to error (as
+	 * an enable while we're already enabled is impossible).
+	 *
+	 * If we're already disabled, we'll only look at the accumulation buffer
+	 * rather than do a real dump, so a concurrent enable does not affect
+	 * us.
+	 *
+	 * If a concurrent enable fails, we might transition to the error
+	 * state, but again, as we're only looking at the accumulation buffer,
+	 * it's not an issue.
+	 */
+	spin_lock_irqsave(&hctx->state_lock, flags);
+
+	state = accum->state;
+
+	/*
+	 * Update the new map now, such that if an enable occurs during this
+	 * dump then that enable will set the new map. If we're already enabled,
+	 * then we'll do it ourselves after the dump.
+	 */
+	if (new_map) {
+		kbase_hwcnt_enable_map_copy(
+			&accum->enable_map, new_map);
+		accum->enable_map_any_enabled = new_map_any_enabled;
+	}
+
+	spin_unlock_irqrestore(&hctx->state_lock, flags);
+
+	/* Error state, so early out. No need to roll back any map updates */
+	if (state == ACCUM_STATE_ERROR)
+		return -EIO;
+
+	/* Initiate the dump if the backend is enabled. */
+	if ((state == ACCUM_STATE_ENABLED) && cur_map_any_enabled) {
+		/* Disable pre-emption, to make the timestamp as accurate as
+		 * possible.
+		 */
+		preempt_disable();
+		{
+			dump_time_ns = hctx->iface->timestamp_ns(
+				accum->backend);
+			if (dump_buf) {
+				errcode = hctx->iface->dump_request(
+					accum->backend);
+				dump_requested = true;
+			} else {
+				errcode = hctx->iface->dump_clear(
+					accum->backend);
+			}
+		}
+		preempt_enable();
+		if (errcode)
+			goto error;
+	} else {
+		dump_time_ns = hctx->iface->timestamp_ns(accum->backend);
+	}
+
+	/* Copy any accumulation into the dest buffer */
+	if (accum->accumulated && dump_buf) {
+		kbase_hwcnt_dump_buffer_copy(
+			dump_buf, &accum->accum_buf, cur_map);
+		dump_written = true;
+	}
+
+	/* Wait for any requested dumps to complete */
+	if (dump_requested) {
+		WARN_ON(state != ACCUM_STATE_ENABLED);
+		errcode = hctx->iface->dump_wait(accum->backend);
+		if (errcode)
+			goto error;
+	}
+
+	/* If we're enabled and there's a new enable map, change the enabled set
+	 * as soon after the dump has completed as possible.
+	 */
+	if ((state == ACCUM_STATE_ENABLED) && new_map) {
+		/* Backend is only enabled if there were any enabled counters */
+		if (cur_map_any_enabled)
+			hctx->iface->dump_disable(accum->backend);
+
+		/* (Re-)enable the backend if the new map has enabled counters.
+		 * No need to acquire the spinlock, as concurrent enable while
+		 * we're already enabled and holding accum_lock is impossible.
+		 */
+		if (new_map_any_enabled) {
+			errcode = hctx->iface->dump_enable(
+				accum->backend, new_map);
+			if (errcode)
+				goto error;
+		}
+	}
+
+	/* Copy, accumulate, or zero into the dest buffer to finish */
+	if (dump_buf) {
+		/* If we dumped, copy or accumulate it into the destination */
+		if (dump_requested) {
+			WARN_ON(state != ACCUM_STATE_ENABLED);
+			errcode = hctx->iface->dump_get(
+				accum->backend,
+				dump_buf,
+				cur_map,
+				dump_written);
+			if (errcode)
+				goto error;
+			dump_written = true;
+		}
+
+		/* If we've not written anything into the dump buffer so far, it
+		 * means there was nothing to write. Zero any enabled counters.
+		 */
+		if (!dump_written)
+			kbase_hwcnt_dump_buffer_zero(dump_buf, cur_map);
+	}
+
+	/* Write out timestamps */
+	*ts_start_ns = accum->ts_last_dump_ns;
+	*ts_end_ns = dump_time_ns;
+
+	accum->accumulated = false;
+	accum->ts_last_dump_ns = dump_time_ns;
+
+	return 0;
+error:
+	/* An error was only physically possible if the backend was enabled */
+	WARN_ON(state != ACCUM_STATE_ENABLED);
+
+	/* Disable the backend, and transition to the error state */
+	hctx->iface->dump_disable(accum->backend);
+	spin_lock_irqsave(&hctx->state_lock, flags);
+
+	accum->state = ACCUM_STATE_ERROR;
+
+	spin_unlock_irqrestore(&hctx->state_lock, flags);
+
+	return errcode;
+}
+
+/**
+ * kbasep_hwcnt_context_disable() - Increment the disable count of the context.
+ * @hctx:       Non-NULL pointer to hardware counter context.
+ * @accumulate: True if we should accumulate before disabling, else false.
+ */
+static void kbasep_hwcnt_context_disable(
+	struct kbase_hwcnt_context *hctx, bool accumulate)
+{
+	unsigned long flags;
+
+	WARN_ON(!hctx);
+	lockdep_assert_held(&hctx->accum_lock);
+
+	if (!kbase_hwcnt_context_disable_atomic(hctx)) {
+		kbasep_hwcnt_accumulator_disable(hctx, accumulate);
+
+		spin_lock_irqsave(&hctx->state_lock, flags);
+
+		/* Atomic disable failed and we're holding the mutex, so current
+		 * disable count must be 0.
+		 */
+		WARN_ON(hctx->disable_count != 0);
+		hctx->disable_count++;
+
+		spin_unlock_irqrestore(&hctx->state_lock, flags);
+	}
+}
+
+int kbase_hwcnt_accumulator_acquire(
+	struct kbase_hwcnt_context *hctx,
+	struct kbase_hwcnt_accumulator **accum)
+{
+	int errcode = 0;
+	unsigned long flags;
+
+	if (!hctx || !accum)
+		return -EINVAL;
+
+	mutex_lock(&hctx->accum_lock);
+	spin_lock_irqsave(&hctx->state_lock, flags);
+
+	if (!hctx->accum_inited)
+		/* Set accum initing now to prevent concurrent init */
+		hctx->accum_inited = true;
+	else
+		/* Already have an accum, or already being inited */
+		errcode = -EBUSY;
+
+	spin_unlock_irqrestore(&hctx->state_lock, flags);
+	mutex_unlock(&hctx->accum_lock);
+
+	if (errcode)
+		return errcode;
+
+	errcode = kbasep_hwcnt_accumulator_init(hctx);
+
+	if (errcode) {
+		mutex_lock(&hctx->accum_lock);
+		spin_lock_irqsave(&hctx->state_lock, flags);
+
+		hctx->accum_inited = false;
+
+		spin_unlock_irqrestore(&hctx->state_lock, flags);
+		mutex_unlock(&hctx->accum_lock);
+
+		return errcode;
+	}
+
+	spin_lock_irqsave(&hctx->state_lock, flags);
+
+	WARN_ON(hctx->disable_count == 0);
+	WARN_ON(hctx->accum.enable_map_any_enabled);
+
+	/* Decrement the disable count to allow the accumulator to be accessible
+	 * now that it's fully constructed.
+	 */
+	hctx->disable_count--;
+
+	/*
+	 * Make sure the accumulator is initialised to the correct state.
+	 * Regardless of initial state, counters don't need to be enabled via
+	 * the backend, as the initial enable map has no enabled counters.
+	 */
+	hctx->accum.state = (hctx->disable_count == 0) ?
+		ACCUM_STATE_ENABLED :
+		ACCUM_STATE_DISABLED;
+
+	spin_unlock_irqrestore(&hctx->state_lock, flags);
+
+	*accum = &hctx->accum;
+
+	return 0;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_accumulator_acquire);
+
+void kbase_hwcnt_accumulator_release(struct kbase_hwcnt_accumulator *accum)
+{
+	unsigned long flags;
+	struct kbase_hwcnt_context *hctx;
+
+	if (!accum)
+		return;
+
+	hctx = container_of(accum, struct kbase_hwcnt_context, accum);
+
+	mutex_lock(&hctx->accum_lock);
+
+	/* Double release is a programming error */
+	WARN_ON(!hctx->accum_inited);
+
+	/* Disable the context to ensure the accumulator is inaccesible while
+	 * we're destroying it. This performs the corresponding disable count
+	 * increment to the decrement done during acquisition.
+	 */
+	kbasep_hwcnt_context_disable(hctx, false);
+
+	mutex_unlock(&hctx->accum_lock);
+
+	kbasep_hwcnt_accumulator_term(hctx);
+
+	mutex_lock(&hctx->accum_lock);
+	spin_lock_irqsave(&hctx->state_lock, flags);
+
+	hctx->accum_inited = false;
+
+	spin_unlock_irqrestore(&hctx->state_lock, flags);
+	mutex_unlock(&hctx->accum_lock);
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_accumulator_release);
+
+void kbase_hwcnt_context_disable(struct kbase_hwcnt_context *hctx)
+{
+	if (WARN_ON(!hctx))
+		return;
+
+	/* Try and atomically disable first, so we can avoid locking the mutex
+	 * if we don't need to.
+	 */
+	if (kbase_hwcnt_context_disable_atomic(hctx))
+		return;
+
+	mutex_lock(&hctx->accum_lock);
+
+	kbasep_hwcnt_context_disable(hctx, true);
+
+	mutex_unlock(&hctx->accum_lock);
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_context_disable);
+
+bool kbase_hwcnt_context_disable_atomic(struct kbase_hwcnt_context *hctx)
+{
+	unsigned long flags;
+	bool atomic_disabled = false;
+
+	if (WARN_ON(!hctx))
+		return false;
+
+	spin_lock_irqsave(&hctx->state_lock, flags);
+
+	if (!WARN_ON(hctx->disable_count == SIZE_MAX)) {
+		/*
+		 * If disable count is non-zero or no counters are enabled, we
+		 * can just bump the disable count.
+		 *
+		 * Otherwise, we can't disable in an atomic context.
+		 */
+		if (hctx->disable_count != 0) {
+			hctx->disable_count++;
+			atomic_disabled = true;
+		} else {
+			WARN_ON(!hctx->accum_inited);
+			if (!hctx->accum.enable_map_any_enabled) {
+				hctx->disable_count++;
+				hctx->accum.state = ACCUM_STATE_DISABLED;
+				atomic_disabled = true;
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&hctx->state_lock, flags);
+
+	return atomic_disabled;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_context_disable_atomic);
+
+void kbase_hwcnt_context_enable(struct kbase_hwcnt_context *hctx)
+{
+	unsigned long flags;
+
+	if (WARN_ON(!hctx))
+		return;
+
+	spin_lock_irqsave(&hctx->state_lock, flags);
+
+	if (!WARN_ON(hctx->disable_count == 0)) {
+		if (hctx->disable_count == 1)
+			kbasep_hwcnt_accumulator_enable(hctx);
+
+		hctx->disable_count--;
+	}
+
+	spin_unlock_irqrestore(&hctx->state_lock, flags);
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_context_enable);
+
+const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(
+	struct kbase_hwcnt_context *hctx)
+{
+	if (!hctx)
+		return NULL;
+
+	return hctx->iface->metadata;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_context_metadata);
+
+int kbase_hwcnt_accumulator_set_counters(
+	struct kbase_hwcnt_accumulator *accum,
+	const struct kbase_hwcnt_enable_map *new_map,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf)
+{
+	int errcode;
+	struct kbase_hwcnt_context *hctx;
+
+	if (!accum || !new_map || !ts_start_ns || !ts_end_ns)
+		return -EINVAL;
+
+	hctx = container_of(accum, struct kbase_hwcnt_context, accum);
+
+	if ((new_map->metadata != hctx->iface->metadata) ||
+	    (dump_buf && (dump_buf->metadata != hctx->iface->metadata)))
+		return -EINVAL;
+
+	mutex_lock(&hctx->accum_lock);
+
+	errcode = kbasep_hwcnt_accumulator_dump(
+		hctx, ts_start_ns, ts_end_ns, dump_buf, new_map);
+
+	mutex_unlock(&hctx->accum_lock);
+
+	return errcode;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_accumulator_set_counters);
+
+int kbase_hwcnt_accumulator_dump(
+	struct kbase_hwcnt_accumulator *accum,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf)
+{
+	int errcode;
+	struct kbase_hwcnt_context *hctx;
+
+	if (!accum || !ts_start_ns || !ts_end_ns)
+		return -EINVAL;
+
+	hctx = container_of(accum, struct kbase_hwcnt_context, accum);
+
+	if (dump_buf && (dump_buf->metadata != hctx->iface->metadata))
+		return -EINVAL;
+
+	mutex_lock(&hctx->accum_lock);
+
+	errcode = kbasep_hwcnt_accumulator_dump(
+		hctx, ts_start_ns, ts_end_ns, dump_buf, NULL);
+
+	mutex_unlock(&hctx->accum_lock);
+
+	return errcode;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_accumulator_dump);
diff --git a/mali_kbase/mali_kbase_hwcnt_accumulator.h b/mali_kbase/mali_kbase_hwcnt_accumulator.h
new file mode 100644
index 0000000..fc45743
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_accumulator.h
@@ -0,0 +1,137 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+/**
+ * Hardware counter accumulator API.
+ */
+
+#ifndef _KBASE_HWCNT_ACCUMULATOR_H_
+#define _KBASE_HWCNT_ACCUMULATOR_H_
+
+#include <linux/types.h>
+
+struct kbase_hwcnt_context;
+struct kbase_hwcnt_accumulator;
+struct kbase_hwcnt_enable_map;
+struct kbase_hwcnt_dump_buffer;
+
+/**
+ * kbase_hwcnt_accumulator_acquire() - Acquire the hardware counter accumulator
+ *                                     for a hardware counter context.
+ * @hctx:  Non-NULL pointer to a hardware counter context.
+ * @accum: Non-NULL pointer to where the pointer to the created accumulator
+ *         will be stored on success.
+ *
+ * There can exist at most one instance of the hardware counter accumulator per
+ * context at a time.
+ *
+ * If multiple clients need access to the hardware counters at the same time,
+ * then an abstraction built on top of the single instance to the hardware
+ * counter accumulator is required.
+ *
+ * No counters will be enabled with the returned accumulator. A subsequent call
+ * to kbase_hwcnt_accumulator_set_counters must be used to turn them on.
+ *
+ * There are four components to a hardware counter dump:
+ *  - A set of enabled counters
+ *  - A start time
+ *  - An end time
+ *  - A dump buffer containing the accumulated counter values for all enabled
+ *    counters between the start and end times.
+ *
+ * For each dump, it is guaranteed that all enabled counters were active for the
+ * entirety of the period between the start and end times.
+ *
+ * It is also guaranteed that the start time of dump "n" is always equal to the
+ * end time of dump "n - 1".
+ *
+ * For all dumps, the values of any counters that were not enabled is undefined.
+ *
+ * Return: 0 on success or error code.
+ */
+int kbase_hwcnt_accumulator_acquire(
+	struct kbase_hwcnt_context *hctx,
+	struct kbase_hwcnt_accumulator **accum);
+
+/**
+ * kbase_hwcnt_accumulator_release() - Release a hardware counter accumulator.
+ * @accum: Non-NULL pointer to the hardware counter accumulator.
+ *
+ * The accumulator must be released before the context the accumulator was
+ * created from is terminated.
+ */
+void kbase_hwcnt_accumulator_release(struct kbase_hwcnt_accumulator *accum);
+
+/**
+ * kbase_hwcnt_accumulator_set_counters() - Perform a dump of the currently
+ *                                          enabled counters, and enable a new
+ *                                          set of counters that will be used
+ *                                          for subsequent dumps.
+ * @accum:       Non-NULL pointer to the hardware counter accumulator.
+ * @new_map:     Non-NULL pointer to the new counter enable map. Must have the
+ *               same metadata as the accumulator.
+ * @ts_start_ns: Non-NULL pointer where the start timestamp of the dump will
+ *               be written out to on success.
+ * @ts_end_ns:   Non-NULL pointer where the end timestamp of the dump will
+ *               be written out to on success.
+ * @dump_buf:    Pointer to the buffer where the dump will be written out to on
+ *               success. If non-NULL, must have the same metadata as the
+ *               accumulator. If NULL, the dump will be discarded.
+ *
+ * If this function fails for some unexpected reason (i.e. anything other than
+ * invalid args), then the accumulator will be put into the error state until
+ * the parent context is next disabled.
+ *
+ * Return: 0 on success or error code.
+ */
+int kbase_hwcnt_accumulator_set_counters(
+	struct kbase_hwcnt_accumulator *accum,
+	const struct kbase_hwcnt_enable_map *new_map,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf);
+
+/**
+ * kbase_hwcnt_accumulator_dump() - Perform a dump of the currently enabled
+ *                                  counters.
+ * @accum:       Non-NULL pointer to the hardware counter accumulator.
+ * @ts_start_ns: Non-NULL pointer where the start timestamp of the dump will
+ *               be written out to on success.
+ * @ts_end_ns:   Non-NULL pointer where the end timestamp of the dump will
+ *               be written out to on success.
+ * @dump_buf:    Pointer to the buffer where the dump will be written out to on
+ *               success. If non-NULL, must have the same metadata as the
+ *               accumulator. If NULL, the dump will be discarded.
+ *
+ * If this function fails for some unexpected reason (i.e. anything other than
+ * invalid args), then the accumulator will be put into the error state until
+ * the parent context is next disabled.
+ *
+ * Return: 0 on success or error code.
+ */
+int kbase_hwcnt_accumulator_dump(
+	struct kbase_hwcnt_accumulator *accum,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf);
+
+#endif /* _KBASE_HWCNT_ACCUMULATOR_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_backend.h b/mali_kbase/mali_kbase_hwcnt_backend.h
new file mode 100644
index 0000000..b7aa0e1
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_backend.h
@@ -0,0 +1,217 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+/*
+ * Virtual interface for hardware counter backends.
+ */
+
+#ifndef _KBASE_HWCNT_BACKEND_H_
+#define _KBASE_HWCNT_BACKEND_H_
+
+#include <linux/types.h>
+
+struct kbase_hwcnt_metadata;
+struct kbase_hwcnt_enable_map;
+struct kbase_hwcnt_dump_buffer;
+
+/*
+ * struct kbase_hwcnt_backend_info - Opaque pointer to information used to
+ *                                   create an instance of a hardware counter
+ *                                   backend.
+ */
+struct kbase_hwcnt_backend_info;
+
+/*
+ * struct kbase_hwcnt_backend_info - Opaque pointer to a hardware counter
+ *                                   backend, used to perform dumps.
+ */
+struct kbase_hwcnt_backend;
+
+/**
+ * typedef kbase_hwcnt_backend_init_fn - Initialise a counter backend.
+ * @info:        Non-NULL pointer to backend info.
+ * @out_backend: Non-NULL pointer to where backend is stored on success.
+ *
+ * All uses of the created hardware counter backend must be externally
+ * synchronised.
+ *
+ * Return: 0 on success, else error code.
+ */
+typedef int (*kbase_hwcnt_backend_init_fn)(
+	const struct kbase_hwcnt_backend_info *info,
+	struct kbase_hwcnt_backend **out_backend);
+
+/**
+ * typedef kbase_hwcnt_backend_term_fn - Terminate a counter backend.
+ * @backend: Pointer to backend to be terminated.
+ */
+typedef void (*kbase_hwcnt_backend_term_fn)(
+	struct kbase_hwcnt_backend *backend);
+
+/**
+ * typedef kbase_hwcnt_backend_timestamp_ns_fn - Get the current backend
+ *                                               timestamp.
+ * @backend: Non-NULL pointer to backend.
+ *
+ * Return: Backend timestamp in nanoseconds.
+ */
+typedef u64 (*kbase_hwcnt_backend_timestamp_ns_fn)(
+	struct kbase_hwcnt_backend *backend);
+
+/**
+ * typedef kbase_hwcnt_backend_dump_enable_fn - Start counter dumping with the
+ *                                              backend.
+ * @backend:    Non-NULL pointer to backend.
+ * @enable_map: Non-NULL pointer to enable map specifying enabled counters.
+ *
+ * The enable_map must have been created using the interface's metadata.
+ * If the backend has already been enabled, an error is returned.
+ *
+ * May be called in an atomic context.
+ *
+ * Return: 0 on success, else error code.
+ */
+typedef int (*kbase_hwcnt_backend_dump_enable_fn)(
+	struct kbase_hwcnt_backend *backend,
+	const struct kbase_hwcnt_enable_map *enable_map);
+
+/**
+ * typedef kbase_hwcnt_backend_dump_enable_nolock_fn - Start counter dumping
+ *                                                     with the backend.
+ * @backend:    Non-NULL pointer to backend.
+ * @enable_map: Non-NULL pointer to enable map specifying enabled counters.
+ *
+ * Exactly the same as kbase_hwcnt_backend_dump_enable_fn(), except must be
+ * called in an atomic context with the spinlock documented by the specific
+ * backend interface held.
+ *
+ * Return: 0 on success, else error code.
+ */
+typedef int (*kbase_hwcnt_backend_dump_enable_nolock_fn)(
+	struct kbase_hwcnt_backend *backend,
+	const struct kbase_hwcnt_enable_map *enable_map);
+
+/**
+ * typedef kbase_hwcnt_backend_dump_disable_fn - Disable counter dumping with
+ *                                               the backend.
+ * @backend: Non-NULL pointer to backend.
+ *
+ * If the backend is already disabled, does nothing.
+ * Any undumped counter values since the last dump get will be lost.
+ */
+typedef void (*kbase_hwcnt_backend_dump_disable_fn)(
+	struct kbase_hwcnt_backend *backend);
+
+/**
+ * typedef kbase_hwcnt_backend_dump_clear_fn - Reset all the current undumped
+ *                                             counters.
+ * @backend: Non-NULL pointer to backend.
+ *
+ * If the backend is not enabled, returns an error.
+ *
+ * Return: 0 on success, else error code.
+ */
+typedef int (*kbase_hwcnt_backend_dump_clear_fn)(
+	struct kbase_hwcnt_backend *backend);
+
+/**
+ * typedef kbase_hwcnt_backend_dump_request_fn - Request an asynchronous counter
+ *                                               dump.
+ * @backend: Non-NULL pointer to backend.
+ *
+ * If the backend is not enabled or another dump is already in progress,
+ * returns an error.
+ *
+ * Return: 0 on success, else error code.
+ */
+typedef int (*kbase_hwcnt_backend_dump_request_fn)(
+	struct kbase_hwcnt_backend *backend);
+
+/**
+ * typedef kbase_hwcnt_backend_dump_wait_fn - Wait until the last requested
+ *                                            counter dump has completed.
+ * @backend: Non-NULL pointer to backend.
+ *
+ * If the backend is not enabled, returns an error.
+ *
+ * Return: 0 on success, else error code.
+ */
+typedef int (*kbase_hwcnt_backend_dump_wait_fn)(
+	struct kbase_hwcnt_backend *backend);
+
+/**
+ * typedef kbase_hwcnt_backend_dump_get_fn - Copy or accumulate enable the
+ *                                           counters dumped after the last dump
+ *                                           request into the dump buffer.
+ * @backend:     Non-NULL pointer to backend.
+ * @dump_buffer: Non-NULL pointer to destination dump buffer.
+ * @enable_map:  Non-NULL pointer to enable map specifying enabled values.
+ * @accumulate:  True if counters should be accumulated into dump_buffer, rather
+ *               than copied.
+ *
+ * If the backend is not enabled, returns an error.
+ * If a dump is in progress (i.e. dump_wait has not yet returned successfully)
+ * then the resultant contents of the dump buffer will be undefined.
+ *
+ * Return: 0 on success, else error code.
+ */
+typedef int (*kbase_hwcnt_backend_dump_get_fn)(
+	struct kbase_hwcnt_backend *backend,
+	struct kbase_hwcnt_dump_buffer *dump_buffer,
+	const struct kbase_hwcnt_enable_map *enable_map,
+	bool accumulate);
+
+/**
+ * struct kbase_hwcnt_backend_interface - Hardware counter backend virtual
+ *                                        interface.
+ * @metadata:           Immutable hardware counter metadata.
+ * @info:               Immutable info used to initialise an instance of the
+ *                      backend.
+ * @init:               Function ptr to initialise an instance of the backend.
+ * @term:               Function ptr to terminate an instance of the backend.
+ * @timestamp_ns:       Function ptr to get the current backend timestamp.
+ * @dump_enable:        Function ptr to enable dumping.
+ * @dump_enable_nolock: Function ptr to enable dumping while the
+ *                      backend-specific spinlock is already held.
+ * @dump_disable:       Function ptr to disable dumping.
+ * @dump_clear:         Function ptr to clear counters.
+ * @dump_request:       Function ptr to request a dump.
+ * @dump_wait:          Function ptr to wait until dump to complete.
+ * @dump_get:           Function ptr to copy or accumulate dump into a dump
+ *                      buffer.
+ */
+struct kbase_hwcnt_backend_interface {
+	const struct kbase_hwcnt_metadata *metadata;
+	const struct kbase_hwcnt_backend_info *info;
+	kbase_hwcnt_backend_init_fn init;
+	kbase_hwcnt_backend_term_fn term;
+	kbase_hwcnt_backend_timestamp_ns_fn timestamp_ns;
+	kbase_hwcnt_backend_dump_enable_fn dump_enable;
+	kbase_hwcnt_backend_dump_enable_nolock_fn dump_enable_nolock;
+	kbase_hwcnt_backend_dump_disable_fn dump_disable;
+	kbase_hwcnt_backend_dump_clear_fn dump_clear;
+	kbase_hwcnt_backend_dump_request_fn dump_request;
+	kbase_hwcnt_backend_dump_wait_fn dump_wait;
+	kbase_hwcnt_backend_dump_get_fn dump_get;
+};
+
+#endif /* _KBASE_HWCNT_BACKEND_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_gpu.c b/mali_kbase/mali_kbase_hwcnt_backend_gpu.c
new file mode 100644
index 0000000..4bc8916
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_backend_gpu.c
@@ -0,0 +1,538 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+#include "mali_kbase_hwcnt_backend_gpu.h"
+#include "mali_kbase_hwcnt_gpu.h"
+#include "mali_kbase_hwcnt_types.h"
+#include "mali_kbase.h"
+#include "mali_kbase_pm_policy.h"
+#include "mali_kbase_hwaccess_instr.h"
+#include "mali_kbase_tlstream.h"
+#ifdef CONFIG_MALI_NO_MALI
+#include "backend/gpu/mali_kbase_model_dummy.h"
+#endif
+
+/**
+ * struct kbase_hwcnt_backend_gpu_info - Information used to create an instance
+ *                                       of a GPU hardware counter backend.
+ * @kbdev:         KBase device.
+ * @use_secondary: True if secondary performance counters should be used,
+ *                 else false. Ignored if secondary counters are not supported.
+ * @metadata:      Hardware counter metadata.
+ * @dump_bytes:    Bytes of GPU memory required to perform a
+ *                 hardware counter dump.
+ */
+struct kbase_hwcnt_backend_gpu_info {
+	struct kbase_device *kbdev;
+	bool use_secondary;
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t dump_bytes;
+};
+
+/**
+ * struct kbase_hwcnt_backend_gpu - Instance of a GPU hardware counter backend.
+ * @info:         Info used to create the backend.
+ * @kctx:         KBase context used for GPU memory allocation and
+ *                counter dumping.
+ * @kctx_element: List element used to add kctx to device context list.
+ * @gpu_dump_va:  GPU hardware counter dump buffer virtual address.
+ * @cpu_dump_va:  CPU mapping of gpu_dump_va.
+ * @vmap:         Dump buffer vmap.
+ * @enabled:      True if dumping has been enabled, else false.
+ */
+struct kbase_hwcnt_backend_gpu {
+	const struct kbase_hwcnt_backend_gpu_info *info;
+	struct kbase_context *kctx;
+	struct kbasep_kctx_list_element *kctx_element;
+	u64 gpu_dump_va;
+	void *cpu_dump_va;
+	struct kbase_vmap_struct *vmap;
+	bool enabled;
+};
+
+/* GPU backend implementation of kbase_hwcnt_backend_timestamp_ns_fn */
+static u64 kbasep_hwcnt_backend_gpu_timestamp_ns(
+	struct kbase_hwcnt_backend *backend)
+{
+	struct timespec ts;
+
+	(void)backend;
+	getrawmonotonic(&ts);
+	return (u64)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
+}
+
+/* GPU backend implementation of kbase_hwcnt_backend_dump_enable_nolock_fn */
+static int kbasep_hwcnt_backend_gpu_dump_enable_nolock(
+	struct kbase_hwcnt_backend *backend,
+	const struct kbase_hwcnt_enable_map *enable_map)
+{
+	int errcode;
+	struct kbase_hwcnt_backend_gpu *backend_gpu =
+		(struct kbase_hwcnt_backend_gpu *)backend;
+	struct kbase_context *kctx;
+	struct kbase_device *kbdev;
+	struct kbase_hwcnt_physical_enable_map phys;
+	struct kbase_instr_hwcnt_enable enable;
+
+	if (!backend_gpu || !enable_map || backend_gpu->enabled ||
+	    (enable_map->metadata != backend_gpu->info->metadata))
+		return -EINVAL;
+
+	kctx = backend_gpu->kctx;
+	kbdev = backend_gpu->kctx->kbdev;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	kbase_hwcnt_gpu_enable_map_to_physical(&phys, enable_map);
+
+	enable.jm_bm = phys.jm_bm;
+	enable.shader_bm = phys.shader_bm;
+	enable.tiler_bm = phys.tiler_bm;
+	enable.mmu_l2_bm = phys.mmu_l2_bm;
+	enable.use_secondary = backend_gpu->info->use_secondary;
+	enable.dump_buffer = backend_gpu->gpu_dump_va;
+	enable.dump_buffer_bytes = backend_gpu->info->dump_bytes;
+
+	errcode = kbase_instr_hwcnt_enable_internal(kbdev, kctx, &enable);
+	if (errcode)
+		goto error;
+
+	backend_gpu->enabled = true;
+
+	return 0;
+error:
+	return errcode;
+}
+
+/* GPU backend implementation of kbase_hwcnt_backend_dump_enable_fn */
+static int kbasep_hwcnt_backend_gpu_dump_enable(
+	struct kbase_hwcnt_backend *backend,
+	const struct kbase_hwcnt_enable_map *enable_map)
+{
+	unsigned long flags;
+	int errcode;
+	struct kbase_hwcnt_backend_gpu *backend_gpu =
+		(struct kbase_hwcnt_backend_gpu *)backend;
+	struct kbase_device *kbdev;
+
+	if (!backend_gpu)
+		return -EINVAL;
+
+	kbdev = backend_gpu->kctx->kbdev;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+
+	errcode = kbasep_hwcnt_backend_gpu_dump_enable_nolock(
+		backend, enable_map);
+
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	return errcode;
+}
+
+/* GPU backend implementation of kbase_hwcnt_backend_dump_disable_fn */
+static void kbasep_hwcnt_backend_gpu_dump_disable(
+	struct kbase_hwcnt_backend *backend)
+{
+	int errcode;
+	struct kbase_hwcnt_backend_gpu *backend_gpu =
+		(struct kbase_hwcnt_backend_gpu *)backend;
+
+	if (WARN_ON(!backend_gpu) || !backend_gpu->enabled)
+		return;
+
+	errcode = kbase_instr_hwcnt_disable_internal(backend_gpu->kctx);
+	WARN_ON(errcode);
+
+	backend_gpu->enabled = false;
+}
+
+/* GPU backend implementation of kbase_hwcnt_backend_dump_clear_fn */
+static int kbasep_hwcnt_backend_gpu_dump_clear(
+	struct kbase_hwcnt_backend *backend)
+{
+	struct kbase_hwcnt_backend_gpu *backend_gpu =
+		(struct kbase_hwcnt_backend_gpu *)backend;
+
+	if (!backend_gpu || !backend_gpu->enabled)
+		return -EINVAL;
+
+	return kbase_instr_hwcnt_clear(backend_gpu->kctx);
+}
+
+/* GPU backend implementation of kbase_hwcnt_backend_dump_request_fn */
+static int kbasep_hwcnt_backend_gpu_dump_request(
+	struct kbase_hwcnt_backend *backend)
+{
+	struct kbase_hwcnt_backend_gpu *backend_gpu =
+		(struct kbase_hwcnt_backend_gpu *)backend;
+
+	if (!backend_gpu || !backend_gpu->enabled)
+		return -EINVAL;
+
+	return kbase_instr_hwcnt_request_dump(backend_gpu->kctx);
+}
+
+/* GPU backend implementation of kbase_hwcnt_backend_dump_wait_fn */
+static int kbasep_hwcnt_backend_gpu_dump_wait(
+	struct kbase_hwcnt_backend *backend)
+{
+	struct kbase_hwcnt_backend_gpu *backend_gpu =
+		(struct kbase_hwcnt_backend_gpu *)backend;
+
+	if (!backend_gpu || !backend_gpu->enabled)
+		return -EINVAL;
+
+	return kbase_instr_hwcnt_wait_for_dump(backend_gpu->kctx);
+}
+
+/* GPU backend implementation of kbase_hwcnt_backend_dump_get_fn */
+static int kbasep_hwcnt_backend_gpu_dump_get(
+	struct kbase_hwcnt_backend *backend,
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_enable_map *dst_enable_map,
+	bool accumulate)
+{
+	struct kbase_hwcnt_backend_gpu *backend_gpu =
+		(struct kbase_hwcnt_backend_gpu *)backend;
+
+	if (!backend_gpu || !dst || !dst_enable_map ||
+	    (backend_gpu->info->metadata != dst->metadata) ||
+	    (dst_enable_map->metadata != dst->metadata))
+		return -EINVAL;
+
+	/* Invalidate the kernel buffer before reading from it. */
+	kbase_sync_mem_regions(
+		backend_gpu->kctx, backend_gpu->vmap, KBASE_SYNC_TO_CPU);
+
+	return kbase_hwcnt_gpu_dump_get(
+		dst, backend_gpu->cpu_dump_va, dst_enable_map, accumulate);
+}
+
+/**
+ * kbasep_hwcnt_backend_gpu_dump_alloc() - Allocate a GPU dump buffer.
+ * @info:        Non-NULL pointer to GPU backend info.
+ * @kctx:        Non-NULL pointer to kbase context.
+ * @gpu_dump_va: Non-NULL pointer to where GPU dump buffer virtual address
+ *               is stored on success.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_hwcnt_backend_gpu_dump_alloc(
+	const struct kbase_hwcnt_backend_gpu_info *info,
+	struct kbase_context *kctx,
+	u64 *gpu_dump_va)
+{
+	struct kbase_va_region *reg;
+	u64 flags;
+	u64 nr_pages;
+
+	WARN_ON(!info);
+	WARN_ON(!kctx);
+	WARN_ON(!gpu_dump_va);
+
+	flags = BASE_MEM_PROT_CPU_RD |
+		BASE_MEM_PROT_GPU_WR |
+		BASE_MEM_PERMANENT_KERNEL_MAPPING |
+		BASE_MEM_CACHED_CPU;
+
+	if (kctx->kbdev->mmu_mode->flags & KBASE_MMU_MODE_HAS_NON_CACHEABLE)
+		flags |= BASE_MEM_UNCACHED_GPU;
+
+	nr_pages = PFN_UP(info->dump_bytes);
+
+	reg = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, gpu_dump_va);
+
+	if (!reg)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * kbasep_hwcnt_backend_gpu_dump_free() - Free an allocated GPU dump buffer.
+ * @kctx:        Non-NULL pointer to kbase context.
+ * @gpu_dump_va: GPU dump buffer virtual address.
+ */
+static void kbasep_hwcnt_backend_gpu_dump_free(
+	struct kbase_context *kctx,
+	u64 gpu_dump_va)
+{
+	WARN_ON(!kctx);
+	if (gpu_dump_va)
+		kbase_mem_free(kctx, gpu_dump_va);
+}
+
+/**
+ * kbasep_hwcnt_backend_gpu_destroy() - Destroy a GPU backend.
+ * @backend: Pointer to GPU backend to destroy.
+ *
+ * Can be safely called on a backend in any state of partial construction.
+ */
+static void kbasep_hwcnt_backend_gpu_destroy(
+	struct kbase_hwcnt_backend_gpu *backend)
+{
+	if (!backend)
+		return;
+
+	if (backend->kctx) {
+		struct kbase_context *kctx = backend->kctx;
+		struct kbase_device *kbdev = kctx->kbdev;
+
+		if (backend->cpu_dump_va)
+			kbase_phy_alloc_mapping_put(kctx, backend->vmap);
+
+		if (backend->gpu_dump_va)
+			kbasep_hwcnt_backend_gpu_dump_free(
+				kctx, backend->gpu_dump_va);
+
+		if (backend->kctx_element) {
+			mutex_lock(&kbdev->kctx_list_lock);
+
+			KBASE_TLSTREAM_TL_DEL_CTX(kctx);
+			list_del(&backend->kctx_element->link);
+
+			mutex_unlock(&kbdev->kctx_list_lock);
+			kfree(backend->kctx_element);
+		}
+
+		kbasep_js_release_privileged_ctx(kbdev, kctx);
+		kbase_destroy_context(kctx);
+	}
+
+	kfree(backend);
+}
+
+/**
+ * kbasep_hwcnt_backend_gpu_create() - Create a GPU backend.
+ * @info:        Non-NULL pointer to backend info.
+ * @out_backend: Non-NULL pointer to where backend is stored on success.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_hwcnt_backend_gpu_create(
+	const struct kbase_hwcnt_backend_gpu_info *info,
+	struct kbase_hwcnt_backend_gpu **out_backend)
+{
+	int errcode;
+	struct kbase_device *kbdev;
+	struct kbase_hwcnt_backend_gpu *backend = NULL;
+
+	WARN_ON(!info);
+	WARN_ON(!out_backend);
+
+	kbdev = info->kbdev;
+
+	backend = kzalloc(sizeof(*backend), GFP_KERNEL);
+	if (!backend)
+		goto alloc_error;
+
+	backend->info = info;
+
+	backend->kctx = kbase_create_context(kbdev, true);
+	if (!backend->kctx)
+		goto alloc_error;
+
+	kbasep_js_schedule_privileged_ctx(kbdev, backend->kctx);
+
+	backend->kctx_element = kzalloc(
+		sizeof(*backend->kctx_element), GFP_KERNEL);
+	if (!backend->kctx_element)
+		goto alloc_error;
+
+	backend->kctx_element->kctx = backend->kctx;
+
+	/* Add kernel context to list of contexts associated with device. */
+	mutex_lock(&kbdev->kctx_list_lock);
+
+	list_add(&backend->kctx_element->link, &kbdev->kctx_list);
+	/* Fire tracepoint while lock is held, to ensure tracepoint is not
+	 * created in both body and summary stream
+	 */
+	KBASE_TLSTREAM_TL_NEW_CTX(
+		backend->kctx, backend->kctx->id, (u32)(backend->kctx->tgid));
+
+	mutex_unlock(&kbdev->kctx_list_lock);
+
+	errcode = kbasep_hwcnt_backend_gpu_dump_alloc(
+		info, backend->kctx, &backend->gpu_dump_va);
+	if (errcode)
+		goto error;
+
+	backend->cpu_dump_va = kbase_phy_alloc_mapping_get(backend->kctx,
+		backend->gpu_dump_va, &backend->vmap);
+	if (!backend->cpu_dump_va)
+		goto alloc_error;
+
+#ifdef CONFIG_MALI_NO_MALI
+	/* The dummy model needs the CPU mapping. */
+	gpu_model_set_dummy_prfcnt_base_cpu(backend->cpu_dump_va);
+#endif
+
+	*out_backend = backend;
+	return 0;
+
+alloc_error:
+	errcode = -ENOMEM;
+error:
+	kbasep_hwcnt_backend_gpu_destroy(backend);
+	return errcode;
+}
+
+/* GPU backend implementation of kbase_hwcnt_backend_init_fn */
+static int kbasep_hwcnt_backend_gpu_init(
+	const struct kbase_hwcnt_backend_info *info,
+	struct kbase_hwcnt_backend **out_backend)
+{
+	int errcode;
+	struct kbase_hwcnt_backend_gpu *backend = NULL;
+
+	if (!info || !out_backend)
+		return -EINVAL;
+
+	errcode = kbasep_hwcnt_backend_gpu_create(
+		(const struct kbase_hwcnt_backend_gpu_info *) info, &backend);
+	if (errcode)
+		return errcode;
+
+	*out_backend = (struct kbase_hwcnt_backend *)backend;
+
+	return 0;
+}
+
+/* GPU backend implementation of kbase_hwcnt_backend_term_fn */
+static void kbasep_hwcnt_backend_gpu_term(struct kbase_hwcnt_backend *backend)
+{
+	if (!backend)
+		return;
+
+	kbasep_hwcnt_backend_gpu_dump_disable(backend);
+	kbasep_hwcnt_backend_gpu_destroy(
+		(struct kbase_hwcnt_backend_gpu *)backend);
+}
+
+/**
+ * kbasep_hwcnt_backend_gpu_info_destroy() - Destroy a GPU backend info.
+ * @info: Pointer to info to destroy.
+ *
+ * Can be safely called on a backend info in any state of partial construction.
+ */
+static void kbasep_hwcnt_backend_gpu_info_destroy(
+	const struct kbase_hwcnt_backend_gpu_info *info)
+{
+	if (!info)
+		return;
+
+	kbase_hwcnt_gpu_metadata_destroy(info->metadata);
+	kfree(info);
+}
+
+/**
+ * kbasep_hwcnt_backend_gpu_info_create() - Create a GPU backend info.
+ * @kbdev: Non_NULL pointer to kbase device.
+ * @out_info: Non-NULL pointer to where info is stored on success.
+ *
+ * Return 0 on success, else error code.
+ */
+static int kbasep_hwcnt_backend_gpu_info_create(
+	struct kbase_device *kbdev,
+	const struct kbase_hwcnt_backend_gpu_info **out_info)
+{
+	int errcode = -ENOMEM;
+	struct kbase_hwcnt_gpu_info hwcnt_gpu_info;
+	struct kbase_hwcnt_backend_gpu_info *info = NULL;
+
+	WARN_ON(!kbdev);
+	WARN_ON(!out_info);
+
+	errcode = kbase_hwcnt_gpu_info_init(kbdev, &hwcnt_gpu_info);
+	if (errcode)
+		return errcode;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		goto error;
+
+	info->kbdev = kbdev;
+
+#ifdef CONFIG_MALI_PRFCNT_SET_SECONDARY
+	info->use_secondary = true;
+#else
+	info->use_secondary = false;
+#endif
+
+	errcode = kbase_hwcnt_gpu_metadata_create(
+		&hwcnt_gpu_info, info->use_secondary,
+		&info->metadata,
+		&info->dump_bytes);
+	if (errcode)
+		goto error;
+
+	*out_info = info;
+
+	return 0;
+error:
+	kbasep_hwcnt_backend_gpu_info_destroy(info);
+	return errcode;
+}
+
+int kbase_hwcnt_backend_gpu_create(
+	struct kbase_device *kbdev,
+	struct kbase_hwcnt_backend_interface *iface)
+{
+	int errcode;
+	const struct kbase_hwcnt_backend_gpu_info *info = NULL;
+
+	if (!kbdev || !iface)
+		return -EINVAL;
+
+	errcode = kbasep_hwcnt_backend_gpu_info_create(kbdev, &info);
+
+	if (errcode)
+		return errcode;
+
+	iface->metadata = info->metadata;
+	iface->info = (struct kbase_hwcnt_backend_info *)info;
+	iface->init = kbasep_hwcnt_backend_gpu_init;
+	iface->term = kbasep_hwcnt_backend_gpu_term;
+	iface->timestamp_ns = kbasep_hwcnt_backend_gpu_timestamp_ns;
+	iface->dump_enable = kbasep_hwcnt_backend_gpu_dump_enable;
+	iface->dump_enable_nolock = kbasep_hwcnt_backend_gpu_dump_enable_nolock;
+	iface->dump_disable = kbasep_hwcnt_backend_gpu_dump_disable;
+	iface->dump_clear = kbasep_hwcnt_backend_gpu_dump_clear;
+	iface->dump_request = kbasep_hwcnt_backend_gpu_dump_request;
+	iface->dump_wait = kbasep_hwcnt_backend_gpu_dump_wait;
+	iface->dump_get = kbasep_hwcnt_backend_gpu_dump_get;
+
+	return 0;
+}
+
+void kbase_hwcnt_backend_gpu_destroy(
+	struct kbase_hwcnt_backend_interface *iface)
+{
+	if (!iface)
+		return;
+
+	kbasep_hwcnt_backend_gpu_info_destroy(
+		(const struct kbase_hwcnt_backend_gpu_info *)iface->info);
+	memset(iface, 0, sizeof(*iface));
+}
diff --git a/mali_kbase/mali_kbase_hwcnt_backend_gpu.h b/mali_kbase/mali_kbase_hwcnt_backend_gpu.h
new file mode 100644
index 0000000..7712f14
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_backend_gpu.h
@@ -0,0 +1,61 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+/**
+ * Concrete implementation of mali_kbase_hwcnt_backend interface for GPU
+ * backend.
+ */
+
+#ifndef _KBASE_HWCNT_BACKEND_GPU_H_
+#define _KBASE_HWCNT_BACKEND_GPU_H_
+
+#include "mali_kbase_hwcnt_backend.h"
+
+struct kbase_device;
+
+/**
+ * kbase_hwcnt_backend_gpu_create() - Create a GPU hardware counter backend
+ *                                    interface.
+ * @kbdev: Non-NULL pointer to kbase device.
+ * @iface: Non-NULL pointer to backend interface structure that is filled in
+ *             on creation success.
+ *
+ * Calls to iface->dump_enable_nolock() require kbdev->hwaccess_lock held.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_backend_gpu_create(
+	struct kbase_device *kbdev,
+	struct kbase_hwcnt_backend_interface *iface);
+
+/**
+ * kbase_hwcnt_backend_gpu_destroy() - Destroy a GPU hardware counter backend
+ *                                     interface.
+ * @iface: Pointer to interface to destroy.
+ *
+ * Can be safely called on an all-zeroed interface, or on an already destroyed
+ * interface.
+ */
+void kbase_hwcnt_backend_gpu_destroy(
+	struct kbase_hwcnt_backend_interface *iface);
+
+#endif /* _KBASE_HWCNT_BACKEND_GPU_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_context.h b/mali_kbase/mali_kbase_hwcnt_context.h
new file mode 100644
index 0000000..bc50ad1
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_context.h
@@ -0,0 +1,119 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+/**
+ * Hardware counter context API.
+ */
+
+#ifndef _KBASE_HWCNT_CONTEXT_H_
+#define _KBASE_HWCNT_CONTEXT_H_
+
+#include <linux/types.h>
+
+struct kbase_hwcnt_backend_interface;
+struct kbase_hwcnt_context;
+
+/**
+ * kbase_hwcnt_context_init() - Initialise a hardware counter context.
+ * @iface:    Non-NULL pointer to a hardware counter backend interface.
+ * @out_hctx: Non-NULL pointer to where the pointer to the created context will
+ *            be stored on success.
+ *
+ * On creation, the disable count of the context will be 0.
+ * A hardware counter accumulator can be acquired using a created context.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_context_init(
+	const struct kbase_hwcnt_backend_interface *iface,
+	struct kbase_hwcnt_context **out_hctx);
+
+/**
+ * kbase_hwcnt_context_term() - Terminate a hardware counter context.
+ * @hctx: Pointer to context to be terminated.
+ */
+void kbase_hwcnt_context_term(struct kbase_hwcnt_context *hctx);
+
+/**
+ * kbase_hwcnt_context_metadata() - Get the hardware counter metadata used by
+ *                                  the context, so related counter data
+ *                                  structures can be created.
+ * @hctx: Non-NULL pointer to the hardware counter context.
+ *
+ * Return: Non-NULL pointer to metadata, or NULL on error.
+ */
+const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(
+	struct kbase_hwcnt_context *hctx);
+
+/**
+ * kbase_hwcnt_context_disable() - Increment the disable count of the context.
+ * @hctx: Pointer to the hardware counter context.
+ *
+ * If a call to this function increments the disable count from 0 to 1, and
+ * an accumulator has been acquired, then a counter dump will be performed
+ * before counters are disabled via the backend interface.
+ *
+ * Subsequent dumps via the accumulator while counters are disabled will first
+ * return the accumulated dump, then will return dumps with zeroed counters.
+ *
+ * After this function call returns, it is guaranteed that counters will not be
+ * enabled via the backend interface.
+ */
+void kbase_hwcnt_context_disable(struct kbase_hwcnt_context *hctx);
+
+/**
+ * kbase_hwcnt_context_disable_atomic() - Increment the disable count of the
+ *                                        context if possible in an atomic
+ *                                        context.
+ * @hctx: Pointer to the hardware counter context.
+ *
+ * This function will only succeed if hardware counters are effectively already
+ * disabled, i.e. there is no accumulator, the disable count is already
+ * non-zero, or the accumulator has no counters set.
+ *
+ * After this function call returns true, it is guaranteed that counters will
+ * not be enabled via the backend interface.
+ *
+ * Return: True if the disable count was incremented, else False.
+ */
+bool kbase_hwcnt_context_disable_atomic(struct kbase_hwcnt_context *hctx);
+
+/**
+ * kbase_hwcnt_context_enable() - Decrement the disable count of the context.
+ * @hctx: Pointer to the hardware counter context.
+ *
+ * If a call to this function decrements the disable count from 1 to 0, and
+ * an accumulator has been acquired, then counters will be re-enabled via the
+ * backend interface.
+ *
+ * If an accumulator has been acquired and enabling counters fails for some
+ * reason, the accumulator will be placed into an error state.
+ *
+ * It is only valid to call this function one time for each prior returned call
+ * to kbase_hwcnt_context_disable.
+ *
+ * The spinlock documented in the backend interface that was passed in to
+ * kbase_hwcnt_context_init() must be held before calling this function.
+ */
+void kbase_hwcnt_context_enable(struct kbase_hwcnt_context *hctx);
+
+#endif /* _KBASE_HWCNT_CONTEXT_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_gpu.c b/mali_kbase/mali_kbase_hwcnt_gpu.c
new file mode 100644
index 0000000..647d3ec
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_gpu.c
@@ -0,0 +1,716 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+#include "mali_kbase_hwcnt_gpu.h"
+#include "mali_kbase_hwcnt_types.h"
+#include "mali_kbase.h"
+#ifdef CONFIG_MALI_NO_MALI
+#include "backend/gpu/mali_kbase_model_dummy.h"
+#endif
+
+#define KBASE_HWCNT_V4_BLOCKS_PER_GROUP 8
+#define KBASE_HWCNT_V4_SC_BLOCKS_PER_GROUP 4
+#define KBASE_HWCNT_V4_MAX_GROUPS \
+	(KBASE_HWCNT_AVAIL_MASK_BITS / KBASE_HWCNT_V4_BLOCKS_PER_GROUP)
+#define KBASE_HWCNT_V4_HEADERS_PER_BLOCK 4
+#define KBASE_HWCNT_V4_COUNTERS_PER_BLOCK 60
+#define KBASE_HWCNT_V4_VALUES_PER_BLOCK \
+	(KBASE_HWCNT_V4_HEADERS_PER_BLOCK + KBASE_HWCNT_V4_COUNTERS_PER_BLOCK)
+/* Index of the PRFCNT_EN header into a V4 counter block */
+#define KBASE_HWCNT_V4_PRFCNT_EN_HEADER 2
+
+#define KBASE_HWCNT_V5_BLOCK_TYPE_COUNT 4
+#define KBASE_HWCNT_V5_HEADERS_PER_BLOCK 4
+#define KBASE_HWCNT_V5_COUNTERS_PER_BLOCK 60
+#define KBASE_HWCNT_V5_VALUES_PER_BLOCK \
+	(KBASE_HWCNT_V5_HEADERS_PER_BLOCK + KBASE_HWCNT_V5_COUNTERS_PER_BLOCK)
+/* Index of the PRFCNT_EN header into a V5 counter block */
+#define KBASE_HWCNT_V5_PRFCNT_EN_HEADER 2
+
+/**
+ * kbasep_hwcnt_backend_gpu_metadata_v4_create() - Create hardware counter
+ *                                                 metadata for a v4 GPU.
+ * @v4_info:  Non-NULL pointer to hwcnt info for a v4 GPU.
+ * @metadata: Non-NULL pointer to where created metadata is stored on success.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_hwcnt_backend_gpu_metadata_v4_create(
+	const struct kbase_hwcnt_gpu_v4_info *v4_info,
+	const struct kbase_hwcnt_metadata **metadata)
+{
+	size_t grp;
+	int errcode = -ENOMEM;
+	struct kbase_hwcnt_description desc;
+	struct kbase_hwcnt_group_description *grps;
+	size_t avail_mask_bit;
+
+	WARN_ON(!v4_info);
+	WARN_ON(!metadata);
+
+	/* Check if there are enough bits in the availability mask to represent
+	 * all the hardware counter blocks in the system.
+	 */
+	if (v4_info->cg_count > KBASE_HWCNT_V4_MAX_GROUPS)
+		return -EINVAL;
+
+	grps = kcalloc(v4_info->cg_count, sizeof(*grps), GFP_KERNEL);
+	if (!grps)
+		goto clean_up;
+
+	desc.grp_cnt = v4_info->cg_count;
+	desc.grps = grps;
+
+	for (grp = 0; grp < v4_info->cg_count; grp++) {
+		size_t blk;
+		size_t sc;
+		const u64 core_mask = v4_info->cgs[grp].core_mask;
+		struct kbase_hwcnt_block_description *blks = kcalloc(
+			KBASE_HWCNT_V4_BLOCKS_PER_GROUP,
+			sizeof(*blks),
+			GFP_KERNEL);
+
+		if (!blks)
+			goto clean_up;
+
+		grps[grp].type = KBASE_HWCNT_GPU_GROUP_TYPE_V4;
+		grps[grp].blk_cnt = KBASE_HWCNT_V4_BLOCKS_PER_GROUP;
+		grps[grp].blks = blks;
+
+		for (blk = 0; blk < KBASE_HWCNT_V4_BLOCKS_PER_GROUP; blk++) {
+			blks[blk].inst_cnt = 1;
+			blks[blk].hdr_cnt =
+				KBASE_HWCNT_V4_HEADERS_PER_BLOCK;
+			blks[blk].ctr_cnt =
+				KBASE_HWCNT_V4_COUNTERS_PER_BLOCK;
+		}
+
+		for (sc = 0; sc < KBASE_HWCNT_V4_SC_BLOCKS_PER_GROUP; sc++) {
+			blks[sc].type = core_mask & (1ull << sc) ?
+				KBASE_HWCNT_GPU_V4_BLOCK_TYPE_SHADER :
+				KBASE_HWCNT_GPU_V4_BLOCK_TYPE_RESERVED;
+		}
+
+		blks[4].type = KBASE_HWCNT_GPU_V4_BLOCK_TYPE_TILER;
+		blks[5].type = KBASE_HWCNT_GPU_V4_BLOCK_TYPE_MMU_L2;
+		blks[6].type = KBASE_HWCNT_GPU_V4_BLOCK_TYPE_RESERVED;
+		blks[7].type = (grp == 0) ?
+			KBASE_HWCNT_GPU_V4_BLOCK_TYPE_JM :
+			KBASE_HWCNT_GPU_V4_BLOCK_TYPE_RESERVED;
+
+		WARN_ON(KBASE_HWCNT_V4_BLOCKS_PER_GROUP != 8);
+	}
+
+	/* Initialise the availability mask */
+	desc.avail_mask = 0;
+	avail_mask_bit = 0;
+
+	for (grp = 0; grp < desc.grp_cnt; grp++) {
+		size_t blk;
+		const struct kbase_hwcnt_block_description *blks =
+			desc.grps[grp].blks;
+		for (blk = 0; blk < desc.grps[grp].blk_cnt; blk++) {
+			WARN_ON(blks[blk].inst_cnt != 1);
+			if (blks[blk].type !=
+			    KBASE_HWCNT_GPU_V4_BLOCK_TYPE_RESERVED)
+				desc.avail_mask |= (1ull << avail_mask_bit);
+
+			avail_mask_bit++;
+		}
+	}
+
+	errcode = kbase_hwcnt_metadata_create(&desc, metadata);
+
+	/* Always clean up, as metadata will make a copy of the input args */
+clean_up:
+	if (grps) {
+		for (grp = 0; grp < v4_info->cg_count; grp++)
+			kfree(grps[grp].blks);
+		kfree(grps);
+	}
+	return errcode;
+}
+
+/**
+ * kbasep_hwcnt_backend_gpu_v4_dump_bytes() - Get the raw dump buffer size for a
+ *                                            V4 GPU.
+ * @v4_info: Non-NULL pointer to hwcnt info for a v4 GPU.
+ *
+ * Return: Size of buffer the V4 GPU needs to perform a counter dump.
+ */
+static size_t kbasep_hwcnt_backend_gpu_v4_dump_bytes(
+	const struct kbase_hwcnt_gpu_v4_info *v4_info)
+{
+	return v4_info->cg_count *
+		KBASE_HWCNT_V4_BLOCKS_PER_GROUP *
+		KBASE_HWCNT_V4_VALUES_PER_BLOCK *
+		KBASE_HWCNT_VALUE_BYTES;
+}
+
+/**
+ * kbasep_hwcnt_backend_gpu_metadata_v5_create() - Create hardware counter
+ *                                                 metadata for a v5 GPU.
+ * @v5_info:       Non-NULL pointer to hwcnt info for a v5 GPU.
+ * @use_secondary: True if secondary performance counters should be used, else
+ *                 false. Ignored if secondary counters are not supported.
+ * @metadata:      Non-NULL pointer to where created metadata is stored
+ *                 on success.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_hwcnt_backend_gpu_metadata_v5_create(
+	const struct kbase_hwcnt_gpu_v5_info *v5_info,
+	bool use_secondary,
+	const struct kbase_hwcnt_metadata **metadata)
+{
+	struct kbase_hwcnt_description desc;
+	struct kbase_hwcnt_group_description group;
+	struct kbase_hwcnt_block_description
+		blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
+	size_t non_sc_block_count;
+	size_t sc_block_count;
+
+	WARN_ON(!v5_info);
+	WARN_ON(!metadata);
+
+	/* Calculate number of block instances that aren't shader cores */
+	non_sc_block_count = 2 + v5_info->l2_count;
+	/* Calculate number of block instances that are shader cores */
+	sc_block_count = fls64(v5_info->core_mask);
+
+	/*
+	 * A system can have up to 64 shader cores, but the 64-bit
+	 * availability mask can't physically represent that many cores as well
+	 * as the other hardware blocks.
+	 * Error out if there are more blocks than our implementation can
+	 * support.
+	 */
+	if ((sc_block_count + non_sc_block_count) > KBASE_HWCNT_AVAIL_MASK_BITS)
+		return -EINVAL;
+
+	/* One Job Manager block */
+	blks[0].type = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_JM;
+	blks[0].inst_cnt = 1;
+	blks[0].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[0].ctr_cnt = KBASE_HWCNT_V5_COUNTERS_PER_BLOCK;
+
+	/* One Tiler block */
+	blks[1].type = KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER;
+	blks[1].inst_cnt = 1;
+	blks[1].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[1].ctr_cnt = KBASE_HWCNT_V5_COUNTERS_PER_BLOCK;
+
+	/* l2_count memsys blks */
+	blks[2].type = use_secondary ?
+		KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2 :
+		KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS;
+	blks[2].inst_cnt = v5_info->l2_count;
+	blks[2].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[2].ctr_cnt = KBASE_HWCNT_V5_COUNTERS_PER_BLOCK;
+
+	/*
+	 * There are as many shader cores in the system as there are bits set in
+	 * the core mask. However, the dump buffer memory requirements need to
+	 * take into account the fact that the core mask may be non-contiguous.
+	 *
+	 * For example, a system with a core mask of 0b1011 has the same dump
+	 * buffer memory requirements as a system with 0b1111, but requires more
+	 * memory than a system with 0b0111. However, core 2 of the system with
+	 * 0b1011 doesn't physically exist, and the dump buffer memory that
+	 * accounts for that core will never be written to when we do a counter
+	 * dump.
+	 *
+	 * We find the core mask's last set bit to determine the memory
+	 * requirements, and embed the core mask into the availability mask so
+	 * we can determine later which shader cores physically exist.
+	 */
+	blks[3].type = use_secondary ?
+		KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2 :
+		KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC;
+	blks[3].inst_cnt = sc_block_count;
+	blks[3].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[3].ctr_cnt = KBASE_HWCNT_V5_COUNTERS_PER_BLOCK;
+
+	WARN_ON(KBASE_HWCNT_V5_BLOCK_TYPE_COUNT != 4);
+
+	group.type = KBASE_HWCNT_GPU_GROUP_TYPE_V5;
+	group.blk_cnt = KBASE_HWCNT_V5_BLOCK_TYPE_COUNT;
+	group.blks = blks;
+
+	desc.grp_cnt = 1;
+	desc.grps = &group;
+
+	/* The JM, Tiler, and L2s are always available, and are before cores */
+	desc.avail_mask = (1ull << non_sc_block_count) - 1;
+	/* Embed the core mask directly in the availability mask */
+	desc.avail_mask |= (v5_info->core_mask << non_sc_block_count);
+
+	return kbase_hwcnt_metadata_create(&desc, metadata);
+}
+
+/**
+ * kbasep_hwcnt_backend_gpu_v5_dump_bytes() - Get the raw dump buffer size for a
+ *                                            V5 GPU.
+ * @v5_info: Non-NULL pointer to hwcnt info for a v5 GPU.
+ *
+ * Return: Size of buffer the V5 GPU needs to perform a counter dump.
+ */
+static size_t kbasep_hwcnt_backend_gpu_v5_dump_bytes(
+	const struct kbase_hwcnt_gpu_v5_info *v5_info)
+{
+	WARN_ON(!v5_info);
+	return (2 + v5_info->l2_count + fls64(v5_info->core_mask)) *
+		KBASE_HWCNT_V5_VALUES_PER_BLOCK *
+		KBASE_HWCNT_VALUE_BYTES;
+}
+
+int kbase_hwcnt_gpu_info_init(
+	struct kbase_device *kbdev,
+	struct kbase_hwcnt_gpu_info *info)
+{
+	if (!kbdev || !info)
+		return -EINVAL;
+
+#ifdef CONFIG_MALI_NO_MALI
+	/* NO_MALI uses V5 layout, regardless of the underlying platform. */
+	info->type = KBASE_HWCNT_GPU_GROUP_TYPE_V5;
+	info->v5.l2_count = KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS;
+	info->v5.core_mask = (1ull << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1;
+#else
+	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_V4)) {
+		info->type = KBASE_HWCNT_GPU_GROUP_TYPE_V4;
+		info->v4.cg_count = kbdev->gpu_props.num_core_groups;
+		info->v4.cgs = kbdev->gpu_props.props.coherency_info.group;
+	} else {
+		const struct base_gpu_props *props = &kbdev->gpu_props.props;
+		const size_t l2_count = props->l2_props.num_l2_slices;
+		const size_t core_mask =
+			props->coherency_info.group[0].core_mask;
+
+		info->type = KBASE_HWCNT_GPU_GROUP_TYPE_V5;
+		info->v5.l2_count = l2_count;
+		info->v5.core_mask = core_mask;
+	}
+#endif
+	return 0;
+}
+
+int kbase_hwcnt_gpu_metadata_create(
+	const struct kbase_hwcnt_gpu_info *info,
+	bool use_secondary,
+	const struct kbase_hwcnt_metadata **out_metadata,
+	size_t *out_dump_bytes)
+{
+	int errcode;
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t dump_bytes;
+
+	if (!info || !out_metadata || !out_dump_bytes)
+		return -EINVAL;
+
+	switch (info->type) {
+	case KBASE_HWCNT_GPU_GROUP_TYPE_V4:
+		dump_bytes = kbasep_hwcnt_backend_gpu_v4_dump_bytes(&info->v4);
+		errcode = kbasep_hwcnt_backend_gpu_metadata_v4_create(
+			&info->v4, &metadata);
+		break;
+	case KBASE_HWCNT_GPU_GROUP_TYPE_V5:
+		dump_bytes = kbasep_hwcnt_backend_gpu_v5_dump_bytes(&info->v5);
+		errcode = kbasep_hwcnt_backend_gpu_metadata_v5_create(
+			&info->v5, use_secondary, &metadata);
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (errcode)
+		return errcode;
+
+	/*
+	 * Dump abstraction size should be exactly the same size and layout as
+	 * the physical dump size, for backwards compatibility.
+	 */
+	WARN_ON(dump_bytes != metadata->dump_buf_bytes);
+
+	*out_metadata = metadata;
+	*out_dump_bytes = dump_bytes;
+
+	return 0;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_gpu_metadata_create);
+
+void kbase_hwcnt_gpu_metadata_destroy(
+	const struct kbase_hwcnt_metadata *metadata)
+{
+	if (!metadata)
+		return;
+
+	kbase_hwcnt_metadata_destroy(metadata);
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_gpu_metadata_destroy);
+
+int kbase_hwcnt_gpu_dump_get(
+	struct kbase_hwcnt_dump_buffer *dst,
+	void *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map,
+	bool accumulate)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+	const u32 *dump_src;
+	size_t src_offset, grp, blk, blk_inst;
+
+	if (!dst || !src || !dst_enable_map ||
+	    (dst_enable_map->metadata != dst->metadata))
+		return -EINVAL;
+
+	metadata = dst->metadata;
+	dump_src = (const u32 *)src;
+	src_offset = 0;
+
+	kbase_hwcnt_metadata_for_each_block(
+		metadata, grp, blk, blk_inst) {
+		const size_t hdr_cnt =
+			kbase_hwcnt_metadata_block_headers_count(
+				metadata, grp, blk);
+		const size_t ctr_cnt =
+			kbase_hwcnt_metadata_block_counters_count(
+				metadata, grp, blk);
+
+		/* Early out if no values in the dest block are enabled */
+		if (kbase_hwcnt_enable_map_block_enabled(
+			dst_enable_map, grp, blk, blk_inst)) {
+			u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+				dst, grp, blk, blk_inst);
+			const u32 *src_blk = dump_src + src_offset;
+
+			if (accumulate) {
+				kbase_hwcnt_dump_buffer_block_accumulate(
+					dst_blk, src_blk, hdr_cnt, ctr_cnt);
+			} else {
+				kbase_hwcnt_dump_buffer_block_copy(
+					dst_blk, src_blk, (hdr_cnt + ctr_cnt));
+			}
+		}
+
+		src_offset += (hdr_cnt + ctr_cnt);
+	}
+
+	return 0;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_gpu_dump_get);
+
+/**
+ * kbasep_hwcnt_backend_gpu_block_map_to_physical() - Convert from a block
+ *                                                    enable map abstraction to
+ *                                                    a physical block enable
+ *                                                    map.
+ * @lo: Low 64 bits of block enable map abstraction.
+ * @hi: High 64 bits of block enable map abstraction.
+ *
+ * The abstraction uses 128 bits to enable 128 block values, whereas the
+ * physical uses just 32 bits, as bit n enables values [n*4, n*4+3].
+ * Therefore, this conversion is lossy.
+ *
+ * Return: 32-bit physical block enable map.
+ */
+static inline u32 kbasep_hwcnt_backend_gpu_block_map_to_physical(
+	u64 lo,
+	u64 hi)
+{
+	u32 phys = 0;
+	u64 dwords[2] = {lo, hi};
+	size_t dword_idx;
+
+	for (dword_idx = 0; dword_idx < 2; dword_idx++) {
+		const u64 dword = dwords[dword_idx];
+		u16 packed = 0;
+
+		size_t hword_bit;
+
+		for (hword_bit = 0; hword_bit < 16; hword_bit++) {
+			const size_t dword_bit = hword_bit * 4;
+			const u16 mask =
+				((dword >> (dword_bit + 0)) & 0x1) |
+				((dword >> (dword_bit + 1)) & 0x1) |
+				((dword >> (dword_bit + 2)) & 0x1) |
+				((dword >> (dword_bit + 3)) & 0x1);
+			packed |= (mask << hword_bit);
+		}
+		phys |= ((u32)packed) << (16 * dword_idx);
+	}
+	return phys;
+}
+
+/**
+ * kbasep_hwcnt_backend_gpu_block_map_from_physical() - Convert from a physical
+ *                                                      block enable map to a
+ *                                                      block enable map
+ *                                                      abstraction.
+ * @phys: Physical 32-bit block enable map
+ * @lo:   Non-NULL pointer to where low 64 bits of block enable map abstraction
+ *        will be stored.
+ * @hi:   Non-NULL pointer to where high 64 bits of block enable map abstraction
+ *        will be stored.
+ */
+static inline void kbasep_hwcnt_backend_gpu_block_map_from_physical(
+	u32 phys,
+	u64 *lo,
+	u64 *hi)
+{
+	u64 dwords[2] = {0, 0};
+
+	size_t dword_idx;
+
+	for (dword_idx = 0; dword_idx < 2; dword_idx++) {
+		const u16 packed = phys >> (16 * dword_idx);
+		u64 dword = 0;
+
+		size_t hword_bit;
+
+		for (hword_bit = 0; hword_bit < 16; hword_bit++) {
+			const size_t dword_bit = hword_bit * 4;
+			const u64 mask = (packed >> (hword_bit)) & 0x1;
+
+			dword |= mask << (dword_bit + 0);
+			dword |= mask << (dword_bit + 1);
+			dword |= mask << (dword_bit + 2);
+			dword |= mask << (dword_bit + 3);
+		}
+		dwords[dword_idx] = dword;
+	}
+	*lo = dwords[0];
+	*hi = dwords[1];
+}
+
+void kbase_hwcnt_gpu_enable_map_to_physical(
+	struct kbase_hwcnt_physical_enable_map *dst,
+	const struct kbase_hwcnt_enable_map *src)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+
+	u64 jm_bm = 0;
+	u64 shader_bm = 0;
+	u64 tiler_bm = 0;
+	u64 mmu_l2_bm = 0;
+
+	size_t grp, blk, blk_inst;
+
+	if (WARN_ON(!src) || WARN_ON(!dst))
+		return;
+
+	metadata = src->metadata;
+
+	kbase_hwcnt_metadata_for_each_block(
+		metadata, grp, blk, blk_inst) {
+		const u64 grp_type = kbase_hwcnt_metadata_group_type(
+			metadata, grp);
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(
+			metadata, grp, blk);
+		const size_t blk_val_cnt =
+			kbase_hwcnt_metadata_block_values_count(
+				metadata, grp, blk);
+		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(
+			src, grp, blk, blk_inst);
+
+		switch ((enum kbase_hwcnt_gpu_group_type)grp_type) {
+		case KBASE_HWCNT_GPU_GROUP_TYPE_V4:
+			WARN_ON(blk_val_cnt != KBASE_HWCNT_V4_VALUES_PER_BLOCK);
+			switch ((enum kbase_hwcnt_gpu_v4_block_type)blk_type) {
+			case KBASE_HWCNT_GPU_V4_BLOCK_TYPE_SHADER:
+				shader_bm |= *blk_map;
+				break;
+			case KBASE_HWCNT_GPU_V4_BLOCK_TYPE_TILER:
+				tiler_bm |= *blk_map;
+				break;
+			case KBASE_HWCNT_GPU_V4_BLOCK_TYPE_MMU_L2:
+				mmu_l2_bm |= *blk_map;
+				break;
+			case KBASE_HWCNT_GPU_V4_BLOCK_TYPE_JM:
+				jm_bm |= *blk_map;
+				break;
+			case KBASE_HWCNT_GPU_V4_BLOCK_TYPE_RESERVED:
+				break;
+			default:
+				WARN_ON(true);
+			}
+			break;
+		case KBASE_HWCNT_GPU_GROUP_TYPE_V5:
+			WARN_ON(blk_val_cnt != KBASE_HWCNT_V5_VALUES_PER_BLOCK);
+			switch ((enum kbase_hwcnt_gpu_v5_block_type)blk_type) {
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_JM:
+				jm_bm |= *blk_map;
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER:
+				tiler_bm |= *blk_map;
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC:
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2:
+				shader_bm |= *blk_map;
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2:
+				mmu_l2_bm |= *blk_map;
+				break;
+			default:
+				WARN_ON(true);
+			}
+			break;
+		default:
+			WARN_ON(true);
+		}
+	}
+
+	dst->jm_bm =
+		kbasep_hwcnt_backend_gpu_block_map_to_physical(jm_bm, 0);
+	dst->shader_bm =
+		kbasep_hwcnt_backend_gpu_block_map_to_physical(shader_bm, 0);
+	dst->tiler_bm =
+		kbasep_hwcnt_backend_gpu_block_map_to_physical(tiler_bm, 0);
+	dst->mmu_l2_bm =
+		kbasep_hwcnt_backend_gpu_block_map_to_physical(mmu_l2_bm, 0);
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_gpu_enable_map_to_physical);
+
+void kbase_hwcnt_gpu_enable_map_from_physical(
+	struct kbase_hwcnt_enable_map *dst,
+	const struct kbase_hwcnt_physical_enable_map *src)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+
+	u64 ignored_hi;
+	u64 jm_bm;
+	u64 shader_bm;
+	u64 tiler_bm;
+	u64 mmu_l2_bm;
+	size_t grp, blk, blk_inst;
+
+	if (WARN_ON(!src) || WARN_ON(!dst))
+		return;
+
+	metadata = dst->metadata;
+
+	kbasep_hwcnt_backend_gpu_block_map_from_physical(
+		src->jm_bm, &jm_bm, &ignored_hi);
+	kbasep_hwcnt_backend_gpu_block_map_from_physical(
+		src->shader_bm, &shader_bm, &ignored_hi);
+	kbasep_hwcnt_backend_gpu_block_map_from_physical(
+		src->tiler_bm, &tiler_bm, &ignored_hi);
+	kbasep_hwcnt_backend_gpu_block_map_from_physical(
+		src->mmu_l2_bm, &mmu_l2_bm, &ignored_hi);
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+		const u64 grp_type = kbase_hwcnt_metadata_group_type(
+			metadata, grp);
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(
+			metadata, grp, blk);
+		const size_t blk_val_cnt =
+			kbase_hwcnt_metadata_block_values_count(
+				metadata, grp, blk);
+		u64 *blk_map = kbase_hwcnt_enable_map_block_instance(
+			dst, grp, blk, blk_inst);
+
+		switch ((enum kbase_hwcnt_gpu_group_type)grp_type) {
+		case KBASE_HWCNT_GPU_GROUP_TYPE_V4:
+			WARN_ON(blk_val_cnt != KBASE_HWCNT_V4_VALUES_PER_BLOCK);
+			switch ((enum kbase_hwcnt_gpu_v4_block_type)blk_type) {
+			case KBASE_HWCNT_GPU_V4_BLOCK_TYPE_SHADER:
+				*blk_map = shader_bm;
+				break;
+			case KBASE_HWCNT_GPU_V4_BLOCK_TYPE_TILER:
+				*blk_map = tiler_bm;
+				break;
+			case KBASE_HWCNT_GPU_V4_BLOCK_TYPE_MMU_L2:
+				*blk_map = mmu_l2_bm;
+				break;
+			case KBASE_HWCNT_GPU_V4_BLOCK_TYPE_JM:
+				*blk_map = jm_bm;
+				break;
+			case KBASE_HWCNT_GPU_V4_BLOCK_TYPE_RESERVED:
+				break;
+			default:
+				WARN_ON(true);
+			}
+			break;
+		case KBASE_HWCNT_GPU_GROUP_TYPE_V5:
+			WARN_ON(blk_val_cnt != KBASE_HWCNT_V5_VALUES_PER_BLOCK);
+			switch ((enum kbase_hwcnt_gpu_v5_block_type)blk_type) {
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_JM:
+				*blk_map = jm_bm;
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER:
+				*blk_map = tiler_bm;
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC:
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2:
+				*blk_map = shader_bm;
+				break;
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:
+			case KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2:
+				*blk_map = mmu_l2_bm;
+				break;
+			default:
+				WARN_ON(true);
+			}
+			break;
+		default:
+			WARN_ON(true);
+		}
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_gpu_enable_map_from_physical);
+
+void kbase_hwcnt_gpu_patch_dump_headers(
+	struct kbase_hwcnt_dump_buffer *buf,
+	const struct kbase_hwcnt_enable_map *enable_map)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t grp, blk, blk_inst;
+
+	if (WARN_ON(!buf) || WARN_ON(!enable_map) ||
+	    WARN_ON(buf->metadata != enable_map->metadata))
+		return;
+
+	metadata = buf->metadata;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+		const u64 grp_type =
+			kbase_hwcnt_metadata_group_type(metadata, grp);
+		u32 *buf_blk = kbase_hwcnt_dump_buffer_block_instance(
+			buf, grp, blk, blk_inst);
+		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(
+			enable_map, grp, blk, blk_inst);
+		const u32 prfcnt_en =
+			kbasep_hwcnt_backend_gpu_block_map_to_physical(
+				blk_map[0], 0);
+
+		switch ((enum kbase_hwcnt_gpu_group_type)grp_type) {
+		case KBASE_HWCNT_GPU_GROUP_TYPE_V4:
+			buf_blk[KBASE_HWCNT_V4_PRFCNT_EN_HEADER] = prfcnt_en;
+			break;
+		case KBASE_HWCNT_GPU_GROUP_TYPE_V5:
+			buf_blk[KBASE_HWCNT_V5_PRFCNT_EN_HEADER] = prfcnt_en;
+			break;
+		default:
+			WARN_ON(true);
+		}
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_gpu_patch_dump_headers);
diff --git a/mali_kbase/mali_kbase_hwcnt_gpu.h b/mali_kbase/mali_kbase_hwcnt_gpu.h
new file mode 100644
index 0000000..509608a
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_gpu.h
@@ -0,0 +1,249 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+#ifndef _KBASE_HWCNT_GPU_H_
+#define _KBASE_HWCNT_GPU_H_
+
+#include <linux/types.h>
+
+struct kbase_device;
+struct kbase_hwcnt_metadata;
+struct kbase_hwcnt_enable_map;
+struct kbase_hwcnt_dump_buffer;
+
+/**
+ * enum kbase_hwcnt_gpu_group_type - GPU hardware counter group types, used to
+ *                                   identify metadata groups.
+ * @KBASE_HWCNT_GPU_GROUP_TYPE_V4: GPU V4 group type.
+ * @KBASE_HWCNT_GPU_GROUP_TYPE_V5: GPU V5 group type.
+ */
+enum kbase_hwcnt_gpu_group_type {
+	KBASE_HWCNT_GPU_GROUP_TYPE_V4 = 0x10,
+	KBASE_HWCNT_GPU_GROUP_TYPE_V5,
+};
+
+/**
+ * enum kbase_hwcnt_gpu_v4_block_type - GPU V4 hardware counter block types,
+ *                                      used to identify metadata blocks.
+ * @KBASE_HWCNT_GPU_V4_BLOCK_TYPE_SHADER:   Shader block.
+ * @KBASE_HWCNT_GPU_V4_BLOCK_TYPE_TILER:    Tiler block.
+ * @KBASE_HWCNT_GPU_V4_BLOCK_TYPE_MMU_L2:   MMU/L2 block.
+ * @KBASE_HWCNT_GPU_V4_BLOCK_TYPE_JM:       Job Manager block.
+ * @KBASE_HWCNT_GPU_V4_BLOCK_TYPE_RESERVED: Reserved block.
+ */
+enum kbase_hwcnt_gpu_v4_block_type {
+	KBASE_HWCNT_GPU_V4_BLOCK_TYPE_SHADER = 0x20,
+	KBASE_HWCNT_GPU_V4_BLOCK_TYPE_TILER,
+	KBASE_HWCNT_GPU_V4_BLOCK_TYPE_MMU_L2,
+	KBASE_HWCNT_GPU_V4_BLOCK_TYPE_JM,
+	KBASE_HWCNT_GPU_V4_BLOCK_TYPE_RESERVED,
+};
+
+/**
+ * enum kbase_hwcnt_gpu_v5_block_type - GPU V5 hardware counter block types,
+ *                                      used to identify metadata blocks.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_JM:      Job Manager block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER:   Tiler block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC:      Shader Core block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2:     Secondary Shader Core block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS:  Memsys block.
+ * @KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2: Secondary Memsys block.
+ */
+enum kbase_hwcnt_gpu_v5_block_type {
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_JM = 0x40,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_TILER,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_SC2,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS,
+	KBASE_HWCNT_GPU_V5_BLOCK_TYPE_PERF_MEMSYS2,
+};
+
+/**
+ * struct kbase_hwcnt_physical_enable_map - Representation of enable map
+ *                                          directly used by GPU.
+ * @jm_bm:     Job Manager counters selection bitmask.
+ * @shader_bm: Shader counters selection bitmask.
+ * @tiler_bm:  Tiler counters selection bitmask.
+ * @mmu_l2_bm: MMU_L2 counters selection bitmask.
+ */
+struct kbase_hwcnt_physical_enable_map {
+	u32 jm_bm;
+	u32 shader_bm;
+	u32 tiler_bm;
+	u32 mmu_l2_bm;
+};
+
+/**
+ * struct kbase_hwcnt_gpu_v4_info - Information about hwcnt blocks on v4 GPUs.
+ * @cg_count: Core group count.
+ * @cgs:      Non-NULL pointer to array of cg_count coherent group structures.
+ *
+ * V4 devices are Mali-T6xx or Mali-T72x, and have one or more core groups,
+ * where each core group may have a physically different layout.
+ */
+struct kbase_hwcnt_gpu_v4_info {
+	size_t cg_count;
+	const struct mali_base_gpu_coherent_group *cgs;
+};
+
+/**
+ * struct kbase_hwcnt_gpu_v5_info - Information about hwcnt blocks on v5 GPUs.
+ * @l2_count:   L2 cache count.
+ * @core_mask:  Shader core mask. May be sparse.
+ */
+struct kbase_hwcnt_gpu_v5_info {
+	size_t l2_count;
+	u64 core_mask;
+};
+
+/**
+ * struct kbase_hwcnt_gpu_info - Tagged union with information about the current
+ *                               GPU's hwcnt blocks.
+ * @type: GPU type.
+ * @v4:   Info filled in if a v4 GPU.
+ * @v5:   Info filled in if a v5 GPU.
+ */
+struct kbase_hwcnt_gpu_info {
+	enum kbase_hwcnt_gpu_group_type type;
+	union {
+		struct kbase_hwcnt_gpu_v4_info v4;
+		struct kbase_hwcnt_gpu_v5_info v5;
+	};
+};
+
+/**
+ * kbase_hwcnt_gpu_info_init() - Initialise an info structure used to create the
+ *                               hwcnt metadata.
+ * @kbdev: Non-NULL pointer to kbase device.
+ * @info:  Non-NULL pointer to data structure to be filled in.
+ *
+ * The initialised info struct will only be valid for use while kbdev is valid.
+ */
+int kbase_hwcnt_gpu_info_init(
+	struct kbase_device *kbdev,
+	struct kbase_hwcnt_gpu_info *info);
+
+/**
+ * kbase_hwcnt_gpu_metadata_create() - Create hardware counter metadata for the
+ *                                     current GPU.
+ * @info:           Non-NULL pointer to info struct initialised by
+ *                  kbase_hwcnt_gpu_info_init.
+ * @use_secondary:  True if secondary performance counters should be used, else
+ *                  false. Ignored if secondary counters are not supported.
+ * @out_metadata:   Non-NULL pointer to where created metadata is stored on
+ *                  success.
+ * @out_dump_bytes: Non-NULL pointer to where the size of the GPU counter dump
+ *                  buffer is stored on success.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_gpu_metadata_create(
+	const struct kbase_hwcnt_gpu_info *info,
+	bool use_secondary,
+	const struct kbase_hwcnt_metadata **out_metadata,
+	size_t *out_dump_bytes);
+
+/**
+ * kbase_hwcnt_gpu_metadata_destroy() - Destroy GPU hardware counter metadata.
+ * @metadata: Pointer to metadata to destroy.
+ */
+void kbase_hwcnt_gpu_metadata_destroy(
+	const struct kbase_hwcnt_metadata *metadata);
+
+/**
+ * kbase_hwcnt_gpu_dump_get() - Copy or accumulate enabled counters from the raw
+ *                              dump buffer in src into the dump buffer
+ *                              abstraction in dst.
+ * @dst:            Non-NULL pointer to dst dump buffer.
+ * @src:            Non-NULL pointer to src raw dump buffer, of same length
+ *                  as returned in out_dump_bytes parameter of
+ *                  kbase_hwcnt_gpu_metadata_create.
+ * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
+ * @accumulate:     True if counters in src should be accumulated into dst,
+ *                  rather than copied.
+ *
+ * The dst and dst_enable_map MUST have been created from the same metadata as
+ * returned from the call to kbase_hwcnt_gpu_metadata_create as was used to get
+ * the length of src.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_gpu_dump_get(
+	struct kbase_hwcnt_dump_buffer *dst,
+	void *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map,
+	bool accumulate);
+
+/**
+ * kbase_hwcnt_gpu_enable_map_to_physical() - Convert an enable map abstraction
+ *                                            into a physical enable map.
+ * @dst: Non-NULL pointer to dst physical enable map.
+ * @src: Non-NULL pointer to src enable map abstraction.
+ *
+ * The src must have been created from a metadata returned from a call to
+ * kbase_hwcnt_gpu_metadata_create.
+ *
+ * This is a lossy conversion, as the enable map abstraction has one bit per
+ * individual counter block value, but the physical enable map uses 1 bit for
+ * every 4 counters, shared over all instances of a block.
+ */
+void kbase_hwcnt_gpu_enable_map_to_physical(
+	struct kbase_hwcnt_physical_enable_map *dst,
+	const struct kbase_hwcnt_enable_map *src);
+
+/**
+ * kbase_hwcnt_gpu_enable_map_from_physical() - Convert a physical enable map to
+ *                                              an enable map abstraction.
+ * @dst: Non-NULL pointer to dst enable map abstraction.
+ * @src: Non-NULL pointer to src physical enable map.
+ *
+ * The dst must have been created from a metadata returned from a call to
+ * kbase_hwcnt_gpu_metadata_create.
+ *
+ * This is a lossy conversion, as the physical enable map can technically
+ * support counter blocks with 128 counters each, but no hardware actually uses
+ * more than 64, so the enable map abstraction has nowhere to store the enable
+ * information for the 64 non-existent counters.
+ */
+void kbase_hwcnt_gpu_enable_map_from_physical(
+	struct kbase_hwcnt_enable_map *dst,
+	const struct kbase_hwcnt_physical_enable_map *src);
+
+/**
+ * kbase_hwcnt_gpu_patch_dump_headers() - Patch all the performance counter
+ *                                        enable headers in a dump buffer to
+ *                                        reflect the specified enable map.
+ * @buf:        Non-NULL pointer to dump buffer to patch.
+ * @enable_map: Non-NULL pointer to enable map.
+ *
+ * The buf and enable_map must have been created from a metadata returned from
+ * a call to kbase_hwcnt_gpu_metadata_create.
+ *
+ * This function should be used before handing off a dump buffer over the
+ * kernel-user boundary, to ensure the header is accurate for the enable map
+ * used by the user.
+ */
+void kbase_hwcnt_gpu_patch_dump_headers(
+	struct kbase_hwcnt_dump_buffer *buf,
+	const struct kbase_hwcnt_enable_map *enable_map);
+
+#endif /* _KBASE_HWCNT_GPU_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_legacy.c b/mali_kbase/mali_kbase_hwcnt_legacy.c
new file mode 100644
index 0000000..b0e6aee
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_legacy.c
@@ -0,0 +1,152 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+#include "mali_kbase_hwcnt_legacy.h"
+#include "mali_kbase_hwcnt_virtualizer.h"
+#include "mali_kbase_hwcnt_types.h"
+#include "mali_kbase_hwcnt_gpu.h"
+#include "mali_kbase_ioctl.h"
+
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+/**
+ * struct kbase_hwcnt_legacy_client - Legacy hardware counter client.
+ * @user_dump_buf: Pointer to a non-NULL user buffer, where dumps are returned.
+ * @enable_map:    Counter enable map.
+ * @dump_buf:      Dump buffer used to manipulate dumps before copied to user.
+ * @hvcli:         Hardware counter virtualizer client.
+ */
+struct kbase_hwcnt_legacy_client {
+	void __user *user_dump_buf;
+	struct kbase_hwcnt_enable_map enable_map;
+	struct kbase_hwcnt_dump_buffer dump_buf;
+	struct kbase_hwcnt_virtualizer_client *hvcli;
+};
+
+int kbase_hwcnt_legacy_client_create(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	struct kbase_ioctl_hwcnt_enable *enable,
+	struct kbase_hwcnt_legacy_client **out_hlcli)
+{
+	int errcode;
+	struct kbase_hwcnt_legacy_client *hlcli;
+	const struct kbase_hwcnt_metadata *metadata;
+	struct kbase_hwcnt_physical_enable_map phys_em;
+
+	if (!hvirt || !enable || !enable->dump_buffer || !out_hlcli)
+		return -EINVAL;
+
+	metadata = kbase_hwcnt_virtualizer_metadata(hvirt);
+
+	hlcli = kzalloc(sizeof(*hlcli), GFP_KERNEL);
+	if (!hlcli)
+		return -ENOMEM;
+
+	hlcli->user_dump_buf = (void __user *)(uintptr_t)enable->dump_buffer;
+
+	errcode = kbase_hwcnt_enable_map_alloc(metadata, &hlcli->enable_map);
+	if (errcode)
+		goto error;
+
+	/* Translate from the ioctl enable map to the internal one */
+	phys_em.jm_bm = enable->jm_bm;
+	phys_em.shader_bm = enable->shader_bm;
+	phys_em.tiler_bm = enable->tiler_bm;
+	phys_em.mmu_l2_bm = enable->mmu_l2_bm;
+	kbase_hwcnt_gpu_enable_map_from_physical(&hlcli->enable_map, &phys_em);
+
+	errcode = kbase_hwcnt_dump_buffer_alloc(metadata, &hlcli->dump_buf);
+	if (errcode)
+		goto error;
+
+	errcode = kbase_hwcnt_virtualizer_client_create(
+		hvirt, &hlcli->enable_map, &hlcli->hvcli);
+	if (errcode)
+		goto error;
+
+	*out_hlcli = hlcli;
+	return 0;
+
+error:
+	kbase_hwcnt_legacy_client_destroy(hlcli);
+	return errcode;
+}
+
+void kbase_hwcnt_legacy_client_destroy(struct kbase_hwcnt_legacy_client *hlcli)
+{
+	if (!hlcli)
+		return;
+
+	kbase_hwcnt_virtualizer_client_destroy(hlcli->hvcli);
+	kbase_hwcnt_dump_buffer_free(&hlcli->dump_buf);
+	kbase_hwcnt_enable_map_free(&hlcli->enable_map);
+	kfree(hlcli);
+}
+
+int kbase_hwcnt_legacy_client_dump(struct kbase_hwcnt_legacy_client *hlcli)
+{
+	int errcode;
+	u64 ts_start_ns;
+	u64 ts_end_ns;
+
+	if (!hlcli)
+		return -EINVAL;
+
+	/* Dump into the kernel buffer */
+	errcode = kbase_hwcnt_virtualizer_client_dump(hlcli->hvcli,
+		&ts_start_ns, &ts_end_ns, &hlcli->dump_buf);
+	if (errcode)
+		return errcode;
+
+	/* Patch the dump buf headers, to hide the counters that other hwcnt
+	 * clients are using.
+	 */
+	kbase_hwcnt_gpu_patch_dump_headers(
+		&hlcli->dump_buf, &hlcli->enable_map);
+
+	/* Zero all non-enabled counters (current values are undefined) */
+	kbase_hwcnt_dump_buffer_zero_non_enabled(
+		&hlcli->dump_buf, &hlcli->enable_map);
+
+	/* Copy into the user's buffer */
+	errcode = copy_to_user(hlcli->user_dump_buf, hlcli->dump_buf.dump_buf,
+		hlcli->dump_buf.metadata->dump_buf_bytes);
+	/* Non-zero errcode implies user buf was invalid or too small */
+	if (errcode)
+		return -EFAULT;
+
+	return 0;
+}
+
+int kbase_hwcnt_legacy_client_clear(struct kbase_hwcnt_legacy_client *hlcli)
+{
+	u64 ts_start_ns;
+	u64 ts_end_ns;
+
+	if (!hlcli)
+		return -EINVAL;
+
+	/* Dump with a NULL buffer to clear this client's counters */
+	return kbase_hwcnt_virtualizer_client_dump(hlcli->hvcli,
+		&ts_start_ns, &ts_end_ns, NULL);
+}
diff --git a/mali_kbase/mali_kbase_hwcnt_legacy.h b/mali_kbase/mali_kbase_hwcnt_legacy.h
new file mode 100644
index 0000000..7a610ae
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_legacy.h
@@ -0,0 +1,94 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+/**
+ * Legacy hardware counter interface, giving userspace clients simple,
+ * synchronous access to hardware counters.
+ *
+ * Any functions operating on an single legacy hardware counter client instance
+ * must be externally synchronised.
+ * Different clients may safely be used concurrently.
+ */
+
+#ifndef _KBASE_HWCNT_LEGACY_H_
+#define _KBASE_HWCNT_LEGACY_H_
+
+struct kbase_hwcnt_legacy_client;
+struct kbase_ioctl_hwcnt_enable;
+struct kbase_hwcnt_virtualizer;
+
+/**
+ * kbase_hwcnt_legacy_client_create() - Create a legacy hardware counter client.
+ * @hvirt:     Non-NULL pointer to hardware counter virtualizer the client
+ *             should be attached to.
+ * @enable:    Non-NULL pointer to hwcnt_enable structure, containing a valid
+ *             pointer to a user dump buffer large enough to hold a dump, and
+ *             the counters that should be enabled.
+ * @out_hlcli: Non-NULL pointer to where the pointer to the created client will
+ *             be stored on success.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_legacy_client_create(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	struct kbase_ioctl_hwcnt_enable *enable,
+	struct kbase_hwcnt_legacy_client **out_hlcli);
+
+/**
+ * kbase_hwcnt_legacy_client_destroy() - Destroy a legacy hardware counter
+ *                                       client.
+ * @hlcli: Pointer to the legacy hardware counter client.
+ *
+ * Will safely destroy a client in any partial state of construction.
+ */
+void kbase_hwcnt_legacy_client_destroy(struct kbase_hwcnt_legacy_client *hlcli);
+
+/**
+ * kbase_hwcnt_legacy_client_dump() - Perform a hardware counter dump into the
+ *                                    client's user buffer.
+ * @hlcli: Non-NULL pointer to the legacy hardware counter client.
+ *
+ * This function will synchronously dump hardware counters into the user buffer
+ * specified on client creation, with the counters specified on client creation.
+ *
+ * The counters are automatically cleared after each dump, such that the next
+ * dump performed will return the counter values accumulated between the time of
+ * this function call and the next dump.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_legacy_client_dump(struct kbase_hwcnt_legacy_client *hlcli);
+
+/**
+ * kbase_hwcnt_legacy_client_clear() - Perform and discard a hardware counter
+ *                                     dump.
+ * @hlcli: Non-NULL pointer to the legacy hardware counter client.
+ *
+ * This function will synchronously clear the hardware counters, such that the
+ * next dump performed will return the counter values accumulated between the
+ * time of this function call and the next dump.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_legacy_client_clear(struct kbase_hwcnt_legacy_client *hlcli);
+
+#endif /* _KBASE_HWCNT_LEGACY_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_types.c b/mali_kbase/mali_kbase_hwcnt_types.c
new file mode 100644
index 0000000..1e9efde
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_types.c
@@ -0,0 +1,538 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+#include "mali_kbase_hwcnt_types.h"
+#include "mali_kbase.h"
+
+/* Minimum alignment of each block of hardware counters */
+#define KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT \
+	(KBASE_HWCNT_BITFIELD_BITS * KBASE_HWCNT_VALUE_BYTES)
+
+/**
+ * KBASE_HWCNT_ALIGN_UPWARDS() - Align a value to an alignment.
+ * @value:     The value to align upwards.
+ * @alignment: The alignment.
+ *
+ * Return: A number greater than or equal to value that is aligned to alignment.
+ */
+#define KBASE_HWCNT_ALIGN_UPWARDS(value, alignment) \
+	(value + ((alignment - (value % alignment)) % alignment))
+
+int kbase_hwcnt_metadata_create(
+	const struct kbase_hwcnt_description *desc,
+	const struct kbase_hwcnt_metadata **out_metadata)
+{
+	char *buf;
+	struct kbase_hwcnt_metadata *metadata;
+	struct kbase_hwcnt_group_metadata *grp_mds;
+	size_t grp;
+	size_t enable_map_count; /* Number of u64 bitfields (inc padding) */
+	size_t dump_buf_count; /* Number of u32 values (inc padding) */
+	size_t avail_mask_bits; /* Number of availability mask bits */
+
+	size_t size;
+	size_t offset;
+
+	if (!desc || !out_metadata)
+		return -EINVAL;
+
+	/* Calculate the bytes needed to tightly pack the metadata */
+
+	/* Top level metadata */
+	size = 0;
+	size += sizeof(struct kbase_hwcnt_metadata);
+
+	/* Group metadata */
+	size += sizeof(struct kbase_hwcnt_group_metadata) * desc->grp_cnt;
+
+	/* Block metadata */
+	for (grp = 0; grp < desc->grp_cnt; grp++) {
+		size += sizeof(struct kbase_hwcnt_block_metadata) *
+			desc->grps[grp].blk_cnt;
+	}
+
+	/* Single allocation for the entire metadata */
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/* Use the allocated memory for the metadata and its members */
+
+	/* Bump allocate the top level metadata */
+	offset = 0;
+	metadata = (struct kbase_hwcnt_metadata *)(buf + offset);
+	offset += sizeof(struct kbase_hwcnt_metadata);
+
+	/* Bump allocate the group metadata */
+	grp_mds = (struct kbase_hwcnt_group_metadata *)(buf + offset);
+	offset += sizeof(struct kbase_hwcnt_group_metadata) * desc->grp_cnt;
+
+	enable_map_count = 0;
+	dump_buf_count = 0;
+	avail_mask_bits = 0;
+
+	for (grp = 0; grp < desc->grp_cnt; grp++) {
+		size_t blk;
+
+		const struct kbase_hwcnt_group_description *grp_desc =
+			desc->grps + grp;
+		struct kbase_hwcnt_group_metadata *grp_md = grp_mds + grp;
+
+		size_t group_enable_map_count = 0;
+		size_t group_dump_buffer_count = 0;
+		size_t group_avail_mask_bits = 0;
+
+		/* Bump allocate this group's block metadata */
+		struct kbase_hwcnt_block_metadata *blk_mds =
+			(struct kbase_hwcnt_block_metadata *)(buf + offset);
+		offset += sizeof(struct kbase_hwcnt_block_metadata) *
+			grp_desc->blk_cnt;
+
+		/* Fill in each block in the group's information */
+		for (blk = 0; blk < grp_desc->blk_cnt; blk++) {
+			const struct kbase_hwcnt_block_description *blk_desc =
+				grp_desc->blks + blk;
+			struct kbase_hwcnt_block_metadata *blk_md =
+				blk_mds + blk;
+			const size_t n_values =
+				blk_desc->hdr_cnt + blk_desc->ctr_cnt;
+
+			blk_md->type = blk_desc->type;
+			blk_md->inst_cnt = blk_desc->inst_cnt;
+			blk_md->hdr_cnt = blk_desc->hdr_cnt;
+			blk_md->ctr_cnt = blk_desc->ctr_cnt;
+			blk_md->enable_map_index = group_enable_map_count;
+			blk_md->enable_map_stride =
+				kbase_hwcnt_bitfield_count(n_values);
+			blk_md->dump_buf_index = group_dump_buffer_count;
+			blk_md->dump_buf_stride =
+				KBASE_HWCNT_ALIGN_UPWARDS(
+					n_values,
+					(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
+					 KBASE_HWCNT_VALUE_BYTES));
+			blk_md->avail_mask_index = group_avail_mask_bits;
+
+			group_enable_map_count +=
+				blk_md->enable_map_stride * blk_md->inst_cnt;
+			group_dump_buffer_count +=
+				blk_md->dump_buf_stride * blk_md->inst_cnt;
+			group_avail_mask_bits += blk_md->inst_cnt;
+		}
+
+		/* Fill in the group's information */
+		grp_md->type = grp_desc->type;
+		grp_md->blk_cnt = grp_desc->blk_cnt;
+		grp_md->blk_metadata = blk_mds;
+		grp_md->enable_map_index = enable_map_count;
+		grp_md->dump_buf_index = dump_buf_count;
+		grp_md->avail_mask_index = avail_mask_bits;
+
+		enable_map_count += group_enable_map_count;
+		dump_buf_count += group_dump_buffer_count;
+		avail_mask_bits += group_avail_mask_bits;
+	}
+
+	/* Fill in the top level metadata's information */
+	metadata->grp_cnt = desc->grp_cnt;
+	metadata->grp_metadata = grp_mds;
+	metadata->enable_map_bytes =
+		enable_map_count * KBASE_HWCNT_BITFIELD_BYTES;
+	metadata->dump_buf_bytes = dump_buf_count * KBASE_HWCNT_VALUE_BYTES;
+	metadata->avail_mask = desc->avail_mask;
+
+	WARN_ON(size != offset);
+	/* Due to the block alignment, there should be exactly one enable map
+	 * bit per 4 bytes in the dump buffer.
+	 */
+	WARN_ON(metadata->dump_buf_bytes !=
+		(metadata->enable_map_bytes *
+		 BITS_PER_BYTE * KBASE_HWCNT_VALUE_BYTES));
+
+	*out_metadata = metadata;
+	return 0;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_metadata_create);
+
+void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata)
+{
+	kfree(metadata);
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_metadata_destroy);
+
+int kbase_hwcnt_enable_map_alloc(
+	const struct kbase_hwcnt_metadata *metadata,
+	struct kbase_hwcnt_enable_map *enable_map)
+{
+	u64 *enable_map_buf;
+
+	if (!metadata || !enable_map)
+		return -EINVAL;
+
+	enable_map_buf = kzalloc(metadata->enable_map_bytes, GFP_KERNEL);
+	if (!enable_map_buf)
+		return -ENOMEM;
+
+	enable_map->metadata = metadata;
+	enable_map->enable_map = enable_map_buf;
+	return 0;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_enable_map_alloc);
+
+void kbase_hwcnt_enable_map_free(struct kbase_hwcnt_enable_map *enable_map)
+{
+	if (!enable_map)
+		return;
+
+	kfree(enable_map->enable_map);
+	enable_map->enable_map = NULL;
+	enable_map->metadata = NULL;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_enable_map_free);
+
+int kbase_hwcnt_dump_buffer_alloc(
+	const struct kbase_hwcnt_metadata *metadata,
+	struct kbase_hwcnt_dump_buffer *dump_buf)
+{
+	u32 *buf;
+
+	if (!metadata || !dump_buf)
+		return -EINVAL;
+
+	buf = kmalloc(metadata->dump_buf_bytes, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	dump_buf->metadata = metadata;
+	dump_buf->dump_buf = buf;
+	return 0;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_alloc);
+
+void kbase_hwcnt_dump_buffer_free(struct kbase_hwcnt_dump_buffer *dump_buf)
+{
+	if (!dump_buf)
+		return;
+
+	kfree(dump_buf->dump_buf);
+	memset(dump_buf, 0, sizeof(*dump_buf));
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_free);
+
+int kbase_hwcnt_dump_buffer_array_alloc(
+	const struct kbase_hwcnt_metadata *metadata,
+	size_t n,
+	struct kbase_hwcnt_dump_buffer_array *dump_bufs)
+{
+	struct kbase_hwcnt_dump_buffer *buffers;
+	size_t buf_idx;
+	unsigned int order;
+	unsigned long addr;
+
+	if (!metadata || !dump_bufs)
+		return -EINVAL;
+
+	/* Allocate memory for the dump buffer struct array */
+	buffers = kmalloc_array(n, sizeof(*buffers), GFP_KERNEL);
+	if (!buffers)
+		return -ENOMEM;
+
+	/* Allocate pages for the actual dump buffers, as they tend to be fairly
+	 * large.
+	 */
+	order = get_order(metadata->dump_buf_bytes * n);
+	addr = __get_free_pages(GFP_KERNEL, order);
+
+	if (!addr) {
+		kfree(buffers);
+		return -ENOMEM;
+	}
+
+	dump_bufs->page_addr = addr;
+	dump_bufs->page_order = order;
+	dump_bufs->buf_cnt = n;
+	dump_bufs->bufs = buffers;
+
+	/* Set the buffer of each dump buf */
+	for (buf_idx = 0; buf_idx < n; buf_idx++) {
+		const size_t offset = metadata->dump_buf_bytes * buf_idx;
+
+		buffers[buf_idx].metadata = metadata;
+		buffers[buf_idx].dump_buf = (u32 *)(addr + offset);
+	}
+
+	return 0;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_array_alloc);
+
+void kbase_hwcnt_dump_buffer_array_free(
+	struct kbase_hwcnt_dump_buffer_array *dump_bufs)
+{
+	if (!dump_bufs)
+		return;
+
+	kfree(dump_bufs->bufs);
+	free_pages(dump_bufs->page_addr, dump_bufs->page_order);
+	memset(dump_bufs, 0, sizeof(*dump_bufs));
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_array_free);
+
+void kbase_hwcnt_dump_buffer_zero(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_enable_map *dst_enable_map)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t grp, blk, blk_inst;
+
+	if (WARN_ON(!dst) ||
+	    WARN_ON(!dst_enable_map) ||
+	    WARN_ON(dst->metadata != dst_enable_map->metadata))
+		return;
+
+	metadata = dst->metadata;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+		u32 *dst_blk;
+		size_t val_cnt;
+
+		if (!kbase_hwcnt_enable_map_block_enabled(
+			dst_enable_map, grp, blk, blk_inst))
+			continue;
+
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+			dst, grp, blk, blk_inst);
+		val_cnt = kbase_hwcnt_metadata_block_values_count(
+			metadata, grp, blk);
+
+		kbase_hwcnt_dump_buffer_block_zero(dst_blk, val_cnt);
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_zero);
+
+void kbase_hwcnt_dump_buffer_zero_strict(
+	struct kbase_hwcnt_dump_buffer *dst)
+{
+	if (WARN_ON(!dst))
+		return;
+
+	memset(dst->dump_buf, 0, dst->metadata->dump_buf_bytes);
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_zero_strict);
+
+void kbase_hwcnt_dump_buffer_zero_non_enabled(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_enable_map *dst_enable_map)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t grp, blk, blk_inst;
+
+	if (WARN_ON(!dst) ||
+	    WARN_ON(!dst_enable_map) ||
+	    WARN_ON(dst->metadata != dst_enable_map->metadata))
+		return;
+
+	metadata = dst->metadata;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+		u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+			dst, grp, blk, blk_inst);
+		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
+			dst_enable_map, grp, blk, blk_inst);
+		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(
+			metadata, grp, blk);
+
+		/* Align upwards to include padding bytes */
+		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(val_cnt,
+			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
+			 KBASE_HWCNT_VALUE_BYTES));
+
+		if (kbase_hwcnt_metadata_block_instance_avail(
+			metadata, grp, blk, blk_inst)) {
+			/* Block available, so only zero non-enabled values */
+			kbase_hwcnt_dump_buffer_block_zero_non_enabled(
+				dst_blk, blk_em, val_cnt);
+		} else {
+			/* Block not available, so zero the entire thing */
+			kbase_hwcnt_dump_buffer_block_zero(dst_blk, val_cnt);
+		}
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_zero_non_enabled);
+
+void kbase_hwcnt_dump_buffer_copy(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_dump_buffer *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t grp, blk, blk_inst;
+
+	if (WARN_ON(!dst) ||
+	    WARN_ON(!src) ||
+	    WARN_ON(!dst_enable_map) ||
+	    WARN_ON(dst == src) ||
+	    WARN_ON(dst->metadata != src->metadata) ||
+	    WARN_ON(dst->metadata != dst_enable_map->metadata))
+		return;
+
+	metadata = dst->metadata;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+		u32 *dst_blk;
+		const u32 *src_blk;
+		size_t val_cnt;
+
+		if (!kbase_hwcnt_enable_map_block_enabled(
+			dst_enable_map, grp, blk, blk_inst))
+			continue;
+
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+			dst, grp, blk, blk_inst);
+		src_blk = kbase_hwcnt_dump_buffer_block_instance(
+			src, grp, blk, blk_inst);
+		val_cnt = kbase_hwcnt_metadata_block_values_count(
+			metadata, grp, blk);
+
+		kbase_hwcnt_dump_buffer_block_copy(dst_blk, src_blk, val_cnt);
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_copy);
+
+void kbase_hwcnt_dump_buffer_copy_strict(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_dump_buffer *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t grp, blk, blk_inst;
+
+	if (WARN_ON(!dst) ||
+	    WARN_ON(!src) ||
+	    WARN_ON(!dst_enable_map) ||
+	    WARN_ON(dst == src) ||
+	    WARN_ON(dst->metadata != src->metadata) ||
+	    WARN_ON(dst->metadata != dst_enable_map->metadata))
+		return;
+
+	metadata = dst->metadata;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+		u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+			dst, grp, blk, blk_inst);
+		const u32 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
+			src, grp, blk, blk_inst);
+		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
+			dst_enable_map, grp, blk, blk_inst);
+		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(
+			metadata, grp, blk);
+		/* Align upwards to include padding bytes */
+		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(val_cnt,
+			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
+			 KBASE_HWCNT_VALUE_BYTES));
+
+		kbase_hwcnt_dump_buffer_block_copy_strict(
+			dst_blk, src_blk, blk_em, val_cnt);
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_copy_strict);
+
+void kbase_hwcnt_dump_buffer_accumulate(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_dump_buffer *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t grp, blk, blk_inst;
+
+	if (WARN_ON(!dst) ||
+	    WARN_ON(!src) ||
+	    WARN_ON(!dst_enable_map) ||
+	    WARN_ON(dst == src) ||
+	    WARN_ON(dst->metadata != src->metadata) ||
+	    WARN_ON(dst->metadata != dst_enable_map->metadata))
+		return;
+
+	metadata = dst->metadata;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+		u32 *dst_blk;
+		const u32 *src_blk;
+		size_t hdr_cnt;
+		size_t ctr_cnt;
+
+		if (!kbase_hwcnt_enable_map_block_enabled(
+			dst_enable_map, grp, blk, blk_inst))
+			continue;
+
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+			dst, grp, blk, blk_inst);
+		src_blk = kbase_hwcnt_dump_buffer_block_instance(
+			src, grp, blk, blk_inst);
+		hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
+			metadata, grp, blk);
+		ctr_cnt = kbase_hwcnt_metadata_block_counters_count(
+			metadata, grp, blk);
+
+		kbase_hwcnt_dump_buffer_block_accumulate(
+			dst_blk, src_blk, hdr_cnt, ctr_cnt);
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_accumulate);
+
+void kbase_hwcnt_dump_buffer_accumulate_strict(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_dump_buffer *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map)
+{
+	const struct kbase_hwcnt_metadata *metadata;
+	size_t grp, blk, blk_inst;
+
+	if (WARN_ON(!dst) ||
+	    WARN_ON(!src) ||
+	    WARN_ON(!dst_enable_map) ||
+	    WARN_ON(dst == src) ||
+	    WARN_ON(dst->metadata != src->metadata) ||
+	    WARN_ON(dst->metadata != dst_enable_map->metadata))
+		return;
+
+	metadata = dst->metadata;
+
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+		u32 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
+			dst, grp, blk, blk_inst);
+		const u32 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
+			src, grp, blk, blk_inst);
+		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
+			dst_enable_map, grp, blk, blk_inst);
+		size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
+			metadata, grp, blk);
+		size_t ctr_cnt = kbase_hwcnt_metadata_block_counters_count(
+			metadata, grp, blk);
+		/* Align upwards to include padding bytes */
+		ctr_cnt = KBASE_HWCNT_ALIGN_UPWARDS(hdr_cnt + ctr_cnt,
+			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
+			 KBASE_HWCNT_VALUE_BYTES) - hdr_cnt);
+
+		kbase_hwcnt_dump_buffer_block_accumulate_strict(
+			dst_blk, src_blk, blk_em, hdr_cnt, ctr_cnt);
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_dump_buffer_accumulate_strict);
diff --git a/mali_kbase/mali_kbase_hwcnt_types.h b/mali_kbase/mali_kbase_hwcnt_types.h
new file mode 100644
index 0000000..4d78c84
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_types.h
@@ -0,0 +1,1087 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+/**
+ * Hardware counter types.
+ * Contains structures for describing the physical layout of hardware counter
+ * dump buffers and enable maps within a system.
+ *
+ * Also contains helper functions for manipulation of these dump buffers and
+ * enable maps.
+ *
+ * Through use of these structures and functions, hardware counters can be
+ * enabled, copied, accumulated, and generally manipulated in a generic way,
+ * regardless of the physical counter dump layout.
+ *
+ * Terminology:
+ *
+ * Hardware Counter System:
+ *   A collection of hardware counter groups, making a full hardware counter
+ *   system.
+ * Hardware Counter Group:
+ *   A group of Hardware Counter Blocks (e.g. a t62x might have more than one
+ *   core group, so has one counter group per core group, where each group
+ *   may have a different number and layout of counter blocks).
+ * Hardware Counter Block:
+ *   A block of hardware counters (e.g. shader block, tiler block).
+ * Hardware Counter Block Instance:
+ *   An instance of a Hardware Counter Block (e.g. an MP4 GPU might have
+ *   4 shader block instances).
+ *
+ * Block Header:
+ *   A header value inside a counter block. Headers don't count anything,
+ *   so it is only valid to copy or zero them. Headers are always the first
+ *   values in the block.
+ * Block Counter:
+ *   A counter value inside a counter block. Counters can be zeroed, copied,
+ *   or accumulated. Counters are always immediately after the headers in the
+ *   block.
+ * Block Value:
+ *   A catch-all term for block headers and block counters.
+ *
+ * Enable Map:
+ *   An array of u64 bitfields, where each bit either enables exactly one
+ *   block value, or is unused (padding).
+ * Dump Buffer:
+ *   An array of u32 values, where each u32 corresponds either to one block
+ *   value, or is unused (padding).
+ * Availability Mask:
+ *   A bitfield, where each bit corresponds to whether a block instance is
+ *   physically available (e.g. an MP3 GPU may have a sparse core mask of
+ *   0b1011, meaning it only has 3 cores but for hardware counter dumps has the
+ *   same dump buffer layout as an MP4 GPU with a core mask of 0b1111. In this
+ *   case, the availability mask might be 0b1011111 (the exact layout will
+ *   depend on the specific hardware architecture), with the 3 extra early bits
+ *   corresponding to other block instances in the hardware counter system).
+ * Metadata:
+ *   Structure describing the physical layout of the enable map and dump buffers
+ *   for a specific hardware counter system.
+ *
+ */
+
+#ifndef _KBASE_HWCNT_TYPES_H_
+#define _KBASE_HWCNT_TYPES_H_
+
+#include <linux/bitops.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include "mali_malisw.h"
+
+/* Number of bytes in each bitfield */
+#define KBASE_HWCNT_BITFIELD_BYTES (sizeof(u64))
+
+/* Number of bits in each bitfield */
+#define KBASE_HWCNT_BITFIELD_BITS (KBASE_HWCNT_BITFIELD_BYTES * BITS_PER_BYTE)
+
+/* Number of bytes for each counter value */
+#define KBASE_HWCNT_VALUE_BYTES (sizeof(u32))
+
+/* Number of bits in an availability mask (i.e. max total number of block
+ * instances supported in a Hardware Counter System)
+ */
+#define KBASE_HWCNT_AVAIL_MASK_BITS (sizeof(u64) * BITS_PER_BYTE)
+
+/**
+ * struct kbase_hwcnt_block_description - Description of one or more identical,
+ *                                        contiguous, Hardware Counter Blocks.
+ * @type:     The arbitrary identifier used to identify the type of the block.
+ * @inst_cnt: The number of Instances of the block.
+ * @hdr_cnt:  The number of 32-bit Block Headers in the block.
+ * @ctr_cnt:  The number of 32-bit Block Counters in the block.
+ */
+struct kbase_hwcnt_block_description {
+	u64 type;
+	size_t inst_cnt;
+	size_t hdr_cnt;
+	size_t ctr_cnt;
+};
+
+/**
+ * struct kbase_hwcnt_group_description - Description of one or more identical,
+ *                                        contiguous Hardware Counter Groups.
+ * @type:    The arbitrary identifier used to identify the type of the group.
+ * @blk_cnt: The number of types of Hardware Counter Block in the group.
+ * @blks:    Non-NULL pointer to an array of blk_cnt block descriptions,
+ *           describing each type of Hardware Counter Block in the group.
+ */
+struct kbase_hwcnt_group_description {
+	u64 type;
+	size_t blk_cnt;
+	const struct kbase_hwcnt_block_description *blks;
+};
+
+/**
+ * struct kbase_hwcnt_description - Description of a Hardware Counter System.
+ * @grp_cnt:    The number of Hardware Counter Groups.
+ * @grps:       Non-NULL pointer to an array of grp_cnt group descriptions,
+ *              describing each Hardware Counter Group in the system.
+ * @avail_mask: Flat Availability Mask for all block instances in the system.
+ */
+struct kbase_hwcnt_description {
+	size_t grp_cnt;
+	const struct kbase_hwcnt_group_description *grps;
+	u64 avail_mask;
+};
+
+/**
+ * struct kbase_hwcnt_block_metadata - Metadata describing the physical layout
+ *                                     of a block in a Hardware Counter System's
+ *                                     Dump Buffers and Enable Maps.
+ * @type:              The arbitrary identifier used to identify the type of the
+ *                     block.
+ * @inst_cnt:          The number of Instances of the block.
+ * @hdr_cnt:           The number of 32-bit Block Headers in the block.
+ * @ctr_cnt:           The number of 32-bit Block Counters in the block.
+ * @enable_map_index:  Index in u64s into the parent's Enable Map where the
+ *                     Enable Map bitfields of the Block Instances described by
+ *                     this metadata start.
+ * @enable_map_stride: Stride in u64s between the Enable Maps of each of the
+ *                     Block Instances described by this metadata.
+ * @dump_buf_index:    Index in u32s into the parent's Dump Buffer where the
+ *                     Dump Buffers of the Block Instances described by this
+ *                     metadata start.
+ * @dump_buf_stride:   Stride in u32s between the Dump Buffers of each of the
+ *                     Block Instances described by this metadata.
+ * @avail_mask_index:  Index in bits into the parent's Availability Mask where
+ *                     the Availability Masks of the Block Instances described
+ *                     by this metadata start.
+ */
+struct kbase_hwcnt_block_metadata {
+	u64 type;
+	size_t inst_cnt;
+	size_t hdr_cnt;
+	size_t ctr_cnt;
+	size_t enable_map_index;
+	size_t enable_map_stride;
+	size_t dump_buf_index;
+	size_t dump_buf_stride;
+	size_t avail_mask_index;
+};
+
+/**
+ * struct kbase_hwcnt_group_metadata - Metadata describing the physical layout
+ *                                     of a group of blocks in a Hardware
+ *                                     Counter System's Dump Buffers and Enable
+ *                                     Maps.
+ * @type:             The arbitrary identifier used to identify the type of the
+ *                    group.
+ * @blk_cnt:          The number of types of Hardware Counter Block in the
+ *                    group.
+ * @blk_metadata:     Non-NULL pointer to an array of blk_cnt block metadata,
+ *                    describing the physical layout of each type of Hardware
+ *                    Counter Block in the group.
+ * @enable_map_index: Index in u64s into the parent's Enable Map where the
+ *                    Enable Maps of the blocks within the group described by
+ *                    this metadata start.
+ * @dump_buf_index:   Index in u32s into the parent's Dump Buffer where the
+ *                    Dump Buffers of the blocks within the group described by
+ *                    metadata start.
+ * @avail_mask_index: Index in bits into the parent's Availability Mask where
+ *                    the Availability Masks of the blocks within the group
+ *                    described by this metadata start.
+ */
+struct kbase_hwcnt_group_metadata {
+	u64 type;
+	size_t blk_cnt;
+	const struct kbase_hwcnt_block_metadata *blk_metadata;
+	size_t enable_map_index;
+	size_t dump_buf_index;
+	size_t avail_mask_index;
+};
+
+/**
+ * struct kbase_hwcnt_metadata - Metadata describing the physical layout
+ *                               of Dump Buffers and Enable Maps within a
+ *                               Hardware Counter System.
+ * @grp_cnt:          The number of Hardware Counter Groups.
+ * @grp_metadata:     Non-NULL pointer to an array of grp_cnt group metadata,
+ *                    describing the physical layout of each Hardware Counter
+ *                    Group in the system.
+ * @enable_map_bytes: The size in bytes of an Enable Map needed for the system.
+ * @dump_buf_bytes:   The size in bytes of a Dump Buffer needed for the system.
+ * @avail_mask:       The Availability Mask for the system.
+ */
+struct kbase_hwcnt_metadata {
+	size_t grp_cnt;
+	const struct kbase_hwcnt_group_metadata *grp_metadata;
+	size_t enable_map_bytes;
+	size_t dump_buf_bytes;
+	u64 avail_mask;
+};
+
+/**
+ * struct kbase_hwcnt_enable_map - Hardware Counter Enable Map. Array of u64
+ *                                 bitfields.
+ * @metadata:   Non-NULL pointer to metadata used to identify, and to describe
+ *              the layout of the enable map.
+ * @enable_map: Non-NULL pointer of size metadata->enable_map_bytes to an array
+ *              of u64 bitfields, each bit of which enables one hardware
+ *              counter.
+ */
+struct kbase_hwcnt_enable_map {
+	const struct kbase_hwcnt_metadata *metadata;
+	u64 *enable_map;
+};
+
+/**
+ * struct kbase_hwcnt_dump_buffer - Hardware Counter Dump Buffer. Array of u32
+ *                                  values.
+ * @metadata: Non-NULL pointer to metadata used to identify, and to describe
+ *            the layout of the Dump Buffer.
+ * @dump_buf: Non-NULL pointer of size metadata->dump_buf_bytes to an array
+ *            of u32 values.
+ */
+struct kbase_hwcnt_dump_buffer {
+	const struct kbase_hwcnt_metadata *metadata;
+	u32 *dump_buf;
+};
+
+/**
+ * struct kbase_hwcnt_dump_buffer_array - Hardware Counter Dump Buffer array.
+ * @page_addr:  Address of allocated pages. A single allocation is used for all
+ *              Dump Buffers in the array.
+ * @page_order: The allocation order of the pages.
+ * @buf_cnt:    The number of allocated Dump Buffers.
+ * @bufs:       Non-NULL pointer to the array of Dump Buffers.
+ */
+struct kbase_hwcnt_dump_buffer_array {
+	unsigned long page_addr;
+	unsigned int page_order;
+	size_t buf_cnt;
+	struct kbase_hwcnt_dump_buffer *bufs;
+};
+
+/**
+ * kbase_hwcnt_metadata_create() - Create a hardware counter metadata object
+ *                                 from a description.
+ * @desc:     Non-NULL pointer to a hardware counter description.
+ * @metadata: Non-NULL pointer to where created metadata will be stored on
+ *            success.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_metadata_create(
+	const struct kbase_hwcnt_description *desc,
+	const struct kbase_hwcnt_metadata **metadata);
+
+/**
+ * kbase_hwcnt_metadata_destroy() - Destroy a hardware counter metadata object.
+ * @metadata: Pointer to hardware counter metadata
+ */
+void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
+
+/**
+ * kbase_hwcnt_metadata_group_count() - Get the number of groups.
+ * @metadata: Non-NULL pointer to metadata.
+ *
+ * Return: Number of hardware counter groups described by metadata.
+ */
+#define kbase_hwcnt_metadata_group_count(metadata) \
+	((metadata)->grp_cnt)
+
+/**
+ * kbase_hwcnt_metadata_group_type() - Get the arbitrary type of a group.
+ * @metadata: Non-NULL pointer to metadata.
+ * @grp:      Index of the group in the metadata.
+ *
+ * Return: Type of the group grp.
+ */
+#define kbase_hwcnt_metadata_group_type(metadata, grp) \
+	((metadata)->grp_metadata[(grp)].type)
+
+/**
+ * kbase_hwcnt_metadata_block_count() - Get the number of blocks in a group.
+ * @metadata: Non-NULL pointer to metadata.
+ * @grp:      Index of the group in the metadata.
+ *
+ * Return: Number of blocks in group grp.
+ */
+#define kbase_hwcnt_metadata_block_count(metadata, grp) \
+	((metadata)->grp_metadata[(grp)].blk_cnt)
+
+/**
+ * kbase_hwcnt_metadata_block_type() - Get the arbitrary type of a block.
+ * @metadata: Non-NULL pointer to metadata.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ *
+ * Return: Type of the block blk in group grp.
+ */
+#define kbase_hwcnt_metadata_block_type(metadata, grp, blk) \
+	((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].type)
+
+/**
+ * kbase_hwcnt_metadata_block_instance_count() - Get the number of instances of
+ *                                               a block.
+ * @metadata: Non-NULL pointer to metadata.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ *
+ * Return: Number of instances of block blk in group grp.
+ */
+#define kbase_hwcnt_metadata_block_instance_count(metadata, grp, blk) \
+	((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].inst_cnt)
+
+/**
+ * kbase_hwcnt_metadata_block_headers_count() - Get the number of counter
+ *                                              headers.
+ * @metadata: Non-NULL pointer to metadata.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ *
+ * Return: Number of u32 counter headers in each instance of block blk in
+ *         group grp.
+ */
+#define kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk) \
+	((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].hdr_cnt)
+
+/**
+ * kbase_hwcnt_metadata_block_counters_count() - Get the number of counters.
+ * @metadata: Non-NULL pointer to metadata.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ *
+ * Return: Number of u32 counters in each instance of block blk in group
+ *         grp.
+ */
+#define kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk) \
+	((metadata)->grp_metadata[(grp)].blk_metadata[(blk)].ctr_cnt)
+
+/**
+ * kbase_hwcnt_metadata_block_values_count() - Get the number of values.
+ * @metadata: Non-NULL pointer to metadata.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ *
+ * Return: Number of u32 headers plus counters in each instance of block blk
+ *         in group grp.
+ */
+#define kbase_hwcnt_metadata_block_values_count(metadata, grp, blk) \
+	(kbase_hwcnt_metadata_block_counters_count((metadata), (grp), (blk)) \
+	+ kbase_hwcnt_metadata_block_headers_count((metadata), (grp), (blk)))
+
+/**
+ * kbase_hwcnt_metadata_for_each_block() - Iterate over each block instance in
+ *                                         the metadata.
+ * @md:       Non-NULL pointer to metadata.
+ * @grp:      size_t variable used as group iterator.
+ * @blk:      size_t variable used as block iterator.
+ * @blk_inst: size_t variable used as block instance iterator.
+ *
+ * Iteration order is group, then block, then block instance (i.e. linearly
+ * through memory).
+ */
+#define kbase_hwcnt_metadata_for_each_block(md, grp, blk, blk_inst) \
+	for ((grp) = 0; (grp) < kbase_hwcnt_metadata_group_count((md)); (grp)++) \
+		for ((blk) = 0; (blk) < kbase_hwcnt_metadata_block_count((md), (grp)); (blk)++) \
+			for ((blk_inst) = 0; (blk_inst) < kbase_hwcnt_metadata_block_instance_count((md), (grp), (blk)); (blk_inst)++)
+
+/**
+ * kbase_hwcnt_metadata_block_avail_bit() - Get the bit index into the avail
+ *                                          mask corresponding to the block.
+ * @metadata: Non-NULL pointer to metadata.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ *
+ * Return: The bit index into the avail mask for the block.
+ */
+static inline size_t kbase_hwcnt_metadata_block_avail_bit(
+	const struct kbase_hwcnt_metadata *metadata,
+	size_t grp,
+	size_t blk)
+{
+	const size_t bit =
+		metadata->grp_metadata[grp].avail_mask_index +
+		metadata->grp_metadata[grp].blk_metadata[blk].avail_mask_index;
+
+	return bit;
+}
+
+/**
+ * kbase_hwcnt_metadata_block_instance_avail() - Check if a block instance is
+ *                                               available.
+ * @metadata: Non-NULL pointer to metadata.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ * @blk_inst: Index of the block instance in the block.
+ *
+ * Return: true if the block instance is available, else false.
+ */
+static inline bool kbase_hwcnt_metadata_block_instance_avail(
+	const struct kbase_hwcnt_metadata *metadata,
+	size_t grp,
+	size_t blk,
+	size_t blk_inst)
+{
+	const size_t bit = kbase_hwcnt_metadata_block_avail_bit(
+		metadata, grp, blk) + blk_inst;
+	const u64 mask = 1ull << bit;
+
+	return (metadata->avail_mask & mask) != 0;
+}
+
+/**
+ * kbase_hwcnt_enable_map_alloc() - Allocate an enable map.
+ * @metadata:   Non-NULL pointer to metadata describing the system.
+ * @enable_map: Non-NULL pointer to enable map to be initialised. Will be
+ *              initialised to all zeroes (i.e. all counters disabled).
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_enable_map_alloc(
+	const struct kbase_hwcnt_metadata *metadata,
+	struct kbase_hwcnt_enable_map *enable_map);
+
+/**
+ * kbase_hwcnt_enable_map_free() - Free an enable map.
+ * @enable_map: Enable map to be freed.
+ *
+ * Can be safely called on an all-zeroed enable map structure, or on an already
+ * freed enable map.
+ */
+void kbase_hwcnt_enable_map_free(struct kbase_hwcnt_enable_map *enable_map);
+
+/**
+ * kbase_hwcnt_enable_map_block_instance() - Get the pointer to a block
+ *                                           instance's enable map.
+ * @map:      Non-NULL pointer to (const) enable map.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ * @blk_inst: Index of the block instance in the block.
+ *
+ * Return: (const) u64* to the bitfield(s) used as the enable map for the
+ *         block instance.
+ */
+#define kbase_hwcnt_enable_map_block_instance(map, grp, blk, blk_inst) \
+	((map)->enable_map + \
+	 (map)->metadata->grp_metadata[(grp)].enable_map_index + \
+	 (map)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].enable_map_index + \
+	 (map)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].enable_map_stride * (blk_inst))
+
+/**
+ * kbase_hwcnt_bitfield_count() - Calculate the number of u64 bitfields required
+ *                                to have at minimum one bit per value.
+ * @val_cnt: Number of values.
+ *
+ * Return: Number of required bitfields.
+ */
+static inline size_t kbase_hwcnt_bitfield_count(size_t val_cnt)
+{
+	return (val_cnt + KBASE_HWCNT_BITFIELD_BITS - 1) /
+		KBASE_HWCNT_BITFIELD_BITS;
+}
+
+/**
+ * kbase_hwcnt_enable_map_block_disable_all() - Disable all values in a block.
+ * @dst:      Non-NULL pointer to enable map.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ * @blk_inst: Index of the block instance in the block.
+ */
+static inline void kbase_hwcnt_enable_map_block_disable_all(
+	struct kbase_hwcnt_enable_map *dst,
+	size_t grp,
+	size_t blk,
+	size_t blk_inst)
+{
+	const size_t val_cnt = kbase_hwcnt_metadata_block_values_count(
+		dst->metadata, grp, blk);
+	const size_t bitfld_cnt = kbase_hwcnt_bitfield_count(val_cnt);
+	u64 *block_enable_map = kbase_hwcnt_enable_map_block_instance(
+		dst, grp, blk, blk_inst);
+
+	memset(block_enable_map, 0, bitfld_cnt * KBASE_HWCNT_BITFIELD_BYTES);
+}
+
+/**
+ * kbase_hwcnt_enable_map_disable_all() - Disable all values in the enable map.
+ * @dst: Non-NULL pointer to enable map to zero.
+ */
+static inline void kbase_hwcnt_enable_map_disable_all(
+	struct kbase_hwcnt_enable_map *dst)
+{
+	memset(dst->enable_map, 0, dst->metadata->enable_map_bytes);
+}
+
+/**
+ * kbase_hwcnt_enable_map_block_enable_all() - Enable all values in a block.
+ * @dst:      Non-NULL pointer to enable map.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ * @blk_inst: Index of the block instance in the block.
+ */
+static inline void kbase_hwcnt_enable_map_block_enable_all(
+	struct kbase_hwcnt_enable_map *dst,
+	size_t grp,
+	size_t blk,
+	size_t blk_inst)
+{
+	const size_t val_cnt = kbase_hwcnt_metadata_block_values_count(
+		dst->metadata, grp, blk);
+	const size_t bitfld_cnt = kbase_hwcnt_bitfield_count(val_cnt);
+	u64 *block_enable_map = kbase_hwcnt_enable_map_block_instance(
+		dst, grp, blk, blk_inst);
+
+	size_t bitfld_idx;
+
+	for (bitfld_idx = 0; bitfld_idx < bitfld_cnt; bitfld_idx++) {
+		const u64 remaining_values = val_cnt -
+			(bitfld_idx * KBASE_HWCNT_BITFIELD_BITS);
+		u64 block_enable_map_mask = U64_MAX;
+
+		if (remaining_values < KBASE_HWCNT_BITFIELD_BITS)
+			block_enable_map_mask = (1ull << remaining_values) - 1;
+
+		block_enable_map[bitfld_idx] = block_enable_map_mask;
+	}
+}
+
+/**
+ * kbase_hwcnt_enable_map_block_enable_all() - Enable all values in an enable
+ *                                             map.
+ * @dst: Non-NULL pointer to enable map.
+ */
+static inline void kbase_hwcnt_enable_map_enable_all(
+	struct kbase_hwcnt_enable_map *dst)
+{
+	size_t grp, blk, blk_inst;
+
+	kbase_hwcnt_metadata_for_each_block(dst->metadata, grp, blk, blk_inst)
+		kbase_hwcnt_enable_map_block_enable_all(
+			dst, grp, blk, blk_inst);
+}
+
+/**
+ * kbase_hwcnt_enable_map_copy() - Copy an enable map to another.
+ * @dst: Non-NULL pointer to destination enable map.
+ * @src: Non-NULL pointer to source enable map.
+ *
+ * The dst and src MUST have been created from the same metadata.
+ */
+static inline void kbase_hwcnt_enable_map_copy(
+	struct kbase_hwcnt_enable_map *dst,
+	const struct kbase_hwcnt_enable_map *src)
+{
+	memcpy(dst->enable_map,
+	       src->enable_map,
+	       dst->metadata->enable_map_bytes);
+}
+
+/**
+ * kbase_hwcnt_enable_map_union() - Union dst and src enable maps into dst.
+ * @dst: Non-NULL pointer to destination enable map.
+ * @src: Non-NULL pointer to source enable map.
+ *
+ * The dst and src MUST have been created from the same metadata.
+ */
+static inline void kbase_hwcnt_enable_map_union(
+	struct kbase_hwcnt_enable_map *dst,
+	const struct kbase_hwcnt_enable_map *src)
+{
+	const size_t bitfld_count =
+		dst->metadata->enable_map_bytes / KBASE_HWCNT_BITFIELD_BYTES;
+	size_t i;
+
+	for (i = 0; i < bitfld_count; i++)
+		dst->enable_map[i] |= src->enable_map[i];
+}
+
+/**
+ * kbase_hwcnt_enable_map_block_enabled() - Check if any values in a block
+ *                                          instance are enabled.
+ * @enable_map: Non-NULL pointer to enable map.
+ * @grp:        Index of the group in the metadata.
+ * @blk:        Index of the block in the group.
+ * @blk_inst:   Index of the block instance in the block.
+ *
+ * Return: true if any values in the block are enabled, else false.
+ */
+static inline bool kbase_hwcnt_enable_map_block_enabled(
+	const struct kbase_hwcnt_enable_map *enable_map,
+	size_t grp,
+	size_t blk,
+	size_t blk_inst)
+{
+	bool any_enabled = false;
+	const size_t val_cnt = kbase_hwcnt_metadata_block_values_count(
+		enable_map->metadata, grp, blk);
+	const size_t bitfld_cnt = kbase_hwcnt_bitfield_count(val_cnt);
+	const u64 *block_enable_map = kbase_hwcnt_enable_map_block_instance(
+		enable_map, grp, blk, blk_inst);
+
+	size_t bitfld_idx;
+
+	for (bitfld_idx = 0; bitfld_idx < bitfld_cnt; bitfld_idx++) {
+		const u64 remaining_values = val_cnt -
+			(bitfld_idx * KBASE_HWCNT_BITFIELD_BITS);
+		u64 block_enable_map_mask = U64_MAX;
+
+		if (remaining_values < KBASE_HWCNT_BITFIELD_BITS)
+			block_enable_map_mask = (1ull << remaining_values) - 1;
+
+		any_enabled = any_enabled ||
+			(block_enable_map[bitfld_idx] & block_enable_map_mask);
+	}
+
+	return any_enabled;
+}
+
+/**
+ * kbase_hwcnt_enable_map_any_enabled() - Check if any values are enabled.
+ * @enable_map: Non-NULL pointer to enable map.
+ *
+ * Return: true if any values are enabled, else false.
+ */
+static inline bool kbase_hwcnt_enable_map_any_enabled(
+	const struct kbase_hwcnt_enable_map *enable_map)
+{
+	size_t grp, blk, blk_inst;
+
+	kbase_hwcnt_metadata_for_each_block(
+		enable_map->metadata, grp, blk, blk_inst) {
+		if (kbase_hwcnt_enable_map_block_enabled(
+			enable_map, grp, blk, blk_inst))
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * kbase_hwcnt_enable_map_block_value_enabled() - Check if a value in a block
+ *                                                instance is enabled.
+ * @bitfld:  Non-NULL pointer to the block bitfield(s) obtained from a call to
+ *           kbase_hwcnt_enable_map_block_instance.
+ * @val_idx: Index of the value to check in the block instance.
+ *
+ * Return: true if the value was enabled, else false.
+ */
+static inline bool kbase_hwcnt_enable_map_block_value_enabled(
+	const u64 *bitfld,
+	size_t val_idx)
+{
+	const size_t idx = val_idx / KBASE_HWCNT_BITFIELD_BITS;
+	const size_t bit = val_idx % KBASE_HWCNT_BITFIELD_BITS;
+	const u64 mask = 1ull << bit;
+
+	return (bitfld[idx] & mask) != 0;
+}
+
+/**
+ * kbase_hwcnt_enable_map_block_enable_value() - Enable a value in a block
+ *                                               instance.
+ * @bitfld:  Non-NULL pointer to the block bitfield(s) obtained from a call to
+ *           kbase_hwcnt_enable_map_block_instance.
+ * @val_idx: Index of the value to enable in the block instance.
+ */
+static inline void kbase_hwcnt_enable_map_block_enable_value(
+	u64 *bitfld,
+	size_t val_idx)
+{
+	const size_t idx = val_idx / KBASE_HWCNT_BITFIELD_BITS;
+	const size_t bit = val_idx % KBASE_HWCNT_BITFIELD_BITS;
+	const u64 mask = 1ull << bit;
+
+	bitfld[idx] |= mask;
+}
+
+/**
+ * kbase_hwcnt_enable_map_block_disable_value() - Disable a value in a block
+ *                                                instance.
+ * @bitfld:  Non-NULL pointer to the block bitfield(s) obtained from a call to
+ *           kbase_hwcnt_enable_map_block_instance.
+ * @val_idx: Index of the value to disable in the block instance.
+ */
+static inline void kbase_hwcnt_enable_map_block_disable_value(
+	u64 *bitfld,
+	size_t val_idx)
+{
+	const size_t idx = val_idx / KBASE_HWCNT_BITFIELD_BITS;
+	const size_t bit = val_idx % KBASE_HWCNT_BITFIELD_BITS;
+	const u64 mask = 1ull << bit;
+
+	bitfld[idx] &= ~mask;
+}
+
+/**
+ * kbase_hwcnt_dump_buffer_alloc() - Allocate a dump buffer.
+ * @metadata: Non-NULL pointer to metadata describing the system.
+ * @dump_buf: Non-NULL pointer to dump buffer to be initialised. Will be
+ *            initialised to undefined values, so must be used as a copy dest,
+ *            or cleared before use.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_dump_buffer_alloc(
+	const struct kbase_hwcnt_metadata *metadata,
+	struct kbase_hwcnt_dump_buffer *dump_buf);
+
+/**
+ * kbase_hwcnt_dump_buffer_free() - Free a dump buffer.
+ * @dump_buf: Dump buffer to be freed.
+ *
+ * Can be safely called on an all-zeroed dump buffer structure, or on an already
+ * freed dump buffer.
+ */
+void kbase_hwcnt_dump_buffer_free(struct kbase_hwcnt_dump_buffer *dump_buf);
+
+/**
+ * kbase_hwcnt_dump_buffer_array_alloc() - Allocate an array of dump buffers.
+ * @metadata:  Non-NULL pointer to metadata describing the system.
+ * @n:         Number of dump buffers to allocate
+ * @dump_bufs: Non-NULL pointer to dump buffer array to be initialised. Each
+ *             dump buffer in the array will be initialised to undefined values,
+ *             so must be used as a copy dest, or cleared before use.
+ *
+ * A single contiguous page allocation will be used for all of the buffers
+ * inside the array, where:
+ * dump_bufs[n].dump_buf == page_addr + n * metadata.dump_buf_bytes
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_dump_buffer_array_alloc(
+	const struct kbase_hwcnt_metadata *metadata,
+	size_t n,
+	struct kbase_hwcnt_dump_buffer_array *dump_bufs);
+
+/**
+ * kbase_hwcnt_dump_buffer_array_free() - Free a dump buffer array.
+ * @dump_bufs: Dump buffer array to be freed.
+ *
+ * Can be safely called on an all-zeroed dump buffer array structure, or on an
+ * already freed dump buffer array.
+ */
+void kbase_hwcnt_dump_buffer_array_free(
+	struct kbase_hwcnt_dump_buffer_array *dump_bufs);
+
+/**
+ * kbase_hwcnt_dump_buffer_block_instance() - Get the pointer to a block
+ *                                            instance's dump buffer.
+ * @buf:      Non-NULL pointer to (const) dump buffer.
+ * @grp:      Index of the group in the metadata.
+ * @blk:      Index of the block in the group.
+ * @blk_inst: Index of the block instance in the block.
+ *
+ * Return: (const) u32* to the dump buffer for the block instance.
+ */
+#define kbase_hwcnt_dump_buffer_block_instance(buf, grp, blk, blk_inst) \
+	((buf)->dump_buf + \
+	 (buf)->metadata->grp_metadata[(grp)].dump_buf_index + \
+	 (buf)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].dump_buf_index + \
+	 (buf)->metadata->grp_metadata[(grp)].blk_metadata[(blk)].dump_buf_stride * (blk_inst))
+
+/**
+ * kbase_hwcnt_dump_buffer_zero() - Zero all enabled values in dst.
+ *                                  After the operation, all non-enabled values
+ *                                  will be undefined.
+ * @dst:            Non-NULL pointer to dump buffer.
+ * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
+ *
+ * The dst and dst_enable_map MUST have been created from the same metadata.
+ */
+void kbase_hwcnt_dump_buffer_zero(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_enable_map *dst_enable_map);
+
+/**
+ * kbase_hwcnt_dump_buffer_block_zero() - Zero all values in a block.
+ * @dst_blk: Non-NULL pointer to dst block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @val_cnt: Number of values in the block.
+ */
+static inline void kbase_hwcnt_dump_buffer_block_zero(
+	u32 *dst_blk,
+	size_t val_cnt)
+{
+	memset(dst_blk, 0, (val_cnt * KBASE_HWCNT_VALUE_BYTES));
+}
+
+/**
+ * kbase_hwcnt_dump_buffer_zero_strict() - Zero all values in dst.
+ *                                         After the operation, all values
+ *                                         (including padding bytes) will be
+ *                                         zero.
+ *                                         Slower than the non-strict variant.
+ * @dst: Non-NULL pointer to dump buffer.
+ */
+void kbase_hwcnt_dump_buffer_zero_strict(
+	struct kbase_hwcnt_dump_buffer *dst);
+
+/**
+ * kbase_hwcnt_dump_buffer_zero_non_enabled() - Zero all non-enabled values in
+ *                                              dst (including padding bytes and
+ *                                              unavailable blocks).
+ *                                              After the operation, all enabled
+ *                                              values will be unchanged.
+ * @dst:            Non-NULL pointer to dump buffer.
+ * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
+ *
+ * The dst and dst_enable_map MUST have been created from the same metadata.
+ */
+void kbase_hwcnt_dump_buffer_zero_non_enabled(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_enable_map *dst_enable_map);
+
+/**
+ * kbase_hwcnt_dump_buffer_block_zero_non_enabled() - Zero all non-enabled
+ *                                                    values in a block.
+ *                                                    After the operation, all
+ *                                                    enabled values will be
+ *                                                    unchanged.
+ * @dst_blk: Non-NULL pointer to dst block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @blk_em:  Non-NULL pointer to the block bitfield(s) obtained from a call to
+ *           kbase_hwcnt_enable_map_block_instance.
+ * @val_cnt: Number of values in the block.
+ */
+static inline void kbase_hwcnt_dump_buffer_block_zero_non_enabled(
+	u32 *dst_blk,
+	const u64 *blk_em,
+	size_t val_cnt)
+{
+	size_t val;
+
+	for (val = 0; val < val_cnt; val++) {
+		if (!kbase_hwcnt_enable_map_block_value_enabled(blk_em, val))
+			dst_blk[val] = 0;
+	}
+}
+
+/**
+ * kbase_hwcnt_dump_buffer_copy() - Copy all enabled values from src to dst.
+ *                                  After the operation, all non-enabled values
+ *                                  will be undefined.
+ * @dst:            Non-NULL pointer to dst dump buffer.
+ * @src:            Non-NULL pointer to src dump buffer.
+ * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
+ *
+ * The dst, src, and dst_enable_map MUST have been created from the same
+ * metadata.
+ */
+void kbase_hwcnt_dump_buffer_copy(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_dump_buffer *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map);
+
+/**
+ * kbase_hwcnt_dump_buffer_block_copy() - Copy all block values from src to dst.
+ * @dst_blk: Non-NULL pointer to dst block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @src_blk: Non-NULL pointer to src block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @val_cnt: Number of values in the block.
+ */
+static inline void kbase_hwcnt_dump_buffer_block_copy(
+	u32 *dst_blk,
+	const u32 *src_blk,
+	size_t val_cnt)
+{
+	/* Copy all the counters in the block instance.
+	 * Values of non-enabled counters are undefined.
+	 */
+	memcpy(dst_blk, src_blk, (val_cnt * KBASE_HWCNT_VALUE_BYTES));
+}
+
+/**
+ * kbase_hwcnt_dump_buffer_copy_strict() - Copy all enabled values from src to
+ *                                         dst.
+ *                                         After the operation, all non-enabled
+ *                                         values (including padding bytes) will
+ *                                         be zero.
+ *                                         Slower than the non-strict variant.
+ * @dst:            Non-NULL pointer to dst dump buffer.
+ * @src:            Non-NULL pointer to src dump buffer.
+ * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
+ *
+ * The dst, src, and dst_enable_map MUST have been created from the same
+ * metadata.
+ */
+void kbase_hwcnt_dump_buffer_copy_strict(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_dump_buffer *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map);
+
+/**
+ * kbase_hwcnt_dump_buffer_block_copy_strict() - Copy all enabled block values
+ *                                               from src to dst.
+ *                                               After the operation, all
+ *                                               non-enabled values will be
+ *                                               zero.
+ * @dst_blk: Non-NULL pointer to dst block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @src_blk: Non-NULL pointer to src block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @blk_em:  Non-NULL pointer to the block bitfield(s) obtained from a call to
+ *           kbase_hwcnt_enable_map_block_instance.
+ * @val_cnt: Number of values in the block.
+ *
+ * After the copy, any disabled values in dst will be zero.
+ */
+static inline void kbase_hwcnt_dump_buffer_block_copy_strict(
+	u32 *dst_blk,
+	const u32 *src_blk,
+	const u64 *blk_em,
+	size_t val_cnt)
+{
+	size_t val;
+
+	for (val = 0; val < val_cnt; val++) {
+		bool val_enabled = kbase_hwcnt_enable_map_block_value_enabled(
+			blk_em, val);
+
+		dst_blk[val] = val_enabled ? src_blk[val] : 0;
+	}
+}
+
+/**
+ * kbase_hwcnt_dump_buffer_accumulate() - Copy all enabled headers and
+ *                                        accumulate all enabled counters from
+ *                                        src to dst.
+ *                                        After the operation, all non-enabled
+ *                                        values will be undefined.
+ * @dst:            Non-NULL pointer to dst dump buffer.
+ * @src:            Non-NULL pointer to src dump buffer.
+ * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
+ *
+ * The dst, src, and dst_enable_map MUST have been created from the same
+ * metadata.
+ */
+void kbase_hwcnt_dump_buffer_accumulate(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_dump_buffer *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map);
+
+/**
+ * kbase_hwcnt_dump_buffer_block_accumulate() - Copy all block headers and
+ *                                              accumulate all block counters
+ *                                              from src to dst.
+ * @dst_blk: Non-NULL pointer to dst block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @src_blk: Non-NULL pointer to src block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @hdr_cnt: Number of headers in the block.
+ * @ctr_cnt: Number of counters in the block.
+ */
+static inline void kbase_hwcnt_dump_buffer_block_accumulate(
+	u32 *dst_blk,
+	const u32 *src_blk,
+	size_t hdr_cnt,
+	size_t ctr_cnt)
+{
+	size_t ctr;
+	/* Copy all the headers in the block instance.
+	 * Values of non-enabled headers are undefined.
+	 */
+	memcpy(dst_blk, src_blk, hdr_cnt * KBASE_HWCNT_VALUE_BYTES);
+
+	/* Accumulate all the counters in the block instance.
+	 * Values of non-enabled counters are undefined.
+	 */
+	for (ctr = hdr_cnt; ctr < ctr_cnt + hdr_cnt; ctr++) {
+		u32 *dst_ctr = dst_blk + ctr;
+		const u32 *src_ctr = src_blk + ctr;
+
+		const u32 src_counter = *src_ctr;
+		const u32 dst_counter = *dst_ctr;
+
+		/* Saturating add */
+		u32 accumulated = src_counter + dst_counter;
+
+		if (accumulated < src_counter)
+			accumulated = U32_MAX;
+
+		*dst_ctr = accumulated;
+	}
+}
+
+/**
+ * kbase_hwcnt_dump_buffer_accumulate_strict() - Copy all enabled headers and
+ *                                               accumulate all enabled counters
+ *                                               from src to dst.
+ *                                               After the operation, all
+ *                                               non-enabled values (including
+ *                                               padding bytes) will be zero.
+ *                                               Slower than the non-strict
+ *                                               variant.
+ * @dst:            Non-NULL pointer to dst dump buffer.
+ * @src:            Non-NULL pointer to src dump buffer.
+ * @dst_enable_map: Non-NULL pointer to enable map specifying enabled values.
+ *
+ * The dst, src, and dst_enable_map MUST have been created from the same
+ * metadata.
+ */
+void kbase_hwcnt_dump_buffer_accumulate_strict(
+	struct kbase_hwcnt_dump_buffer *dst,
+	const struct kbase_hwcnt_dump_buffer *src,
+	const struct kbase_hwcnt_enable_map *dst_enable_map);
+
+/**
+ * kbase_hwcnt_dump_buffer_block_accumulate_strict() - Copy all enabled block
+ *                                                     headers and accumulate
+ *                                                     all block counters from
+ *                                                     src to dst.
+ *                                                     After the operation, all
+ *                                                     non-enabled values will
+ *                                                     be zero.
+ * @dst_blk: Non-NULL pointer to dst block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @src_blk: Non-NULL pointer to src block obtained from a call to
+ *           kbase_hwcnt_dump_buffer_block_instance.
+ * @blk_em:  Non-NULL pointer to the block bitfield(s) obtained from a call to
+ *           kbase_hwcnt_enable_map_block_instance.
+ * @hdr_cnt: Number of headers in the block.
+ * @ctr_cnt: Number of counters in the block.
+ */
+static inline void kbase_hwcnt_dump_buffer_block_accumulate_strict(
+	u32 *dst_blk,
+	const u32 *src_blk,
+	const u64 *blk_em,
+	size_t hdr_cnt,
+	size_t ctr_cnt)
+{
+	size_t ctr;
+
+	kbase_hwcnt_dump_buffer_block_copy_strict(
+		dst_blk, src_blk, blk_em, hdr_cnt);
+
+	for (ctr = hdr_cnt; ctr < ctr_cnt + hdr_cnt; ctr++) {
+		bool ctr_enabled = kbase_hwcnt_enable_map_block_value_enabled(
+			blk_em, ctr);
+
+		u32 *dst_ctr = dst_blk + ctr;
+		const u32 *src_ctr = src_blk + ctr;
+
+		const u32 src_counter = *src_ctr;
+		const u32 dst_counter = *dst_ctr;
+
+		/* Saturating add */
+		u32 accumulated = src_counter + dst_counter;
+
+		if (accumulated < src_counter)
+			accumulated = U32_MAX;
+
+		*dst_ctr = ctr_enabled ? accumulated : 0;
+	}
+}
+
+#endif /* _KBASE_HWCNT_TYPES_H_ */
diff --git a/mali_kbase/mali_kbase_hwcnt_virtualizer.c b/mali_kbase/mali_kbase_hwcnt_virtualizer.c
new file mode 100644
index 0000000..26e9852
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_virtualizer.c
@@ -0,0 +1,688 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+#include "mali_kbase_hwcnt_virtualizer.h"
+#include "mali_kbase_hwcnt_accumulator.h"
+#include "mali_kbase_hwcnt_context.h"
+#include "mali_kbase_hwcnt_types.h"
+#include "mali_malisw.h"
+#include "mali_kbase_debug.h"
+#include "mali_kbase_linux.h"
+
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+/**
+ * struct kbase_hwcnt_virtualizer - Hardware counter virtualizer structure.
+ * @hctx:         Hardware counter context being virtualized.
+ * @metadata:     Hardware counter metadata.
+ * @lock:         Lock acquired at all entrypoints, to protect mutable state.
+ * @client_count: Current number of virtualizer clients.
+ * @clients:      List of virtualizer clients.
+ * @accum:        Hardware counter accumulator. NULL if no clients.
+ * @scratch_map:  Enable map used as scratch space during counter changes.
+ * @scratch_buf:  Dump buffer used as scratch space during dumps.
+ */
+struct kbase_hwcnt_virtualizer {
+	struct kbase_hwcnt_context *hctx;
+	const struct kbase_hwcnt_metadata *metadata;
+	struct mutex lock;
+	size_t client_count;
+	struct list_head clients;
+	struct kbase_hwcnt_accumulator *accum;
+	struct kbase_hwcnt_enable_map scratch_map;
+	struct kbase_hwcnt_dump_buffer scratch_buf;
+};
+
+/**
+ * struct kbase_hwcnt_virtualizer_client - Virtualizer client structure.
+ * @node:        List node used for virtualizer client list.
+ * @hvirt:       Hardware counter virtualizer.
+ * @enable_map:  Enable map with client's current enabled counters.
+ * @accum_buf:   Dump buffer with client's current accumulated counters.
+ * @has_accum:   True if accum_buf contains any accumulated counters.
+ * @ts_start_ns: Counter collection start time of current dump.
+ */
+struct kbase_hwcnt_virtualizer_client {
+	struct list_head node;
+	struct kbase_hwcnt_virtualizer *hvirt;
+	struct kbase_hwcnt_enable_map enable_map;
+	struct kbase_hwcnt_dump_buffer accum_buf;
+	bool has_accum;
+	u64 ts_start_ns;
+};
+
+const struct kbase_hwcnt_metadata *kbase_hwcnt_virtualizer_metadata(
+	struct kbase_hwcnt_virtualizer *hvirt)
+{
+	if (!hvirt)
+		return NULL;
+
+	return hvirt->metadata;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_virtualizer_metadata);
+
+/**
+ * kbasep_hwcnt_virtualizer_client_free - Free a virtualizer client's memory.
+ * @hvcli: Pointer to virtualizer client.
+ *
+ * Will safely free a client in any partial state of construction.
+ */
+static void kbasep_hwcnt_virtualizer_client_free(
+	struct kbase_hwcnt_virtualizer_client *hvcli)
+{
+	if (!hvcli)
+		return;
+
+	kbase_hwcnt_dump_buffer_free(&hvcli->accum_buf);
+	kbase_hwcnt_enable_map_free(&hvcli->enable_map);
+	kfree(hvcli);
+}
+
+/**
+ * kbasep_hwcnt_virtualizer_client_alloc - Allocate memory for a virtualizer
+ *                                         client.
+ * @metadata:  Non-NULL pointer to counter metadata.
+ * @out_hvcli: Non-NULL pointer to where created client will be stored on
+ *             success.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_hwcnt_virtualizer_client_alloc(
+	const struct kbase_hwcnt_metadata *metadata,
+	struct kbase_hwcnt_virtualizer_client **out_hvcli)
+{
+	int errcode;
+	struct kbase_hwcnt_virtualizer_client *hvcli = NULL;
+
+	WARN_ON(!metadata);
+	WARN_ON(!out_hvcli);
+
+	hvcli = kzalloc(sizeof(*hvcli), GFP_KERNEL);
+	if (!hvcli)
+		return -ENOMEM;
+
+	errcode = kbase_hwcnt_enable_map_alloc(metadata, &hvcli->enable_map);
+	if (errcode)
+		goto error;
+
+	errcode = kbase_hwcnt_dump_buffer_alloc(metadata, &hvcli->accum_buf);
+	if (errcode)
+		goto error;
+
+	*out_hvcli = hvcli;
+	return 0;
+error:
+	kbasep_hwcnt_virtualizer_client_free(hvcli);
+	return errcode;
+}
+
+/**
+ * kbasep_hwcnt_virtualizer_client_accumulate - Accumulate a dump buffer into a
+ *                                              client's accumulation buffer.
+ * @hvcli:    Non-NULL pointer to virtualizer client.
+ * @dump_buf: Non-NULL pointer to dump buffer to accumulate from.
+ */
+static void kbasep_hwcnt_virtualizer_client_accumulate(
+	struct kbase_hwcnt_virtualizer_client *hvcli,
+	const struct kbase_hwcnt_dump_buffer *dump_buf)
+{
+	WARN_ON(!hvcli);
+	WARN_ON(!dump_buf);
+	lockdep_assert_held(&hvcli->hvirt->lock);
+
+	if (hvcli->has_accum) {
+		/* If already some accumulation, accumulate */
+		kbase_hwcnt_dump_buffer_accumulate(
+			&hvcli->accum_buf, dump_buf, &hvcli->enable_map);
+	} else {
+		/* If no accumulation, copy */
+		kbase_hwcnt_dump_buffer_copy(
+			&hvcli->accum_buf, dump_buf, &hvcli->enable_map);
+	}
+	hvcli->has_accum = true;
+}
+
+/**
+ * kbasep_hwcnt_virtualizer_accumulator_term - Terminate the hardware counter
+ *                                             accumulator after final client
+ *                                             removal.
+ * @hvirt: Non-NULL pointer to the hardware counter virtualizer.
+ *
+ * Will safely terminate the accumulator in any partial state of initialisation.
+ */
+static void kbasep_hwcnt_virtualizer_accumulator_term(
+	struct kbase_hwcnt_virtualizer *hvirt)
+{
+	WARN_ON(!hvirt);
+	lockdep_assert_held(&hvirt->lock);
+	WARN_ON(hvirt->client_count);
+
+	kbase_hwcnt_dump_buffer_free(&hvirt->scratch_buf);
+	kbase_hwcnt_enable_map_free(&hvirt->scratch_map);
+	kbase_hwcnt_accumulator_release(hvirt->accum);
+	hvirt->accum = NULL;
+}
+
+/**
+ * kbasep_hwcnt_virtualizer_accumulator_init - Initialise the hardware counter
+ *                                             accumulator before first client
+ *                                             addition.
+ * @hvirt: Non-NULL pointer to the hardware counter virtualizer.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_hwcnt_virtualizer_accumulator_init(
+	struct kbase_hwcnt_virtualizer *hvirt)
+{
+	int errcode;
+
+	WARN_ON(!hvirt);
+	lockdep_assert_held(&hvirt->lock);
+	WARN_ON(hvirt->client_count);
+	WARN_ON(hvirt->accum);
+
+	errcode = kbase_hwcnt_accumulator_acquire(
+		hvirt->hctx, &hvirt->accum);
+	if (errcode)
+		goto error;
+
+	errcode = kbase_hwcnt_enable_map_alloc(
+		hvirt->metadata, &hvirt->scratch_map);
+	if (errcode)
+		goto error;
+
+	errcode = kbase_hwcnt_dump_buffer_alloc(
+		hvirt->metadata, &hvirt->scratch_buf);
+	if (errcode)
+		goto error;
+
+	return 0;
+error:
+	kbasep_hwcnt_virtualizer_accumulator_term(hvirt);
+	return errcode;
+}
+
+/**
+ * kbasep_hwcnt_virtualizer_client_add - Add a newly allocated client to the
+ *                                       virtualizer.
+ * @hvirt:      Non-NULL pointer to the hardware counter virtualizer.
+ * @hvcli:      Non-NULL pointer to the virtualizer client to add.
+ * @enable_map: Non-NULL pointer to client's initial enable map.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_hwcnt_virtualizer_client_add(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	struct kbase_hwcnt_virtualizer_client *hvcli,
+	const struct kbase_hwcnt_enable_map *enable_map)
+{
+	int errcode = 0;
+	u64 ts_start_ns;
+	u64 ts_end_ns;
+
+	WARN_ON(!hvirt);
+	WARN_ON(!hvcli);
+	WARN_ON(!enable_map);
+	lockdep_assert_held(&hvirt->lock);
+
+	if (hvirt->client_count == 0)
+		/* First client added, so initialise the accumulator */
+		errcode = kbasep_hwcnt_virtualizer_accumulator_init(hvirt);
+	if (errcode)
+		return errcode;
+
+	hvirt->client_count += 1;
+
+	if (hvirt->client_count == 1) {
+		/* First client, so just pass the enable map onwards as is */
+		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum,
+			enable_map, &ts_start_ns, &ts_end_ns, NULL);
+	} else {
+		struct kbase_hwcnt_virtualizer_client *pos;
+
+		/* Make the scratch enable map the union of all enable maps */
+		kbase_hwcnt_enable_map_copy(
+			&hvirt->scratch_map, enable_map);
+		list_for_each_entry(pos, &hvirt->clients, node)
+			kbase_hwcnt_enable_map_union(
+				&hvirt->scratch_map, &pos->enable_map);
+
+		/* Set the counters with the new union enable map */
+		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum,
+			&hvirt->scratch_map,
+			&ts_start_ns, &ts_end_ns,
+			&hvirt->scratch_buf);
+		/* Accumulate into only existing clients' accumulation bufs */
+		if (!errcode)
+			list_for_each_entry(pos, &hvirt->clients, node)
+				kbasep_hwcnt_virtualizer_client_accumulate(
+					pos, &hvirt->scratch_buf);
+	}
+	if (errcode)
+		goto error;
+
+	list_add(&hvcli->node, &hvirt->clients);
+	hvcli->hvirt = hvirt;
+	kbase_hwcnt_enable_map_copy(&hvcli->enable_map, enable_map);
+	hvcli->has_accum = false;
+	hvcli->ts_start_ns = ts_end_ns;
+
+	return 0;
+error:
+	hvirt->client_count -= 1;
+	if (hvirt->client_count == 0)
+		kbasep_hwcnt_virtualizer_accumulator_term(hvirt);
+	return errcode;
+}
+
+/**
+ * kbasep_hwcnt_virtualizer_client_remove - Remove a client from the
+ *                                          virtualizer.
+ * @hvirt:      Non-NULL pointer to the hardware counter virtualizer.
+ * @hvcli:      Non-NULL pointer to the virtualizer client to remove.
+ */
+static void kbasep_hwcnt_virtualizer_client_remove(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	struct kbase_hwcnt_virtualizer_client *hvcli)
+{
+	int errcode = 0;
+	u64 ts_start_ns;
+	u64 ts_end_ns;
+
+	WARN_ON(!hvirt);
+	WARN_ON(!hvcli);
+	lockdep_assert_held(&hvirt->lock);
+
+	list_del(&hvcli->node);
+	hvirt->client_count -= 1;
+
+	if (hvirt->client_count == 0) {
+		/* Last client removed, so terminate the accumulator */
+		kbasep_hwcnt_virtualizer_accumulator_term(hvirt);
+	} else {
+		struct kbase_hwcnt_virtualizer_client *pos;
+		/* Make the scratch enable map the union of all enable maps */
+		kbase_hwcnt_enable_map_disable_all(&hvirt->scratch_map);
+		list_for_each_entry(pos, &hvirt->clients, node)
+			kbase_hwcnt_enable_map_union(
+				&hvirt->scratch_map, &pos->enable_map);
+		/* Set the counters with the new union enable map */
+		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum,
+			&hvirt->scratch_map,
+			&ts_start_ns, &ts_end_ns,
+			&hvirt->scratch_buf);
+		/* Accumulate into remaining clients' accumulation bufs */
+		if (!errcode)
+			list_for_each_entry(pos, &hvirt->clients, node)
+				kbasep_hwcnt_virtualizer_client_accumulate(
+					pos, &hvirt->scratch_buf);
+	}
+	WARN_ON(errcode);
+}
+
+/**
+ * kbasep_hwcnt_virtualizer_client_set_counters - Perform a dump of the client's
+ *                                                currently enabled counters,
+ *                                                and enable a new set of
+ *                                                counters that will be used for
+ *                                                subsequent dumps.
+ * @hvirt:       Non-NULL pointer to the hardware counter virtualizer.
+ * @hvcli:       Non-NULL pointer to the virtualizer client.
+ * @enable_map:  Non-NULL pointer to the new counter enable map for the client.
+ *               Must have the same metadata as the virtualizer.
+ * @ts_start_ns: Non-NULL pointer where the start timestamp of the dump will
+ *               be written out to on success.
+ * @ts_end_ns:   Non-NULL pointer where the end timestamp of the dump will
+ *               be written out to on success.
+ * @dump_buf:    Pointer to the buffer where the dump will be written out to on
+ *               success. If non-NULL, must have the same metadata as the
+ *               accumulator. If NULL, the dump will be discarded.
+ *
+ * Return: 0 on success or error code.
+ */
+static int kbasep_hwcnt_virtualizer_client_set_counters(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	struct kbase_hwcnt_virtualizer_client *hvcli,
+	const struct kbase_hwcnt_enable_map *enable_map,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf)
+{
+	int errcode;
+	struct kbase_hwcnt_virtualizer_client *pos;
+
+	WARN_ON(!hvirt);
+	WARN_ON(!hvcli);
+	WARN_ON(!enable_map);
+	WARN_ON(!ts_start_ns);
+	WARN_ON(!ts_end_ns);
+	WARN_ON(enable_map->metadata != hvirt->metadata);
+	WARN_ON(dump_buf && (dump_buf->metadata != hvirt->metadata));
+	lockdep_assert_held(&hvirt->lock);
+
+	/* Make the scratch enable map the union of all enable maps */
+	kbase_hwcnt_enable_map_copy(&hvirt->scratch_map, enable_map);
+	list_for_each_entry(pos, &hvirt->clients, node)
+		/* Ignore the enable map of the selected client */
+		if (pos != hvcli)
+			kbase_hwcnt_enable_map_union(
+				&hvirt->scratch_map, &pos->enable_map);
+
+	/* Set the counters with the new union enable map */
+	errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum,
+		&hvirt->scratch_map, ts_start_ns, ts_end_ns,
+		&hvirt->scratch_buf);
+	if (errcode)
+		return errcode;
+
+	/* Accumulate into all accumulation bufs except the selected client's */
+	list_for_each_entry(pos, &hvirt->clients, node)
+		if (pos != hvcli)
+			kbasep_hwcnt_virtualizer_client_accumulate(
+				pos, &hvirt->scratch_buf);
+
+	/* Finally, write into the dump buf */
+	if (dump_buf) {
+		const struct kbase_hwcnt_dump_buffer *src = &hvirt->scratch_buf;
+
+		if (hvcli->has_accum) {
+			kbase_hwcnt_dump_buffer_accumulate(
+				&hvcli->accum_buf, src, &hvcli->enable_map);
+			src = &hvcli->accum_buf;
+		}
+		kbase_hwcnt_dump_buffer_copy(dump_buf, src, &hvcli->enable_map);
+	}
+	hvcli->has_accum = false;
+
+	/* Update the selected client's enable map */
+	kbase_hwcnt_enable_map_copy(&hvcli->enable_map, enable_map);
+
+	/* Fix up the timestamps */
+	*ts_start_ns = hvcli->ts_start_ns;
+	hvcli->ts_start_ns = *ts_end_ns;
+
+	return errcode;
+}
+
+int kbase_hwcnt_virtualizer_client_set_counters(
+	struct kbase_hwcnt_virtualizer_client *hvcli,
+	const struct kbase_hwcnt_enable_map *enable_map,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf)
+{
+	int errcode;
+	struct kbase_hwcnt_virtualizer *hvirt;
+
+	if (!hvcli || !enable_map || !ts_start_ns || !ts_end_ns)
+		return -EINVAL;
+
+	hvirt = hvcli->hvirt;
+
+	if ((enable_map->metadata != hvirt->metadata) ||
+	    (dump_buf && (dump_buf->metadata != hvirt->metadata)))
+		return -EINVAL;
+
+	mutex_lock(&hvirt->lock);
+
+	if ((hvirt->client_count == 1) && (!hvcli->has_accum)) {
+		/*
+		 * If there's only one client with no prior accumulation, we can
+		 * completely skip the virtualize and just pass through the call
+		 * to the accumulator, saving a fair few copies and
+		 * accumulations.
+		 */
+		errcode = kbase_hwcnt_accumulator_set_counters(
+			hvirt->accum, enable_map,
+			ts_start_ns, ts_end_ns, dump_buf);
+
+		if (!errcode) {
+			/* Update the selected client's enable map */
+			kbase_hwcnt_enable_map_copy(
+				&hvcli->enable_map, enable_map);
+
+			/* Fix up the timestamps */
+			*ts_start_ns = hvcli->ts_start_ns;
+			hvcli->ts_start_ns = *ts_end_ns;
+		}
+	} else {
+		/* Otherwise, do the full virtualize */
+		errcode = kbasep_hwcnt_virtualizer_client_set_counters(
+			hvirt, hvcli, enable_map,
+			ts_start_ns, ts_end_ns, dump_buf);
+	}
+
+	mutex_unlock(&hvirt->lock);
+
+	return errcode;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_virtualizer_client_set_counters);
+
+/**
+ * kbasep_hwcnt_virtualizer_client_dump - Perform a dump of the client's
+ *                                        currently enabled counters.
+ * @hvirt:       Non-NULL pointer to the hardware counter virtualizer.
+ * @hvcli:       Non-NULL pointer to the virtualizer client.
+ * @ts_start_ns: Non-NULL pointer where the start timestamp of the dump will
+ *               be written out to on success.
+ * @ts_end_ns:   Non-NULL pointer where the end timestamp of the dump will
+ *               be written out to on success.
+ * @dump_buf:    Pointer to the buffer where the dump will be written out to on
+ *               success. If non-NULL, must have the same metadata as the
+ *               accumulator. If NULL, the dump will be discarded.
+ *
+ * Return: 0 on success or error code.
+ */
+static int kbasep_hwcnt_virtualizer_client_dump(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	struct kbase_hwcnt_virtualizer_client *hvcli,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf)
+{
+	int errcode;
+	struct kbase_hwcnt_virtualizer_client *pos;
+
+	WARN_ON(!hvirt);
+	WARN_ON(!hvcli);
+	WARN_ON(!ts_start_ns);
+	WARN_ON(!ts_end_ns);
+	WARN_ON(dump_buf && (dump_buf->metadata != hvirt->metadata));
+	lockdep_assert_held(&hvirt->lock);
+
+	/* Perform the dump */
+	errcode = kbase_hwcnt_accumulator_dump(hvirt->accum,
+		ts_start_ns, ts_end_ns, &hvirt->scratch_buf);
+	if (errcode)
+		return errcode;
+
+	/* Accumulate into all accumulation bufs except the selected client's */
+	list_for_each_entry(pos, &hvirt->clients, node)
+		if (pos != hvcli)
+			kbasep_hwcnt_virtualizer_client_accumulate(
+				pos, &hvirt->scratch_buf);
+
+	/* Finally, write into the dump buf */
+	if (dump_buf) {
+		const struct kbase_hwcnt_dump_buffer *src = &hvirt->scratch_buf;
+
+		if (hvcli->has_accum) {
+			kbase_hwcnt_dump_buffer_accumulate(
+				&hvcli->accum_buf, src, &hvcli->enable_map);
+			src = &hvcli->accum_buf;
+		}
+		kbase_hwcnt_dump_buffer_copy(dump_buf, src, &hvcli->enable_map);
+	}
+	hvcli->has_accum = false;
+
+	/* Fix up the timestamps */
+	*ts_start_ns = hvcli->ts_start_ns;
+	hvcli->ts_start_ns = *ts_end_ns;
+
+	return errcode;
+}
+
+int kbase_hwcnt_virtualizer_client_dump(
+	struct kbase_hwcnt_virtualizer_client *hvcli,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf)
+{
+	int errcode;
+	struct kbase_hwcnt_virtualizer *hvirt;
+
+	if (!hvcli || !ts_start_ns || !ts_end_ns)
+		return -EINVAL;
+
+	hvirt = hvcli->hvirt;
+
+	if (dump_buf && (dump_buf->metadata != hvirt->metadata))
+		return -EINVAL;
+
+	mutex_lock(&hvirt->lock);
+
+	if ((hvirt->client_count == 1) && (!hvcli->has_accum)) {
+		/*
+		 * If there's only one client with no prior accumulation, we can
+		 * completely skip the virtualize and just pass through the call
+		 * to the accumulator, saving a fair few copies and
+		 * accumulations.
+		 */
+		errcode = kbase_hwcnt_accumulator_dump(
+			hvirt->accum, ts_start_ns, ts_end_ns, dump_buf);
+
+		if (!errcode) {
+			/* Fix up the timestamps */
+			*ts_start_ns = hvcli->ts_start_ns;
+			hvcli->ts_start_ns = *ts_end_ns;
+		}
+	} else {
+		/* Otherwise, do the full virtualize */
+		errcode = kbasep_hwcnt_virtualizer_client_dump(
+			hvirt, hvcli, ts_start_ns, ts_end_ns, dump_buf);
+	}
+
+	mutex_unlock(&hvirt->lock);
+
+	return errcode;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_virtualizer_client_dump);
+
+int kbase_hwcnt_virtualizer_client_create(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	const struct kbase_hwcnt_enable_map *enable_map,
+	struct kbase_hwcnt_virtualizer_client **out_hvcli)
+{
+	int errcode;
+	struct kbase_hwcnt_virtualizer_client *hvcli;
+
+	if (!hvirt || !enable_map || !out_hvcli ||
+	    (enable_map->metadata != hvirt->metadata))
+		return -EINVAL;
+
+	errcode = kbasep_hwcnt_virtualizer_client_alloc(
+		hvirt->metadata, &hvcli);
+	if (errcode)
+		return errcode;
+
+	mutex_lock(&hvirt->lock);
+
+	errcode = kbasep_hwcnt_virtualizer_client_add(hvirt, hvcli, enable_map);
+
+	mutex_unlock(&hvirt->lock);
+
+	if (errcode) {
+		kbasep_hwcnt_virtualizer_client_free(hvcli);
+		return errcode;
+	}
+
+	*out_hvcli = hvcli;
+	return 0;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_virtualizer_client_create);
+
+void kbase_hwcnt_virtualizer_client_destroy(
+	struct kbase_hwcnt_virtualizer_client *hvcli)
+{
+	if (!hvcli)
+		return;
+
+	mutex_lock(&hvcli->hvirt->lock);
+
+	kbasep_hwcnt_virtualizer_client_remove(hvcli->hvirt, hvcli);
+
+	mutex_unlock(&hvcli->hvirt->lock);
+
+	kbasep_hwcnt_virtualizer_client_free(hvcli);
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_virtualizer_client_destroy);
+
+int kbase_hwcnt_virtualizer_init(
+	struct kbase_hwcnt_context *hctx,
+	struct kbase_hwcnt_virtualizer **out_hvirt)
+{
+	struct kbase_hwcnt_virtualizer *virt;
+	const struct kbase_hwcnt_metadata *metadata;
+
+	if (!hctx || !out_hvirt)
+		return -EINVAL;
+
+	metadata = kbase_hwcnt_context_metadata(hctx);
+	if (!metadata)
+		return -EINVAL;
+
+	virt = kzalloc(sizeof(*virt), GFP_KERNEL);
+	if (!virt)
+		return -ENOMEM;
+
+	virt->hctx = hctx;
+	virt->metadata = metadata;
+
+	mutex_init(&virt->lock);
+	INIT_LIST_HEAD(&virt->clients);
+
+	*out_hvirt = virt;
+	return 0;
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_virtualizer_init);
+
+void kbase_hwcnt_virtualizer_term(
+	struct kbase_hwcnt_virtualizer *hvirt)
+{
+	if (!hvirt)
+		return;
+
+	/* Non-zero client count implies client leak */
+	if (WARN_ON(hvirt->client_count != 0)) {
+		struct kbase_hwcnt_virtualizer_client *pos, *n;
+
+		list_for_each_entry_safe(pos, n, &hvirt->clients, node)
+			kbase_hwcnt_virtualizer_client_destroy(pos);
+	}
+
+	WARN_ON(hvirt->client_count != 0);
+	WARN_ON(hvirt->accum);
+
+	kfree(hvirt);
+}
+KBASE_EXPORT_TEST_API(kbase_hwcnt_virtualizer_term);
diff --git a/mali_kbase/mali_kbase_hwcnt_virtualizer.h b/mali_kbase/mali_kbase_hwcnt_virtualizer.h
new file mode 100644
index 0000000..1efa81d
--- /dev/null
+++ b/mali_kbase/mali_kbase_hwcnt_virtualizer.h
@@ -0,0 +1,139 @@
+/*
+ *
+ * (C) COPYRIGHT 2018 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU licence.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ *
+ */
+
+/**
+ * Hardware counter virtualizer API.
+ *
+ * Virtualizes a hardware counter context, so multiple clients can access
+ * a single hardware counter resource as though each was the exclusive user.
+ */
+
+#ifndef _KBASE_HWCNT_VIRTUALIZER_H_
+#define _KBASE_HWCNT_VIRTUALIZER_H_
+
+#include <linux/types.h>
+
+struct kbase_hwcnt_context;
+struct kbase_hwcnt_virtualizer;
+struct kbase_hwcnt_virtualizer_client;
+struct kbase_hwcnt_enable_map;
+struct kbase_hwcnt_dump_buffer;
+
+/**
+ * kbase_hwcnt_virtualizer_init - Initialise a hardware counter virtualizer.
+ * @hctx:      Non-NULL pointer to the hardware counter context to virtualize.
+ * @out_hvirt: Non-NULL pointer to where the pointer to the created virtualizer
+ *             will be stored on success.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_virtualizer_init(
+	struct kbase_hwcnt_context *hctx,
+	struct kbase_hwcnt_virtualizer **out_hvirt);
+
+/**
+ * kbase_hwcnt_virtualizer_term - Terminate a hardware counter virtualizer.
+ * @hvirt: Pointer to virtualizer to be terminated.
+ */
+void kbase_hwcnt_virtualizer_term(
+	struct kbase_hwcnt_virtualizer *hvirt);
+
+/**
+ * kbase_hwcnt_virtualizer_metadata - Get the hardware counter metadata used by
+ *                                    the virtualizer, so related counter data
+ *                                    structures can be created.
+ * @hvirt: Non-NULL pointer to the hardware counter virtualizer.
+ *
+ * Return: Non-NULL pointer to metadata, or NULL on error.
+ */
+const struct kbase_hwcnt_metadata *kbase_hwcnt_virtualizer_metadata(
+	struct kbase_hwcnt_virtualizer *hvirt);
+
+/**
+ * kbase_hwcnt_virtualizer_client_create - Create a new virtualizer client.
+ * @hvirt:      Non-NULL pointer to the hardware counter virtualizer.
+ * @enable_map: Non-NULL pointer to the enable map for the client. Must have the
+ *              same metadata as the virtualizer.
+ * @out_hvcli:  Non-NULL pointer to where the pointer to the created client will
+ *              be stored on success.
+ *
+ * Return: 0 on success, else error code.
+ */
+int kbase_hwcnt_virtualizer_client_create(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	const struct kbase_hwcnt_enable_map *enable_map,
+	struct kbase_hwcnt_virtualizer_client **out_hvcli);
+
+/**
+ * kbase_hwcnt_virtualizer_client_destroy() - Destroy a virtualizer client.
+ * @hvcli: Pointer to the hardware counter client.
+ */
+void kbase_hwcnt_virtualizer_client_destroy(
+	struct kbase_hwcnt_virtualizer_client *hvcli);
+
+/**
+ * kbase_hwcnt_virtualizer_client_set_counters - Perform a dump of the client's
+ *                                               currently enabled counters, and
+ *                                               enable a new set of counters
+ *                                               that will be used for
+ *                                               subsequent dumps.
+ * @hvcli:       Non-NULL pointer to the virtualizer client.
+ * @enable_map:  Non-NULL pointer to the new counter enable map for the client.
+ *               Must have the same metadata as the virtualizer.
+ * @ts_start_ns: Non-NULL pointer where the start timestamp of the dump will
+ *               be written out to on success.
+ * @ts_end_ns:   Non-NULL pointer where the end timestamp of the dump will
+ *               be written out to on success.
+ * @dump_buf:    Pointer to the buffer where the dump will be written out to on
+ *               success. If non-NULL, must have the same metadata as the
+ *               accumulator. If NULL, the dump will be discarded.
+ *
+ * Return: 0 on success or error code.
+ */
+int kbase_hwcnt_virtualizer_client_set_counters(
+	struct kbase_hwcnt_virtualizer_client *hvcli,
+	const struct kbase_hwcnt_enable_map *enable_map,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf);
+
+/**
+ * kbase_hwcnt_virtualizer_client_dump - Perform a dump of the client's
+ *                                       currently enabled counters.
+ * @hvcli:       Non-NULL pointer to the virtualizer client.
+ * @ts_start_ns: Non-NULL pointer where the start timestamp of the dump will
+ *               be written out to on success.
+ * @ts_end_ns:   Non-NULL pointer where the end timestamp of the dump will
+ *               be written out to on success.
+ * @dump_buf:    Pointer to the buffer where the dump will be written out to on
+ *               success. If non-NULL, must have the same metadata as the
+ *               accumulator. If NULL, the dump will be discarded.
+ *
+ * Return: 0 on success or error code.
+ */
+int kbase_hwcnt_virtualizer_client_dump(
+	struct kbase_hwcnt_virtualizer_client *hvcli,
+	u64 *ts_start_ns,
+	u64 *ts_end_ns,
+	struct kbase_hwcnt_dump_buffer *dump_buf);
+
+#endif /* _KBASE_HWCNT_VIRTUALIZER_H_ */
diff --git a/mali_kbase/mali_kbase_ioctl.h b/mali_kbase/mali_kbase_ioctl.h
index ffc30d8..ccf67df 100644
--- a/mali_kbase/mali_kbase_ioctl.h
+++ b/mali_kbase/mali_kbase_ioctl.h
@@ -64,9 +64,11 @@ extern "C" {
  * - Added BASE_MEM_GPU_VA_SAME_4GB_PAGE under base_mem_alloc_flags
  * 11.12:
  * - Removed ioctl: KBASE_IOCTL_GET_PROFILING_CONTROLS
+ * 11.13:
+ * - New ioctl: KBASE_IOCTL_MEM_EXEC_INIT
  */
 #define BASE_UK_VERSION_MAJOR 11
-#define BASE_UK_VERSION_MINOR 12
+#define BASE_UK_VERSION_MINOR 13
 
 /**
  * struct kbase_ioctl_version_check - Check version compatibility with kernel
@@ -673,6 +675,19 @@ union kbase_ioctl_cinstr_gwt_dump {
 	_IOWR(KBASE_IOCTL_TYPE, 35, union kbase_ioctl_cinstr_gwt_dump)
 
 
+/**
+ * struct kbase_ioctl_mem_exec_init - Initialise the EXEC_VA memory zone
+ *
+ * @va_pages: Number of VA pages to reserve for EXEC_VA
+ */
+struct kbase_ioctl_mem_exec_init {
+	__u64 va_pages;
+};
+
+#define KBASE_IOCTL_MEM_EXEC_INIT \
+	_IOW(KBASE_IOCTL_TYPE, 38, struct kbase_ioctl_mem_exec_init)
+
+
 /***************
  * test ioctls *
  ***************/
@@ -747,6 +762,21 @@ union kbase_ioctl_cs_event_memory_read {
 
 #endif
 
+/* Customer extension range */
+#define KBASE_IOCTL_EXTRA_TYPE (KBASE_IOCTL_TYPE + 2)
+
+/* If the integration needs extra ioctl add them there
+ * like this:
+ *
+ * struct my_ioctl_args {
+ *  ....
+ * }
+ *
+ * #define KBASE_IOCTL_MY_IOCTL \
+ *         _IOWR(KBASE_IOCTL_EXTRA_TYPE, 0, struct my_ioctl_args)
+ */
+
+
 /**********************************
  * Definitions for GPU properties *
  **********************************/
diff --git a/mali_kbase/mali_kbase_jd.c b/mali_kbase/mali_kbase_jd.c
index 29cf193..97d7b43 100644
--- a/mali_kbase/mali_kbase_jd.c
+++ b/mali_kbase/mali_kbase_jd.c
@@ -804,7 +804,6 @@ bool jd_submit_atom(struct kbase_context *kctx, const struct base_jd_atom_v2 *us
 	katom->extres = NULL;
 	katom->device_nr = user_atom->device_nr;
 	katom->jc = user_atom->jc;
-	katom->coreref_state = KBASE_ATOM_COREREF_STATE_NO_CORES_REQUESTED;
 	katom->core_req = user_atom->core_req;
 	katom->atom_flags = 0;
 	katom->retry_count = 0;
@@ -1219,7 +1218,6 @@ void kbase_jd_done_worker(struct work_struct *data)
 	struct kbasep_js_atom_retained_state katom_retained_state;
 	bool context_idle;
 	base_jd_core_req core_req = katom->core_req;
-	enum kbase_atom_coreref_state coreref_state = katom->coreref_state;
 
 	/* Soft jobs should never reach this function */
 	KBASE_DEBUG_ASSERT((katom->core_req & BASE_JD_REQ_SOFT_JOB) == 0);
@@ -1365,7 +1363,7 @@ void kbase_jd_done_worker(struct work_struct *data)
 		mutex_unlock(&jctx->lock);
 	}
 
-	kbase_backend_complete_wq_post_sched(kbdev, core_req, coreref_state);
+	kbase_backend_complete_wq_post_sched(kbdev, core_req);
 
 	if (context_idle)
 		kbase_pm_context_idle(kbdev);
diff --git a/mali_kbase/mali_kbase_jd_debugfs.c b/mali_kbase/mali_kbase_jd_debugfs.c
index 271daef..7b15d8a 100644
--- a/mali_kbase/mali_kbase_jd_debugfs.c
+++ b/mali_kbase/mali_kbase_jd_debugfs.c
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2014-2017 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -190,9 +190,8 @@ static int kbasep_jd_debugfs_atoms_show(struct seq_file *sfile, void *data)
 		kbasep_jd_debugfs_atom_deps(deps, atom);
 
 		seq_printf(sfile,
-				"%3u, %8x, %2u, %2u, %c%3u %c%3u, %20lld, ",
+				"%3u, %8x, %2u, %c%3u %c%3u, %20lld, ",
 				i, atom->core_req, atom->status,
-				atom->coreref_state,
 				deps[0].type, deps[0].id,
 				deps[1].type, deps[1].id,
 				start_timestamp);
diff --git a/mali_kbase/mali_kbase_jd_debugfs.h b/mali_kbase/mali_kbase_jd_debugfs.h
index ce0cb61..697bdef 100644
--- a/mali_kbase/mali_kbase_jd_debugfs.h
+++ b/mali_kbase/mali_kbase_jd_debugfs.h
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2014-2017 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -30,7 +30,7 @@
 
 #include <linux/debugfs.h>
 
-#define MALI_JD_DEBUGFS_VERSION 2
+#define MALI_JD_DEBUGFS_VERSION 3
 
 /* Forward declarations */
 struct kbase_context;
diff --git a/mali_kbase/mali_kbase_js.c b/mali_kbase/mali_kbase_js.c
index 58a1b4b..80b6d77 100644
--- a/mali_kbase/mali_kbase_js.c
+++ b/mali_kbase/mali_kbase_js.c
@@ -2259,7 +2259,6 @@ static void js_return_worker(struct work_struct *data)
 	bool context_idle = false;
 	unsigned long flags;
 	base_jd_core_req core_req = katom->core_req;
-	enum kbase_atom_coreref_state coreref_state = katom->coreref_state;
 
 	KBASE_TLSTREAM_TL_EVENT_ATOM_SOFTSTOP_EX(katom);
 
@@ -2349,7 +2348,7 @@ static void js_return_worker(struct work_struct *data)
 
 	kbase_js_sched_all(kbdev);
 
-	kbase_backend_complete_wq_post_sched(kbdev, core_req, coreref_state);
+	kbase_backend_complete_wq_post_sched(kbdev, core_req);
 }
 
 void kbase_js_unpull(struct kbase_context *kctx, struct kbase_jd_atom *katom)
diff --git a/mali_kbase/mali_kbase_js_ctx_attr.c b/mali_kbase/mali_kbase_js_ctx_attr.c
index 6fd908a..1ff230c 100644
--- a/mali_kbase/mali_kbase_js_ctx_attr.c
+++ b/mali_kbase/mali_kbase_js_ctx_attr.c
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2012-2016 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2012-2016, 2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -198,29 +198,6 @@ static bool kbasep_js_ctx_attr_ctx_release_attr(struct kbase_device *kbdev, stru
  * More commonly used public functions
  */
 
-void kbasep_js_ctx_attr_set_initial_attrs(struct kbase_device *kbdev, struct kbase_context *kctx)
-{
-	bool runpool_state_changed = false;
-
-	KBASE_DEBUG_ASSERT(kbdev != NULL);
-	KBASE_DEBUG_ASSERT(kctx != NULL);
-
-	if (kbase_ctx_flag(kctx, KCTX_SUBMIT_DISABLED)) {
-		/* This context never submits, so don't track any scheduling attributes */
-		return;
-	}
-
-	/* Transfer attributes held in the context flags for contexts that have submit enabled */
-
-	/* ... More attributes can be added here ... */
-
-	/* The context should not have been scheduled yet, so ASSERT if this caused
-	 * runpool state changes (note that other threads *can't* affect the value
-	 * of runpool_state_changed, due to how it's calculated) */
-	KBASE_DEBUG_ASSERT(runpool_state_changed == false);
-	CSTD_UNUSED(runpool_state_changed);
-}
-
 void kbasep_js_ctx_attr_runpool_retain_ctx(struct kbase_device *kbdev, struct kbase_context *kctx)
 {
 	bool runpool_state_changed;
diff --git a/mali_kbase/mali_kbase_js_ctx_attr.h b/mali_kbase/mali_kbase_js_ctx_attr.h
index be781e6..25fd397 100644
--- a/mali_kbase/mali_kbase_js_ctx_attr.h
+++ b/mali_kbase/mali_kbase_js_ctx_attr.h
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2012-2015 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2012-2015, 2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -46,14 +46,6 @@
  */
 
 /**
- * Set the initial attributes of a context (when context create flags are set)
- *
- * Requires:
- * - Hold the jsctx_mutex
- */
-void kbasep_js_ctx_attr_set_initial_attrs(struct kbase_device *kbdev, struct kbase_context *kctx);
-
-/**
  * Retain all attributes of a context
  *
  * This occurs on scheduling in the context on the runpool (but after
diff --git a/mali_kbase/mali_kbase_js_defs.h b/mali_kbase/mali_kbase_js_defs.h
index 7385daa..052a0b3 100644
--- a/mali_kbase/mali_kbase_js_defs.h
+++ b/mali_kbase/mali_kbase_js_defs.h
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2011-2017 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2011-2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -151,18 +151,19 @@ typedef u32 kbasep_js_atom_done_code;
  */
 enum {
 	/*
-	 * In this mode, the context containing higher priority atoms will be
-	 * scheduled first and also the new runnable higher priority atoms can
-	 * preempt lower priority atoms currently running on the GPU, even if
-	 * they belong to a different context.
+	 * In this mode, higher priority atoms will be scheduled first,
+	 * regardless of the context they belong to. Newly-runnable higher
+	 * priority atoms can preempt lower priority atoms currently running on
+	 * the GPU, even if they belong to a different context.
 	 */
 	KBASE_JS_SYSTEM_PRIORITY_MODE = 0,
 
 	/*
-	 * In this mode, the contexts are scheduled in round-robin fashion and
-	 * the new runnable higher priority atoms can preempt the lower priority
-	 * atoms currently running on the GPU, only if they belong to the same
-	 * context.
+	 * In this mode, the highest-priority atom will be chosen from each
+	 * context in turn using a round-robin algorithm, so priority only has
+	 * an effect within the context an atom belongs to. Newly-runnable
+	 * higher priority atoms can preempt the lower priority atoms currently
+	 * running on the GPU, but only if they belong to the same context.
 	 */
 	KBASE_JS_PROCESS_LOCAL_PRIORITY_MODE,
 
diff --git a/mali_kbase/mali_kbase_mem.c b/mali_kbase/mali_kbase_mem.c
index 3940024..3d0de90 100644
--- a/mali_kbase/mali_kbase_mem.c
+++ b/mali_kbase/mali_kbase_mem.c
@@ -79,21 +79,28 @@ static struct rb_root *kbase_gpu_va_to_rbtree(struct kbase_context *kctx,
 {
 	struct rb_root *rbtree = NULL;
 
+	/* The gpu_pfn can only be greater than the starting pfn of the EXEC_VA
+	 * zone if this has been initialized.
+	 */
+	if (gpu_pfn >= kctx->exec_va_start)
+		rbtree = &kctx->reg_rbtree_exec;
+	else {
+		u64 same_va_end;
+
 #ifdef CONFIG_64BIT
-	if (kbase_ctx_flag(kctx, KCTX_COMPAT)) {
+		if (kbase_ctx_flag(kctx, KCTX_COMPAT))
 #endif /* CONFIG_64BIT */
-		if (gpu_pfn >= KBASE_REG_ZONE_CUSTOM_VA_BASE)
-			rbtree = &kctx->reg_rbtree_custom;
-		else
-			rbtree = &kctx->reg_rbtree_same;
+			same_va_end = KBASE_REG_ZONE_CUSTOM_VA_BASE;
 #ifdef CONFIG_64BIT
-	} else {
-		if (gpu_pfn >= kctx->same_va_end)
+		else
+			same_va_end = kctx->same_va_end;
+#endif /* CONFIG_64BIT */
+
+		if (gpu_pfn >= same_va_end)
 			rbtree = &kctx->reg_rbtree_custom;
 		else
 			rbtree = &kctx->reg_rbtree_same;
 	}
-#endif /* CONFIG_64BIT */
 
 	return rbtree;
 }
@@ -224,7 +231,6 @@ struct kbase_va_region *kbase_find_region_base_address(
 			rbnode = rbnode->rb_right;
 		else
 			return reg;
-
 	}
 
 	return NULL;
@@ -615,11 +621,15 @@ static void kbase_region_tracker_ds_init(struct kbase_context *kctx,
 	kctx->reg_rbtree_same = RB_ROOT;
 	kbase_region_tracker_insert(same_va_reg);
 
-	/* Although custom_va_reg doesn't always exist,
+	/* Although custom_va_reg and exec_va_reg don't always exist,
 	 * initialize unconditionally because of the mem_view debugfs
-	 * implementation which relies on this being empty.
+	 * implementation which relies on them being empty.
+	 *
+	 * The difference between the two is that the EXEC_VA region
+	 * is never initialized at this stage.
 	 */
 	kctx->reg_rbtree_custom = RB_ROOT;
+	kctx->reg_rbtree_exec = RB_ROOT;
 
 	if (custom_va_reg)
 		kbase_region_tracker_insert(custom_va_reg);
@@ -644,6 +654,7 @@ void kbase_region_tracker_term(struct kbase_context *kctx)
 {
 	kbase_region_tracker_erase_rbtree(&kctx->reg_rbtree_same);
 	kbase_region_tracker_erase_rbtree(&kctx->reg_rbtree_custom);
+	kbase_region_tracker_erase_rbtree(&kctx->reg_rbtree_exec);
 }
 
 void kbase_region_tracker_term_rbtree(struct rb_root *rbtree)
@@ -657,9 +668,6 @@ static size_t kbase_get_same_va_bits(struct kbase_context *kctx)
 			(size_t) kctx->kbdev->gpu_props.mmu.va_bits);
 }
 
-/**
- * Initialize the region tracker data structure.
- */
 int kbase_region_tracker_init(struct kbase_context *kctx)
 {
 	struct kbase_va_region *same_va_reg;
@@ -709,12 +717,17 @@ int kbase_region_tracker_init(struct kbase_context *kctx)
 			goto fail_free_same_va;
 		}
 #ifdef CONFIG_64BIT
+	} else {
+		custom_va_size = 0;
 	}
 #endif
 
 	kbase_region_tracker_ds_init(kctx, same_va_reg, custom_va_reg);
 
 	kctx->same_va_end = same_va_pages + 1;
+	kctx->gpu_va_end = kctx->same_va_end + custom_va_size;
+	kctx->exec_va_start = U64_MAX;
+	kctx->jit_va = false;
 
 
 	kbase_gpu_vm_unlock(kctx);
@@ -735,11 +748,12 @@ static int kbase_region_tracker_init_jit_64(struct kbase_context *kctx,
 	struct kbase_va_region *custom_va_reg;
 	u64 same_va_bits = kbase_get_same_va_bits(kctx);
 	u64 total_va_size;
-	int err;
 
 	total_va_size = (1ULL << (same_va_bits - PAGE_SHIFT)) - 1;
 
-	kbase_gpu_vm_lock(kctx);
+	/* First verify that a JIT_VA zone has not been created already. */
+	if (kctx->jit_va)
+		return -EINVAL;
 
 	/*
 	 * Modify the same VA free region after creation. Be careful to ensure
@@ -748,23 +762,11 @@ static int kbase_region_tracker_init_jit_64(struct kbase_context *kctx,
 	 */
 	same_va = kbase_region_tracker_find_region_base_address(kctx,
 			PAGE_SIZE);
-	if (!same_va) {
-		err = -ENOMEM;
-		goto fail_unlock;
-	}
-
-	/* The region flag or region size has changed since creation so bail. */
-	if ((!(same_va->flags & KBASE_REG_FREE)) ||
-			(same_va->nr_pages != total_va_size)) {
-		err = -ENOMEM;
-		goto fail_unlock;
-	}
+	if (!same_va)
+		return -ENOMEM;
 
-	if (same_va->nr_pages < jit_va_pages ||
-			kctx->same_va_end < jit_va_pages) {
-		err = -ENOMEM;
-		goto fail_unlock;
-	}
+	if (same_va->nr_pages < jit_va_pages || kctx->same_va_end < jit_va_pages)
+		return -ENOMEM;
 
 	/* It's safe to adjust the same VA zone now */
 	same_va->nr_pages -= jit_va_pages;
@@ -779,44 +781,121 @@ static int kbase_region_tracker_init_jit_64(struct kbase_context *kctx,
 				jit_va_pages,
 				KBASE_REG_ZONE_CUSTOM_VA);
 
-	if (!custom_va_reg) {
-		/*
-		 * The context will be destroyed if we fail here so no point
-		 * reverting the change we made to same_va.
-		 */
-		err = -ENOMEM;
-		goto fail_unlock;
-	}
+	/*
+	 * The context will be destroyed if we fail here so no point
+	 * reverting the change we made to same_va.
+	 */
+	if (!custom_va_reg)
+		return -ENOMEM;
 
 	kbase_region_tracker_insert(custom_va_reg);
-
-	kbase_gpu_vm_unlock(kctx);
 	return 0;
-
-fail_unlock:
-	kbase_gpu_vm_unlock(kctx);
-	return err;
 }
 #endif
 
 int kbase_region_tracker_init_jit(struct kbase_context *kctx, u64 jit_va_pages,
 		u8 max_allocations, u8 trim_level)
 {
+	int err = 0;
+
 	if (trim_level > 100)
 		return -EINVAL;
 
-	kctx->jit_max_allocations = max_allocations;
-	kctx->trim_level = trim_level;
+	kbase_gpu_vm_lock(kctx);
 
 #ifdef CONFIG_64BIT
 	if (!kbase_ctx_flag(kctx, KCTX_COMPAT))
-		return kbase_region_tracker_init_jit_64(kctx, jit_va_pages);
+		err = kbase_region_tracker_init_jit_64(kctx, jit_va_pages);
 #endif
 	/*
 	 * Nothing to do for 32-bit clients, JIT uses the existing
 	 * custom VA zone.
 	 */
-	return 0;
+
+	if (!err) {
+		kctx->jit_max_allocations = max_allocations;
+		kctx->trim_level = trim_level;
+		kctx->jit_va = true;
+	}
+
+	kbase_gpu_vm_unlock(kctx);
+
+	return err;
+}
+
+int kbase_region_tracker_init_exec(struct kbase_context *kctx, u64 exec_va_pages)
+{
+	struct kbase_va_region *shrinking_va_reg;
+	struct kbase_va_region *exec_va_reg;
+	u64 exec_va_start, exec_va_base_addr;
+	int err;
+
+	/* The EXEC_VA zone shall be created by making space at the end of the
+	 * address space. Firstly, verify that the number of EXEC_VA pages
+	 * requested by the client is reasonable and then make sure that it is
+	 * not greater than the address space itself before calculating the base
+	 * address of the new zone.
+	 */
+	if (exec_va_pages == 0 || exec_va_pages > KBASE_REG_ZONE_EXEC_VA_MAX_PAGES)
+		return -EINVAL;
+
+	kbase_gpu_vm_lock(kctx);
+
+	/* First verify that a JIT_VA zone has not been created already. */
+	if (kctx->jit_va) {
+		err = -EPERM;
+		goto exit_unlock;
+	}
+
+	if (exec_va_pages > kctx->gpu_va_end) {
+		err = -ENOMEM;
+		goto exit_unlock;
+	}
+
+	exec_va_start = kctx->gpu_va_end - exec_va_pages;
+	exec_va_base_addr = exec_va_start << PAGE_SHIFT;
+
+	shrinking_va_reg = kbase_region_tracker_find_region_enclosing_address(kctx,
+			exec_va_base_addr);
+	if (!shrinking_va_reg) {
+		err = -ENOMEM;
+		goto exit_unlock;
+	}
+
+	/* Make sure that the EXEC_VA region is still uninitialized */
+	if ((shrinking_va_reg->flags & KBASE_REG_ZONE_MASK) ==
+			KBASE_REG_ZONE_EXEC_VA) {
+		err = -EPERM;
+		goto exit_unlock;
+	}
+
+	if (shrinking_va_reg->nr_pages <= exec_va_pages) {
+		err = -ENOMEM;
+		goto exit_unlock;
+	}
+
+	exec_va_reg = kbase_alloc_free_region(&kctx->reg_rbtree_exec,
+			exec_va_start,
+			exec_va_pages,
+			KBASE_REG_ZONE_EXEC_VA);
+	if (!exec_va_reg) {
+		err = -ENOMEM;
+		goto exit_unlock;
+	}
+
+	shrinking_va_reg->nr_pages -= exec_va_pages;
+#ifdef CONFIG_64BIT
+	if (!kbase_ctx_flag(kctx, KCTX_COMPAT))
+		kctx->same_va_end -= exec_va_pages;
+#endif
+	kctx->exec_va_start = exec_va_start;
+
+	kbase_region_tracker_insert(exec_va_reg);
+	err = 0;
+
+exit_unlock:
+	kbase_gpu_vm_unlock(kctx);
+	return err;
 }
 
 
@@ -938,6 +1017,10 @@ static struct kbase_context *kbase_reg_flags_to_kctx(
 		kctx = container_of(rbtree, struct kbase_context,
 				reg_rbtree_same);
 		break;
+	case KBASE_REG_ZONE_EXEC_VA:
+		kctx = container_of(rbtree, struct kbase_context,
+				reg_rbtree_exec);
+		break;
 	default:
 		WARN(1, "Unknown zone in region: flags=0x%lx\n", reg->flags);
 		break;
@@ -2917,6 +3000,30 @@ update_failed_unlocked:
 	return ret;
 }
 
+static void trace_jit_stats(struct kbase_context *kctx,
+		u32 bin_id, u32 max_allocations)
+{
+	const u32 alloc_count =
+		kctx->jit_current_allocations_per_bin[bin_id];
+
+	struct kbase_va_region *walker;
+	u32 va_pages = 0;
+	u32 ph_pages = 0;
+
+	mutex_lock(&kctx->jit_evict_lock);
+	list_for_each_entry(walker, &kctx->jit_active_head, jit_node) {
+		if (walker->jit_bin_id != bin_id)
+			continue;
+
+		va_pages += walker->nr_pages;
+		ph_pages += walker->gpu_alloc->nents;
+	}
+	mutex_unlock(&kctx->jit_evict_lock);
+
+	KBASE_TLSTREAM_AUX_JIT_STATS(kctx->id, bin_id, max_allocations,
+		alloc_count, va_pages, ph_pages);
+}
+
 struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 		struct base_jit_alloc_info *info)
 {
@@ -3069,6 +3176,8 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 	kctx->jit_current_allocations++;
 	kctx->jit_current_allocations_per_bin[info->bin_id]++;
 
+	trace_jit_stats(kctx, info->bin_id, info->max_allocations);
+
 	reg->jit_usage_id = info->usage_id;
 	reg->jit_bin_id = info->bin_id;
 
@@ -3112,6 +3221,8 @@ void kbase_jit_free(struct kbase_context *kctx, struct kbase_va_region *reg)
 	kctx->jit_current_allocations--;
 	kctx->jit_current_allocations_per_bin[reg->jit_bin_id]--;
 
+	trace_jit_stats(kctx, reg->jit_bin_id, UINT_MAX);
+
 	kbase_mem_evictable_mark_reclaim(reg->gpu_alloc);
 
 	kbase_gpu_vm_lock(kctx);
@@ -3225,6 +3336,17 @@ void kbase_jit_term(struct kbase_context *kctx)
 	cancel_work_sync(&kctx->jit_work);
 }
 
+bool kbase_has_exec_va_zone(struct kbase_context *kctx)
+{
+	bool has_exec_va_zone;
+
+	kbase_gpu_vm_lock(kctx);
+	has_exec_va_zone = (kctx->exec_va_start != U64_MAX);
+	kbase_gpu_vm_unlock(kctx);
+
+	return has_exec_va_zone;
+}
+
 static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 		struct kbase_va_region *reg)
 {
diff --git a/mali_kbase/mali_kbase_mem.h b/mali_kbase/mali_kbase_mem.h
index 5958cf4..a873bb1 100644
--- a/mali_kbase/mali_kbase_mem.h
+++ b/mali_kbase/mali_kbase_mem.h
@@ -328,6 +328,13 @@ struct kbase_va_region {
 #define KBASE_REG_ZONE_CUSTOM_VA_SIZE    (((1ULL << 44) >> PAGE_SHIFT) - KBASE_REG_ZONE_CUSTOM_VA_BASE)
 /* end 32-bit clients only */
 
+/* The starting address and size of the GPU-executable zone are dynamic
+ * and depend on the platform and the number of pages requested by the
+ * user process, with an upper limit of 4 GB.
+ */
+#define KBASE_REG_ZONE_EXEC_VA           KBASE_REG_ZONE(2)
+#define KBASE_REG_ZONE_EXEC_VA_MAX_PAGES ((1ULL << 32) >> PAGE_SHIFT) /* 4 GB */
+
 
 	unsigned long flags;
 
@@ -792,9 +799,40 @@ void kbase_mem_pool_mark_dying(struct kbase_mem_pool *pool);
  */
 struct page *kbase_mem_alloc_page(struct kbase_mem_pool *pool);
 
+/**
+ * kbase_region_tracker_init - Initialize the region tracker data structure
+ * @kctx: kbase context
+ *
+ * Return: 0 if success, negative error code otherwise.
+ */
 int kbase_region_tracker_init(struct kbase_context *kctx);
+
+/**
+ * kbase_region_tracker_init_jit - Initialize the JIT region
+ * @kctx: kbase context
+ * @jit_va_pages: Size of the JIT region in pages
+ * @max_allocations: Maximum number of allocations allowed for the JIT region
+ * @trim_level: Trim level for the JIT region
+ *
+ * Return: 0 if success, negative error code otherwise.
+ */
 int kbase_region_tracker_init_jit(struct kbase_context *kctx, u64 jit_va_pages,
 		u8 max_allocations, u8 trim_level);
+
+/**
+ * kbase_region_tracker_init_exec - Initialize the EXEC_VA region
+ * @kctx: kbase context
+ * @exec_va_pages: Size of the JIT region in pages.
+ *                 It must not be greater than 4 GB.
+ *
+ * Return: 0 if success, negative error code otherwise.
+ */
+int kbase_region_tracker_init_exec(struct kbase_context *kctx, u64 exec_va_pages);
+
+/**
+ * kbase_region_tracker_term - Terminate the JIT region
+ * @kctx: kbase context
+ */
 void kbase_region_tracker_term(struct kbase_context *kctx);
 
 /**
@@ -1349,6 +1387,18 @@ bool kbase_jit_evict(struct kbase_context *kctx);
 void kbase_jit_term(struct kbase_context *kctx);
 
 /**
+ * kbase_has_exec_va_zone - EXEC_VA zone predicate
+ *
+ * Determine whether an EXEC_VA zone has been created for the GPU address space
+ * of the given kbase context.
+ *
+ * @kctx: kbase context
+ *
+ * Return: True if the kbase context has an EXEC_VA zone.
+ */
+bool kbase_has_exec_va_zone(struct kbase_context *kctx);
+
+/**
  * kbase_map_external_resource - Map an external resource to the GPU.
  * @kctx:              kbase context.
  * @reg:               The region to map.
diff --git a/mali_kbase/mali_kbase_mem_linux.c b/mali_kbase/mali_kbase_mem_linux.c
index 1299353..c70112d 100644
--- a/mali_kbase/mali_kbase_mem_linux.c
+++ b/mali_kbase/mali_kbase_mem_linux.c
@@ -250,6 +250,16 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx,
 		goto bad_flags;
 	}
 
+#ifdef CONFIG_DEBUG_FS
+	if (unlikely(kbase_ctx_flag(kctx, KCTX_INFINITE_CACHE))) {
+		/* Mask coherency flags if infinite cache is enabled to prevent
+		 * the skipping of syncs from BASE side.
+		 */
+		*flags &= ~(BASE_MEM_COHERENT_SYSTEM_REQUIRED |
+			    BASE_MEM_COHERENT_SYSTEM);
+	}
+#endif
+
 	if ((*flags & BASE_MEM_UNCACHED_GPU) != 0 &&
 			(*flags & BASE_MEM_COHERENT_SYSTEM_REQUIRED) != 0) {
 		/* Remove COHERENT_SYSTEM_REQUIRED flag if uncached GPU mapping is requested */
@@ -273,6 +283,9 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx,
 	if (*flags & BASE_MEM_SAME_VA) {
 		rbtree = &kctx->reg_rbtree_same;
 		zone = KBASE_REG_ZONE_SAME_VA;
+	} else if ((*flags & BASE_MEM_PROT_GPU_EX) && kbase_has_exec_va_zone(kctx)) {
+		rbtree = &kctx->reg_rbtree_exec;
+		zone = KBASE_REG_ZONE_EXEC_VA;
 	} else {
 		rbtree = &kctx->reg_rbtree_custom;
 		zone = KBASE_REG_ZONE_CUSTOM_VA;
@@ -914,6 +927,9 @@ static struct kbase_va_region *kbase_mem_from_umm(struct kbase_context *kctx,
 	if (!reg)
 		goto no_region;
 
+	if (kbase_update_region_flags(kctx, reg, *flags) != 0)
+		goto invalid_flags;
+
 	reg->gpu_alloc = kbase_alloc_create(kctx, *va_pages,
 			KBASE_MEM_TYPE_IMPORTED_UMM);
 	if (IS_ERR_OR_NULL(reg->gpu_alloc))
@@ -924,9 +940,6 @@ static struct kbase_va_region *kbase_mem_from_umm(struct kbase_context *kctx,
 	/* No pages to map yet */
 	reg->gpu_alloc->nents = 0;
 
-	if (kbase_update_region_flags(kctx, reg, *flags) != 0)
-		goto invalid_flags;
-
 	reg->flags &= ~KBASE_REG_FREE;
 	reg->flags |= KBASE_REG_GPU_NX;	/* UMM is always No eXecute */
 	reg->flags &= ~KBASE_REG_GROWABLE;	/* UMM cannot be grown */
@@ -946,10 +959,8 @@ static struct kbase_va_region *kbase_mem_from_umm(struct kbase_context *kctx,
 
 	return reg;
 
-invalid_flags:
-	kbase_mem_phy_alloc_put(reg->gpu_alloc);
-	kbase_mem_phy_alloc_put(reg->cpu_alloc);
 no_alloc_obj:
+invalid_flags:
 	kfree(reg);
 no_region:
 bad_size:
@@ -1186,7 +1197,7 @@ u64 kbase_mem_alias(struct kbase_context *kctx, u64 *flags, u64 stride,
 	/* mask to only allowed flags */
 	*flags &= (BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
 		   BASE_MEM_COHERENT_SYSTEM | BASE_MEM_COHERENT_LOCAL |
-		   BASE_MEM_COHERENT_SYSTEM_REQUIRED);
+		   BASE_MEM_PROT_CPU_RD | BASE_MEM_COHERENT_SYSTEM_REQUIRED);
 
 	if (!(*flags & (BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR))) {
 		dev_warn(kctx->kbdev->dev,
@@ -1787,6 +1798,7 @@ static int kbase_cpu_mmap(struct kbase_context *kctx,
 	struct tagged_addr *page_array;
 	int err = 0;
 	int i;
+	u64 start_off;
 
 	map = kzalloc(sizeof(*map), GFP_KERNEL);
 
@@ -1819,6 +1831,38 @@ static int kbase_cpu_mmap(struct kbase_context *kctx,
 	vma->vm_private_data = map;
 
 	page_array = kbase_get_cpu_phy_pages(reg);
+	start_off = vma->vm_pgoff - reg->start_pfn +
+				(aligned_offset >> PAGE_SHIFT);
+	if (reg->cpu_alloc->type == KBASE_MEM_TYPE_ALIAS && nr_pages) {
+		struct kbase_aliased *aliased =
+			reg->cpu_alloc->imported.alias.aliased;
+
+		if (!reg->cpu_alloc->imported.alias.stride ||
+				reg->nr_pages < (start_off + nr_pages)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		while (start_off >= reg->cpu_alloc->imported.alias.stride) {
+			aliased++;
+			start_off -= reg->cpu_alloc->imported.alias.stride;
+		}
+
+		if (!aliased->alloc) {
+			/* sink page not available for dumping map */
+			err = -EINVAL;
+			goto out;
+		}
+
+		if ((start_off + nr_pages) > aliased->length) {
+			/* not fully backed by physical pages */
+			err = -EINVAL;
+			goto out;
+		}
+
+		/* ready the pages for dumping map */
+		page_array = aliased->alloc->pages + aliased->offset;
+	}
 
 	if (!(reg->flags & KBASE_REG_CPU_CACHED) &&
 	    (reg->flags & (KBASE_REG_CPU_WR|KBASE_REG_CPU_RD))) {
@@ -1833,8 +1877,6 @@ static int kbase_cpu_mmap(struct kbase_context *kctx,
 
 	if (!kaddr) {
 		unsigned long addr = vma->vm_start + aligned_offset;
-		u64 start_off = vma->vm_pgoff - reg->start_pfn +
-			(aligned_offset>>PAGE_SHIFT);
 
 		vma->vm_flags |= VM_PFNMAP;
 		for (i = 0; i < nr_pages; i++) {
@@ -2127,8 +2169,19 @@ int kbase_mmap(struct file *file, struct vm_area_struct *vma)
 			}
 #endif /* CONFIG_DMA_SHARED_BUFFER */
 
-			/* limit what we map to the amount currently backed */
-			if (reg->cpu_alloc->nents < (vma->vm_pgoff - reg->start_pfn + nr_pages)) {
+			if (reg->cpu_alloc->type == KBASE_MEM_TYPE_ALIAS) {
+				/* initial params check for aliased dumping map */
+				if (nr_pages > reg->gpu_alloc->imported.alias.stride ||
+					!reg->gpu_alloc->imported.alias.stride ||
+					!nr_pages) {
+					err = -EINVAL;
+					dev_warn(dev, "mmap aliased: invalid params!\n");
+					goto out_unlock;
+				}
+			}
+			else if (reg->cpu_alloc->nents <
+					(vma->vm_pgoff - reg->start_pfn + nr_pages)) {
+				/* limit what we map to the amount currently backed */
 				if ((vma->vm_pgoff - reg->start_pfn) >= reg->cpu_alloc->nents)
 					nr_pages = 0;
 				else
@@ -2431,134 +2484,4 @@ static int kbase_tracking_page_setup(struct kbase_context *kctx, struct vm_area_
 
 	return 0;
 }
-void *kbase_va_alloc(struct kbase_context *kctx, u32 size, struct kbase_hwc_dma_mapping *handle)
-{
-	int res;
-	void *va;
-	dma_addr_t  dma_pa;
-	struct kbase_va_region *reg;
-	struct tagged_addr *page_array;
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0))
-	unsigned long attrs = DMA_ATTR_WRITE_COMBINE;
-#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0))
-	DEFINE_DMA_ATTRS(attrs);
-#endif
-
-	u32 pages = ((size - 1) >> PAGE_SHIFT) + 1;
-	u32 flags = BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_CPU_WR |
-		    BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR;
-	u32 i;
-
-	KBASE_DEBUG_ASSERT(kctx != NULL);
-	KBASE_DEBUG_ASSERT(0 != size);
-	KBASE_DEBUG_ASSERT(0 != pages);
-
-	if (size == 0)
-		goto err;
-
-	/* All the alloc calls return zeroed memory */
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0))
-	va = dma_alloc_attrs(kctx->kbdev->dev, size, &dma_pa, GFP_KERNEL,
-			     attrs);
-#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0))
-	dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
-	va = dma_alloc_attrs(kctx->kbdev->dev, size, &dma_pa, GFP_KERNEL,
-			     &attrs);
-#else
-	va = dma_alloc_writecombine(kctx->kbdev->dev, size, &dma_pa, GFP_KERNEL);
-#endif
-	if (!va)
-		goto err;
-
-	/* Store the state so we can free it later. */
-	handle->cpu_va = va;
-	handle->dma_pa = dma_pa;
-	handle->size   = size;
-
-
-	reg = kbase_alloc_free_region(&kctx->reg_rbtree_same, 0, pages,
-			KBASE_REG_ZONE_SAME_VA);
-	if (!reg)
-		goto no_reg;
-
-	reg->flags &= ~KBASE_REG_FREE;
-	if (kbase_update_region_flags(kctx, reg, flags) != 0)
-		goto invalid_flags;
-
-	reg->cpu_alloc = kbase_alloc_create(kctx, pages, KBASE_MEM_TYPE_RAW);
-	if (IS_ERR_OR_NULL(reg->cpu_alloc))
-		goto no_alloc;
-
-	reg->gpu_alloc = kbase_mem_phy_alloc_get(reg->cpu_alloc);
-
-	page_array = kbase_get_cpu_phy_pages(reg);
-
-	for (i = 0; i < pages; i++)
-		page_array[i] = as_tagged(dma_pa + ((dma_addr_t)i << PAGE_SHIFT));
-
-	reg->cpu_alloc->nents = pages;
-
-	kbase_gpu_vm_lock(kctx);
-	res = kbase_gpu_mmap(kctx, reg, (uintptr_t) va, pages, 1);
-	kbase_gpu_vm_unlock(kctx);
-	if (res)
-		goto no_mmap;
-
-	return va;
-
-no_mmap:
-	kbase_mem_phy_alloc_put(reg->cpu_alloc);
-	kbase_mem_phy_alloc_put(reg->gpu_alloc);
-no_alloc:
-invalid_flags:
-	kfree(reg);
-no_reg:
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0))
-	dma_free_attrs(kctx->kbdev->dev, size, va, dma_pa, attrs);
-#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0))
-	dma_free_attrs(kctx->kbdev->dev, size, va, dma_pa, &attrs);
-#else
-	dma_free_writecombine(kctx->kbdev->dev, size, va, dma_pa);
-#endif
-err:
-	return NULL;
-}
-KBASE_EXPORT_SYMBOL(kbase_va_alloc);
-
-void kbase_va_free(struct kbase_context *kctx, struct kbase_hwc_dma_mapping *handle)
-{
-	struct kbase_va_region *reg;
-	int err;
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0)) && \
-	(LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0))
-	DEFINE_DMA_ATTRS(attrs);
-#endif
-
-	KBASE_DEBUG_ASSERT(kctx != NULL);
-	KBASE_DEBUG_ASSERT(handle->cpu_va != NULL);
-
-	kbase_gpu_vm_lock(kctx);
-	reg = kbase_region_tracker_find_region_base_address(kctx, (uintptr_t)handle->cpu_va);
-	KBASE_DEBUG_ASSERT(reg);
-	err = kbase_gpu_munmap(kctx, reg);
-	kbase_gpu_vm_unlock(kctx);
-	KBASE_DEBUG_ASSERT(!err);
-
-	kbase_mem_phy_alloc_put(reg->cpu_alloc);
-	kbase_mem_phy_alloc_put(reg->gpu_alloc);
-	kfree(reg);
-
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0))
-	dma_free_attrs(kctx->kbdev->dev, handle->size,
-		       handle->cpu_va, handle->dma_pa, DMA_ATTR_WRITE_COMBINE);
-#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0))
-	dma_set_attr(DMA_ATTR_WRITE_COMBINE, &attrs);
-	dma_free_attrs(kctx->kbdev->dev, handle->size,
-			handle->cpu_va, handle->dma_pa, &attrs);
-#else
-	dma_free_writecombine(kctx->kbdev->dev, handle->size,
-				handle->cpu_va, handle->dma_pa);
-#endif
-}
-KBASE_EXPORT_SYMBOL(kbase_va_free);
 
diff --git a/mali_kbase/mali_kbase_mem_linux.h b/mali_kbase/mali_kbase_mem_linux.h
index 0a03bee..5cb88d1 100644
--- a/mali_kbase/mali_kbase_mem_linux.h
+++ b/mali_kbase/mali_kbase_mem_linux.h
@@ -303,22 +303,6 @@ void *kbase_vmap(struct kbase_context *kctx, u64 gpu_addr, size_t size,
  */
 void kbase_vunmap(struct kbase_context *kctx, struct kbase_vmap_struct *map);
 
-/** @brief Allocate memory from kernel space and map it onto the GPU
- *
- * @param kctx   The context used for the allocation/mapping
- * @param size   The size of the allocation in bytes
- * @param handle An opaque structure used to contain the state needed to free the memory
- * @return the VA for kernel space and GPU MMU
- */
-void *kbase_va_alloc(struct kbase_context *kctx, u32 size, struct kbase_hwc_dma_mapping *handle);
-
-/** @brief Free/unmap memory allocated by kbase_va_alloc
- *
- * @param kctx   The context used for the allocation/mapping
- * @param handle An opaque structure returned by the kbase_va_alloc function.
- */
-void kbase_va_free(struct kbase_context *kctx, struct kbase_hwc_dma_mapping *handle);
-
 extern const struct vm_operations_struct kbase_vm_ops;
 
 /**
diff --git a/mali_kbase/mali_kbase_mmu.c b/mali_kbase/mali_kbase_mmu.c
index 5e6732a..84341ca 100644
--- a/mali_kbase/mali_kbase_mmu.c
+++ b/mali_kbase/mali_kbase_mmu.c
@@ -45,7 +45,7 @@
 #include <mali_kbase_hw.h>
 #include <mali_kbase_mmu_hw.h>
 #include <mali_kbase_hwaccess_jm.h>
-#include <mali_kbase_time.h>
+#include <mali_kbase_hwaccess_time.h>
 #include <mali_kbase_mem.h>
 
 #define KBASE_MMU_PAGE_ENTRIES 512
@@ -1404,7 +1404,6 @@ static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx,
 	err = kbase_mmu_hw_do_operation(kbdev,
 				&kbdev->as[kctx->as_nr],
 				vpfn, nr, op, 0);
-#if KBASE_GPU_RESET_EN
 	if (err) {
 		/* Flush failed to complete, assume the
 		 * GPU has hung and perform a reset to
@@ -1414,7 +1413,6 @@ static void kbase_mmu_flush_invalidate_noretain(struct kbase_context *kctx,
 		if (kbase_prepare_to_reset_gpu_locked(kbdev))
 			kbase_reset_gpu_locked(kbdev);
 	}
-#endif /* KBASE_GPU_RESET_EN */
 
 #ifndef CONFIG_MALI_NO_MALI
 	/*
@@ -1454,7 +1452,6 @@ static void kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev,
 	err = kbase_mmu_hw_do_operation(kbdev,
 			as, vpfn, nr, op, 0);
 
-#if KBASE_GPU_RESET_EN
 	if (err) {
 		/* Flush failed to complete, assume the GPU has hung and
 		 * perform a reset to recover
@@ -1464,7 +1461,6 @@ static void kbase_mmu_flush_invalidate_as(struct kbase_device *kbdev,
 		if (kbase_prepare_to_reset_gpu(kbdev))
 			kbase_reset_gpu(kbdev);
 	}
-#endif /* KBASE_GPU_RESET_EN */
 
 	mutex_unlock(&kbdev->mmu_hw_mutex);
 	/* AS transaction end */
@@ -2054,9 +2050,7 @@ void bus_fault_worker(struct work_struct *data)
 	struct kbase_context *kctx;
 	struct kbase_device *kbdev;
 	struct kbase_fault *fault;
-#if KBASE_GPU_RESET_EN
 	bool reset_status = false;
-#endif /* KBASE_GPU_RESET_EN */
 
 	faulting_as = container_of(data, struct kbase_as, work_busfault);
 	fault = &faulting_as->bf_data;
@@ -2088,7 +2082,6 @@ void bus_fault_worker(struct work_struct *data)
 
 	}
 
-#if KBASE_GPU_RESET_EN
 	if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_8245)) {
 		/* Due to H/W issue 8245 we need to reset the GPU after using UNMAPPED mode.
 		 * We start the reset before switching to UNMAPPED to ensure that unrelated jobs
@@ -2097,7 +2090,6 @@ void bus_fault_worker(struct work_struct *data)
 		dev_err(kbdev->dev, "GPU bus error occurred. For this GPU version we now soft-reset as part of bus error recovery\n");
 		reset_status = kbase_prepare_to_reset_gpu(kbdev);
 	}
-#endif /* KBASE_GPU_RESET_EN */
 	/* NOTE: If GPU already powered off for suspend, we don't need to switch to unmapped */
 	if (!kbase_pm_context_active_handle_suspend(kbdev, KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE)) {
 		unsigned long flags;
@@ -2122,10 +2114,8 @@ void bus_fault_worker(struct work_struct *data)
 		kbase_pm_context_idle(kbdev);
 	}
 
-#if KBASE_GPU_RESET_EN
 	if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_8245) && reset_status)
 		kbase_reset_gpu(kbdev);
-#endif /* KBASE_GPU_RESET_EN */
 
 	kbasep_js_runpool_release_ctx(kbdev, kctx);
 
@@ -2336,9 +2326,7 @@ static void kbase_mmu_report_fault_and_kill(struct kbase_context *kctx,
 	struct kbase_device *kbdev;
 	struct kbasep_js_device_data *js_devdata;
 
-#if KBASE_GPU_RESET_EN
 	bool reset_status = false;
-#endif
 
 	as_no = as->number;
 	kbdev = kctx->kbdev;
@@ -2375,11 +2363,9 @@ static void kbase_mmu_report_fault_and_kill(struct kbase_context *kctx,
 	if ((kbdev->hwcnt.kctx) && (kbdev->hwcnt.kctx->as_nr == as_no) &&
 			(kbdev->hwcnt.backend.state ==
 						KBASE_INSTR_STATE_DUMPING)) {
-		unsigned int num_core_groups = kbdev->gpu_props.num_core_groups;
-
 		if ((fault->addr >= kbdev->hwcnt.addr) &&
 				(fault->addr < (kbdev->hwcnt.addr +
-						(num_core_groups * 2048))))
+					kbdev->hwcnt.addr_bytes)))
 			kbdev->hwcnt.backend.state = KBASE_INSTR_STATE_FAULT;
 	}
 
@@ -2394,7 +2380,6 @@ static void kbase_mmu_report_fault_and_kill(struct kbase_context *kctx,
 	kbase_backend_jm_kill_jobs_from_kctx(kctx);
 	/* AS transaction begin */
 	mutex_lock(&kbdev->mmu_hw_mutex);
-#if KBASE_GPU_RESET_EN
 	if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_8245)) {
 		/* Due to H/W issue 8245 we need to reset the GPU after using UNMAPPED mode.
 		 * We start the reset before switching to UNMAPPED to ensure that unrelated jobs
@@ -2403,7 +2388,6 @@ static void kbase_mmu_report_fault_and_kill(struct kbase_context *kctx,
 		dev_err(kbdev->dev, "Unhandled page fault. For this GPU version we now soft-reset the GPU as part of page fault recovery.");
 		reset_status = kbase_prepare_to_reset_gpu(kbdev);
 	}
-#endif /* KBASE_GPU_RESET_EN */
 	/* switch to UNMAPPED mode, will abort all jobs and stop any hw counter dumping */
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	kbase_mmu_disable(kctx);
@@ -2417,10 +2401,8 @@ static void kbase_mmu_report_fault_and_kill(struct kbase_context *kctx,
 	kbase_mmu_hw_enable_fault(kbdev, as,
 			KBASE_MMU_FAULT_TYPE_PAGE_UNEXPECTED);
 
-#if KBASE_GPU_RESET_EN
 	if (kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_8245) && reset_status)
 		kbase_reset_gpu(kbdev);
-#endif /* KBASE_GPU_RESET_EN */
 }
 
 void kbasep_as_do_poke(struct work_struct *work)
@@ -2608,7 +2590,6 @@ void kbase_mmu_interrupt_process(struct kbase_device *kbdev,
 					KBASE_MMU_FAULT_TYPE_PAGE_UNEXPECTED);
 		}
 
-#if KBASE_GPU_RESET_EN
 		if (kbase_as_has_bus_fault(as) &&
 				kbase_hw_has_issue(kbdev, BASE_HW_ISSUE_8245)) {
 			bool reset_status;
@@ -2622,7 +2603,6 @@ void kbase_mmu_interrupt_process(struct kbase_device *kbdev,
 			if (reset_status)
 				kbase_reset_gpu_locked(kbdev);
 		}
-#endif /* KBASE_GPU_RESET_EN */
 
 		return;
 	}
diff --git a/mali_kbase/mali_kbase_pm.c b/mali_kbase/mali_kbase_pm.c
index d5b8c77..5699eb8 100644
--- a/mali_kbase/mali_kbase_pm.c
+++ b/mali_kbase/mali_kbase_pm.c
@@ -30,6 +30,7 @@
 #include <mali_kbase.h>
 #include <mali_midg_regmap.h>
 #include <mali_kbase_vinstr.h>
+#include <mali_kbase_hwcnt_context.h>
 
 #include <mali_kbase_pm.h>
 
@@ -83,10 +84,6 @@ int kbase_pm_context_active_handle_suspend(struct kbase_device *kbdev, enum kbas
 		 * the policy */
 		kbase_hwaccess_pm_gpu_active(kbdev);
 	}
-#if defined(CONFIG_DEVFREQ_THERMAL) && defined(CONFIG_MALI_DEVFREQ)
-	if (kbdev->ipa.gpu_active_callback)
-		kbdev->ipa.gpu_active_callback(kbdev->ipa.model_data);
-#endif
 
 	mutex_unlock(&kbdev->pm.lock);
 	mutex_unlock(&js_devdata->runpool_mutex);
@@ -118,25 +115,11 @@ void kbase_pm_context_idle(struct kbase_device *kbdev)
 
 		/* Wake up anyone waiting for this to become 0 (e.g. suspend). The
 		 * waiters must synchronize with us by locking the pm.lock after
-		 * waiting */
+		 * waiting.
+		 */
 		wake_up(&kbdev->pm.zero_active_count_wait);
 	}
 
-#if defined(CONFIG_DEVFREQ_THERMAL) && defined(CONFIG_MALI_DEVFREQ)
-	/* IPA may be using vinstr, in which case there may be one PM reference
-	 * still held when all other contexts have left the GPU. Inform IPA that
-	 * the GPU is now idle so that vinstr can drop it's reference.
-	 *
-	 * If the GPU was only briefly active then it might have gone idle
-	 * before vinstr has taken a PM reference, meaning that active_count is
-	 * zero. We still need to inform IPA in this case, so that vinstr can
-	 * drop the PM reference and avoid keeping the GPU powered
-	 * unnecessarily.
-	 */
-	if (c <= 1 && kbdev->ipa.gpu_idle_callback)
-		kbdev->ipa.gpu_idle_callback(kbdev->ipa.model_data);
-#endif
-
 	mutex_unlock(&kbdev->pm.lock);
 	mutex_unlock(&js_devdata->runpool_mutex);
 }
@@ -147,10 +130,16 @@ void kbase_pm_suspend(struct kbase_device *kbdev)
 {
 	KBASE_DEBUG_ASSERT(kbdev);
 
-	/* Suspend vinstr.
-	 * This call will block until vinstr is suspended. */
+	/* Suspend vinstr. This blocks until the vinstr worker and timer are
+	 * no longer running.
+	 */
 	kbase_vinstr_suspend(kbdev->vinstr_ctx);
 
+	/* Disable GPU hardware counters.
+	 * This call will block until counters are disabled.
+	 */
+	kbase_hwcnt_context_disable(kbdev->hwcnt_gpu_ctx);
+
 	mutex_lock(&kbdev->pm.lock);
 	KBASE_DEBUG_ASSERT(!kbase_pm_is_suspending(kbdev));
 	kbdev->pm.suspending = true;
@@ -177,6 +166,8 @@ void kbase_pm_suspend(struct kbase_device *kbdev)
 
 void kbase_pm_resume(struct kbase_device *kbdev)
 {
+	unsigned long flags;
+
 	/* MUST happen before any pm_context_active calls occur */
 	kbase_hwaccess_pm_resume(kbdev);
 
@@ -195,7 +186,11 @@ void kbase_pm_resume(struct kbase_device *kbdev)
 	 * need it and the policy doesn't want it on */
 	kbase_pm_context_idle(kbdev);
 
-	/* Resume vinstr operation */
+	/* Re-enable GPU hardware counters */
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	kbase_hwcnt_context_enable(kbdev->hwcnt_gpu_ctx);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	/* Resume vinstr */
 	kbase_vinstr_resume(kbdev->vinstr_ctx);
 }
-
diff --git a/mali_kbase/mali_kbase_softjobs.c b/mali_kbase/mali_kbase_softjobs.c
index a3090c1..e762af4 100644
--- a/mali_kbase/mali_kbase_softjobs.c
+++ b/mali_kbase/mali_kbase_softjobs.c
@@ -1129,8 +1129,9 @@ static int kbase_jit_allocate_process(struct kbase_jd_atom *katom)
 		reg = kctx->jit_alloc[info->id];
 		new_addr = reg->start_pfn << PAGE_SHIFT;
 		*ptr = new_addr;
-		KBASE_TLSTREAM_TL_ATTRIB_ATOM_JIT(
-				katom, info->gpu_alloc_addr, new_addr);
+		KBASE_TLSTREAM_TL_ATTRIB_ATOM_JIT(katom,
+			info->gpu_alloc_addr,
+			new_addr, info->va_pages);
 		kbase_vunmap(kctx, &mapping);
 	}
 
diff --git a/mali_kbase/mali_kbase_tlstream.c b/mali_kbase/mali_kbase_tlstream.c
index aaf5782..10e3889 100644
--- a/mali_kbase/mali_kbase_tlstream.c
+++ b/mali_kbase/mali_kbase_tlstream.c
@@ -170,7 +170,8 @@ enum tl_msg_id_aux {
 	KBASE_AUX_PROTECTED_ENTER_START,
 	KBASE_AUX_PROTECTED_ENTER_END,
 	KBASE_AUX_PROTECTED_LEAVE_START,
-	KBASE_AUX_PROTECTED_LEAVE_END
+	KBASE_AUX_PROTECTED_LEAVE_END,
+	KBASE_AUX_JIT_STATS,
 };
 
 /*****************************************************************************/
@@ -448,8 +449,8 @@ static const struct tp_desc tp_desc_obj[] = {
 		KBASE_TL_ATTRIB_ATOM_JIT,
 		__stringify(KBASE_TL_ATTRIB_ATOM_JIT),
 		"jit done for atom",
-		"@pLL",
-		"atom,edit_addr,new_addr"
+		"@pLLL",
+		"atom,edit_addr,new_addr,va_pages"
 	},
 	{
 		KBASE_TL_ATTRIB_ATOM_JITALLOCINFO,
@@ -573,6 +574,13 @@ static const struct tp_desc tp_desc_aux[] = {
 		"leave protected mode end",
 		"@p",
 		"gpu"
+	},
+	{
+		KBASE_AUX_JIT_STATS,
+		__stringify(KBASE_AUX_JIT_STATS),
+		"per-bin JIT statistics",
+		"@IIIIII",
+		"ctx_nr,bid,max_allocs,allocs,va_pages,ph_pages"
 	}
 };
 
@@ -2165,12 +2173,12 @@ void __kbase_tlstream_tl_attrib_atom_prioritized(void *atom)
 }
 
 void __kbase_tlstream_tl_attrib_atom_jit(
-		void *atom, u64 edit_addr, u64 new_addr)
+		void *atom, u64 edit_addr, u64 new_addr, u64 va_pages)
 {
 	const u32     msg_id = KBASE_TL_ATTRIB_ATOM_JIT;
 	const size_t  msg_size =
 		sizeof(msg_id) + sizeof(u64) + sizeof(atom)
-		+ sizeof(edit_addr) + sizeof(new_addr);
+		+ sizeof(edit_addr) + sizeof(new_addr) + sizeof(va_pages);
 	unsigned long flags;
 	char          *buffer;
 	size_t        pos = 0;
@@ -2188,6 +2196,9 @@ void __kbase_tlstream_tl_attrib_atom_jit(
 			buffer, pos, &edit_addr, sizeof(edit_addr));
 	pos = kbasep_tlstream_write_bytes(
 			buffer, pos, &new_addr, sizeof(new_addr));
+	pos = kbasep_tlstream_write_bytes(
+		buffer, pos, &va_pages, sizeof(va_pages));
+
 	KBASE_DEBUG_ASSERT(msg_size == pos);
 
 	kbasep_tlstream_msgbuf_release(TL_STREAM_TYPE_OBJ, flags);
@@ -2624,3 +2635,40 @@ void __kbase_tlstream_aux_protected_leave_end(void *gpu)
 
 	kbasep_tlstream_msgbuf_release(TL_STREAM_TYPE_AUX, flags);
 }
+
+void __kbase_tlstream_aux_jit_stats(u32 ctx_nr, u32 bid,
+		u32 max_allocs, u32 allocs,
+		u32 va_pages, u32 ph_pages)
+{
+	const u32     msg_id = KBASE_AUX_JIT_STATS;
+	const size_t  msg_size = sizeof(msg_id) + sizeof(u64) +
+		sizeof(ctx_nr) + sizeof(bid) +
+		sizeof(max_allocs) + sizeof(allocs) +
+		sizeof(va_pages) + sizeof(ph_pages);
+	unsigned long flags;
+	char          *buffer;
+	size_t        pos = 0;
+
+	buffer = kbasep_tlstream_msgbuf_acquire(
+			TL_STREAM_TYPE_AUX,
+			msg_size, &flags);
+	KBASE_DEBUG_ASSERT(buffer);
+
+	pos = kbasep_tlstream_write_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_tlstream_write_timestamp(buffer, pos);
+	pos = kbasep_tlstream_write_bytes(
+		buffer, pos, &ctx_nr, sizeof(ctx_nr));
+	pos = kbasep_tlstream_write_bytes(
+		buffer, pos, &bid, sizeof(bid));
+	pos = kbasep_tlstream_write_bytes(
+		buffer, pos, &max_allocs, sizeof(max_allocs));
+	pos = kbasep_tlstream_write_bytes(
+		buffer, pos, &allocs, sizeof(allocs));
+	pos = kbasep_tlstream_write_bytes(
+		buffer, pos, &va_pages, sizeof(va_pages));
+	pos = kbasep_tlstream_write_bytes(
+		buffer, pos, &ph_pages, sizeof(ph_pages));
+	KBASE_DEBUG_ASSERT(msg_size == pos);
+
+	kbasep_tlstream_msgbuf_release(TL_STREAM_TYPE_AUX, flags);
+}
diff --git a/mali_kbase/mali_kbase_tlstream.h b/mali_kbase/mali_kbase_tlstream.h
index 6f9656f..e2a3ea4 100644
--- a/mali_kbase/mali_kbase_tlstream.h
+++ b/mali_kbase/mali_kbase_tlstream.h
@@ -141,7 +141,7 @@ void __kbase_tlstream_tl_attrib_atom_priority(void *atom, u32 prio);
 void __kbase_tlstream_tl_attrib_atom_state(void *atom, u32 state);
 void __kbase_tlstream_tl_attrib_atom_prioritized(void *atom);
 void __kbase_tlstream_tl_attrib_atom_jit(
-		void *atom, u64 edit_addr, u64 new_addr);
+		void *atom, u64 edit_addr, u64 new_addr, u64 va_pages);
 void __kbase_tlstream_tl_attrib_atom_jitallocinfo(
 		void *atom, u64 va_pages, u64 commit_pages, u64 extent,
 		u32 jit_id, u32 bin_id, u32 max_allocations, u32 flags,
@@ -163,6 +163,9 @@ void __kbase_tlstream_aux_protected_enter_start(void *gpu);
 void __kbase_tlstream_aux_protected_enter_end(void *gpu);
 void __kbase_tlstream_aux_protected_leave_start(void *gpu);
 void __kbase_tlstream_aux_protected_leave_end(void *gpu);
+void __kbase_tlstream_aux_jit_stats(u32 ctx_nr, u32 bin_id,
+		u32 max_allocations, u32 allocations,
+		u32 va_pages_nr, u32 ph_pages_nr);
 
 #define TLSTREAM_ENABLED (1 << 31)
 
@@ -472,9 +475,11 @@ extern atomic_t kbase_tlstream_enabled;
  * @atom:       atom identifier
  * @edit_addr:  address edited by jit
  * @new_addr:   address placed into the edited location
+ * @va_pages:   maximum number of pages this jit can allocate
  */
-#define KBASE_TLSTREAM_TL_ATTRIB_ATOM_JIT(atom, edit_addr, new_addr) \
-	__TRACE_IF_ENABLED_JD(tl_attrib_atom_jit, atom, edit_addr, new_addr)
+#define KBASE_TLSTREAM_TL_ATTRIB_ATOM_JIT(atom, edit_addr, new_addr, va_pages) \
+	__TRACE_IF_ENABLED_JD(tl_attrib_atom_jit, atom, edit_addr, \
+		new_addr, va_pages)
 
 /**
  * Information about the JIT allocation atom.
@@ -652,5 +657,24 @@ extern atomic_t kbase_tlstream_enabled;
 #define KBASE_TLSTREAM_AUX_PROTECTED_LEAVE_END(gpu) \
 	__TRACE_IF_ENABLED_LATENCY(aux_protected_leave_end, gpu)
 
+/**
+ * KBASE_TLSTREAM_AUX_JIT_STATS - JIT allocations per bin statistics
+ *
+ * @ctx_nr:      kernel context number
+ * @bid:         JIT bin id
+ * @max_allocs:  maximum allocations allowed in this bin.
+ *               UINT_MAX is a special value. It denotes that
+ *               the parameter was not changed since the last time.
+ * @allocs:      number of active allocations in this bin
+ * @va_pages:    number of virtual pages allocated in this bin
+ * @ph_pages:    number of physical pages allocated in this bin
+ *
+ * Function emits a timeline message indicating the JIT statistics
+ * for a given bin have chaned.
+ */
+#define KBASE_TLSTREAM_AUX_JIT_STATS(ctx_nr, bid, max_allocs, allocs, va_pages, ph_pages) \
+	__TRACE_IF_ENABLED(aux_jit_stats, ctx_nr, bid, \
+			max_allocs, allocs, \
+			va_pages, ph_pages)
 #endif /* _KBASE_TLSTREAM_H */
 
diff --git a/mali_kbase/mali_kbase_trace_defs.h b/mali_kbase/mali_kbase_trace_defs.h
index d7364d5..77fb818 100644
--- a/mali_kbase/mali_kbase_trace_defs.h
+++ b/mali_kbase/mali_kbase_trace_defs.h
@@ -172,8 +172,6 @@ int dummy_array[] = {
 	KBASE_TRACE_CODE_MAKE_CODE(JS_JOB_DONE_TRY_RUN_NEXT_JOB),
 	/* gpu_addr==value to write into JS_HEAD */
 	KBASE_TRACE_CODE_MAKE_CODE(JS_JOB_DONE_RETRY_NEEDED),
-	/* kctx is the one being evicted, info_val == kctx to put in  */
-	KBASE_TRACE_CODE_MAKE_CODE(JS_FAST_START_EVICTS_CTX),
 	KBASE_TRACE_CODE_MAKE_CODE(JS_AFFINITY_SUBMIT_TO_BLOCKED),
 	/* info_val == lower 32 bits of affinity */
 	KBASE_TRACE_CODE_MAKE_CODE(JS_AFFINITY_CURRENT),
diff --git a/mali_kbase/mali_kbase_utility.c b/mali_kbase/mali_kbase_utility.c
deleted file mode 100644
index 3ea234a..0000000
--- a/mali_kbase/mali_kbase_utility.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- *
- * (C) COPYRIGHT 2012-2013, 2015 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU licence.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * SPDX-License-Identifier: GPL-2.0
- *
- */
-
-
-
-#include <mali_kbase.h>
-
-bool kbasep_list_member_of(const struct list_head *base, struct list_head *entry)
-{
-	struct list_head *pos = base->next;
-
-	while (pos != base) {
-		if (pos == entry)
-			return true;
-
-		pos = pos->next;
-	}
-	return false;
-}
diff --git a/mali_kbase/mali_kbase_utility.h b/mali_kbase/mali_kbase_utility.h
index f2e5a33..8d4f044 100644
--- a/mali_kbase/mali_kbase_utility.h
+++ b/mali_kbase/mali_kbase_utility.h
@@ -29,17 +29,6 @@
 #error "Don't include this file directly, use mali_kbase.h instead"
 #endif
 
-/** Test whether the given list entry is a member of the given list.
- *
- * @param base      The head of the list to be tested
- * @param entry     The list entry to be tested
- *
- * @return          true if entry is a member of base
- *                  false otherwise
- */
-bool kbasep_list_member_of(const struct list_head *base, struct list_head *entry);
-
-
 static inline void kbase_timer_setup(struct timer_list *timer,
 				     void (*callback)(struct timer_list *timer))
 {
diff --git a/mali_kbase/mali_kbase_vinstr.c b/mali_kbase/mali_kbase_vinstr.c
index df936cf..51cb365 100644
--- a/mali_kbase/mali_kbase_vinstr.c
+++ b/mali_kbase/mali_kbase_vinstr.c
@@ -20,221 +20,109 @@
  *
  */
 
+#include "mali_kbase_vinstr.h"
+#include "mali_kbase_hwcnt_virtualizer.h"
+#include "mali_kbase_hwcnt_types.h"
+#include "mali_kbase_hwcnt_reader.h"
+#include "mali_kbase_hwcnt_gpu.h"
+#include "mali_kbase_ioctl.h"
+#include "mali_malisw.h"
+#include "mali_kbase_debug.h"
+
 #include <linux/anon_inodes.h>
-#include <linux/atomic.h>
+#include <linux/fcntl.h>
+#include <linux/fs.h>
 #include <linux/hrtimer.h>
-#include <linux/jiffies.h>
-#include <linux/kthread.h>
-#include <linux/list.h>
 #include <linux/mm.h>
+#include <linux/mutex.h>
 #include <linux/poll.h>
-#include <linux/preempt.h>
 #include <linux/slab.h>
-#include <linux/wait.h>
-
-#include <mali_kbase.h>
-#include <mali_kbase_hwaccess_instr.h>
-#include <mali_kbase_hwaccess_jm.h>
-#include <mali_kbase_hwcnt_reader.h>
-#include <mali_kbase_mem_linux.h>
-#include <mali_kbase_tlstream.h>
-#ifdef CONFIG_MALI_NO_MALI
-#include <backend/gpu/mali_kbase_model_dummy.h>
-#endif
-
-/*****************************************************************************/
+#include <linux/workqueue.h>
 
 /* Hwcnt reader API version */
-#define HWCNT_READER_API        1
-
-/* The number of nanoseconds in a second. */
-#define NSECS_IN_SEC            1000000000ull /* ns */
-
-/* The time resolution of dumping service. */
-#define DUMPING_RESOLUTION      500000ull /* ns */
+#define HWCNT_READER_API 1
 
-/* The maximal supported number of dumping buffers. */
-#define MAX_BUFFER_COUNT        32
+/* The minimum allowed interval between dumps (equivalent to 10KHz) */
+#define DUMP_INTERVAL_MIN_NS (100 * NSEC_PER_USEC)
 
-/* Size and number of hw counters blocks. */
-#define NR_CNT_BLOCKS_PER_GROUP 8
-#define NR_CNT_PER_BLOCK        64
-#define NR_BYTES_PER_CNT        4
-#define NR_BYTES_PER_HDR        16
-#define PRFCNT_EN_MASK_OFFSET   0x8
-
-/*****************************************************************************/
-
-enum {
-	SHADER_HWCNT_BM,
-	TILER_HWCNT_BM,
-	MMU_L2_HWCNT_BM,
-	JM_HWCNT_BM
-};
-
-enum vinstr_state {
-	VINSTR_IDLE,
-	VINSTR_DUMPING,
-	VINSTR_SUSPENDING,
-	VINSTR_SUSPENDED,
-	VINSTR_RESUMING
-};
+/* The maximum allowed buffers per client */
+#define MAX_BUFFER_COUNT 32
 
 /**
- * struct kbase_vinstr_context - vinstr context per device
- * @lock:              protects the entire vinstr context, but the list of
- *                     vinstr clients can be updated outside the lock using
- *                     @state_lock.
- * @kbdev:             pointer to kbase device
- * @kctx:              pointer to kbase context
- * @vmap:              vinstr vmap for mapping hwcnt dump buffer
- * @gpu_va:            GPU hwcnt dump buffer address
- * @cpu_va:            the CPU side mapping of the hwcnt dump buffer
- * @dump_size:         size of the dump buffer in bytes
- * @bitmap:            current set of counters monitored, not always in sync
- *                     with hardware
- * @reprogram:         when true, reprogram hwcnt block with the new set of
- *                     counters
- * @state:             vinstr state
- * @state_lock:        protects information about vinstr state and list of
- *                     clients.
- * @suspend_waitq:     notification queue to trigger state re-validation
- * @suspend_cnt:       reference counter of vinstr's suspend state
- * @suspend_work:      worker to execute on entering suspended state
- * @resume_work:       worker to execute on leaving suspended state
- * @nclients:          number of attached clients, pending or idle
- * @nclients_suspended: number of attached but suspended clients
- * @waiting_clients:   head of list of clients being periodically sampled
- * @idle_clients:      head of list of clients being idle
- * @suspended_clients: head of list of clients being suspended
- * @thread:            periodic sampling thread
- * @waitq:             notification queue of sampling thread
- * @request_pending:   request for action for sampling thread
- * @clients_present:   when true, we have at least one client
- *                     Note: this variable is in sync. with nclients and is
- *                     present to preserve simplicity. Protected by state_lock.
- * @need_suspend:      when true, a suspend has been requested while a resume is
- *                     in progress. Resume worker should queue a suspend.
- * @need_resume:       when true, a resume has been requested while a suspend is
- *                     in progress. Suspend worker should queue a resume.
- * @forced_suspend:    when true, the suspend of vinstr needs to take place
- *                     regardless of the kernel/user space clients attached
- *                     to it. In particular, this flag is set when the suspend
- *                     of vinstr is requested on entering protected mode or at
- *                     the time of device suspend.
+ * struct kbase_vinstr_context - IOCTL interface for userspace hardware
+ *                               counters.
+ * @hvirt:         Hardware counter virtualizer used by vinstr.
+ * @metadata:      Hardware counter metadata provided by virtualizer.
+ * @lock:          Lock protecting all vinstr state.
+ * @suspend_count: Suspend reference count. If non-zero, timer and worker are
+ *                 prevented from being re-scheduled.
+ * @client_count:  Number of vinstr clients.
+ * @clients:       List of vinstr clients.
+ * @dump_timer:    Timer that enqueues dump_work to a workqueue.
+ * @dump_work:     Worker for performing periodic counter dumps.
  */
 struct kbase_vinstr_context {
-	struct mutex             lock;
-	struct kbase_device      *kbdev;
-	struct kbase_context     *kctx;
-
-	struct kbase_vmap_struct *vmap;
-	u64                      gpu_va;
-	void                     *cpu_va;
-	size_t                   dump_size;
-	u32                      bitmap[4];
-	bool                     reprogram;
-
-	enum vinstr_state        state;
-	struct spinlock          state_lock;
-	wait_queue_head_t        suspend_waitq;
-	unsigned int             suspend_cnt;
-	struct work_struct       suspend_work;
-	struct work_struct       resume_work;
-
-	u32                      nclients;
-	u32                      nclients_suspended;
-	struct list_head         waiting_clients;
-	struct list_head         idle_clients;
-	struct list_head         suspended_clients;
-
-	struct task_struct       *thread;
-	wait_queue_head_t        waitq;
-	atomic_t                 request_pending;
-
-	bool                     clients_present;
-
-	bool                     need_suspend;
-	bool                     need_resume;
-	bool                     forced_suspend;
+	struct kbase_hwcnt_virtualizer *hvirt;
+	const struct kbase_hwcnt_metadata *metadata;
+	struct mutex lock;
+	size_t suspend_count;
+	size_t client_count;
+	struct list_head clients;
+	struct hrtimer dump_timer;
+	struct work_struct dump_work;
 };
 
 /**
- * struct kbase_vinstr_client - a vinstr client attached to a vinstr context
- * @vinstr_ctx:    vinstr context client is attached to
- * @list:          node used to attach this client to list in vinstr context
- * @buffer_count:  number of buffers this client is using
- * @event_mask:    events this client reacts to
- * @dump_size:     size of one dump buffer in bytes
- * @bitmap:        bitmap request for JM, TILER, SHADER and MMU counters
- * @legacy_buffer: userspace hwcnt dump buffer (legacy interface)
- * @kernel_buffer: kernel hwcnt dump buffer (kernel client interface)
- * @accum_buffer:  temporary accumulation buffer for preserving counters
- * @dump_time:     next time this clients shall request hwcnt dump
- * @dump_interval: interval between periodic hwcnt dumps
- * @dump_buffers:  kernel hwcnt dump buffers allocated by this client
- * @dump_buffers_meta: metadata of dump buffers
- * @meta_idx:      index of metadata being accessed by userspace
- * @read_idx:      index of buffer read by userspace
- * @write_idx:     index of buffer being written by dumping service
- * @waitq:         client's notification queue
- * @pending:       when true, client has attached but hwcnt not yet updated
- * @suspended:     when true, client is suspended
+ * struct kbase_vinstr_client - A vinstr client attached to a vinstr context.
+ * @vctx:              Vinstr context client is attached to.
+ * @hvcli:             Hardware counter virtualizer client.
+ * @node:              Node used to attach this client to list in vinstr
+ *                     context.
+ * @dump_interval_ns:  Interval between periodic dumps. If 0, not a periodic
+ *                     client.
+ * @next_dump_time_ns: Time in ns when this client's next periodic dump must
+ *                     occur. If 0, not a periodic client.
+ * @enable_map:        Counters enable map.
+ * @dump_bufs:         Array of dump buffers allocated by this client.
+ * @dump_bufs_meta:    Metadata of dump buffers.
+ * @meta_idx:          Index of metadata being accessed by userspace.
+ * @read_idx:          Index of buffer read by userspace.
+ * @write_idx:         Index of buffer being written by dump worker.
+ * @waitq:             Client's notification queue.
  */
 struct kbase_vinstr_client {
-	struct kbase_vinstr_context        *vinstr_ctx;
-	struct list_head                   list;
-	unsigned int                       buffer_count;
-	u32                                event_mask;
-	size_t                             dump_size;
-	u32                                bitmap[4];
-	void __user                        *legacy_buffer;
-	void                               *kernel_buffer;
-	void                               *accum_buffer;
-	u64                                dump_time;
-	u32                                dump_interval;
-	char                               *dump_buffers;
-	struct kbase_hwcnt_reader_metadata *dump_buffers_meta;
-	atomic_t                           meta_idx;
-	atomic_t                           read_idx;
-	atomic_t                           write_idx;
-	wait_queue_head_t                  waitq;
-	bool                               pending;
-	bool                               suspended;
-};
-
-/**
- * struct kbasep_vinstr_wake_up_timer - vinstr service thread wake up timer
- * @hrtimer:    high resolution timer
- * @vinstr_ctx: vinstr context
- */
-struct kbasep_vinstr_wake_up_timer {
-	struct hrtimer              hrtimer;
-	struct kbase_vinstr_context *vinstr_ctx;
+	struct kbase_vinstr_context *vctx;
+	struct kbase_hwcnt_virtualizer_client *hvcli;
+	struct list_head node;
+	u64 next_dump_time_ns;
+	u32 dump_interval_ns;
+	struct kbase_hwcnt_enable_map enable_map;
+	struct kbase_hwcnt_dump_buffer_array dump_bufs;
+	struct kbase_hwcnt_reader_metadata *dump_bufs_meta;
+	atomic_t meta_idx;
+	atomic_t read_idx;
+	atomic_t write_idx;
+	wait_queue_head_t waitq;
 };
 
-/*****************************************************************************/
-
-static void kbase_vinstr_update_suspend(
-		struct kbase_vinstr_context *vinstr_ctx);
-
-static int kbasep_vinstr_service_task(void *data);
-
 static unsigned int kbasep_vinstr_hwcnt_reader_poll(
-		struct file *filp,
-		poll_table  *wait);
+	struct file *filp,
+	poll_table *wait);
+
 static long kbasep_vinstr_hwcnt_reader_ioctl(
-		struct file   *filp,
-		unsigned int  cmd,
-		unsigned long arg);
+	struct file *filp,
+	unsigned int cmd,
+	unsigned long arg);
+
 static int kbasep_vinstr_hwcnt_reader_mmap(
-		struct file           *filp,
-		struct vm_area_struct *vma);
+	struct file *filp,
+	struct vm_area_struct *vma);
+
 static int kbasep_vinstr_hwcnt_reader_release(
-		struct inode *inode,
-		struct file  *filp);
+	struct inode *inode,
+	struct file *filp);
 
-/* The timeline stream file operations structure. */
+/* Vinstr client file operations */
 static const struct file_operations vinstr_client_fops = {
 	.poll           = kbasep_vinstr_hwcnt_reader_poll,
 	.unlocked_ioctl = kbasep_vinstr_hwcnt_reader_ioctl,
@@ -243,1211 +131,546 @@ static const struct file_operations vinstr_client_fops = {
 	.release        = kbasep_vinstr_hwcnt_reader_release,
 };
 
-/*****************************************************************************/
-
-static int enable_hwcnt(struct kbase_vinstr_context *vinstr_ctx)
+/**
+ * kbasep_vinstr_timestamp_ns() - Get the current time in nanoseconds.
+ *
+ * Return: Current time in nanoseconds.
+ */
+static u64 kbasep_vinstr_timestamp_ns(void)
 {
-	struct kbase_context *kctx = vinstr_ctx->kctx;
-	struct kbase_device *kbdev = kctx->kbdev;
-	struct kbase_ioctl_hwcnt_enable enable;
-	int err;
-
-	enable.dump_buffer = vinstr_ctx->gpu_va;
-	enable.jm_bm       = vinstr_ctx->bitmap[JM_HWCNT_BM];
-	enable.tiler_bm    = vinstr_ctx->bitmap[TILER_HWCNT_BM];
-	enable.shader_bm   = vinstr_ctx->bitmap[SHADER_HWCNT_BM];
-	enable.mmu_l2_bm   = vinstr_ctx->bitmap[MMU_L2_HWCNT_BM];
-
-	/* Mark the context as active so the GPU is kept turned on */
-	/* A suspend won't happen here, because we're in a syscall from a
-	 * userspace thread. */
-	kbase_pm_context_active(kbdev);
-
-	/* Schedule the context in */
-	kbasep_js_schedule_privileged_ctx(kbdev, kctx);
-	err = kbase_instr_hwcnt_enable_internal(kbdev, kctx, &enable);
-	if (err) {
-		/* Release the context. This had its own Power Manager Active
-		 * reference */
-		kbasep_js_release_privileged_ctx(kbdev, kctx);
-
-		/* Also release our Power Manager Active reference */
-		kbase_pm_context_idle(kbdev);
-	}
+	struct timespec ts;
 
-	return err;
+	getrawmonotonic(&ts);
+	return (u64)ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
 }
 
-static void disable_hwcnt(struct kbase_vinstr_context *vinstr_ctx)
+/**
+ * kbasep_vinstr_next_dump_time_ns() - Calculate the next periodic dump time.
+ * @cur_ts_ns: Current time in nanoseconds.
+ * @interval:  Interval between dumps in nanoseconds.
+ *
+ * Return: 0 if interval is 0 (i.e. a non-periodic client), or the next dump
+ *         time that occurs after cur_ts_ns.
+ */
+static u64 kbasep_vinstr_next_dump_time_ns(u64 cur_ts_ns, u32 interval)
 {
-	struct kbase_context *kctx = vinstr_ctx->kctx;
-	struct kbase_device *kbdev = kctx->kbdev;
-	int err;
-
-	err = kbase_instr_hwcnt_disable_internal(kctx);
-	if (err) {
-		dev_warn(kbdev->dev, "Failed to disable HW counters (ctx:%p)",
-				kctx);
-		return;
-	}
-
-	/* Release the context. This had its own Power Manager Active reference. */
-	kbasep_js_release_privileged_ctx(kbdev, kctx);
+	/* Non-periodic client */
+	if (interval == 0)
+		return 0;
 
-	/* Also release our Power Manager Active reference. */
-	kbase_pm_context_idle(kbdev);
-
-	dev_dbg(kbdev->dev, "HW counters dumping disabled for context %p", kctx);
+	/*
+	 * Return the next interval after the current time relative to t=0.
+	 * This means multiple clients with the same period will synchronise,
+	 * regardless of when they were started, allowing the worker to be
+	 * scheduled less frequently.
+	 */
+	do_div(cur_ts_ns, interval);
+	return (cur_ts_ns + 1) * interval;
 }
 
-static int reprogram_hwcnt(struct kbase_vinstr_context *vinstr_ctx)
-{
-	disable_hwcnt(vinstr_ctx);
-	return enable_hwcnt(vinstr_ctx);
-}
+/**
+ * kbasep_vinstr_client_dump() - Perform a dump for a client.
+ * @vcli:     Non-NULL pointer to a vinstr client.
+ * @event_id: Event type that triggered the dump.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_vinstr_client_dump(
+	struct kbase_vinstr_client *vcli,
+	enum base_hwcnt_reader_event event_id)
+{
+	int errcode;
+	u64 ts_start_ns;
+	u64 ts_end_ns;
+	unsigned int write_idx;
+	unsigned int read_idx;
+	struct kbase_hwcnt_dump_buffer *dump_buf;
+	struct kbase_hwcnt_reader_metadata *meta;
 
-static void hwcnt_bitmap_set(u32 dst[4], u32 src[4])
-{
-	dst[JM_HWCNT_BM]     = src[JM_HWCNT_BM];
-	dst[TILER_HWCNT_BM]  = src[TILER_HWCNT_BM];
-	dst[SHADER_HWCNT_BM] = src[SHADER_HWCNT_BM];
-	dst[MMU_L2_HWCNT_BM] = src[MMU_L2_HWCNT_BM];
-}
+	WARN_ON(!vcli);
+	lockdep_assert_held(&vcli->vctx->lock);
 
-static void hwcnt_bitmap_union(u32 dst[4], u32 src[4])
-{
-	dst[JM_HWCNT_BM]     |= src[JM_HWCNT_BM];
-	dst[TILER_HWCNT_BM]  |= src[TILER_HWCNT_BM];
-	dst[SHADER_HWCNT_BM] |= src[SHADER_HWCNT_BM];
-	dst[MMU_L2_HWCNT_BM] |= src[MMU_L2_HWCNT_BM];
-}
+	write_idx = atomic_read(&vcli->write_idx);
+	read_idx = atomic_read(&vcli->read_idx);
 
-size_t kbase_vinstr_dump_size(struct kbase_device *kbdev)
-{
-	size_t dump_size;
-
-#ifndef CONFIG_MALI_NO_MALI
-	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_V4)) {
-		u32 nr_cg;
-
-		nr_cg = kbdev->gpu_props.num_core_groups;
-		dump_size = nr_cg * NR_CNT_BLOCKS_PER_GROUP *
-				NR_CNT_PER_BLOCK *
-				NR_BYTES_PER_CNT;
-	} else
-#endif /* CONFIG_MALI_NO_MALI */
-	{
-		/* assume v5 for now */
-#ifdef CONFIG_MALI_NO_MALI
-		u32 nr_l2 = KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS;
-		u64 core_mask =
-			(1ULL << KBASE_DUMMY_MODEL_MAX_SHADER_CORES) - 1;
-#else
-		base_gpu_props *props = &kbdev->gpu_props.props;
-		u32 nr_l2 = props->l2_props.num_l2_slices;
-		u64 core_mask = props->coherency_info.group[0].core_mask;
-#endif
-		u32 nr_blocks = fls64(core_mask);
+	/* Check if there is a place to copy HWC block into. */
+	if (write_idx - read_idx == vcli->dump_bufs.buf_cnt)
+		return -EBUSY;
+	write_idx %= vcli->dump_bufs.buf_cnt;
 
-		/* JM and tiler counter blocks are always present */
-		dump_size = (2 + nr_l2 + nr_blocks) *
-				NR_CNT_PER_BLOCK *
-				NR_BYTES_PER_CNT;
-	}
-	return dump_size;
-}
-KBASE_EXPORT_TEST_API(kbase_vinstr_dump_size);
+	dump_buf = &vcli->dump_bufs.bufs[write_idx];
+	meta = &vcli->dump_bufs_meta[write_idx];
 
-static size_t kbasep_vinstr_dump_size_ctx(
-		struct kbase_vinstr_context *vinstr_ctx)
-{
-	return kbase_vinstr_dump_size(vinstr_ctx->kctx->kbdev);
-}
+	errcode = kbase_hwcnt_virtualizer_client_dump(
+		vcli->hvcli, &ts_start_ns, &ts_end_ns, dump_buf);
+	if (errcode)
+		return errcode;
 
-static int kbasep_vinstr_map_kernel_dump_buffer(
-		struct kbase_vinstr_context *vinstr_ctx)
-{
-	struct kbase_va_region *reg;
-	struct kbase_context *kctx = vinstr_ctx->kctx;
-	u64 flags, nr_pages;
-
-	flags = BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_GPU_WR |
-		BASE_MEM_PERMANENT_KERNEL_MAPPING | BASE_MEM_CACHED_CPU;
-	if (kctx->kbdev->mmu_mode->flags &
-			KBASE_MMU_MODE_HAS_NON_CACHEABLE)
-		flags |= BASE_MEM_UNCACHED_GPU;
-	vinstr_ctx->dump_size = kbasep_vinstr_dump_size_ctx(vinstr_ctx);
-	nr_pages = PFN_UP(vinstr_ctx->dump_size);
-
-	reg = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags,
-			&vinstr_ctx->gpu_va);
-	if (!reg)
-		return -ENOMEM;
+	/* Patch the dump buf headers, to hide the counters that other hwcnt
+	 * clients are using.
+	 */
+	kbase_hwcnt_gpu_patch_dump_headers(dump_buf, &vcli->enable_map);
 
-	vinstr_ctx->cpu_va = kbase_phy_alloc_mapping_get(kctx,
-			vinstr_ctx->gpu_va, &vinstr_ctx->vmap);
+	/* Zero all non-enabled counters (current values are undefined) */
+	kbase_hwcnt_dump_buffer_zero_non_enabled(dump_buf, &vcli->enable_map);
 
-	if (!vinstr_ctx->cpu_va) {
-		kbase_mem_free(kctx, vinstr_ctx->gpu_va);
-		return -ENOMEM;
-	}
+	meta->timestamp = ts_end_ns;
+	meta->event_id = event_id;
+	meta->buffer_idx = write_idx;
 
+	/* Notify client. Make sure all changes to memory are visible. */
+	wmb();
+	atomic_inc(&vcli->write_idx);
+	wake_up_interruptible(&vcli->waitq);
 	return 0;
 }
 
-static void kbasep_vinstr_unmap_kernel_dump_buffer(
-		struct kbase_vinstr_context *vinstr_ctx)
+/**
+ * kbasep_vinstr_client_clear() - Reset all the client's counters to zero.
+ * @vcli: Non-NULL pointer to a vinstr client.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_vinstr_client_clear(struct kbase_vinstr_client *vcli)
 {
-	struct kbase_context *kctx = vinstr_ctx->kctx;
+	u64 ts_start_ns;
+	u64 ts_end_ns;
 
-	kbase_phy_alloc_mapping_put(kctx, vinstr_ctx->vmap);
-	kbase_mem_free(kctx, vinstr_ctx->gpu_va);
+	WARN_ON(!vcli);
+	lockdep_assert_held(&vcli->vctx->lock);
+
+	/* A virtualizer dump with a NULL buffer will just clear the virtualizer
+	 * client's buffer.
+	 */
+	return kbase_hwcnt_virtualizer_client_dump(
+		vcli->hvcli, &ts_start_ns, &ts_end_ns, NULL);
 }
 
 /**
- * kbasep_vinstr_create_kctx - create kernel context for vinstr
- * @vinstr_ctx: vinstr context
- * Return: zero on success
+ * kbasep_vinstr_reschedule_worker() - Update next dump times for all periodic
+ *                                     vinstr clients, then reschedule the dump
+ *                                     worker appropriately.
+ * @vctx: Non-NULL pointer to the vinstr context.
+ *
+ * If there are no periodic clients, then the dump worker will not be
+ * rescheduled. Else, the dump worker will be rescheduled for the next periodic
+ * client dump.
  */
-static int kbasep_vinstr_create_kctx(struct kbase_vinstr_context *vinstr_ctx)
+static void kbasep_vinstr_reschedule_worker(struct kbase_vinstr_context *vctx)
 {
-	struct kbase_device *kbdev = vinstr_ctx->kbdev;
-	struct kbasep_kctx_list_element *element = NULL;
-	unsigned long flags;
-	bool enable_backend = false;
-	int err;
-
-	vinstr_ctx->kctx = kbase_create_context(vinstr_ctx->kbdev, true);
-	if (!vinstr_ctx->kctx)
-		return -ENOMEM;
+	u64 cur_ts_ns;
+	u64 earliest_next_ns = U64_MAX;
+	struct kbase_vinstr_client *pos;
 
-	/* Map the master kernel dump buffer.  The HW dumps the counters
-	 * into this memory region. */
-	err = kbasep_vinstr_map_kernel_dump_buffer(vinstr_ctx);
-	if (err)
-		goto failed_map;
-
-	/* Add kernel context to list of contexts associated with device. */
-	element = kzalloc(sizeof(*element), GFP_KERNEL);
-	if (element) {
-		element->kctx = vinstr_ctx->kctx;
-		mutex_lock(&kbdev->kctx_list_lock);
-		list_add(&element->link, &kbdev->kctx_list);
-
-		/* Inform timeline client about new context.
-		 * Do this while holding the lock to avoid tracepoint
-		 * being created in both body and summary stream. */
-		KBASE_TLSTREAM_TL_NEW_CTX(
-				vinstr_ctx->kctx,
-				vinstr_ctx->kctx->id,
-				(u32)(vinstr_ctx->kctx->tgid));
-
-		mutex_unlock(&kbdev->kctx_list_lock);
-	} else {
-		/* Don't treat this as a fail - just warn about it. */
-		dev_warn(kbdev->dev,
-				"couldn't add kctx to kctx_list\n");
-	}
+	WARN_ON(!vctx);
+	lockdep_assert_held(&vctx->lock);
 
-	/* Don't enable hardware counters if vinstr is suspended.
-	 * Note that vinstr resume code is run under vinstr context lock,
-	 * lower layer will be enabled as needed on resume. */
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	if (VINSTR_IDLE == vinstr_ctx->state)
-		enable_backend = true;
-	vinstr_ctx->clients_present = true;
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-	if (enable_backend)
-		err = enable_hwcnt(vinstr_ctx);
-	if (err)
-		goto failed_enable;
-
-	vinstr_ctx->thread = kthread_run(
-			kbasep_vinstr_service_task,
-			vinstr_ctx,
-			"mali_vinstr_service");
-	if (IS_ERR(vinstr_ctx->thread)) {
-		err = PTR_ERR(vinstr_ctx->thread);
-		goto failed_kthread;
-	}
+	cur_ts_ns = kbasep_vinstr_timestamp_ns();
 
-	return 0;
+	/*
+	 * Update each client's next dump time, and find the earliest next
+	 * dump time if any of the clients have a non-zero interval.
+	 */
+	list_for_each_entry(pos, &vctx->clients, node) {
+		const u64 cli_next_ns =
+			kbasep_vinstr_next_dump_time_ns(
+				cur_ts_ns, pos->dump_interval_ns);
 
-failed_kthread:
-	disable_hwcnt(vinstr_ctx);
-failed_enable:
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	vinstr_ctx->clients_present = false;
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-	kbasep_vinstr_unmap_kernel_dump_buffer(vinstr_ctx);
-	if (element) {
-		mutex_lock(&kbdev->kctx_list_lock);
-		list_del(&element->link);
-		kfree(element);
-		mutex_unlock(&kbdev->kctx_list_lock);
-		KBASE_TLSTREAM_TL_DEL_CTX(vinstr_ctx->kctx);
+		/* Non-zero next dump time implies a periodic client */
+		if ((cli_next_ns != 0) && (cli_next_ns < earliest_next_ns))
+			earliest_next_ns = cli_next_ns;
+
+		pos->next_dump_time_ns = cli_next_ns;
 	}
-failed_map:
-	kbase_destroy_context(vinstr_ctx->kctx);
-	vinstr_ctx->kctx = NULL;
-	return err;
+
+	/* Cancel the timer if it is already pending */
+	hrtimer_cancel(&vctx->dump_timer);
+
+	/* Start the timer if there are periodic clients and vinstr is not
+	 * suspended.
+	 */
+	if ((earliest_next_ns != U64_MAX) &&
+	    (vctx->suspend_count == 0) &&
+	    !WARN_ON(earliest_next_ns < cur_ts_ns))
+		hrtimer_start(
+			&vctx->dump_timer,
+			ns_to_ktime(earliest_next_ns - cur_ts_ns),
+			HRTIMER_MODE_REL);
 }
 
 /**
- * kbasep_vinstr_destroy_kctx - destroy vinstr's kernel context
- * @vinstr_ctx: vinstr context
+ * kbasep_vinstr_dump_worker()- Dump worker, that dumps all periodic clients
+ *                              that need to be dumped, then reschedules itself.
+ * @work: Work structure.
  */
-static void kbasep_vinstr_destroy_kctx(struct kbase_vinstr_context *vinstr_ctx)
+static void kbasep_vinstr_dump_worker(struct work_struct *work)
 {
-	struct kbase_device             *kbdev = vinstr_ctx->kbdev;
-	struct kbasep_kctx_list_element *element;
-	struct kbasep_kctx_list_element *tmp;
-	bool                            found = false;
-	bool                            hwcnt_disabled = false;
-	unsigned long                   flags;
-
-	/* Release hw counters dumping resources. */
-	vinstr_ctx->thread = NULL;
-
-	/* Simplify state transitions by specifying that we have no clients. */
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	vinstr_ctx->clients_present = false;
-	if ((VINSTR_SUSPENDED == vinstr_ctx->state) || (VINSTR_RESUMING == vinstr_ctx->state))
-		hwcnt_disabled = true;
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-
-	if (!hwcnt_disabled)
-		disable_hwcnt(vinstr_ctx);
-
-	kbasep_vinstr_unmap_kernel_dump_buffer(vinstr_ctx);
-
-	/* Remove kernel context from the device's contexts list. */
-	mutex_lock(&kbdev->kctx_list_lock);
-	list_for_each_entry_safe(element, tmp, &kbdev->kctx_list, link) {
-		if (element->kctx == vinstr_ctx->kctx) {
-			list_del(&element->link);
-			kfree(element);
-			found = true;
-		}
-	}
-	mutex_unlock(&kbdev->kctx_list_lock);
+	struct kbase_vinstr_context *vctx =
+		container_of(work, struct kbase_vinstr_context, dump_work);
+	struct kbase_vinstr_client *pos;
+	u64 cur_time_ns;
 
-	if (!found)
-		dev_warn(kbdev->dev, "kctx not in kctx_list\n");
+	mutex_lock(&vctx->lock);
 
-	/* Destroy context. */
-	kbase_destroy_context(vinstr_ctx->kctx);
+	cur_time_ns = kbasep_vinstr_timestamp_ns();
 
-	/* Inform timeline client about context destruction. */
-	KBASE_TLSTREAM_TL_DEL_CTX(vinstr_ctx->kctx);
+	/* Dump all periodic clients whose next dump time is before the current
+	 * time.
+	 */
+	list_for_each_entry(pos, &vctx->clients, node) {
+		if ((pos->next_dump_time_ns != 0) &&
+			(pos->next_dump_time_ns < cur_time_ns))
+			kbasep_vinstr_client_dump(
+				pos, BASE_HWCNT_READER_EVENT_PERIODIC);
+	}
 
-	vinstr_ctx->kctx = NULL;
+	/* Update the next dump times of all periodic clients, then reschedule
+	 * this worker at the earliest next dump time.
+	 */
+	kbasep_vinstr_reschedule_worker(vctx);
+
+	mutex_unlock(&vctx->lock);
 }
 
 /**
- * kbasep_vinstr_attach_client - Attach a client to the vinstr core
- * @vinstr_ctx:    vinstr context
- * @buffer_count:  requested number of dump buffers
- * @bitmap:        bitmaps describing which counters should be enabled
- * @argp:          pointer where notification descriptor shall be stored
- * @kernel_buffer: pointer to kernel side buffer
- *
- * Return: vinstr opaque client handle or NULL on failure
+ * kbasep_vinstr_dump_timer() - Dump timer that schedules the dump worker for
+ *                              execution as soon as possible.
+ * @timer: Timer structure.
  */
-static struct kbase_vinstr_client *kbasep_vinstr_attach_client(
-		struct kbase_vinstr_context *vinstr_ctx, u32 buffer_count,
-		u32 bitmap[4], void *argp, void *kernel_buffer)
+static enum hrtimer_restart kbasep_vinstr_dump_timer(struct hrtimer *timer)
 {
-	struct task_struct         *thread = NULL;
-	struct kbase_vinstr_client *cli;
-	unsigned long flags;
-	bool clients_present = false;
-
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
+	struct kbase_vinstr_context *vctx =
+		container_of(timer, struct kbase_vinstr_context, dump_timer);
 
-	if (buffer_count > MAX_BUFFER_COUNT
-	    || (buffer_count & (buffer_count - 1)))
-		return NULL;
-
-	cli = kzalloc(sizeof(*cli), GFP_KERNEL);
-	if (!cli)
-		return NULL;
-
-	cli->vinstr_ctx   = vinstr_ctx;
-	cli->buffer_count = buffer_count;
-	cli->event_mask   =
-		(1 << BASE_HWCNT_READER_EVENT_MANUAL) |
-		(1 << BASE_HWCNT_READER_EVENT_PERIODIC);
-	cli->pending      = true;
+	/* We don't need to check vctx->suspend_count here, as the suspend
+	 * function will ensure that any worker enqueued here is immediately
+	 * cancelled, and the worker itself won't reschedule this timer if
+	 * suspend_count != 0.
+	 */
+#if KERNEL_VERSION(3, 16, 0) > LINUX_VERSION_CODE
+	queue_work(system_wq, &vctx->dump_work);
+#else
+	queue_work(system_highpri_wq, &vctx->dump_work);
+#endif
+	return HRTIMER_NORESTART;
+}
 
-	hwcnt_bitmap_set(cli->bitmap, bitmap);
+/**
+ * kbasep_vinstr_client_destroy() - Destroy a vinstr client.
+ * @vcli: vinstr client. Must not be attached to a vinstr context.
+ */
+static void kbasep_vinstr_client_destroy(struct kbase_vinstr_client *vcli)
+{
+	if (!vcli)
+		return;
 
-	mutex_lock(&vinstr_ctx->lock);
+	kbase_hwcnt_virtualizer_client_destroy(vcli->hvcli);
+	kfree(vcli->dump_bufs_meta);
+	kbase_hwcnt_dump_buffer_array_free(&vcli->dump_bufs);
+	kbase_hwcnt_enable_map_free(&vcli->enable_map);
+	kfree(vcli);
+}
 
-	hwcnt_bitmap_union(vinstr_ctx->bitmap, cli->bitmap);
-	vinstr_ctx->reprogram = true;
+/**
+ * kbasep_vinstr_client_create() - Create a vinstr client. Does not attach to
+ *                                 the vinstr context.
+ * @vctx:     Non-NULL pointer to vinstr context.
+ * @setup:    Non-NULL pointer to hardware counter ioctl setup structure.
+ *            setup->buffer_count must not be 0.
+ * @out_vcli: Non-NULL pointer to where created client will be stored on
+ *            success.
+ *
+ * Return: 0 on success, else error code.
+ */
+static int kbasep_vinstr_client_create(
+	struct kbase_vinstr_context *vctx,
+	struct kbase_ioctl_hwcnt_reader_setup *setup,
+	struct kbase_vinstr_client **out_vcli)
+{
+	int errcode;
+	struct kbase_vinstr_client *vcli;
+	struct kbase_hwcnt_physical_enable_map phys_em;
 
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	clients_present = (vinstr_ctx->nclients || vinstr_ctx->nclients_suspended);
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
+	WARN_ON(!vctx);
+	WARN_ON(!setup);
+	WARN_ON(setup->buffer_count == 0);
 
-	/* If this is the first client, create the vinstr kbase
-	 * context. This context is permanently resident until the
-	 * last client exits. */
-	if (!clients_present) {
-		hwcnt_bitmap_set(vinstr_ctx->bitmap, cli->bitmap);
-		if (kbasep_vinstr_create_kctx(vinstr_ctx) < 0)
-			goto error;
+	vcli = kzalloc(sizeof(*vcli), GFP_KERNEL);
+	if (!vcli)
+		return -ENOMEM;
 
-		vinstr_ctx->reprogram = false;
-		cli->pending = false;
-	}
+	vcli->vctx = vctx;
 
-	/* The GPU resets the counter block every time there is a request
-	 * to dump it. We need a per client kernel buffer for accumulating
-	 * the counters. */
-	cli->dump_size    = kbasep_vinstr_dump_size_ctx(vinstr_ctx);
-	cli->accum_buffer = kzalloc(cli->dump_size, GFP_KERNEL);
-	if (!cli->accum_buffer)
+	errcode = kbase_hwcnt_enable_map_alloc(
+		vctx->metadata, &vcli->enable_map);
+	if (errcode)
 		goto error;
 
-	/* Prepare buffers. */
-	if (cli->buffer_count) {
-		int *fd = (int *)argp;
-		size_t tmp;
-
-		/* Allocate area for buffers metadata storage. */
-		tmp = sizeof(struct kbase_hwcnt_reader_metadata) *
-			cli->buffer_count;
-		cli->dump_buffers_meta = kmalloc(tmp, GFP_KERNEL);
-		if (!cli->dump_buffers_meta)
-			goto error;
-
-		/* Allocate required number of dumping buffers. */
-		cli->dump_buffers = (char *)__get_free_pages(
-				GFP_KERNEL | __GFP_ZERO,
-				get_order(cli->dump_size * cli->buffer_count));
-		if (!cli->dump_buffers)
-			goto error;
-
-		/* Create descriptor for user-kernel data exchange. */
-		*fd = anon_inode_getfd(
-				"[mali_vinstr_desc]",
-				&vinstr_client_fops,
-				cli,
-				O_RDONLY | O_CLOEXEC);
-		if (0 > *fd)
-			goto error;
-	} else if (kernel_buffer) {
-		cli->kernel_buffer = kernel_buffer;
-	} else {
-		cli->legacy_buffer = (void __user *)argp;
-	}
+	phys_em.jm_bm = setup->jm_bm;
+	phys_em.shader_bm = setup->shader_bm;
+	phys_em.tiler_bm = setup->tiler_bm;
+	phys_em.mmu_l2_bm = setup->mmu_l2_bm;
+	kbase_hwcnt_gpu_enable_map_from_physical(&vcli->enable_map, &phys_em);
 
-	atomic_set(&cli->read_idx, 0);
-	atomic_set(&cli->meta_idx, 0);
-	atomic_set(&cli->write_idx, 0);
-	init_waitqueue_head(&cli->waitq);
+	errcode = kbase_hwcnt_dump_buffer_array_alloc(
+		vctx->metadata, setup->buffer_count, &vcli->dump_bufs);
+	if (errcode)
+		goto error;
 
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	vinstr_ctx->nclients++;
-	list_add(&cli->list, &vinstr_ctx->idle_clients);
-	kbase_vinstr_update_suspend(vinstr_ctx);
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
+	errcode = -ENOMEM;
+	vcli->dump_bufs_meta = kmalloc_array(
+		setup->buffer_count, sizeof(*vcli->dump_bufs_meta), GFP_KERNEL);
+	if (!vcli->dump_bufs_meta)
+		goto error;
 
-	mutex_unlock(&vinstr_ctx->lock);
+	errcode = kbase_hwcnt_virtualizer_client_create(
+		vctx->hvirt, &vcli->enable_map, &vcli->hvcli);
+	if (errcode)
+		goto error;
 
-	return cli;
+	init_waitqueue_head(&vcli->waitq);
 
+	*out_vcli = vcli;
+	return 0;
 error:
-	kfree(cli->dump_buffers_meta);
-	if (cli->dump_buffers)
-		free_pages(
-				(unsigned long)cli->dump_buffers,
-				get_order(cli->dump_size * cli->buffer_count));
-	kfree(cli->accum_buffer);
-	if (!clients_present && vinstr_ctx->kctx) {
-		thread = vinstr_ctx->thread;
-		kbasep_vinstr_destroy_kctx(vinstr_ctx);
-	}
-	kfree(cli);
-
-	mutex_unlock(&vinstr_ctx->lock);
-
-	/* Thread must be stopped after lock is released. */
-	if (thread)
-		kthread_stop(thread);
-
-	return NULL;
+	kbasep_vinstr_client_destroy(vcli);
+	return errcode;
 }
 
-void kbase_vinstr_detach_client(struct kbase_vinstr_client *cli)
+int kbase_vinstr_init(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	struct kbase_vinstr_context **out_vctx)
 {
-	struct kbase_vinstr_context *vinstr_ctx;
-	struct kbase_vinstr_client  *iter, *tmp;
-	struct task_struct          *thread = NULL;
-	u32 zerobitmap[4] = { 0 };
-	int cli_found = 0;
-	unsigned long flags;
-	bool clients_present;
-
-	KBASE_DEBUG_ASSERT(cli);
-	vinstr_ctx = cli->vinstr_ctx;
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-
-	mutex_lock(&vinstr_ctx->lock);
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-
-	list_for_each_entry_safe(iter, tmp, &vinstr_ctx->idle_clients, list) {
-		if (iter == cli) {
-			cli_found = 1;
-			break;
-		}
-	}
-	if (!cli_found) {
-		list_for_each_entry_safe(
-				iter, tmp, &vinstr_ctx->waiting_clients, list) {
-			if (iter == cli) {
-				cli_found = 1;
-				break;
-			}
-		}
-	}
-	if (!cli_found) {
-		list_for_each_entry_safe(
-				iter, tmp, &vinstr_ctx->suspended_clients, list) {
-			if (iter == cli) {
-				cli_found = 1;
-				break;
-			}
-		}
-	}
-	KBASE_DEBUG_ASSERT(cli_found);
+	struct kbase_vinstr_context *vctx;
+	const struct kbase_hwcnt_metadata *metadata;
 
-	if (cli_found) {
-		vinstr_ctx->reprogram = true;
-		list_del(&iter->list);
-	}
-
-	if (!cli->suspended)
-		vinstr_ctx->nclients--;
-	else
-		vinstr_ctx->nclients_suspended--;
-
-	kbase_vinstr_update_suspend(vinstr_ctx);
-
-	clients_present = (vinstr_ctx->nclients || vinstr_ctx->nclients_suspended);
-
-	/* Rebuild context bitmap now that the client has detached */
-	hwcnt_bitmap_set(vinstr_ctx->bitmap, zerobitmap);
-	list_for_each_entry(iter, &vinstr_ctx->idle_clients, list)
-		hwcnt_bitmap_union(vinstr_ctx->bitmap, iter->bitmap);
-	list_for_each_entry(iter, &vinstr_ctx->waiting_clients, list)
-		hwcnt_bitmap_union(vinstr_ctx->bitmap, iter->bitmap);
-	list_for_each_entry(iter, &vinstr_ctx->suspended_clients, list)
-		hwcnt_bitmap_union(vinstr_ctx->bitmap, iter->bitmap);
+	if (!hvirt || !out_vctx)
+		return -EINVAL;
 
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
+	metadata = kbase_hwcnt_virtualizer_metadata(hvirt);
+	if (!metadata)
+		return -EINVAL;
 
-	kfree(cli->dump_buffers_meta);
-	free_pages(
-			(unsigned long)cli->dump_buffers,
-			get_order(cli->dump_size * cli->buffer_count));
-	kfree(cli->accum_buffer);
-	kfree(cli);
+	vctx = kzalloc(sizeof(*vctx), GFP_KERNEL);
+	if (!vctx)
+		return -ENOMEM;
 
-	if (!clients_present) {
-		thread = vinstr_ctx->thread;
-		kbasep_vinstr_destroy_kctx(vinstr_ctx);
-	}
+	vctx->hvirt = hvirt;
+	vctx->metadata = metadata;
 
-	mutex_unlock(&vinstr_ctx->lock);
+	mutex_init(&vctx->lock);
+	INIT_LIST_HEAD(&vctx->clients);
+	hrtimer_init(&vctx->dump_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	vctx->dump_timer.function = kbasep_vinstr_dump_timer;
+	INIT_WORK(&vctx->dump_work, kbasep_vinstr_dump_worker);
 
-	/* Thread must be stopped after lock is released. */
-	if (thread)
-		kthread_stop(thread);
+	*out_vctx = vctx;
+	return 0;
 }
-KBASE_EXPORT_TEST_API(kbase_vinstr_detach_client);
 
-/* Accumulate counters in the dump buffer */
-static void accum_dump_buffer(void *dst, void *src, size_t dump_size)
+void kbase_vinstr_term(struct kbase_vinstr_context *vctx)
 {
-	size_t block_size = NR_CNT_PER_BLOCK * NR_BYTES_PER_CNT;
-	u32 *d = dst;
-	u32 *s = src;
-	size_t i, j;
-
-	for (i = 0; i < dump_size; i += block_size) {
-		/* skip over the header block */
-		d += NR_BYTES_PER_HDR / sizeof(u32);
-		s += NR_BYTES_PER_HDR / sizeof(u32);
-		for (j = 0; j < (block_size - NR_BYTES_PER_HDR) / sizeof(u32); j++) {
-			/* saturate result if addition would result in wraparound */
-			if (U32_MAX - *d < *s)
-				*d = U32_MAX;
-			else
-				*d += *s;
-			d++;
-			s++;
-		}
-	}
-}
+	if (!vctx)
+		return;
 
-/* This is the Midgard v4 patch function.  It copies the headers for each
- * of the defined blocks from the master kernel buffer and then patches up
- * the performance counter enable mask for each of the blocks to exclude
- * counters that were not requested by the client. */
-static void patch_dump_buffer_hdr_v4(
-		struct kbase_vinstr_context *vinstr_ctx,
-		struct kbase_vinstr_client *cli)
-{
-	u32 *mask;
-	u8 *dst = cli->accum_buffer;
-	u8 *src = vinstr_ctx->cpu_va;
-	u32 nr_cg = vinstr_ctx->kctx->kbdev->gpu_props.num_core_groups;
-	size_t i, group_size, group;
-	enum {
-		SC0_BASE    = 0 * NR_CNT_PER_BLOCK * NR_BYTES_PER_CNT,
-		SC1_BASE    = 1 * NR_CNT_PER_BLOCK * NR_BYTES_PER_CNT,
-		SC2_BASE    = 2 * NR_CNT_PER_BLOCK * NR_BYTES_PER_CNT,
-		SC3_BASE    = 3 * NR_CNT_PER_BLOCK * NR_BYTES_PER_CNT,
-		TILER_BASE  = 4 * NR_CNT_PER_BLOCK * NR_BYTES_PER_CNT,
-		MMU_L2_BASE = 5 * NR_CNT_PER_BLOCK * NR_BYTES_PER_CNT,
-		JM_BASE     = 7 * NR_CNT_PER_BLOCK * NR_BYTES_PER_CNT
-	};
-
-	group_size = NR_CNT_BLOCKS_PER_GROUP *
-			NR_CNT_PER_BLOCK *
-			NR_BYTES_PER_CNT;
-	for (i = 0; i < nr_cg; i++) {
-		group = i * group_size;
-		/* copy shader core headers */
-		memcpy(&dst[group + SC0_BASE], &src[group + SC0_BASE],
-		       NR_BYTES_PER_HDR);
-		memcpy(&dst[group + SC1_BASE], &src[group + SC1_BASE],
-		       NR_BYTES_PER_HDR);
-		memcpy(&dst[group + SC2_BASE], &src[group + SC2_BASE],
-		      NR_BYTES_PER_HDR);
-		memcpy(&dst[group + SC3_BASE], &src[group + SC3_BASE],
-		      NR_BYTES_PER_HDR);
-
-		/* copy tiler header */
-		memcpy(&dst[group + TILER_BASE], &src[group + TILER_BASE],
-		      NR_BYTES_PER_HDR);
-
-		/* copy mmu header */
-		memcpy(&dst[group + MMU_L2_BASE], &src[group + MMU_L2_BASE],
-		      NR_BYTES_PER_HDR);
-
-		/* copy job manager header */
-		memcpy(&dst[group + JM_BASE], &src[group + JM_BASE],
-		      NR_BYTES_PER_HDR);
-
-		/* patch the shader core enable mask */
-		mask = (u32 *)&dst[group + SC0_BASE + PRFCNT_EN_MASK_OFFSET];
-		*mask &= cli->bitmap[SHADER_HWCNT_BM];
-		mask = (u32 *)&dst[group + SC1_BASE + PRFCNT_EN_MASK_OFFSET];
-		*mask &= cli->bitmap[SHADER_HWCNT_BM];
-		mask = (u32 *)&dst[group + SC2_BASE + PRFCNT_EN_MASK_OFFSET];
-		*mask &= cli->bitmap[SHADER_HWCNT_BM];
-		mask = (u32 *)&dst[group + SC3_BASE + PRFCNT_EN_MASK_OFFSET];
-		*mask &= cli->bitmap[SHADER_HWCNT_BM];
-
-		/* patch the tiler core enable mask */
-		mask = (u32 *)&dst[group + TILER_BASE + PRFCNT_EN_MASK_OFFSET];
-		*mask &= cli->bitmap[TILER_HWCNT_BM];
-
-		/* patch the mmu core enable mask */
-		mask = (u32 *)&dst[group + MMU_L2_BASE + PRFCNT_EN_MASK_OFFSET];
-		*mask &= cli->bitmap[MMU_L2_HWCNT_BM];
-
-		/* patch the job manager enable mask */
-		mask = (u32 *)&dst[group + JM_BASE + PRFCNT_EN_MASK_OFFSET];
-		*mask &= cli->bitmap[JM_HWCNT_BM];
-	}
-}
+	cancel_work_sync(&vctx->dump_work);
 
-/* This is the Midgard v5 patch function.  It copies the headers for each
- * of the defined blocks from the master kernel buffer and then patches up
- * the performance counter enable mask for each of the blocks to exclude
- * counters that were not requested by the client. */
-static void patch_dump_buffer_hdr_v5(
-		struct kbase_vinstr_context *vinstr_ctx,
-		struct kbase_vinstr_client *cli)
-{
-	struct kbase_device *kbdev = vinstr_ctx->kctx->kbdev;
-	u32 i, nr_l2;
-	u64 core_mask;
-	u32 *mask;
-	u8 *dst = cli->accum_buffer;
-	u8 *src = vinstr_ctx->cpu_va;
-	size_t block_size = NR_CNT_PER_BLOCK * NR_BYTES_PER_CNT;
-
-	/* copy and patch job manager header */
-	memcpy(dst, src, NR_BYTES_PER_HDR);
-	mask = (u32 *)&dst[PRFCNT_EN_MASK_OFFSET];
-	*mask &= cli->bitmap[JM_HWCNT_BM];
-	dst += block_size;
-	src += block_size;
-
-	/* copy and patch tiler header */
-	memcpy(dst, src, NR_BYTES_PER_HDR);
-	mask = (u32 *)&dst[PRFCNT_EN_MASK_OFFSET];
-	*mask &= cli->bitmap[TILER_HWCNT_BM];
-	dst += block_size;
-	src += block_size;
-
-	/* copy and patch MMU/L2C headers */
-	nr_l2 = kbdev->gpu_props.props.l2_props.num_l2_slices;
-	for (i = 0; i < nr_l2; i++) {
-		memcpy(dst, src, NR_BYTES_PER_HDR);
-		mask = (u32 *)&dst[PRFCNT_EN_MASK_OFFSET];
-		*mask &= cli->bitmap[MMU_L2_HWCNT_BM];
-		dst += block_size;
-		src += block_size;
-	}
+	/* Non-zero client count implies client leak */
+	if (WARN_ON(vctx->client_count != 0)) {
+		struct kbase_vinstr_client *pos, *n;
 
-	/* copy and patch shader core headers */
-	core_mask = kbdev->gpu_props.props.coherency_info.group[0].core_mask;
-	while (0ull != core_mask) {
-		memcpy(dst, src, NR_BYTES_PER_HDR);
-		if (0ull != (core_mask & 1ull)) {
-			/* if block is not reserved update header */
-			mask = (u32 *)&dst[PRFCNT_EN_MASK_OFFSET];
-			*mask &= cli->bitmap[SHADER_HWCNT_BM];
+		list_for_each_entry_safe(pos, n, &vctx->clients, node) {
+			list_del(&pos->node);
+			vctx->client_count--;
+			kbasep_vinstr_client_destroy(pos);
 		}
-		dst += block_size;
-		src += block_size;
-
-		core_mask >>= 1;
 	}
+
+	WARN_ON(vctx->client_count != 0);
+	kfree(vctx);
 }
 
-/**
- * accum_clients - accumulate dumped hw counters for all known clients
- * @vinstr_ctx: vinstr context
- */
-static void accum_clients(struct kbase_vinstr_context *vinstr_ctx)
+void kbase_vinstr_suspend(struct kbase_vinstr_context *vctx)
 {
-	struct kbase_vinstr_client *iter;
-	int v4 = 0;
+	if (WARN_ON(!vctx))
+		return;
 
-#ifndef CONFIG_MALI_NO_MALI
-	v4 = kbase_hw_has_feature(vinstr_ctx->kbdev, BASE_HW_FEATURE_V4);
-#endif
+	mutex_lock(&vctx->lock);
 
-	list_for_each_entry(iter, &vinstr_ctx->idle_clients, list) {
-		/* Don't bother accumulating clients whose hwcnt requests
-		 * have not yet been honoured. */
-		if (iter->pending)
-			continue;
-		if (v4)
-			patch_dump_buffer_hdr_v4(vinstr_ctx, iter);
-		else
-			patch_dump_buffer_hdr_v5(vinstr_ctx, iter);
-		accum_dump_buffer(
-				iter->accum_buffer,
-				vinstr_ctx->cpu_va,
-				iter->dump_size);
-	}
-	list_for_each_entry(iter, &vinstr_ctx->waiting_clients, list) {
-		/* Don't bother accumulating clients whose hwcnt requests
-		 * have not yet been honoured. */
-		if (iter->pending)
-			continue;
-		if (v4)
-			patch_dump_buffer_hdr_v4(vinstr_ctx, iter);
-		else
-			patch_dump_buffer_hdr_v5(vinstr_ctx, iter);
-		accum_dump_buffer(
-				iter->accum_buffer,
-				vinstr_ctx->cpu_va,
-				iter->dump_size);
-	}
-}
+	if (!WARN_ON(vctx->suspend_count == SIZE_MAX))
+		vctx->suspend_count++;
 
-/*****************************************************************************/
+	mutex_unlock(&vctx->lock);
 
-/**
- * kbasep_vinstr_get_timestamp - return timestamp
- *
- * Function returns timestamp value based on raw monotonic timer. Value will
- * wrap around zero in case of overflow.
- *
- * Return: timestamp value
- */
-static u64 kbasep_vinstr_get_timestamp(void)
-{
-	struct timespec ts;
-
-	getrawmonotonic(&ts);
-	return (u64)ts.tv_sec * NSECS_IN_SEC + ts.tv_nsec;
+	/* Always sync cancel the timer and then the worker, regardless of the
+	 * new suspend count.
+	 *
+	 * This ensures concurrent calls to kbase_vinstr_suspend() always block
+	 * until vinstr is fully suspended.
+	 *
+	 * The timer is cancelled before the worker, as the timer
+	 * unconditionally re-enqueues the worker, but the worker checks the
+	 * suspend_count that we just incremented before rescheduling the timer.
+	 *
+	 * Therefore if we cancel the worker first, the timer might re-enqueue
+	 * the worker before we cancel the timer, but the opposite is not
+	 * possible.
+	 */
+	hrtimer_cancel(&vctx->dump_timer);
+	cancel_work_sync(&vctx->dump_work);
 }
 
-/**
- * kbasep_vinstr_add_dump_request - register client's dumping request
- * @cli:             requesting client
- * @waiting_clients: list of pending dumping requests
- */
-static void kbasep_vinstr_add_dump_request(
-		struct kbase_vinstr_client *cli,
-		struct list_head *waiting_clients)
+void kbase_vinstr_resume(struct kbase_vinstr_context *vctx)
 {
-	struct kbase_vinstr_client *tmp;
-
-	if (list_empty(waiting_clients)) {
-		list_add(&cli->list, waiting_clients);
+	if (WARN_ON(!vctx))
 		return;
-	}
-	list_for_each_entry(tmp, waiting_clients, list) {
-		if (tmp->dump_time > cli->dump_time) {
-			list_add_tail(&cli->list, &tmp->list);
-			return;
-		}
-	}
-	list_add_tail(&cli->list, waiting_clients);
-}
 
-/**
- * kbasep_vinstr_collect_and_accumulate - collect hw counters via low level
- *                                        dump and accumulate them for known
- *                                        clients
- * @vinstr_ctx: vinstr context
- * @timestamp: pointer where collection timestamp will be recorded
- *
- * Return: zero on success
- */
-static int kbasep_vinstr_collect_and_accumulate(
-		struct kbase_vinstr_context *vinstr_ctx, u64 *timestamp)
-{
-	unsigned long flags;
-	int rcode;
+	mutex_lock(&vctx->lock);
 
-#ifdef CONFIG_MALI_NO_MALI
-	/* The dummy model needs the CPU mapping. */
-	gpu_model_set_dummy_prfcnt_base_cpu(vinstr_ctx->cpu_va);
-#endif
+	if (!WARN_ON(vctx->suspend_count == 0)) {
+		vctx->suspend_count--;
 
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	if (VINSTR_IDLE != vinstr_ctx->state) {
-		spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-		return -EAGAIN;
-	} else {
-		vinstr_ctx->state = VINSTR_DUMPING;
-	}
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-
-	/* Request HW counters dump.
-	 * Disable preemption to make dump timestamp more accurate. */
-	preempt_disable();
-	*timestamp = kbasep_vinstr_get_timestamp();
-	rcode = kbase_instr_hwcnt_request_dump(vinstr_ctx->kctx);
-	preempt_enable();
-
-	if (!rcode)
-		rcode = kbase_instr_hwcnt_wait_for_dump(vinstr_ctx->kctx);
-	WARN_ON(rcode);
-
-	if (!rcode) {
-		/* Invalidate the kernel buffer before reading from it.
-		 * As the vinstr_ctx->lock is already held by the caller, the
-		 * unmap of kernel buffer cannot take place simultaneously.
+		/* Last resume, so re-enqueue the worker if we have any periodic
+		 * clients.
 		 */
-		lockdep_assert_held(&vinstr_ctx->lock);
-		kbase_sync_mem_regions(vinstr_ctx->kctx, vinstr_ctx->vmap,
-				KBASE_SYNC_TO_CPU);
-	}
-
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	switch (vinstr_ctx->state) {
-	case VINSTR_SUSPENDING:
-		schedule_work(&vinstr_ctx->suspend_work);
-		break;
-	case VINSTR_DUMPING:
-		vinstr_ctx->state = VINSTR_IDLE;
-		wake_up_all(&vinstr_ctx->suspend_waitq);
-		break;
-	default:
-		break;
-	}
+		if (vctx->suspend_count == 0) {
+			struct kbase_vinstr_client *pos;
+			bool has_periodic_clients = false;
 
-	/* Accumulate values of collected counters. */
-	if (!rcode)
-		accum_clients(vinstr_ctx);
+			list_for_each_entry(pos, &vctx->clients, node) {
+				if (pos->dump_interval_ns != 0) {
+					has_periodic_clients = true;
+					break;
+				}
+			}
 
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
+			if (has_periodic_clients)
+#if KERNEL_VERSION(3, 16, 0) > LINUX_VERSION_CODE
+				queue_work(system_wq, &vctx->dump_work);
+#else
+				queue_work(system_highpri_wq, &vctx->dump_work);
+#endif
+		}
+	}
 
-	return rcode;
+	mutex_unlock(&vctx->lock);
 }
 
-/**
- * kbasep_vinstr_fill_dump_buffer - copy accumulated counters to empty kernel
- *                                  buffer
- * @cli:       requesting client
- * @timestamp: timestamp when counters were collected
- * @event_id:  id of event that caused triggered counters collection
- *
- * Return: zero on success
- */
-static int kbasep_vinstr_fill_dump_buffer(
-		struct kbase_vinstr_client *cli, u64 timestamp,
-		enum base_hwcnt_reader_event event_id)
+int kbase_vinstr_hwcnt_reader_setup(
+	struct kbase_vinstr_context *vctx,
+	struct kbase_ioctl_hwcnt_reader_setup *setup)
 {
-	unsigned int write_idx = atomic_read(&cli->write_idx);
-	unsigned int read_idx  = atomic_read(&cli->read_idx);
+	int errcode;
+	int fd;
+	struct kbase_vinstr_client *vcli = NULL;
 
-	struct kbase_hwcnt_reader_metadata *meta;
-	void                               *buffer;
+	if (!vctx || !setup ||
+	    (setup->buffer_count == 0) ||
+	    (setup->buffer_count > MAX_BUFFER_COUNT))
+		return -EINVAL;
 
-	/* Check if there is a place to copy HWC block into. */
-	if (write_idx - read_idx == cli->buffer_count)
-		return -1;
-	write_idx %= cli->buffer_count;
-
-	/* Fill in dump buffer and its metadata. */
-	buffer = &cli->dump_buffers[write_idx * cli->dump_size];
-	meta   = &cli->dump_buffers_meta[write_idx];
-	meta->timestamp  = timestamp;
-	meta->event_id   = event_id;
-	meta->buffer_idx = write_idx;
-	memcpy(buffer, cli->accum_buffer, cli->dump_size);
-	return 0;
-}
+	errcode = kbasep_vinstr_client_create(vctx, setup, &vcli);
+	if (errcode)
+		goto error;
 
-/**
- * kbasep_vinstr_fill_dump_buffer_legacy - copy accumulated counters to buffer
- *                                         allocated in userspace
- * @cli: requesting client
- *
- * Return: zero on success
- *
- * This is part of legacy ioctl interface.
- */
-static int kbasep_vinstr_fill_dump_buffer_legacy(
-		struct kbase_vinstr_client *cli)
-{
-	void __user  *buffer = cli->legacy_buffer;
-	int          rcode;
+	errcode = anon_inode_getfd(
+		"[mali_vinstr_desc]",
+		&vinstr_client_fops,
+		vcli,
+		O_RDONLY | O_CLOEXEC);
+	if (errcode < 0)
+		goto error;
 
-	/* Copy data to user buffer. */
-	rcode = copy_to_user(buffer, cli->accum_buffer, cli->dump_size);
-	if (rcode) {
-		pr_warn("error while copying buffer to user\n");
-		return -EFAULT;
-	}
-	return 0;
-}
+	fd = errcode;
 
-/**
- * kbasep_vinstr_fill_dump_buffer_kernel - copy accumulated counters to buffer
- *                                         allocated in kernel space
- * @cli: requesting client
- *
- * Return: zero on success
- *
- * This is part of the kernel client interface.
- */
-static int kbasep_vinstr_fill_dump_buffer_kernel(
-		struct kbase_vinstr_client *cli)
-{
-	memcpy(cli->kernel_buffer, cli->accum_buffer, cli->dump_size);
+	/* Add the new client. No need to reschedule worker, as not periodic */
+	mutex_lock(&vctx->lock);
 
-	return 0;
-}
+	vctx->client_count++;
+	list_add(&vcli->node, &vctx->clients);
 
-/**
- * kbasep_vinstr_reprogram - reprogram hwcnt set collected by inst
- * @vinstr_ctx: vinstr context
- */
-static void kbasep_vinstr_reprogram(
-		struct kbase_vinstr_context *vinstr_ctx)
-{
-	unsigned long flags;
-	bool suspended = false;
-
-	/* Don't enable hardware counters if vinstr is suspended. */
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	if (VINSTR_IDLE != vinstr_ctx->state)
-		suspended = true;
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-	if (suspended)
-		return;
+	mutex_unlock(&vctx->lock);
 
-	/* Change to suspended state is done while holding vinstr context
-	 * lock. Below code will then no re-enable the instrumentation. */
-
-	if (vinstr_ctx->reprogram) {
-		struct kbase_vinstr_client *iter;
-
-		if (!reprogram_hwcnt(vinstr_ctx)) {
-			vinstr_ctx->reprogram = false;
-			spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-			list_for_each_entry(
-					iter,
-					&vinstr_ctx->idle_clients,
-					list)
-				iter->pending = false;
-			list_for_each_entry(
-					iter,
-					&vinstr_ctx->waiting_clients,
-					list)
-				iter->pending = false;
-			spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-		}
-	}
+	return fd;
+error:
+	kbasep_vinstr_client_destroy(vcli);
+	return errcode;
 }
 
 /**
- * kbasep_vinstr_update_client - copy accumulated counters to user readable
- *                               buffer and notify the user
- * @cli:       requesting client
- * @timestamp: timestamp when counters were collected
- * @event_id:  id of event that caused triggered counters collection
+ * kbasep_vinstr_hwcnt_reader_buffer_ready() - Check if client has ready
+ *                                             buffers.
+ * @cli: Non-NULL pointer to vinstr client.
  *
- * Return: zero on success
+ * Return: Non-zero if client has at least one dumping buffer filled that was
+ *         not notified to user yet.
  */
-static int kbasep_vinstr_update_client(
-		struct kbase_vinstr_client *cli, u64 timestamp,
-		enum base_hwcnt_reader_event event_id)
+static int kbasep_vinstr_hwcnt_reader_buffer_ready(
+	struct kbase_vinstr_client *cli)
 {
-	int rcode = 0;
-	unsigned long flags;
-
-	/* Copy collected counters to user readable buffer. */
-	if (cli->buffer_count)
-		rcode = kbasep_vinstr_fill_dump_buffer(
-				cli, timestamp, event_id);
-	else if (cli->kernel_buffer)
-		rcode = kbasep_vinstr_fill_dump_buffer_kernel(cli);
-	else
-		rcode = kbasep_vinstr_fill_dump_buffer_legacy(cli);
-
-	/* Prepare for next request. */
-	memset(cli->accum_buffer, 0, cli->dump_size);
-
-	spin_lock_irqsave(&cli->vinstr_ctx->state_lock, flags);
-	/* Check if client was put to suspend state while it was being updated */
-	if (cli->suspended)
-		rcode = -EINVAL;
-	spin_unlock_irqrestore(&cli->vinstr_ctx->state_lock, flags);
-
-	if (rcode)
-		goto exit;
-
-	/* Notify client. Make sure all changes to memory are visible. */
-	wmb();
-	atomic_inc(&cli->write_idx);
-	wake_up_interruptible(&cli->waitq);
-
-exit:
-	return rcode;
+	WARN_ON(!cli);
+	return atomic_read(&cli->write_idx) != atomic_read(&cli->meta_idx);
 }
 
 /**
- * kbasep_vinstr_wake_up_callback - vinstr wake up timer wake up function
+ * kbasep_vinstr_hwcnt_reader_ioctl_dump() - Dump ioctl command.
+ * @cli: Non-NULL pointer to vinstr client.
  *
- * @hrtimer: high resolution timer
- *
- * Return: High resolution timer restart enum.
+ * Return: 0 on success, else error code.
  */
-static enum hrtimer_restart kbasep_vinstr_wake_up_callback(
-		struct hrtimer *hrtimer)
+static long kbasep_vinstr_hwcnt_reader_ioctl_dump(
+	struct kbase_vinstr_client *cli)
 {
-	struct kbasep_vinstr_wake_up_timer *timer =
-		container_of(
-			hrtimer,
-			struct kbasep_vinstr_wake_up_timer,
-			hrtimer);
+	int errcode;
 
-	KBASE_DEBUG_ASSERT(timer);
+	mutex_lock(&cli->vctx->lock);
 
-	atomic_set(&timer->vinstr_ctx->request_pending, 1);
-	wake_up_all(&timer->vinstr_ctx->waitq);
+	errcode = kbasep_vinstr_client_dump(
+		cli, BASE_HWCNT_READER_EVENT_MANUAL);
 
-	return HRTIMER_NORESTART;
+	mutex_unlock(&cli->vctx->lock);
+	return errcode;
 }
 
 /**
- * kbasep_vinstr_service_task - HWC dumping service thread
+ * kbasep_vinstr_hwcnt_reader_ioctl_clear() - Clear ioctl command.
+ * @cli: Non-NULL pointer to vinstr client.
  *
- * @data: Pointer to vinstr context structure.
- *
- * Return: 0 on success; -ENOMEM if timer allocation fails
+ * Return: 0 on success, else error code.
  */
-static int kbasep_vinstr_service_task(void *data)
+static long kbasep_vinstr_hwcnt_reader_ioctl_clear(
+	struct kbase_vinstr_client *cli)
 {
-	struct kbase_vinstr_context        *vinstr_ctx = data;
-	struct kbasep_vinstr_wake_up_timer *timer;
-
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
+	int errcode;
 
-	timer = kmalloc(sizeof(*timer), GFP_KERNEL);
+	mutex_lock(&cli->vctx->lock);
 
-	if (!timer) {
-		dev_warn(vinstr_ctx->kbdev->dev, "Timer allocation failed!\n");
-		return -ENOMEM;
-	}
-
-	hrtimer_init(&timer->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-
-	timer->hrtimer.function = kbasep_vinstr_wake_up_callback;
-	timer->vinstr_ctx       = vinstr_ctx;
-
-	while (!kthread_should_stop()) {
-		struct kbase_vinstr_client *cli = NULL;
-		struct kbase_vinstr_client *tmp;
-		int                        rcode;
-		unsigned long              flags;
+	errcode = kbasep_vinstr_client_clear(cli);
 
-		u64              timestamp = kbasep_vinstr_get_timestamp();
-		u64              dump_time = 0;
-		struct list_head expired_requests;
-
-		/* Hold lock while performing operations on lists of clients. */
-		mutex_lock(&vinstr_ctx->lock);
-
-		/* Closing thread must not interact with client requests. */
-		if (current == vinstr_ctx->thread) {
-			atomic_set(&vinstr_ctx->request_pending, 0);
-
-			spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-			if (!list_empty(&vinstr_ctx->waiting_clients)) {
-				cli = list_first_entry(
-						&vinstr_ctx->waiting_clients,
-						struct kbase_vinstr_client,
-						list);
-				dump_time = cli->dump_time;
-			}
-			spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-		}
-
-		if (!cli || ((s64)timestamp - (s64)dump_time < 0ll)) {
-			mutex_unlock(&vinstr_ctx->lock);
-
-			/* Sleep until next dumping event or service request. */
-			if (cli) {
-				u64 diff = dump_time - timestamp;
-
-				hrtimer_start(
-						&timer->hrtimer,
-						ns_to_ktime(diff),
-						HRTIMER_MODE_REL);
-			}
-			wait_event(
-					vinstr_ctx->waitq,
-					atomic_read(
-						&vinstr_ctx->request_pending) ||
-					kthread_should_stop());
-			hrtimer_cancel(&timer->hrtimer);
-			continue;
-		}
-
-		rcode = kbasep_vinstr_collect_and_accumulate(vinstr_ctx,
-				&timestamp);
-
-		INIT_LIST_HEAD(&expired_requests);
-
-		spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-		/* Find all expired requests. */
-		list_for_each_entry_safe(
-				cli,
-				tmp,
-				&vinstr_ctx->waiting_clients,
-				list) {
-			s64 tdiff =
-				(s64)(timestamp + DUMPING_RESOLUTION) -
-				(s64)cli->dump_time;
-			if (tdiff >= 0ll) {
-				list_del(&cli->list);
-				list_add(&cli->list, &expired_requests);
-			} else {
-				break;
-			}
-		}
-
-		/* Fill data for each request found. */
-		while (!list_empty(&expired_requests)) {
-			cli = list_first_entry(&expired_requests,
-					struct kbase_vinstr_client, list);
-
-			/* Ensure that legacy buffer will not be used from
-			 * this kthread context. */
-			BUG_ON(0 == cli->buffer_count);
-			/* Expect only periodically sampled clients. */
-			BUG_ON(0 == cli->dump_interval);
-
-			/* Release the spinlock, as filling the data in client's
-			 * userspace buffer could result in page faults. */
-			spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-			if (!rcode)
-				kbasep_vinstr_update_client(
-						cli,
-						timestamp,
-						BASE_HWCNT_READER_EVENT_PERIODIC);
-			spin_lock_irqsave(&cli->vinstr_ctx->state_lock, flags);
-
-			/* This client got suspended, move to the next one. */
-			if (cli->suspended)
-				continue;
-
-			/* Set new dumping time. Drop missed probing times. */
-			do {
-				cli->dump_time += cli->dump_interval;
-			} while (cli->dump_time < timestamp);
-
-			list_del(&cli->list);
-			kbasep_vinstr_add_dump_request(
-					cli,
-					&vinstr_ctx->waiting_clients);
-		}
-		spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-
-		/* Reprogram counters set if required. */
-		kbasep_vinstr_reprogram(vinstr_ctx);
-
-		mutex_unlock(&vinstr_ctx->lock);
-	}
-
-	kfree(timer);
-
-	return 0;
+	mutex_unlock(&cli->vctx->lock);
+	return errcode;
 }
 
-/*****************************************************************************/
-
 /**
- * kbasep_vinstr_hwcnt_reader_buffer_ready - check if client has ready buffers
- * @cli: pointer to vinstr client structure
+ * kbasep_vinstr_hwcnt_reader_ioctl_get_buffer() - Get buffer ioctl command.
+ * @cli:    Non-NULL pointer to vinstr client.
+ * @buffer: Non-NULL pointer to userspace buffer.
+ * @size:   Size of buffer.
  *
- * Return: non-zero if client has at least one dumping buffer filled that was
- *         not notified to user yet
- */
-static int kbasep_vinstr_hwcnt_reader_buffer_ready(
-		struct kbase_vinstr_client *cli)
-{
-	KBASE_DEBUG_ASSERT(cli);
-	return atomic_read(&cli->write_idx) != atomic_read(&cli->meta_idx);
-}
-
-/**
- * kbasep_vinstr_hwcnt_reader_ioctl_get_buffer - hwcnt reader's ioctl command
- * @cli:    pointer to vinstr client structure
- * @buffer: pointer to userspace buffer
- * @size:   size of buffer
- *
- * Return: zero on success
+ * Return: 0 on success, else error code.
  */
 static long kbasep_vinstr_hwcnt_reader_ioctl_get_buffer(
-		struct kbase_vinstr_client *cli, void __user *buffer,
-		size_t size)
+	struct kbase_vinstr_client *cli,
+	void __user *buffer,
+	size_t size)
 {
 	unsigned int meta_idx = atomic_read(&cli->meta_idx);
-	unsigned int idx = meta_idx % cli->buffer_count;
+	unsigned int idx = meta_idx % cli->dump_bufs.buf_cnt;
 
-	struct kbase_hwcnt_reader_metadata *meta = &cli->dump_buffers_meta[idx];
+	struct kbase_hwcnt_reader_metadata *meta = &cli->dump_bufs_meta[idx];
 
 	/* Metadata sanity check. */
-	KBASE_DEBUG_ASSERT(idx == meta->buffer_idx);
+	WARN_ON(idx != meta->buffer_idx);
 
 	if (sizeof(struct kbase_hwcnt_reader_metadata) != size)
 		return -EINVAL;
@@ -1470,19 +693,20 @@ static long kbasep_vinstr_hwcnt_reader_ioctl_get_buffer(
 }
 
 /**
- * kbasep_vinstr_hwcnt_reader_ioctl_put_buffer - hwcnt reader's ioctl command
- * @cli:    pointer to vinstr client structure
- * @buffer: pointer to userspace buffer
- * @size:   size of buffer
+ * kbasep_vinstr_hwcnt_reader_ioctl_put_buffer() - Put buffer ioctl command.
+ * @cli:    Non-NULL pointer to vinstr client.
+ * @buffer: Non-NULL pointer to userspace buffer.
+ * @size:   Size of buffer.
  *
- * Return: zero on success
+ * Return: 0 on success, else error code.
  */
 static long kbasep_vinstr_hwcnt_reader_ioctl_put_buffer(
-		struct kbase_vinstr_client *cli, void __user *buffer,
-		size_t size)
+	struct kbase_vinstr_client *cli,
+	void __user *buffer,
+	size_t size)
 {
 	unsigned int read_idx = atomic_read(&cli->read_idx);
-	unsigned int idx = read_idx % cli->buffer_count;
+	unsigned int idx = read_idx % cli->dump_bufs.buf_cnt;
 
 	struct kbase_hwcnt_reader_metadata meta;
 
@@ -1505,182 +729,126 @@ static long kbasep_vinstr_hwcnt_reader_ioctl_put_buffer(
 }
 
 /**
- * kbasep_vinstr_hwcnt_reader_ioctl_set_interval - hwcnt reader's ioctl command
- * @cli:      pointer to vinstr client structure
- * @interval: periodic dumping interval (disable periodic dumping if zero)
+ * kbasep_vinstr_hwcnt_reader_ioctl_set_interval() - Set interval ioctl command.
+ * @cli:      Non-NULL pointer to vinstr client.
+ * @interval: Periodic dumping interval (disable periodic dumping if 0).
  *
- * Return: zero on success
+ * Return: 0 always.
  */
 static long kbasep_vinstr_hwcnt_reader_ioctl_set_interval(
-		struct kbase_vinstr_client *cli, u32 interval)
-{
-	struct kbase_vinstr_context *vinstr_ctx = cli->vinstr_ctx;
-	unsigned long flags;
-
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-
-	mutex_lock(&vinstr_ctx->lock);
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-
-	if (cli->suspended) {
-		spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-		mutex_unlock(&vinstr_ctx->lock);
-		return -ENOMEM;
-	}
-
-	list_del(&cli->list);
-
-	cli->dump_interval = interval;
-
-	/* If interval is non-zero, enable periodic dumping for this client. */
-	if (cli->dump_interval) {
-		if (DUMPING_RESOLUTION > cli->dump_interval)
-			cli->dump_interval = DUMPING_RESOLUTION;
-		cli->dump_time =
-			kbasep_vinstr_get_timestamp() + cli->dump_interval;
-
-		kbasep_vinstr_add_dump_request(
-				cli, &vinstr_ctx->waiting_clients);
-
-		atomic_set(&vinstr_ctx->request_pending, 1);
-		wake_up_all(&vinstr_ctx->waitq);
-	} else {
-		list_add(&cli->list, &vinstr_ctx->idle_clients);
-	}
+	struct kbase_vinstr_client *cli,
+	u32 interval)
+{
+	mutex_lock(&cli->vctx->lock);
+
+	if ((interval != 0) && (interval < DUMP_INTERVAL_MIN_NS))
+		interval = DUMP_INTERVAL_MIN_NS;
+	/* Update the interval, and put in a dummy next dump time */
+	cli->dump_interval_ns = interval;
+	cli->next_dump_time_ns = 0;
+
+	/*
+	 * If it's a periodic client, kick off the worker early to do a proper
+	 * timer reschedule. Return value is ignored, as we don't care if the
+	 * worker is already queued.
+	 */
+	if ((interval != 0) && (cli->vctx->suspend_count == 0))
+#if KERNEL_VERSION(3, 16, 0) > LINUX_VERSION_CODE
+		queue_work(system_wq, &cli->vctx->dump_work);
+#else
+		queue_work(system_highpri_wq, &cli->vctx->dump_work);
+#endif
 
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-	mutex_unlock(&vinstr_ctx->lock);
+	mutex_unlock(&cli->vctx->lock);
 
 	return 0;
 }
 
 /**
- * kbasep_vinstr_hwcnt_reader_event_mask - return event mask for event id
- * @event_id: id of event
- * Return: event_mask or zero if event is not supported or maskable
- */
-static u32 kbasep_vinstr_hwcnt_reader_event_mask(
-		enum base_hwcnt_reader_event event_id)
-{
-	u32 event_mask = 0;
-
-	switch (event_id) {
-	case BASE_HWCNT_READER_EVENT_PREJOB:
-	case BASE_HWCNT_READER_EVENT_POSTJOB:
-		/* These event are maskable. */
-		event_mask = (1 << event_id);
-		break;
-
-	case BASE_HWCNT_READER_EVENT_MANUAL:
-	case BASE_HWCNT_READER_EVENT_PERIODIC:
-		/* These event are non-maskable. */
-	default:
-		/* These event are not supported. */
-		break;
-	}
-
-	return event_mask;
-}
-
-/**
- * kbasep_vinstr_hwcnt_reader_ioctl_enable_event - hwcnt reader's ioctl command
- * @cli:      pointer to vinstr client structure
- * @event_id: id of event to enable
+ * kbasep_vinstr_hwcnt_reader_ioctl_enable_event() - Enable event ioctl command.
+ * @cli:      Non-NULL pointer to vinstr client.
+ * @event_id: ID of event to enable.
  *
- * Return: zero on success
+ * Return: 0 always.
  */
 static long kbasep_vinstr_hwcnt_reader_ioctl_enable_event(
 		struct kbase_vinstr_client *cli,
 		enum base_hwcnt_reader_event event_id)
 {
-	struct kbase_vinstr_context *vinstr_ctx = cli->vinstr_ctx;
-	u32                         event_mask;
-
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-
-	event_mask = kbasep_vinstr_hwcnt_reader_event_mask(event_id);
-	if (!event_mask)
-		return -EINVAL;
-
-	mutex_lock(&vinstr_ctx->lock);
-	cli->event_mask |= event_mask;
-	mutex_unlock(&vinstr_ctx->lock);
-
+	/* No-op, as events aren't supported */
 	return 0;
 }
 
 /**
- * kbasep_vinstr_hwcnt_reader_ioctl_disable_event - hwcnt reader's ioctl command
- * @cli:      pointer to vinstr client structure
- * @event_id: id of event to disable
+ * kbasep_vinstr_hwcnt_reader_ioctl_disable_event() - Disable event ioctl
+ *                                                    command.
+ * @cli:      Non-NULL pointer to vinstr client.
+ * @event_id: ID of event to disable.
  *
- * Return: zero on success
+ * Return: 0 always.
  */
 static long kbasep_vinstr_hwcnt_reader_ioctl_disable_event(
-		struct kbase_vinstr_client *cli,
-		enum base_hwcnt_reader_event event_id)
+	struct kbase_vinstr_client *cli,
+	enum base_hwcnt_reader_event event_id)
 {
-	struct kbase_vinstr_context *vinstr_ctx = cli->vinstr_ctx;
-	u32                         event_mask;
-
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-
-	event_mask = kbasep_vinstr_hwcnt_reader_event_mask(event_id);
-	if (!event_mask)
-		return -EINVAL;
-
-	mutex_lock(&vinstr_ctx->lock);
-	cli->event_mask &= ~event_mask;
-	mutex_unlock(&vinstr_ctx->lock);
-
+	/* No-op, as events aren't supported */
 	return 0;
 }
 
 /**
- * kbasep_vinstr_hwcnt_reader_ioctl_get_hwver - hwcnt reader's ioctl command
- * @cli:   pointer to vinstr client structure
- * @hwver: pointer to user buffer where hw version will be stored
+ * kbasep_vinstr_hwcnt_reader_ioctl_get_hwver() - Get HW version ioctl command.
+ * @cli:   Non-NULL pointer to vinstr client.
+ * @hwver: Non-NULL pointer to user buffer where HW version will be stored.
  *
- * Return: zero on success
+ * Return: 0 on success, else error code.
  */
 static long kbasep_vinstr_hwcnt_reader_ioctl_get_hwver(
-		struct kbase_vinstr_client *cli, u32 __user *hwver)
+	struct kbase_vinstr_client *cli,
+	u32 __user *hwver)
 {
-#ifndef CONFIG_MALI_NO_MALI
-	struct kbase_vinstr_context *vinstr_ctx = cli->vinstr_ctx;
-#endif
-
-	u32                         ver = 5;
+	u32 ver = 0;
+	const enum kbase_hwcnt_gpu_group_type type =
+		kbase_hwcnt_metadata_group_type(cli->vctx->metadata, 0);
 
-#ifndef CONFIG_MALI_NO_MALI
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-	if (kbase_hw_has_feature(vinstr_ctx->kbdev, BASE_HW_FEATURE_V4))
+	switch (type) {
+	case KBASE_HWCNT_GPU_GROUP_TYPE_V4:
 		ver = 4;
-#endif
+		break;
+	case KBASE_HWCNT_GPU_GROUP_TYPE_V5:
+		ver = 5;
+		break;
+	default:
+		WARN_ON(true);
+	}
 
-	return put_user(ver, hwver);
+	if (ver != 0) {
+		return put_user(ver, hwver);
+	} else {
+		return -EINVAL;
+	}
 }
 
 /**
- * kbasep_vinstr_hwcnt_reader_ioctl - hwcnt reader's ioctl
- * @filp:   pointer to file structure
- * @cmd:    user command
- * @arg:    command's argument
+ * kbasep_vinstr_hwcnt_reader_ioctl() - hwcnt reader's ioctl.
+ * @filp:   Non-NULL pointer to file structure.
+ * @cmd:    User command.
+ * @arg:    Command's argument.
  *
- * Return: zero on success
+ * Return: 0 on success, else error code.
  */
-static long kbasep_vinstr_hwcnt_reader_ioctl(struct file *filp,
-		unsigned int cmd, unsigned long arg)
+static long kbasep_vinstr_hwcnt_reader_ioctl(
+	struct file *filp,
+	unsigned int cmd,
+	unsigned long arg)
 {
-	long                       rcode = 0;
+	long rcode;
 	struct kbase_vinstr_client *cli;
 
-	KBASE_DEBUG_ASSERT(filp);
+	if (!filp || (_IOC_TYPE(cmd) != KBASE_HWCNT_READER))
+		return -EINVAL;
 
 	cli = filp->private_data;
-	KBASE_DEBUG_ASSERT(cli);
-
-	if (unlikely(KBASE_HWCNT_READER != _IOC_TYPE(cmd)))
+	if (!cli)
 		return -EINVAL;
 
 	switch (cmd) {
@@ -1689,42 +857,41 @@ static long kbasep_vinstr_hwcnt_reader_ioctl(struct file *filp,
 		break;
 	case KBASE_HWCNT_READER_GET_HWVER:
 		rcode = kbasep_vinstr_hwcnt_reader_ioctl_get_hwver(
-				cli, (u32 __user *)arg);
+			cli, (u32 __user *)arg);
 		break;
 	case KBASE_HWCNT_READER_GET_BUFFER_SIZE:
-		KBASE_DEBUG_ASSERT(cli->vinstr_ctx);
 		rcode = put_user(
-				(u32)cli->vinstr_ctx->dump_size,
-				(u32 __user *)arg);
+			(u32)cli->vctx->metadata->dump_buf_bytes,
+			(u32 __user *)arg);
 		break;
 	case KBASE_HWCNT_READER_DUMP:
-		rcode = kbase_vinstr_hwc_dump(
-				cli, BASE_HWCNT_READER_EVENT_MANUAL);
+		rcode = kbasep_vinstr_hwcnt_reader_ioctl_dump(cli);
 		break;
 	case KBASE_HWCNT_READER_CLEAR:
-		rcode = kbase_vinstr_hwc_clear(cli);
+		rcode = kbasep_vinstr_hwcnt_reader_ioctl_clear(cli);
 		break;
 	case KBASE_HWCNT_READER_GET_BUFFER:
 		rcode = kbasep_vinstr_hwcnt_reader_ioctl_get_buffer(
-				cli, (void __user *)arg, _IOC_SIZE(cmd));
+			cli, (void __user *)arg, _IOC_SIZE(cmd));
 		break;
 	case KBASE_HWCNT_READER_PUT_BUFFER:
 		rcode = kbasep_vinstr_hwcnt_reader_ioctl_put_buffer(
-				cli, (void __user *)arg, _IOC_SIZE(cmd));
+			cli, (void __user *)arg, _IOC_SIZE(cmd));
 		break;
 	case KBASE_HWCNT_READER_SET_INTERVAL:
 		rcode = kbasep_vinstr_hwcnt_reader_ioctl_set_interval(
-				cli, (u32)arg);
+			cli, (u32)arg);
 		break;
 	case KBASE_HWCNT_READER_ENABLE_EVENT:
 		rcode = kbasep_vinstr_hwcnt_reader_ioctl_enable_event(
-				cli, (enum base_hwcnt_reader_event)arg);
+			cli, (enum base_hwcnt_reader_event)arg);
 		break;
 	case KBASE_HWCNT_READER_DISABLE_EVENT:
 		rcode = kbasep_vinstr_hwcnt_reader_ioctl_disable_event(
-				cli, (enum base_hwcnt_reader_event)arg);
+			cli, (enum base_hwcnt_reader_event)arg);
 		break;
 	default:
+		WARN_ON(true);
 		rcode = -EINVAL;
 		break;
 	}
@@ -1733,21 +900,25 @@ static long kbasep_vinstr_hwcnt_reader_ioctl(struct file *filp,
 }
 
 /**
- * kbasep_vinstr_hwcnt_reader_poll - hwcnt reader's poll
- * @filp: pointer to file structure
- * @wait: pointer to poll table
- * Return: POLLIN if data can be read without blocking, otherwise zero
+ * kbasep_vinstr_hwcnt_reader_poll() - hwcnt reader's poll.
+ * @filp: Non-NULL pointer to file structure.
+ * @wait: Non-NULL pointer to poll table.
+ *
+ * Return: POLLIN if data can be read without blocking, 0 if data can not be
+ *         read without blocking, else error code.
  */
-static unsigned int kbasep_vinstr_hwcnt_reader_poll(struct file *filp,
-		poll_table *wait)
+static unsigned int kbasep_vinstr_hwcnt_reader_poll(
+	struct file *filp,
+	poll_table *wait)
 {
 	struct kbase_vinstr_client *cli;
 
-	KBASE_DEBUG_ASSERT(filp);
-	KBASE_DEBUG_ASSERT(wait);
+	if (!filp || !wait)
+		return -EINVAL;
 
 	cli = filp->private_data;
-	KBASE_DEBUG_ASSERT(cli);
+	if (!cli)
+		return -EINVAL;
 
 	poll_wait(filp, &cli->waitq, wait);
 	if (kbasep_vinstr_hwcnt_reader_buffer_ready(cli))
@@ -1756,25 +927,28 @@ static unsigned int kbasep_vinstr_hwcnt_reader_poll(struct file *filp,
 }
 
 /**
- * kbasep_vinstr_hwcnt_reader_mmap - hwcnt reader's mmap
- * @filp: pointer to file structure
- * @vma:  pointer to vma structure
- * Return: zero on success
+ * kbasep_vinstr_hwcnt_reader_mmap() - hwcnt reader's mmap.
+ * @filp: Non-NULL pointer to file structure.
+ * @vma:  Non-NULL pointer to vma structure.
+ *
+ * Return: 0 on success, else error code.
  */
-static int kbasep_vinstr_hwcnt_reader_mmap(struct file *filp,
-		struct vm_area_struct *vma)
+static int kbasep_vinstr_hwcnt_reader_mmap(
+	struct file *filp,
+	struct vm_area_struct *vma)
 {
 	struct kbase_vinstr_client *cli;
-	unsigned long size, addr, pfn, offset;
-	unsigned long vm_size = vma->vm_end - vma->vm_start;
+	unsigned long vm_size, size, addr, pfn, offset;
 
-	KBASE_DEBUG_ASSERT(filp);
-	KBASE_DEBUG_ASSERT(vma);
+	if (!filp || !vma)
+		return -EINVAL;
 
 	cli = filp->private_data;
-	KBASE_DEBUG_ASSERT(cli);
+	if (!cli)
+		return -EINVAL;
 
-	size = cli->buffer_count * cli->dump_size;
+	vm_size = vma->vm_end - vma->vm_start;
+	size = cli->dump_bufs.buf_cnt * cli->vctx->metadata->dump_buf_bytes;
 
 	if (vma->vm_pgoff > (size >> PAGE_SHIFT))
 		return -EINVAL;
@@ -1783,579 +957,33 @@ static int kbasep_vinstr_hwcnt_reader_mmap(struct file *filp,
 	if (vm_size > size - offset)
 		return -EINVAL;
 
-	addr = __pa((unsigned long)cli->dump_buffers + offset);
+	addr = __pa(cli->dump_bufs.page_addr + offset);
 	pfn = addr >> PAGE_SHIFT;
 
 	return remap_pfn_range(
-			vma,
-			vma->vm_start,
-			pfn,
-			vm_size,
-			vma->vm_page_prot);
+		vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
 }
 
 /**
- * kbasep_vinstr_hwcnt_reader_release - hwcnt reader's release
- * @inode: pointer to inode structure
- * @filp:  pointer to file structure
- * Return always return zero
+ * kbasep_vinstr_hwcnt_reader_release() - hwcnt reader's release.
+ * @inode: Non-NULL pointer to inode structure.
+ * @filp:  Non-NULL pointer to file structure.
+ *
+ * Return: 0 always.
  */
 static int kbasep_vinstr_hwcnt_reader_release(struct inode *inode,
-		struct file *filp)
-{
-	struct kbase_vinstr_client *cli;
-
-	KBASE_DEBUG_ASSERT(inode);
-	KBASE_DEBUG_ASSERT(filp);
-
-	cli = filp->private_data;
-	KBASE_DEBUG_ASSERT(cli);
-
-	kbase_vinstr_detach_client(cli);
-	return 0;
-}
-
-/*****************************************************************************/
-
-/**
- * kbasep_vinstr_kick_scheduler - trigger scheduler cycle
- * @kbdev: pointer to kbase device structure
- */
-static void kbasep_vinstr_kick_scheduler(struct kbase_device *kbdev)
-{
-	struct kbasep_js_device_data *js_devdata = &kbdev->js_data;
-	unsigned long flags;
-
-	down(&js_devdata->schedule_sem);
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	kbase_backend_slot_update(kbdev);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-	up(&js_devdata->schedule_sem);
-}
-
-/**
- * kbasep_vinstr_suspend_worker - worker suspending vinstr module
- * @data: pointer to work structure
- */
-static void kbasep_vinstr_suspend_worker(struct work_struct *data)
-{
-	struct kbase_vinstr_context *vinstr_ctx;
-	unsigned long flags;
-
-	vinstr_ctx = container_of(data, struct kbase_vinstr_context,
-			suspend_work);
-
-	mutex_lock(&vinstr_ctx->lock);
-
-	if (vinstr_ctx->kctx)
-		disable_hwcnt(vinstr_ctx);
-
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	vinstr_ctx->state = VINSTR_SUSPENDED;
-	wake_up_all(&vinstr_ctx->suspend_waitq);
-
-	if (vinstr_ctx->need_resume) {
-		vinstr_ctx->need_resume = false;
-		vinstr_ctx->state = VINSTR_RESUMING;
-		schedule_work(&vinstr_ctx->resume_work);
-
-		spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-
-		mutex_unlock(&vinstr_ctx->lock);
-	} else {
-		spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-
-		mutex_unlock(&vinstr_ctx->lock);
-
-		/* Kick GPU scheduler to allow entering protected mode.
-		 * This must happen after vinstr was suspended.
-		 */
-		kbasep_vinstr_kick_scheduler(vinstr_ctx->kbdev);
-	}
-}
-
-/**
- * kbasep_vinstr_resume_worker - worker resuming vinstr module
- * @data: pointer to work structure
- */
-static void kbasep_vinstr_resume_worker(struct work_struct *data)
+	struct file *filp)
 {
-	struct kbase_vinstr_context *vinstr_ctx;
-	unsigned long flags;
-
-	vinstr_ctx = container_of(data, struct kbase_vinstr_context,
-			resume_work);
-
-	mutex_lock(&vinstr_ctx->lock);
-
-	if (vinstr_ctx->kctx)
-		enable_hwcnt(vinstr_ctx);
+	struct kbase_vinstr_client *vcli = filp->private_data;
 
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	vinstr_ctx->state = VINSTR_IDLE;
-	wake_up_all(&vinstr_ctx->suspend_waitq);
+	mutex_lock(&vcli->vctx->lock);
 
-	if (vinstr_ctx->need_suspend) {
-		vinstr_ctx->need_suspend = false;
-		vinstr_ctx->state = VINSTR_SUSPENDING;
-		schedule_work(&vinstr_ctx->suspend_work);
+	vcli->vctx->client_count--;
+	list_del(&vcli->node);
 
-		spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
+	mutex_unlock(&vcli->vctx->lock);
 
-		mutex_unlock(&vinstr_ctx->lock);
-	} else {
-		spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-
-		mutex_unlock(&vinstr_ctx->lock);
-
-		/* Kick GPU scheduler to allow entering protected mode.
-		 * Note that scheduler state machine might requested re-entry to
-		 * protected mode before vinstr was resumed.
-		 * This must happen after vinstr was release.
-		 */
-		kbasep_vinstr_kick_scheduler(vinstr_ctx->kbdev);
-	}
-}
-
-/*****************************************************************************/
-
-struct kbase_vinstr_context *kbase_vinstr_init(struct kbase_device *kbdev)
-{
-	struct kbase_vinstr_context *vinstr_ctx;
-
-	vinstr_ctx = kzalloc(sizeof(*vinstr_ctx), GFP_KERNEL);
-	if (!vinstr_ctx)
-		return NULL;
-
-	INIT_LIST_HEAD(&vinstr_ctx->idle_clients);
-	INIT_LIST_HEAD(&vinstr_ctx->waiting_clients);
-	INIT_LIST_HEAD(&vinstr_ctx->suspended_clients);
-	mutex_init(&vinstr_ctx->lock);
-	spin_lock_init(&vinstr_ctx->state_lock);
-	vinstr_ctx->kbdev = kbdev;
-	vinstr_ctx->thread = NULL;
-	vinstr_ctx->state = VINSTR_IDLE;
-	vinstr_ctx->suspend_cnt = 0;
-	INIT_WORK(&vinstr_ctx->suspend_work, kbasep_vinstr_suspend_worker);
-	INIT_WORK(&vinstr_ctx->resume_work, kbasep_vinstr_resume_worker);
-	init_waitqueue_head(&vinstr_ctx->suspend_waitq);
-
-	atomic_set(&vinstr_ctx->request_pending, 0);
-	init_waitqueue_head(&vinstr_ctx->waitq);
-
-	return vinstr_ctx;
-}
-
-void kbase_vinstr_term(struct kbase_vinstr_context *vinstr_ctx)
-{
-	struct kbase_vinstr_client *cli;
-
-	/* Stop service thread first. */
-	if (vinstr_ctx->thread)
-		kthread_stop(vinstr_ctx->thread);
-
-	/* Wait for workers. */
-	flush_work(&vinstr_ctx->suspend_work);
-	flush_work(&vinstr_ctx->resume_work);
-
-	while (1) {
-		struct list_head *list = &vinstr_ctx->idle_clients;
-
-		if (list_empty(list)) {
-			list = &vinstr_ctx->waiting_clients;
-			if (list_empty(list)) {
-				list = &vinstr_ctx->suspended_clients;
-				if (list_empty(list))
-					break;
-			}
-		}
-
-		cli = list_first_entry(list, struct kbase_vinstr_client, list);
-		list_del(&cli->list);
-		if (!cli->suspended)
-			vinstr_ctx->nclients--;
-		else
-			vinstr_ctx->nclients_suspended--;
-		kfree(cli->accum_buffer);
-		kfree(cli);
-	}
-	KBASE_DEBUG_ASSERT(!vinstr_ctx->nclients);
-	KBASE_DEBUG_ASSERT(!vinstr_ctx->nclients_suspended);
-	if (vinstr_ctx->kctx)
-		kbasep_vinstr_destroy_kctx(vinstr_ctx);
-	kfree(vinstr_ctx);
-}
-
-int kbase_vinstr_hwcnt_reader_setup(struct kbase_vinstr_context *vinstr_ctx,
-		struct kbase_ioctl_hwcnt_reader_setup *setup)
-{
-	struct kbase_vinstr_client  *cli;
-	u32                         bitmap[4];
-	int                         fd;
-
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-	KBASE_DEBUG_ASSERT(setup);
-	KBASE_DEBUG_ASSERT(setup->buffer_count);
-
-	bitmap[SHADER_HWCNT_BM] = setup->shader_bm;
-	bitmap[TILER_HWCNT_BM]  = setup->tiler_bm;
-	bitmap[MMU_L2_HWCNT_BM] = setup->mmu_l2_bm;
-	bitmap[JM_HWCNT_BM]     = setup->jm_bm;
-
-	cli = kbasep_vinstr_attach_client(
-			vinstr_ctx,
-			setup->buffer_count,
-			bitmap,
-			&fd,
-			NULL);
-
-	if (!cli)
-		return -ENOMEM;
-
-	kbase_vinstr_wait_for_ready(vinstr_ctx);
-	return fd;
-}
-
-int kbase_vinstr_legacy_hwc_setup(
-		struct kbase_vinstr_context *vinstr_ctx,
-		struct kbase_vinstr_client  **cli,
-		struct kbase_ioctl_hwcnt_enable *enable)
-{
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-	KBASE_DEBUG_ASSERT(enable);
-	KBASE_DEBUG_ASSERT(cli);
-
-	if (enable->dump_buffer) {
-		u32 bitmap[4];
-
-		bitmap[SHADER_HWCNT_BM] = enable->shader_bm;
-		bitmap[TILER_HWCNT_BM]  = enable->tiler_bm;
-		bitmap[MMU_L2_HWCNT_BM] = enable->mmu_l2_bm;
-		bitmap[JM_HWCNT_BM]     = enable->jm_bm;
-
-		if (*cli)
-			return -EBUSY;
-
-		*cli = kbasep_vinstr_attach_client(
-				vinstr_ctx,
-				0,
-				bitmap,
-				(void *)(uintptr_t)enable->dump_buffer,
-				NULL);
-
-		if (!(*cli))
-			return -ENOMEM;
-
-		kbase_vinstr_wait_for_ready(vinstr_ctx);
-	} else {
-		if (!*cli)
-			return -EINVAL;
-
-		kbase_vinstr_detach_client(*cli);
-		*cli = NULL;
-	}
+	kbasep_vinstr_client_destroy(vcli);
 
 	return 0;
 }
-
-struct kbase_vinstr_client *kbase_vinstr_hwcnt_kernel_setup(
-		struct kbase_vinstr_context *vinstr_ctx,
-		struct kbase_ioctl_hwcnt_reader_setup *setup,
-		void *kernel_buffer)
-{
-	struct kbase_vinstr_client *kernel_client;
-	u32 bitmap[4];
-
-	if (!vinstr_ctx || !setup || !kernel_buffer)
-		return NULL;
-
-	bitmap[SHADER_HWCNT_BM] = setup->shader_bm;
-	bitmap[TILER_HWCNT_BM]  = setup->tiler_bm;
-	bitmap[MMU_L2_HWCNT_BM] = setup->mmu_l2_bm;
-	bitmap[JM_HWCNT_BM]     = setup->jm_bm;
-
-	kernel_client = kbasep_vinstr_attach_client(
-				vinstr_ctx,
-				0,
-				bitmap,
-				NULL,
-				kernel_buffer);
-
-	if (kernel_client)
-		kbase_vinstr_wait_for_ready(vinstr_ctx);
-
-	return kernel_client;
-}
-KBASE_EXPORT_TEST_API(kbase_vinstr_hwcnt_kernel_setup);
-
-int kbase_vinstr_hwc_dump(struct kbase_vinstr_client *cli,
-		enum base_hwcnt_reader_event event_id)
-{
-	int                         rcode = 0;
-	struct kbase_vinstr_context *vinstr_ctx;
-	u64                         timestamp;
-	u32                         event_mask;
-
-	if (!cli)
-		return -EINVAL;
-
-	vinstr_ctx = cli->vinstr_ctx;
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-
-	KBASE_DEBUG_ASSERT(event_id < BASE_HWCNT_READER_EVENT_COUNT);
-	event_mask = 1 << event_id;
-
-	mutex_lock(&vinstr_ctx->lock);
-
-	if (event_mask & cli->event_mask) {
-		rcode = kbasep_vinstr_collect_and_accumulate(
-				vinstr_ctx,
-				&timestamp);
-		if (rcode)
-			goto exit;
-
-		rcode = kbasep_vinstr_update_client(cli, timestamp, event_id);
-		if (rcode)
-			goto exit;
-
-		kbasep_vinstr_reprogram(vinstr_ctx);
-	}
-
-exit:
-	mutex_unlock(&vinstr_ctx->lock);
-
-	return rcode;
-}
-KBASE_EXPORT_TEST_API(kbase_vinstr_hwc_dump);
-
-int kbase_vinstr_hwc_clear(struct kbase_vinstr_client *cli)
-{
-	struct kbase_vinstr_context *vinstr_ctx;
-	int                         rcode;
-	u64                         unused;
-
-	if (!cli)
-		return -EINVAL;
-
-	vinstr_ctx = cli->vinstr_ctx;
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-
-	mutex_lock(&vinstr_ctx->lock);
-
-	rcode = kbasep_vinstr_collect_and_accumulate(vinstr_ctx, &unused);
-	if (rcode)
-		goto exit;
-	rcode = kbase_instr_hwcnt_clear(vinstr_ctx->kctx);
-	if (rcode)
-		goto exit;
-	memset(cli->accum_buffer, 0, cli->dump_size);
-
-	kbasep_vinstr_reprogram(vinstr_ctx);
-
-exit:
-	mutex_unlock(&vinstr_ctx->lock);
-
-	return rcode;
-}
-
-int kbase_vinstr_try_suspend(struct kbase_vinstr_context *vinstr_ctx)
-{
-	unsigned long flags;
-	int ret = -EAGAIN;
-
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	vinstr_ctx->forced_suspend = true;
-	switch (vinstr_ctx->state) {
-	case VINSTR_SUSPENDED:
-		vinstr_ctx->suspend_cnt++;
-		/* overflow shall not happen */
-		BUG_ON(0 == vinstr_ctx->suspend_cnt);
-		ret = 0;
-		break;
-
-	case VINSTR_IDLE:
-		if (vinstr_ctx->clients_present) {
-			vinstr_ctx->state = VINSTR_SUSPENDING;
-			schedule_work(&vinstr_ctx->suspend_work);
-		} else {
-			vinstr_ctx->state = VINSTR_SUSPENDED;
-
-			vinstr_ctx->suspend_cnt++;
-			/* overflow shall not happen */
-			WARN_ON(0 == vinstr_ctx->suspend_cnt);
-			ret = 0;
-		}
-		break;
-
-	case VINSTR_DUMPING:
-		vinstr_ctx->state = VINSTR_SUSPENDING;
-		break;
-
-	case VINSTR_RESUMING:
-		vinstr_ctx->need_suspend = true;
-		break;
-
-	case VINSTR_SUSPENDING:
-		break;
-
-	default:
-		KBASE_DEBUG_ASSERT(0);
-		break;
-	}
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-
-	return ret;
-}
-
-static int kbase_vinstr_is_ready(struct kbase_vinstr_context *vinstr_ctx)
-{
-	unsigned long flags;
-	int ret = -EAGAIN;
-
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	switch (vinstr_ctx->state) {
-	case VINSTR_SUSPENDED:
-	case VINSTR_RESUMING:
-	case VINSTR_SUSPENDING:
-		break;
-
-	case VINSTR_IDLE:
-	case VINSTR_DUMPING:
-		ret = 0;
-		break;
-	default:
-		KBASE_DEBUG_ASSERT(0);
-		break;
-	}
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-
-	return ret;
-}
-
-void kbase_vinstr_suspend(struct kbase_vinstr_context *vinstr_ctx)
-{
-	wait_event(vinstr_ctx->suspend_waitq,
-			(0 == kbase_vinstr_try_suspend(vinstr_ctx)));
-}
-
-void kbase_vinstr_wait_for_ready(struct kbase_vinstr_context *vinstr_ctx)
-{
-	wait_event(vinstr_ctx->suspend_waitq,
-			(0 == kbase_vinstr_is_ready(vinstr_ctx)));
-}
-KBASE_EXPORT_TEST_API(kbase_vinstr_wait_for_ready);
-
-/**
- * kbase_vinstr_update_suspend - Update vinstr suspend/resume status depending
- *                               on nclients
- * @vinstr_ctx: vinstr context pointer
- *
- * This function should be called whenever vinstr_ctx->nclients changes. This
- * may cause vinstr to be suspended or resumed, depending on the number of
- * clients and whether IPA is suspended or not.
- */
-static void kbase_vinstr_update_suspend(struct kbase_vinstr_context *vinstr_ctx)
-{
-	lockdep_assert_held(&vinstr_ctx->state_lock);
-
-	switch (vinstr_ctx->state) {
-	case VINSTR_SUSPENDED:
-		if ((vinstr_ctx->nclients) && (0 == vinstr_ctx->suspend_cnt)) {
-			vinstr_ctx->state = VINSTR_RESUMING;
-			schedule_work(&vinstr_ctx->resume_work);
-		}
-		break;
-
-	case VINSTR_SUSPENDING:
-		if ((vinstr_ctx->nclients) && (!vinstr_ctx->forced_suspend))
-			vinstr_ctx->need_resume = true;
-		break;
-
-	case VINSTR_IDLE:
-		if (!vinstr_ctx->nclients) {
-			vinstr_ctx->state = VINSTR_SUSPENDING;
-			schedule_work(&vinstr_ctx->suspend_work);
-		}
-		break;
-
-	case VINSTR_DUMPING:
-		if (!vinstr_ctx->nclients)
-			vinstr_ctx->state = VINSTR_SUSPENDING;
-		break;
-
-	case VINSTR_RESUMING:
-		if (!vinstr_ctx->nclients)
-			vinstr_ctx->need_suspend = true;
-		break;
-	}
-}
-
-void kbase_vinstr_resume(struct kbase_vinstr_context *vinstr_ctx)
-{
-	unsigned long flags;
-
-	KBASE_DEBUG_ASSERT(vinstr_ctx);
-
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-	BUG_ON(VINSTR_SUSPENDING == vinstr_ctx->state);
-	if (VINSTR_SUSPENDED == vinstr_ctx->state) {
-		BUG_ON(0 == vinstr_ctx->suspend_cnt);
-		vinstr_ctx->suspend_cnt--;
-		if (0 == vinstr_ctx->suspend_cnt) {
-			vinstr_ctx->forced_suspend = false;
-			if (vinstr_ctx->clients_present) {
-				vinstr_ctx->state = VINSTR_RESUMING;
-				schedule_work(&vinstr_ctx->resume_work);
-			} else {
-				vinstr_ctx->state = VINSTR_IDLE;
-			}
-		}
-	}
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-}
-
-void kbase_vinstr_suspend_client(struct kbase_vinstr_client *client)
-{
-	struct kbase_vinstr_context *vinstr_ctx = client->vinstr_ctx;
-	unsigned long flags;
-
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-
-	if (!client->suspended) {
-		list_del(&client->list);
-		list_add(&client->list, &vinstr_ctx->suspended_clients);
-
-		vinstr_ctx->nclients--;
-		vinstr_ctx->nclients_suspended++;
-		kbase_vinstr_update_suspend(vinstr_ctx);
-
-		client->suspended = true;
-	}
-
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-}
-
-void kbase_vinstr_resume_client(struct kbase_vinstr_client *client)
-{
-	struct kbase_vinstr_context *vinstr_ctx = client->vinstr_ctx;
-	unsigned long flags;
-
-	spin_lock_irqsave(&vinstr_ctx->state_lock, flags);
-
-	if (client->suspended) {
-		list_del(&client->list);
-		list_add(&client->list, &vinstr_ctx->idle_clients);
-
-		vinstr_ctx->nclients++;
-		vinstr_ctx->nclients_suspended--;
-		kbase_vinstr_update_suspend(vinstr_ctx);
-
-		client->suspended = false;
-	}
-
-	spin_unlock_irqrestore(&vinstr_ctx->state_lock, flags);
-}
diff --git a/mali_kbase/mali_kbase_vinstr.h b/mali_kbase/mali_kbase_vinstr.h
index d32799f..81d315f 100644
--- a/mali_kbase/mali_kbase_vinstr.h
+++ b/mali_kbase/mali_kbase_vinstr.h
@@ -20,163 +20,72 @@
  *
  */
 
+/*
+ * Vinstr, used to provide an ioctl for userspace access to periodic hardware
+ * counters.
+ */
+
 #ifndef _KBASE_VINSTR_H_
 #define _KBASE_VINSTR_H_
 
-#include <mali_kbase_hwcnt_reader.h>
-#include <mali_kbase_ioctl.h>
-
-/*****************************************************************************/
-
 struct kbase_vinstr_context;
-struct kbase_vinstr_client;
-
-/*****************************************************************************/
-
-/**
- * kbase_vinstr_init() - initialize the vinstr core
- * @kbdev: kbase device
- *
- * Return: pointer to the vinstr context on success or NULL on failure
- */
-struct kbase_vinstr_context *kbase_vinstr_init(struct kbase_device *kbdev);
-
-/**
- * kbase_vinstr_term() - terminate the vinstr core
- * @vinstr_ctx: vinstr context
- */
-void kbase_vinstr_term(struct kbase_vinstr_context *vinstr_ctx);
-
-/**
- * kbase_vinstr_hwcnt_reader_setup - configure hw counters reader
- * @vinstr_ctx: vinstr context
- * @setup:      reader's configuration
- *
- * Return: file descriptor on success and a (negative) error code otherwise
- */
-int kbase_vinstr_hwcnt_reader_setup(
-		struct kbase_vinstr_context        *vinstr_ctx,
-		struct kbase_ioctl_hwcnt_reader_setup *setup);
+struct kbase_hwcnt_virtualizer;
+struct kbase_ioctl_hwcnt_reader_setup;
 
 /**
- * kbase_vinstr_legacy_hwc_setup - configure hw counters for dumping
- * @vinstr_ctx: vinstr context
- * @cli:        pointer where to store pointer to new vinstr client structure
- * @enable:      hwc configuration
+ * kbase_vinstr_init() - Initialise a vinstr context.
+ * @hvirt:    Non-NULL pointer to the hardware counter virtualizer.
+ * @out_vctx: Non-NULL pointer to where the pointer to the created vinstr
+ *            context will be stored on success.
  *
- * Return: zero on success
- */
-int kbase_vinstr_legacy_hwc_setup(
-		struct kbase_vinstr_context *vinstr_ctx,
-		struct kbase_vinstr_client  **cli,
-		struct kbase_ioctl_hwcnt_enable *enable);
-
-/**
- * kbase_vinstr_hwcnt_kernel_setup - configure hw counters for kernel side
- *                                   client
- * @vinstr_ctx:    vinstr context
- * @setup:         reader's configuration
- * @kernel_buffer: pointer to dump buffer
+ * On creation, the suspend count of the context will be 0.
  *
- * setup->buffer_count is not used for kernel side clients.
- *
- * Return: pointer to client structure, or NULL on failure
+ * Return: 0 on success, else error code.
  */
-struct kbase_vinstr_client *kbase_vinstr_hwcnt_kernel_setup(
-		struct kbase_vinstr_context *vinstr_ctx,
-		struct kbase_ioctl_hwcnt_reader_setup *setup,
-		void *kernel_buffer);
+int kbase_vinstr_init(
+	struct kbase_hwcnt_virtualizer *hvirt,
+	struct kbase_vinstr_context **out_vctx);
 
 /**
- * kbase_vinstr_hwc_dump - issue counter dump for vinstr client
- * @cli:      pointer to vinstr client
- * @event_id: id of event that triggered hwcnt dump
- *
- * Return: zero on success
+ * kbase_vinstr_term() - Terminate a vinstr context.
+ * @vctx: Pointer to the vinstr context to be terminated.
  */
-int kbase_vinstr_hwc_dump(
-		struct kbase_vinstr_client   *cli,
-		enum base_hwcnt_reader_event event_id);
+void kbase_vinstr_term(struct kbase_vinstr_context *vctx);
 
 /**
- * kbase_vinstr_hwc_clear - performs a reset of the hardware counters for
- *                          a given kbase context
- * @cli: pointer to vinstr client
+ * kbase_vinstr_suspend() - Increment the suspend count of the context.
+ * @vctx: Non-NULL pointer to the vinstr context to be suspended.
  *
- * Return: zero on success
+ * After this function call returns, it is guaranteed that all timers and
+ * workers in vinstr will be cancelled, and will not be re-triggered until
+ * after the context has been resumed. In effect, this means no new counter
+ * dumps will occur for any existing or subsequently added periodic clients.
  */
-int kbase_vinstr_hwc_clear(struct kbase_vinstr_client *cli);
+void kbase_vinstr_suspend(struct kbase_vinstr_context *vctx);
 
 /**
- * kbase_vinstr_try_suspend - try suspending operation of a given vinstr context
- * @vinstr_ctx: vinstr context
- *
- * Return: 0 on success, or negative if state change is in progress
+ * kbase_vinstr_resume() - Decrement the suspend count of the context.
+ * @vctx: Non-NULL pointer to the vinstr context to be resumed.
  *
- * Warning: This API call is non-generic. It is meant to be used only by
- *          job scheduler state machine.
+ * If a call to this function decrements the suspend count from 1 to 0, then
+ * normal operation of vinstr will be resumed (i.e. counter dumps will once
+ * again be automatically triggered for all periodic clients).
  *
- * Function initiates vinstr switch to suspended state. Once it was called
- * vinstr enters suspending state. If function return non-zero value, it
- * indicates that state switch is not complete and function must be called
- * again. On state switch vinstr will trigger job scheduler state machine
- * cycle.
- */
-int kbase_vinstr_try_suspend(struct kbase_vinstr_context *vinstr_ctx);
-
-/**
- * kbase_vinstr_suspend - suspends operation of a given vinstr context
- * @vinstr_ctx: vinstr context
- *
- * Function initiates vinstr switch to suspended state. Then it blocks until
- * operation is completed.
- */
-void kbase_vinstr_suspend(struct kbase_vinstr_context *vinstr_ctx);
-
-/**
- * kbase_vinstr_wait_for_ready - waits for the vinstr context to get ready
- * @vinstr_ctx: vinstr context
- *
- * Function waits for the vinstr to become ready for dumping. It can be in the
- * resuming state after the client was attached but the client currently expects
- * that vinstr is ready for dumping immediately post attach.
- */
-void kbase_vinstr_wait_for_ready(struct kbase_vinstr_context *vinstr_ctx);
-
-/**
- * kbase_vinstr_resume - resumes operation of a given vinstr context
- * @vinstr_ctx: vinstr context
- *
- * Function can be called only if it was preceded by a successful call
+ * It is only valid to call this function one time for each prior returned call
  * to kbase_vinstr_suspend.
  */
-void kbase_vinstr_resume(struct kbase_vinstr_context *vinstr_ctx);
+void kbase_vinstr_resume(struct kbase_vinstr_context *vctx);
 
 /**
- * kbase_vinstr_dump_size - Return required size of dump buffer
- * @kbdev: device pointer
+ * kbase_vinstr_hwcnt_reader_setup() - Set up a new hardware counter reader
+ *                                     client.
+ * @vinstr_ctx: Non-NULL pointer to the vinstr context.
+ * @setup:      Non-NULL pointer to the hwcnt reader configuration.
  *
- * Return : buffer size in bytes
+ * Return: file descriptor on success, else a (negative) error code.
  */
-size_t kbase_vinstr_dump_size(struct kbase_device *kbdev);
-
-/**
- * kbase_vinstr_detach_client - Detach a client from the vinstr core
- * @cli: pointer to vinstr client
- */
-void kbase_vinstr_detach_client(struct kbase_vinstr_client *cli);
-
-/**
- * kbase_vinstr_suspend_client - Suspend vinstr client
- * @client: pointer to vinstr client
- */
-void kbase_vinstr_suspend_client(struct kbase_vinstr_client *client);
-
-/**
- * kbase_vinstr_resume_client - Resume vinstr client
- * @client: pointer to vinstr client
- */
-void kbase_vinstr_resume_client(struct kbase_vinstr_client *client);
+int kbase_vinstr_hwcnt_reader_setup(
+	struct kbase_vinstr_context *vinstr_ctx,
+	struct kbase_ioctl_hwcnt_reader_setup *setup);
 
 #endif /* _KBASE_VINSTR_H_ */
-
diff --git a/mali_kbase/mali_linux_kbase_trace.h b/mali_kbase/mali_linux_kbase_trace.h
index 920562e..6c6a8c6 100644
--- a/mali_kbase/mali_linux_kbase_trace.h
+++ b/mali_kbase/mali_linux_kbase_trace.h
@@ -154,7 +154,6 @@ DEFINE_MALI_ADD_EVENT(JM_ZAP_SCHEDULED);
 DEFINE_MALI_ADD_EVENT(JM_ZAP_DONE);
 DEFINE_MALI_ADD_EVENT(JM_SUBMIT_AFTER_RESET);
 DEFINE_MALI_ADD_EVENT(JM_JOB_COMPLETE);
-DEFINE_MALI_ADD_EVENT(JS_FAST_START_EVICTS_CTX);
 DEFINE_MALI_ADD_EVENT(JS_CTX_ATTR_NOW_ON_RUNPOOL);
 DEFINE_MALI_ADD_EVENT(JS_CTX_ATTR_NOW_OFF_RUNPOOL);
 DEFINE_MALI_ADD_EVENT(JS_CTX_ATTR_NOW_ON_CTX);
diff --git a/mali_kbase/mali_malisw.h b/mali_kbase/mali_malisw.h
index f17bd5e..3a4db10 100644
--- a/mali_kbase/mali_malisw.h
+++ b/mali_kbase/mali_malisw.h
@@ -1,6 +1,6 @@
 /*
  *
- * (C) COPYRIGHT 2014-2015 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2015, 2018 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -83,15 +83,6 @@
 #define CSTD_NOP(...)	((void)#__VA_ARGS__)
 
 /**
- * Function-like macro for converting a pointer in to a u64 for storing into
- * an external data structure. This is commonly used when pairing a 32-bit
- * CPU with a 64-bit peripheral, such as a Midgard GPU. C's type promotion
- * is complex and a straight cast does not work reliably as pointers are
- * often considered as signed.
- */
-#define PTR_TO_U64(x)	((uint64_t)((uintptr_t)(x)))
-
-/**
  * @hideinitializer
  * Function-like macro for stringizing a single level macro.
  * @code
@@ -115,22 +106,4 @@
  */
 #define CSTD_STR2(x)	CSTD_STR1(x)
 
-/**
- * Specify an assertion value which is evaluated at compile time. Recommended
- * usage is specification of a @c static @c INLINE function containing all of
- * the assertions thus:
- *
- * @code
- * static INLINE [module]_compile_time_assertions( void )
- * {
- *     COMPILE_TIME_ASSERT( sizeof(uintptr_t) == sizeof(intptr_t) );
- * }
- * @endcode
- *
- * @note Use @c static not @c STATIC. We never want to turn off this @c static
- * specification for testing purposes.
- */
-#define CSTD_COMPILE_TIME_ASSERT(expr) \
-	do { switch (0) { case 0: case (expr):; } } while (false)
-
 #endif /* _MALISW_H_ */
diff --git a/mali_kbase/mali_midg_regmap.h b/mali_kbase/mali_midg_regmap.h
index 8d9f7b6..0f03e8d 100644
--- a/mali_kbase/mali_midg_regmap.h
+++ b/mali_kbase/mali_midg_regmap.h
@@ -217,7 +217,7 @@
 #define JOB_IRQ_THROTTLE        0x014	/* cycles to delay delivering an interrupt externally. The JOB_IRQ_STATUS is NOT affected by this, just the delivery of the interrupt.  */
 
 /* JOB IRQ flags */
-#define JOB_IRQ_GLOBAL_IF       (1 << 18)   /* Global interface interrupt received */
+#define JOB_IRQ_GLOBAL_IF       (1 << 31)   /* Global interface interrupt received */
 
 #define JOB_SLOT0               0x800	/* Configuration registers for job slot 0 */
 #define JOB_SLOT1               0x880	/* Configuration registers for job slot 1 */
@@ -381,14 +381,14 @@
 /*
  * Begin TRANSCFG register values
  */
-#define AS_TRANSCFG_PTW_MEMATTR_MASK (3 << 24)
-#define AS_TRANSCFG_PTW_MEMATTR_NON_CACHEABLE (1 << 24)
-#define AS_TRANSCFG_PTW_MEMATTR_WRITE_BACK (2 << 24)
-
-#define AS_TRANSCFG_PTW_SH_MASK ((3 << 28))
-#define AS_TRANSCFG_PTW_SH_OS (2 << 28)
-#define AS_TRANSCFG_PTW_SH_IS (3 << 28)
-
+#define AS_TRANSCFG_PTW_MEMATTR_MASK (3ull << 24)
+#define AS_TRANSCFG_PTW_MEMATTR_NON_CACHEABLE (1ull << 24)
+#define AS_TRANSCFG_PTW_MEMATTR_WRITE_BACK (2ull << 24)
+
+#define AS_TRANSCFG_PTW_SH_MASK ((3ull << 28))
+#define AS_TRANSCFG_PTW_SH_OS (2ull << 28)
+#define AS_TRANSCFG_PTW_SH_IS (3ull << 28)
+#define AS_TRANSCFG_R_ALLOCATE (1ull << 30)
 /*
  * Begin Command Values
  */
diff --git a/mali_kbase/mali_uk.h b/mali_kbase/mali_uk.h
index c81f404..701f390 100644
--- a/mali_kbase/mali_uk.h
+++ b/mali_kbase/mali_uk.h
@@ -74,68 +74,6 @@ enum uk_client_id {
 	UK_CLIENT_COUNT
 };
 
-/**
- * Each function callable through the UK interface has a unique number.
- * Functions provided by UK clients start from number UK_FUNC_ID.
- * Numbers below UK_FUNC_ID are used for internal UK functions.
- */
-enum uk_func {
-	UKP_FUNC_ID_CHECK_VERSION,   /**< UKK Core internal function */
-	/**
-	 * Each UK client numbers the functions they provide starting from
-	 * number UK_FUNC_ID. This number is then eventually assigned to the
-	 * id field of the union uk_header structure when preparing to make a
-	 * UK call. See your UK client for a list of their function numbers.
-	 */
-	UK_FUNC_ID = 512
-};
-
-/**
- * Arguments for a UK call are stored in a structure. This structure consists
- * of a fixed size header and a payload. The header carries a 32-bit number
- * identifying the UK function to be called (see uk_func). When the UKK client
- * receives this header and executed the requested UK function, it will use
- * the same header to store the result of the function in the form of a
- * int return code. The size of this structure is such that the
- * first member of the payload following the header can be accessed efficiently
- * on a 32 and 64-bit kernel and the structure has the same size regardless
- * of a 32 or 64-bit kernel. The uk_kernel_size_type type should be defined
- * accordingly in the OS specific mali_uk_os.h header file.
- */
-union uk_header {
-	/**
-	 * 32-bit number identifying the UK function to be called.
-	 * Also see uk_func.
-	 */
-	u32 id;
-	/**
-	 * The int return code returned by the called UK function.
-	 * See the specification of the particular UK function you are
-	 * calling for the meaning of the error codes returned. All
-	 * UK functions return 0 on success.
-	 */
-	u32 ret;
-	/*
-	 * Used to ensure 64-bit alignment of this union. Do not remove.
-	 * This field is used for padding and does not need to be initialized.
-	 */
-	u64 sizer;
-};
-
-/**
- * This structure carries a 16-bit major and minor number and is sent along with an internal UK call
- * used during uku_open to identify the versions of the UK module in use by the user-side and kernel-side.
- */
-struct uku_version_check_args {
-	union uk_header header;
-		  /**< UK call header */
-	u16 major;
-	   /**< This field carries the user-side major version on input and the kernel-side major version on output */
-	u16 minor;
-	   /**< This field carries the user-side minor version on input and the kernel-side minor version on output. */
-	u8 padding[4];
-};
-
 /** @} end group uk_api */
 
 /** @} *//* end group base_api */
diff --git a/mali_kbase/sconscript b/mali_kbase/sconscript
index 01c7589..f9d9c1b 100644
--- a/mali_kbase/sconscript
+++ b/mali_kbase/sconscript
@@ -50,7 +50,6 @@ make_args = env.kernel_get_config_defines(ret_list = True) + [
 	'MALI_KERNEL_TEST_API=%s' % env['debug'],
 	'MALI_UNIT_TEST=%s' % env['unit'],
 	'MALI_RELEASE_NAME=%s' % env['mali_release_name'],
-	'MALI_MOCK_TEST=%s' % mock_test,
 	'MALI_CUSTOMER_RELEASE=%s' % env['release'],
 	'MALI_USE_CSF=%s' % env['csf'],
 	'MALI_COVERAGE=%s' % env['coverage'],
diff --git a/mali_kbase/tests/Mconfig b/mali_kbase/tests/Mconfig
index ddd7630..af4e383 100644
--- a/mali_kbase/tests/Mconfig
+++ b/mali_kbase/tests/Mconfig
@@ -21,6 +21,11 @@ config BUILD_IPA_TESTS
 	default y if UNIT_TEST_KERNEL_MODULES && MALI_DEVFREQ
 	default n
 
+config BUILD_IPA_UNIT_TESTS
+	bool
+	default y if NO_MALI && BUILD_IPA_TESTS
+	default n
+
 config BUILD_CSF_TESTS
 	bool
 	default y if UNIT_TEST_KERNEL_MODULES && GPU_HAS_CSF
diff --git a/mali_kbase/tests/mali_kutf_irq_test/build.bp b/mali_kbase/tests/mali_kutf_irq_test/build.bp
index a6669af..66f4eb3 100644
--- a/mali_kbase/tests/mali_kutf_irq_test/build.bp
+++ b/mali_kbase/tests/mali_kutf_irq_test/build.bp
@@ -21,7 +21,6 @@ bob_kernel_module {
         "mali_kbase",
         "kutf",
     ],
-    install_group: "IG_tests",
     enabled: false,
     base_build_kutf: {
         enabled: true,
diff --git a/mali_kbase/tests/sconscript b/mali_kbase/tests/sconscript
index 0bd24a5..ca64e83 100644
--- a/mali_kbase/tests/sconscript
+++ b/mali_kbase/tests/sconscript
@@ -1,5 +1,5 @@
 #
-# (C) COPYRIGHT 2010-2011, 2013, 2017 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2010-2011, 2013, 2017-2018 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -39,6 +39,5 @@ if kutf_env['debug'] == '1':
 		SConscript('kutf_test_runner/sconscript')
 
 if env['unit'] == '1':
-	SConscript('mali_kutf_ipa_test/sconscript')
 	SConscript('mali_kutf_ipa_unit_test/sconscript')
 	SConscript('mali_kutf_vinstr_test/sconscript')
author	Sidath Senanayake <sidaths@google.com>	2018-12-06 09:09:59 +0100
committer	Sidath Senanayake <sidaths@google.com>	2018-12-06 09:09:59 +0100
commit	a970431fa55f99aba31ea4263fdc8e70019a9ccd (patch)
tree	91bb7f49a4869c0385338fe144f53ac8b98468ea /mali_kbase
parent	f10b3de5283d0c196459f18160161e48cfadae81 (diff)
download	gpu-a970431fa55f99aba31ea4263fdc8e70019a9ccd.tar.gz