Merge pull request #843 from jeenu-arm/cas-lock

Introduce locking primitives using CAS instruction
diff --git a/Makefile b/Makefile
index 7b4e4f6..9f900db 100644
--- a/Makefile
+++ b/Makefile
@@ -111,7 +111,7 @@
 
 # Default build string (git branch and commit)
 ifeq (${BUILD_STRING},)
-        BUILD_STRING	:=	$(shell git log -n 1 --pretty=format:"%h")
+        BUILD_STRING	:=	$(shell git describe --always --dirty --tags 2> /dev/null)
 endif
 VERSION_STRING		:=	v${VERSION_MAJOR}.${VERSION_MINOR}(${BUILD_TYPE}):${BUILD_STRING}
 
@@ -346,11 +346,6 @@
         endif
 endif
 
-# Make sure PMF is enabled if PSCI STAT is enabled.
-ifeq (${ENABLE_PSCI_STAT},1)
-ENABLE_PMF			:= 1
-endif
-
 ifneq (${FIP_ALIGN},0)
 FIP_ARGS += --align ${FIP_ALIGN}
 endif
diff --git a/bl1/bl1.ld.S b/bl1/bl1.ld.S
index b9554d1..b69065e 100644
--- a/bl1/bl1.ld.S
+++ b/bl1/bl1.ld.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -133,7 +133,8 @@
 
     /*
      * The .bss section gets initialised to 0 at runtime.
-     * Its base address must be 16-byte aligned.
+     * Its base address should be 16-byte aligned for better performance of the
+     * zero-initialization code.
      */
     .bss : ALIGN(16) {
         __BSS_START__ = .;
diff --git a/bl1/bl1_fwu.c b/bl1/bl1_fwu.c
index 1cc7daf..f7fae68 100644
--- a/bl1/bl1_fwu.c
+++ b/bl1/bl1_fwu.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -335,7 +335,7 @@
 		 */
 		if (image_desc->state == IMAGE_STATE_COPIED) {
 			/* Clear the memory.*/
-			memset((void *)base_addr, 0, total_size);
+			zero_normalmem((void *)base_addr, total_size);
 			flush_dcache_range(base_addr, total_size);
 
 			/* Indicate that image can be copied again*/
diff --git a/bl2/aarch64/bl2_entrypoint.S b/bl2/aarch64/bl2_entrypoint.S
index 25363ac..31f7787 100644
--- a/bl2/aarch64/bl2_entrypoint.S
+++ b/bl2/aarch64/bl2_entrypoint.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -94,12 +94,12 @@
 	 */
 	ldr	x0, =__BSS_START__
 	ldr	x1, =__BSS_SIZE__
-	bl	zeromem16
+	bl	zeromem
 
 #if USE_COHERENT_MEM
 	ldr	x0, =__COHERENT_RAM_START__
 	ldr	x1, =__COHERENT_RAM_UNALIGNED_SIZE__
-	bl	zeromem16
+	bl	zeromem
 #endif
 
 	/* --------------------------------------------
diff --git a/bl2/bl2.ld.S b/bl2/bl2.ld.S
index fa694de..b9275f3 100644
--- a/bl2/bl2.ld.S
+++ b/bl2/bl2.ld.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -113,7 +113,8 @@
 
     /*
      * The .bss section gets initialised to 0 at runtime.
-     * Its base address must be 16-byte aligned.
+     * Its base address should be 16-byte aligned for better performance of the
+     * zero-initialization code.
      */
     .bss : ALIGN(16) {
         __BSS_START__ = .;
diff --git a/bl2u/aarch64/bl2u_entrypoint.S b/bl2u/aarch64/bl2u_entrypoint.S
index 1175c6f..9fa84bf 100644
--- a/bl2u/aarch64/bl2u_entrypoint.S
+++ b/bl2u/aarch64/bl2u_entrypoint.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -94,7 +94,7 @@
 	 */
 	ldr	x0, =__BSS_START__
 	ldr	x1, =__BSS_SIZE__
-	bl	zeromem16
+	bl	zeromem
 
 	/* --------------------------------------------
 	 * Allocate a stack whose memory will be marked
diff --git a/bl2u/bl2u.ld.S b/bl2u/bl2u.ld.S
index d72589f..91e8556 100644
--- a/bl2u/bl2u.ld.S
+++ b/bl2u/bl2u.ld.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -100,7 +100,8 @@
 
     /*
      * The .bss section gets initialised to 0 at runtime.
-     * Its base address must be 16-byte aligned.
+     * Its base address should be 16-byte aligned for better performance of the
+     * zero-initialization code.
      */
     .bss : ALIGN(16) {
         __BSS_START__ = .;
diff --git a/bl31/bl31.ld.S b/bl31/bl31.ld.S
index 9a05e6c..e5d6232 100644
--- a/bl31/bl31.ld.S
+++ b/bl31/bl31.ld.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -158,7 +158,8 @@
 
     /*
      * The .bss section gets initialised to 0 at runtime.
-     * Its base address must be 16-byte aligned.
+     * Its base address should be 16-byte aligned for better performance of the
+     * zero-initialization code.
      */
     .bss (NOLOAD) : ALIGN(16) {
         __BSS_START__ = .;
diff --git a/bl32/sp_min/sp_min.ld.S b/bl32/sp_min/sp_min.ld.S
index e0e23e8..f1d4d0b 100644
--- a/bl32/sp_min/sp_min.ld.S
+++ b/bl32/sp_min/sp_min.ld.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -134,9 +134,10 @@
 
     /*
      * The .bss section gets initialised to 0 at runtime.
-     * Its base address must be 16-byte aligned.
+     * Its base address should be 8-byte aligned for better performance of the
+     * zero-initialization code.
      */
-    .bss (NOLOAD) : ALIGN(16) {
+    .bss (NOLOAD) : ALIGN(8) {
         __BSS_START__ = .;
         *(.bss*)
         *(COMMON)
diff --git a/bl32/sp_min/sp_min_main.c b/bl32/sp_min/sp_min_main.c
index 02663a2..f34716e 100644
--- a/bl32/sp_min/sp_min_main.c
+++ b/bl32/sp_min/sp_min_main.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -45,6 +45,7 @@
 #include <stdint.h>
 #include <string.h>
 #include <types.h>
+#include <utils.h>
 #include "sp_min_private.h"
 
 /* Pointers to per-core cpu contexts */
@@ -203,7 +204,7 @@
 	smc_set_next_ctx(NON_SECURE);
 
 	next_smc_ctx = smc_get_next_ctx();
-	memset(next_smc_ctx, 0, sizeof(smc_ctx_t));
+	zeromem(next_smc_ctx, sizeof(smc_ctx_t));
 
 	copy_cpu_ctx_to_smc_stx(get_regs_ctx(cm_get_context(NON_SECURE)),
 			next_smc_ctx);
diff --git a/bl32/tsp/aarch64/tsp_entrypoint.S b/bl32/tsp/aarch64/tsp_entrypoint.S
index 4c296d4..bdb882a 100644
--- a/bl32/tsp/aarch64/tsp_entrypoint.S
+++ b/bl32/tsp/aarch64/tsp_entrypoint.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -120,12 +120,12 @@
 	 */
 	ldr	x0, =__BSS_START__
 	ldr	x1, =__BSS_SIZE__
-	bl	zeromem16
+	bl	zeromem
 
 #if USE_COHERENT_MEM
 	ldr	x0, =__COHERENT_RAM_START__
 	ldr	x1, =__COHERENT_RAM_UNALIGNED_SIZE__
-	bl	zeromem16
+	bl	zeromem
 #endif
 
 	/* --------------------------------------------
diff --git a/bl32/tsp/tsp.ld.S b/bl32/tsp/tsp.ld.S
index 7e24f66..d93e3bb 100644
--- a/bl32/tsp/tsp.ld.S
+++ b/bl32/tsp/tsp.ld.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -104,7 +104,8 @@
 
     /*
      * The .bss section gets initialised to 0 at runtime.
-     * Its base address must be 16-byte aligned.
+     * Its base address should be 16-byte aligned for better performance of the
+     * zero-initialization code.
      */
     .bss : ALIGN(16) {
         __BSS_START__ = .;
diff --git a/common/bl_common.c b/common/bl_common.c
index 47bdad5..1d66530 100644
--- a/common/bl_common.c
+++ b/common/bl_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -347,7 +347,7 @@
 				 image_data->image_size);
 	if (rc != 0) {
 		/* Authentication error, zero memory and flush it right away. */
-		memset((void *)image_data->image_base, 0x00,
+		zero_normalmem((void *)image_data->image_base,
 		       image_data->image_size);
 		flush_dcache_range(image_data->image_base,
 				   image_data->image_size);
@@ -543,7 +543,7 @@
 				 image_data->image_size);
 	if (rc != 0) {
 		/* Authentication error, zero memory and flush it right away. */
-		memset((void *)image_data->image_base, 0x00,
+		zero_normalmem((void *)image_data->image_base,
 		       image_data->image_size);
 		flush_dcache_range(image_data->image_base,
 				   image_data->image_size);
diff --git a/docs/firmware-design.md b/docs/firmware-design.md
index 0fdc941..523fa55 100644
--- a/docs/firmware-design.md
+++ b/docs/firmware-design.md
@@ -1343,7 +1343,7 @@
 
 The following linker symbols are defined for this purpose:
 
-*   `__BSS_START__`          Must be aligned on a 16-byte boundary.
+*   `__BSS_START__`
 *   `__BSS_SIZE__`
 *   `__COHERENT_RAM_START__` Must be aligned on a page-size boundary.
 *   `__COHERENT_RAM_END__`   Must be aligned on a page-size boundary.
diff --git a/docs/porting-guide.md b/docs/porting-guide.md
index e8486f1..a5e5966 100644
--- a/docs/porting-guide.md
+++ b/docs/porting-guide.md
@@ -1707,9 +1707,55 @@
 convert the power-state parameter (possibly encoding a composite power state)
 passed in a PSCI `CPU_SUSPEND` call to this representation.
 
-The following functions must be implemented to initialize PSCI functionality in
-the ARM Trusted Firmware.
+The following functions form part of platform port of PSCI functionality.
+
+
+### Function : plat_psci_stat_accounting_start() [optional]
+
+    Argument : const psci_power_state_t *
+    Return   : void
+
+This is an optional hook that platforms can implement for residency statistics
+accounting before entering a low power state.  The `pwr_domain_state` field of
+`state_info` (first argument) can be inspected if stat accounting is done
+differently at CPU level versus higher levels.  As an example, if the element at
+index 0 (CPU power level) in the `pwr_domain_state` array indicates a power down
+state, special hardware logic may be programmed in order to keep track of the
+residency statistics.  For higher levels (array indices > 0), the residency
+statistics could be tracked in software using PMF.  If `ENABLE_PMF` is set, the
+default implementation will use PMF to capture timestamps.
+
+### Function : plat_psci_stat_accounting_stop() [optional]
+
+    Argument : const psci_power_state_t *
+    Return   : void
+
+This is an optional hook that platforms can implement for residency statistics
+accounting after exiting from a low power state.  The `pwr_domain_state` field
+of `state_info` (first argument) can be inspected if stat accounting is done
+differently at CPU level versus higher levels.  As an example, if the element at
+index 0 (CPU power level) in the `pwr_domain_state` array indicates a power down
+state, special hardware logic may be programmed in order to keep track of the
+residency statistics.  For higher levels (array indices > 0), the residency
+statistics could be tracked in software using PMF.  If `ENABLE_PMF` is set, the
+default implementation will use PMF to capture timestamps.
+
+### Function : plat_psci_stat_get_residency() [optional]
+
+    Argument : unsigned int, const psci_power_state_t *, int
+    Return   : u_register_t
 
+This is an optional interface that is is invoked after resuming from a low power
+state and provides the time spent resident in that low power state by the power
+domain at a particular power domain level.  When a CPU wakes up from suspend,
+all its parent power domain levels are also woken up.  The generic PSCI code
+invokes this function for each parent power domain that is resumed and it
+identified by the `lvl` (first argument) parameter.  The `state_info` (second
+argument) describes the low power state that the power domain has resumed from.
+The current CPU is the first CPU in the power domain to resume from the low
+power state and the `last_cpu_idx` (third parameter) is the index of the last
+CPU in the power domain to suspend and may be needed to calculate the residency
+for that power domain.
 
 ### Function : plat_get_target_pwr_state() [optional]
 
diff --git a/docs/user-guide.md b/docs/user-guide.md
index 7ae5b9c..091aeba 100644
--- a/docs/user-guide.md
+++ b/docs/user-guide.md
@@ -37,6 +37,9 @@
 *   Linux kernel image
 *   Root filesystem
 
+Note: the ARM TF v1.3 release was tested with Linaro Release 16.06, and the
+latest version of ARM TF is tested with Linaro Release 16.12.
+
 This document also assumes that the user is familiar with the FVP models and
 the different command line options available to launch the model.
 
@@ -282,8 +285,9 @@
 
 *   `ENABLE_PSCI_STAT`: Boolean option to enable support for optional PSCI
      functions `PSCI_STAT_RESIDENCY` and `PSCI_STAT_COUNT`. Default is 0.
-     Enabling this option enables the `ENABLE_PMF` build option as well.
-     The PMF is used for collecting the statistics.
+     In the absence of an alternate stat collection backend, `ENABLE_PMF` must
+     be enabled. If `ENABLE_PMF` is set, the residency statistics are tracked in
+     software.
 
 *   `ENABLE_RUNTIME_INSTRUMENTATION`: Boolean option to enable runtime
     instrumentation which injects timestamp collection points into
@@ -886,8 +890,8 @@
 a single FIP binary. It assumes that a [Linaro Release][Linaro Release Notes]
 has been installed.
 
-Note currently [Linaro Release][Linaro Release Notes] only includes pre-built
-binaries for AArch64. For AArch32, pre-built binaries are not available.
+Note: Linaro Release 16.06 only includes pre-built binaries for AArch64. For
+AArch32, pre-built binaries are only available from Linaro Release 16.12.
 
 Note: follow the full instructions for one platform before switching to a
 different one. Mixing instructions for different platforms may result in
@@ -1139,6 +1143,9 @@
 parameter options. A brief description of the important ones that affect the ARM
 Trusted Firmware and normal world software behavior is provided below.
 
+Note the instructions in the following sections assume that Linaro Release 16.06
+is being used.
+
 ### Obtaining the Flattened Device Trees
 
 Depending on the FVP configuration and Linux configuration used, different
diff --git a/drivers/auth/mbedtls/mbedtls_x509_parser.c b/drivers/auth/mbedtls/mbedtls_x509_parser.c
index f9485de..36c279f 100644
--- a/drivers/auth/mbedtls/mbedtls_x509_parser.c
+++ b/drivers/auth/mbedtls/mbedtls_x509_parser.c
@@ -43,6 +43,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+#include <utils.h>
 
 /* mbed TLS headers */
 #include <mbedtls/asn1.h>
@@ -71,7 +72,7 @@
 {
 #define ZERO_AND_CLEAN(x)					\
 	do {							\
-		memset(&x, 0, sizeof(x));			\
+		zeromem(&x, sizeof(x));				\
 		clean_dcache_range((uintptr_t)&x, sizeof(x));	\
 	} while (0);
 
@@ -111,7 +112,7 @@
 			     MBEDTLS_ASN1_SEQUENCE);
 
 	while (p < end) {
-		memset(&extn_oid, 0x0, sizeof(extn_oid));
+		zeromem(&extn_oid, sizeof(extn_oid));
 		is_critical = 0; /* DEFAULT FALSE */
 
 		mbedtls_asn1_get_tag(&p, end, &len, MBEDTLS_ASN1_CONSTRUCTED |
diff --git a/drivers/emmc/emmc.c b/drivers/emmc/emmc.c
index 3fae2a1..1c1ea82 100644
--- a/drivers/emmc/emmc.c
+++ b/drivers/emmc/emmc.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,6 +36,7 @@
 #include <emmc.h>
 #include <errno.h>
 #include <string.h>
+#include <utils.h>
 
 static const emmc_ops_t *ops;
 static unsigned int emmc_ocr_value;
@@ -53,7 +54,7 @@
 	int ret;
 
 	do {
-		memset(&cmd, 0, sizeof(emmc_cmd_t));
+		zeromem(&cmd, sizeof(emmc_cmd_t));
 		cmd.cmd_idx = EMMC_CMD13;
 		cmd.cmd_arg = EMMC_FIX_RCA << RCA_SHIFT_OFFSET;
 		cmd.resp_type = EMMC_RESPONSE_R1;
@@ -71,7 +72,7 @@
 	emmc_cmd_t cmd;
 	int ret, state;
 
-	memset(&cmd, 0, sizeof(emmc_cmd_t));
+	zeromem(&cmd, sizeof(emmc_cmd_t));
 	cmd.cmd_idx = EMMC_CMD6;
 	cmd.cmd_arg = EXTCSD_WRITE_BYTES | EXTCSD_CMD(ext_cmd) |
 		      EXTCSD_VALUE(value) | 1;
@@ -107,14 +108,14 @@
 	ops->init();
 
 	/* CMD0: reset to IDLE */
-	memset(&cmd, 0, sizeof(emmc_cmd_t));
+	zeromem(&cmd, sizeof(emmc_cmd_t));
 	cmd.cmd_idx = EMMC_CMD0;
 	ret = ops->send_cmd(&cmd);
 	assert(ret == 0);
 
 	while (1) {
 		/* CMD1: get OCR register */
-		memset(&cmd, 0, sizeof(emmc_cmd_t));
+		zeromem(&cmd, sizeof(emmc_cmd_t));
 		cmd.cmd_idx = EMMC_CMD1;
 		cmd.cmd_arg = OCR_SECTOR_MODE | OCR_VDD_MIN_2V7 |
 			      OCR_VDD_MIN_1V7;
@@ -127,14 +128,14 @@
 	}
 
 	/* CMD2: Card Identification */
-	memset(&cmd, 0, sizeof(emmc_cmd_t));
+	zeromem(&cmd, sizeof(emmc_cmd_t));
 	cmd.cmd_idx = EMMC_CMD2;
 	cmd.resp_type = EMMC_RESPONSE_R2;
 	ret = ops->send_cmd(&cmd);
 	assert(ret == 0);
 
 	/* CMD3: Set Relative Address */
-	memset(&cmd, 0, sizeof(emmc_cmd_t));
+	zeromem(&cmd, sizeof(emmc_cmd_t));
 	cmd.cmd_idx = EMMC_CMD3;
 	cmd.cmd_arg = EMMC_FIX_RCA << RCA_SHIFT_OFFSET;
 	cmd.resp_type = EMMC_RESPONSE_R1;
@@ -142,7 +143,7 @@
 	assert(ret == 0);
 
 	/* CMD9: CSD Register */
-	memset(&cmd, 0, sizeof(emmc_cmd_t));
+	zeromem(&cmd, sizeof(emmc_cmd_t));
 	cmd.cmd_idx = EMMC_CMD9;
 	cmd.cmd_arg = EMMC_FIX_RCA << RCA_SHIFT_OFFSET;
 	cmd.resp_type = EMMC_RESPONSE_R2;
@@ -151,7 +152,7 @@
 	memcpy(&emmc_csd, &cmd.resp_data, sizeof(cmd.resp_data));
 
 	/* CMD7: Select Card */
-	memset(&cmd, 0, sizeof(emmc_cmd_t));
+	zeromem(&cmd, sizeof(emmc_cmd_t));
 	cmd.cmd_idx = EMMC_CMD7;
 	cmd.cmd_arg = EMMC_FIX_RCA << RCA_SHIFT_OFFSET;
 	cmd.resp_type = EMMC_RESPONSE_R1;
@@ -181,7 +182,7 @@
 	assert(ret == 0);
 
 	if (is_cmd23_enabled()) {
-		memset(&cmd, 0, sizeof(emmc_cmd_t));
+		zeromem(&cmd, sizeof(emmc_cmd_t));
 		/* set block count */
 		cmd.cmd_idx = EMMC_CMD23;
 		cmd.cmd_arg = size / EMMC_BLOCK_SIZE;
@@ -189,7 +190,7 @@
 		ret = ops->send_cmd(&cmd);
 		assert(ret == 0);
 
-		memset(&cmd, 0, sizeof(emmc_cmd_t));
+		zeromem(&cmd, sizeof(emmc_cmd_t));
 		cmd.cmd_idx = EMMC_CMD18;
 	} else {
 		if (size > EMMC_BLOCK_SIZE)
@@ -213,7 +214,7 @@
 
 	if (is_cmd23_enabled() == 0) {
 		if (size > EMMC_BLOCK_SIZE) {
-			memset(&cmd, 0, sizeof(emmc_cmd_t));
+			zeromem(&cmd, sizeof(emmc_cmd_t));
 			cmd.cmd_idx = EMMC_CMD12;
 			ret = ops->send_cmd(&cmd);
 			assert(ret == 0);
@@ -240,17 +241,17 @@
 
 	if (is_cmd23_enabled()) {
 		/* set block count */
-		memset(&cmd, 0, sizeof(emmc_cmd_t));
+		zeromem(&cmd, sizeof(emmc_cmd_t));
 		cmd.cmd_idx = EMMC_CMD23;
 		cmd.cmd_arg = size / EMMC_BLOCK_SIZE;
 		cmd.resp_type = EMMC_RESPONSE_R1;
 		ret = ops->send_cmd(&cmd);
 		assert(ret == 0);
 
-		memset(&cmd, 0, sizeof(emmc_cmd_t));
+		zeromem(&cmd, sizeof(emmc_cmd_t));
 		cmd.cmd_idx = EMMC_CMD25;
 	} else {
-		memset(&cmd, 0, sizeof(emmc_cmd_t));
+		zeromem(&cmd, sizeof(emmc_cmd_t));
 		if (size > EMMC_BLOCK_SIZE)
 			cmd.cmd_idx = EMMC_CMD25;
 		else
@@ -272,7 +273,7 @@
 
 	if (is_cmd23_enabled() == 0) {
 		if (size > EMMC_BLOCK_SIZE) {
-			memset(&cmd, 0, sizeof(emmc_cmd_t));
+			zeromem(&cmd, sizeof(emmc_cmd_t));
 			cmd.cmd_idx = EMMC_CMD12;
 			ret = ops->send_cmd(&cmd);
 			assert(ret == 0);
@@ -291,21 +292,21 @@
 	assert(ops != 0);
 	assert((size != 0) && ((size % EMMC_BLOCK_SIZE) == 0));
 
-	memset(&cmd, 0, sizeof(emmc_cmd_t));
+	zeromem(&cmd, sizeof(emmc_cmd_t));
 	cmd.cmd_idx = EMMC_CMD35;
 	cmd.cmd_arg = lba;
 	cmd.resp_type = EMMC_RESPONSE_R1;
 	ret = ops->send_cmd(&cmd);
 	assert(ret == 0);
 
-	memset(&cmd, 0, sizeof(emmc_cmd_t));
+	zeromem(&cmd, sizeof(emmc_cmd_t));
 	cmd.cmd_idx = EMMC_CMD36;
 	cmd.cmd_arg = lba + (size / EMMC_BLOCK_SIZE) - 1;
 	cmd.resp_type = EMMC_RESPONSE_R1;
 	ret = ops->send_cmd(&cmd);
 	assert(ret == 0);
 
-	memset(&cmd, 0, sizeof(emmc_cmd_t));
+	zeromem(&cmd, sizeof(emmc_cmd_t));
 	cmd.cmd_idx = EMMC_CMD38;
 	cmd.resp_type = EMMC_RESPONSE_R1B;
 	ret = ops->send_cmd(&cmd);
diff --git a/drivers/io/io_block.c b/drivers/io/io_block.c
index 4ec59bc..a855581 100644
--- a/drivers/io/io_block.c
+++ b/drivers/io/io_block.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,6 +36,7 @@
 #include <io_storage.h>
 #include <platform_def.h>
 #include <string.h>
+#include <utils.h>
 
 typedef struct {
 	io_block_dev_spec_t	*dev_spec;
@@ -135,8 +136,8 @@
 	result = find_first_block_state(state->dev_spec, &index);
 	if (result ==  0) {
 		/* free if device info is valid */
-		memset(state, 0, sizeof(block_dev_state_t));
-		memset(dev_info, 0, sizeof(io_dev_info_t));
+		zeromem(state, sizeof(block_dev_state_t));
+		zeromem(dev_info, sizeof(io_dev_info_t));
 		--block_dev_count;
 	}
 
diff --git a/drivers/io/io_fip.c b/drivers/io/io_fip.c
index 99cf15b..6724fc3 100644
--- a/drivers/io/io_fip.c
+++ b/drivers/io/io_fip.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2014-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,6 +40,7 @@
 #include <platform_def.h>
 #include <stdint.h>
 #include <string.h>
+#include <utils.h>
 #include <uuid.h>
 
 /* Useful for printing UUIDs when debugging.*/
@@ -351,7 +352,7 @@
 	 * If we had malloc() we would free() here.
 	 */
 	if (current_file.entry.offset_address != 0) {
-		memset(&current_file, 0, sizeof(current_file));
+		zeromem(&current_file, sizeof(current_file));
 	}
 
 	/* Clear the Entity info. */
diff --git a/drivers/io/io_memmap.c b/drivers/io/io_memmap.c
index fe39652..5104fb1 100644
--- a/drivers/io/io_memmap.c
+++ b/drivers/io/io_memmap.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2014-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include <io_driver.h>
 #include <io_storage.h>
 #include <string.h>
+#include <utils.h>
 
 /* As we need to be able to keep state for seek, only one file can be open
  * at a time. Make this a structure and point to the entity->info. When we
@@ -118,13 +119,14 @@
 
 
 /* Open a file on the memmap device */
-/* TODO: Can we do any sensible limit checks on requested memory */
 static int memmap_block_open(io_dev_info_t *dev_info, const uintptr_t spec,
 			     io_entity_t *entity)
 {
 	int result = -ENOMEM;
 	const io_block_spec_t *block_spec = (io_block_spec_t *)spec;
 
+	assert(block_spec->length >= 0);
+
 	/* Since we need to track open state for seek() we only allow one open
 	 * spec at a time. When we have dynamic memory we can malloc and set
 	 * entity->info.
@@ -152,13 +154,19 @@
 static int memmap_block_seek(io_entity_t *entity, int mode, ssize_t offset)
 {
 	int result = -ENOENT;
+	file_state_t *fp;
 
 	/* We only support IO_SEEK_SET for the moment. */
 	if (mode == IO_SEEK_SET) {
 		assert(entity != NULL);
 
-		/* TODO: can we do some basic limit checks on seek? */
-		((file_state_t *)entity->info)->file_pos = offset;
+		fp = (file_state_t *) entity->info;
+
+		/* Assert that new file position is valid */
+		assert((offset >= 0) && (offset < fp->size));
+
+		/* Reset file position */
+		fp->file_pos = offset;
 		result = 0;
 	}
 
@@ -183,18 +191,24 @@
 			     size_t length, size_t *length_read)
 {
 	file_state_t *fp;
+	size_t pos_after;
 
 	assert(entity != NULL);
 	assert(buffer != (uintptr_t)NULL);
 	assert(length_read != NULL);
 
-	fp = (file_state_t *)entity->info;
+	fp = (file_state_t *) entity->info;
+
+	/* Assert that file position is valid for this read operation */
+	pos_after = fp->file_pos + length;
+	assert((pos_after >= fp->file_pos) && (pos_after <= fp->size));
 
 	memcpy((void *)buffer, (void *)(fp->base + fp->file_pos), length);
 
 	*length_read = length;
-	/* advance the file 'cursor' for incremental reads */
-	fp->file_pos += length;
+
+	/* Set file position after read */
+	fp->file_pos = pos_after;
 
 	return 0;
 }
@@ -205,19 +219,24 @@
 			      size_t length, size_t *length_written)
 {
 	file_state_t *fp;
+	size_t pos_after;
 
 	assert(entity != NULL);
 	assert(buffer != (uintptr_t)NULL);
 	assert(length_written != NULL);
 
-	fp = (file_state_t *)entity->info;
+	fp = (file_state_t *) entity->info;
+
+	/* Assert that file position is valid for this write operation */
+	pos_after = fp->file_pos + length;
+	assert((pos_after >= fp->file_pos) && (pos_after <= fp->size));
 
 	memcpy((void *)(fp->base + fp->file_pos), (void *)buffer, length);
 
 	*length_written = length;
 
-	/* advance the file 'cursor' for incremental writes */
-	fp->file_pos += length;
+	/* Set file position after write */
+	fp->file_pos = pos_after;
 
 	return 0;
 }
@@ -231,7 +250,7 @@
 	entity->info = 0;
 
 	/* This would be a mem free() if we had malloc.*/
-	memset((void *)&current_file, 0, sizeof(current_file));
+	zeromem((void *)&current_file, sizeof(current_file));
 
 	return 0;
 }
diff --git a/drivers/io/io_semihosting.c b/drivers/io/io_semihosting.c
index 30ca99c..e33a044 100644
--- a/drivers/io/io_semihosting.c
+++ b/drivers/io/io_semihosting.c
@@ -95,7 +95,7 @@
 		const uintptr_t spec, io_entity_t *entity)
 {
 	int result = -ENOENT;
-	long sh_result = -1;
+	long sh_result;
 	const io_file_spec_t *file_spec = (const io_file_spec_t *)spec;
 
 	assert(file_spec != NULL);
@@ -151,7 +151,7 @@
 		size_t *length_read)
 {
 	int result = -ENOENT;
-	long sh_result = -1;
+	long sh_result;
 	size_t bytes = length;
 	long file_handle;
 
@@ -176,7 +176,7 @@
 static int sh_file_write(io_entity_t *entity, const uintptr_t buffer,
 		size_t length, size_t *length_written)
 {
-	long sh_result = -1;
+	long sh_result;
 	long file_handle;
 	size_t bytes = length;
 
@@ -197,7 +197,7 @@
 /* Close a file on the semi-hosting device */
 static int sh_file_close(io_entity_t *entity)
 {
-	long sh_result = -1;
+	long sh_result;
 	long file_handle;
 
 	assert(entity != NULL);
diff --git a/drivers/partition/gpt.c b/drivers/partition/gpt.c
index 9240d5a..05f13f3 100644
--- a/drivers/partition/gpt.c
+++ b/drivers/partition/gpt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include <errno.h>
 #include <gpt.h>
 #include <string.h>
+#include <utils.h>
 
 static int unicode_to_ascii(unsigned short *str_in, unsigned char *str_out)
 {
@@ -65,7 +66,7 @@
 		return -EINVAL;
 	}
 
-	memset(entry, 0, sizeof(partition_entry_t));
+	zeromem(entry, sizeof(partition_entry_t));
 	result = unicode_to_ascii(gpt_entry->name, (uint8_t *)entry->name);
 	if (result != 0) {
 		return result;
diff --git a/include/common/aarch32/el3_common_macros.S b/include/common/aarch32/el3_common_macros.S
index 463a080..f6b7527 100644
--- a/include/common/aarch32/el3_common_macros.S
+++ b/include/common/aarch32/el3_common_macros.S
@@ -98,6 +98,11 @@
 	orr	r0, r0, #FPEXC_EN_BIT
 	vmsr	FPEXC, r0
 	isb
+
+	/* Disable secure self-hosted invasive debug. */
+	ldr	r0, =SDCR_DEF_VAL
+	stcopr	r0, SDCR
+
 	.endm
 
 /* -----------------------------------------------------------------------------
diff --git a/include/common/aarch64/el3_common_macros.S b/include/common/aarch64/el3_common_macros.S
index cbfa6ee..e085f9f 100644
--- a/include/common/aarch64/el3_common_macros.S
+++ b/include/common/aarch64/el3_common_macros.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -79,10 +79,11 @@
 	msr	scr_el3, x0
 
 	/* ---------------------------------------------------------------------
-	 * Reset registers that may have architecturally unknown reset values
+	 * Disable secure self-hosted invasive debug.
 	 * ---------------------------------------------------------------------
 	 */
-	msr	mdcr_el3, xzr
+	mov_imm	x0, MDCR_DEF_VAL
+	msr	mdcr_el3, x0
 
 	/* ---------------------------------------------------------------------
 	 * Enable External Aborts and SError Interrupts now that the exception
@@ -252,12 +253,12 @@
 
 		ldr	x0, =__BSS_START__
 		ldr	x1, =__BSS_SIZE__
-		bl	zeromem16
+		bl	zeromem
 
 #if USE_COHERENT_MEM
 		ldr	x0, =__COHERENT_RAM_START__
 		ldr	x1, =__COHERENT_RAM_UNALIGNED_SIZE__
-		bl	zeromem16
+		bl	zeromem
 #endif
 
 #ifdef IMAGE_BL1
diff --git a/include/lib/aarch32/arch.h b/include/lib/aarch32/arch.h
index 170fa84..8525c7b 100644
--- a/include/lib/aarch32/arch.h
+++ b/include/lib/aarch32/arch.h
@@ -125,6 +125,14 @@
 #define SCTLR_AFE_BIT		(1 << 29)
 #define SCTLR_TE_BIT		(1 << 30)
 
+/* SDCR definitions */
+#define SDCR_SPD(x)		((x) << 14)
+#define SDCR_SPD_LEGACY		0x0
+#define SDCR_SPD_DISABLE	0x2
+#define SDCR_SPD_ENABLE		0x3
+
+#define SDCR_DEF_VAL		SDCR_SPD(SDCR_SPD_DISABLE)
+
 /* HSCTLR definitions */
 #define HSCTLR_RES1 	((1 << 29) | (1 << 28) | (1 << 23) | (1 << 22)	\
 			| (1 << 18) | (1 << 16) | (1 << 11) | (1 << 4)	\
@@ -345,6 +353,7 @@
 /* System register defines The format is: coproc, opt1, CRn, CRm, opt2 */
 #define SCR		p15, 0, c1, c1, 0
 #define SCTLR		p15, 0, c1, c0, 0
+#define SDCR		p15, 0, c1, c3, 1
 #define MPIDR		p15, 0, c0, c0, 5
 #define MIDR		p15, 0, c0, c0, 0
 #define VBAR		p15, 0, c12, c0, 0
diff --git a/include/lib/aarch64/arch.h b/include/lib/aarch64/arch.h
index 3f71824..5876ce8 100644
--- a/include/lib/aarch64/arch.h
+++ b/include/lib/aarch64/arch.h
@@ -195,6 +195,15 @@
 #define SCR_NS_BIT		(1 << 0)
 #define SCR_VALID_BIT_MASK	0x2f8f
 
+/* MDCR definitions */
+#define MDCR_SPD32(x)		((x) << 14)
+#define MDCR_SPD32_LEGACY	0x0
+#define MDCR_SPD32_DISABLE	0x2
+#define MDCR_SPD32_ENABLE	0x3
+#define MDCR_SDD_BIT		(1 << 16)
+
+#define MDCR_DEF_VAL		(MDCR_SDD_BIT | MDCR_SPD32(MDCR_SPD32_DISABLE))
+
 /* HCR definitions */
 #define HCR_RW_BIT		(1ull << 31)
 #define HCR_AMO_BIT		(1 << 5)
diff --git a/include/lib/utils.h b/include/lib/utils.h
index b6bc9af..69bbb43 100644
--- a/include/lib/utils.h
+++ b/include/lib/utils.h
@@ -80,4 +80,35 @@
 # define ULL(_x)	(_x##ull)
 #endif
 
+/*
+ * C code should be put in this part of the header to avoid breaking ASM files
+ * or linker scripts including it.
+ */
+#if !(defined(__LINKER__) || defined(__ASSEMBLY__))
+
+#include <types.h>
+
+/*
+ * Fill a region of normal memory of size "length" in bytes with zero bytes.
+ *
+ * WARNING: This function can only operate on normal memory. This means that
+ *          the MMU must be enabled when using this function. Otherwise, use
+ *          zeromem.
+ */
+void zero_normalmem(void *mem, u_register_t length);
+
+/*
+ * Fill a region of memory of size "length" in bytes with null bytes.
+ *
+ * Unlike zero_normalmem, this function has no restriction on the type of
+ * memory targeted and can be used for any device memory as well as normal
+ * memory. This function must be used instead of zero_normalmem when MMU is
+ * disabled.
+ *
+ * NOTE: When data cache and MMU are enabled, prefer zero_normalmem for faster
+ *       zeroing.
+ */
+void zeromem(void *mem, u_register_t length);
+#endif /* !(defined(__LINKER__) || defined(__ASSEMBLY__)) */
+
 #endif /* __UTILS_H__ */
diff --git a/include/plat/arm/css/common/css_def.h b/include/plat/arm/css/common/css_def.h
index a2fe0d5..7cfaf59 100644
--- a/include/plat/arm/css/common/css_def.h
+++ b/include/plat/arm/css/common/css_def.h
@@ -101,6 +101,13 @@
 #define SSC_VERSION_DESIGNER_ID_MASK		0xff
 #define SSC_VERSION_PART_NUM_MASK		0xfff
 
+/* SSC debug configuration registers */
+#define SSC_DBGCFG_SET		0x14
+#define SSC_DBGCFG_CLR		0x18
+
+#define SPIDEN_INT_CLR_SHIFT	6
+#define SPIDEN_SEL_SET_SHIFT	7
+
 #ifndef __ASSEMBLY__
 
 /* SSC_VERSION related accessors */
diff --git a/include/plat/common/platform.h b/include/plat/common/platform.h
index f904292..73bb643 100644
--- a/include/plat/common/platform.h
+++ b/include/plat/common/platform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -254,6 +254,11 @@
 /*******************************************************************************
  * Optional PSCI functions (BL31).
  ******************************************************************************/
+void plat_psci_stat_accounting_start(const psci_power_state_t *state_info);
+void plat_psci_stat_accounting_stop(const psci_power_state_t *state_info);
+u_register_t plat_psci_stat_get_residency(unsigned int lvl,
+			const psci_power_state_t *state_info,
+			int last_cpu_index);
 plat_local_state_t plat_get_target_pwr_state(unsigned int lvl,
 			const plat_local_state_t *states,
 			unsigned int ncpu);
diff --git a/lib/aarch32/misc_helpers.S b/lib/aarch32/misc_helpers.S
index bf4084a..dc84799 100644
--- a/lib/aarch32/misc_helpers.S
+++ b/lib/aarch32/misc_helpers.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 
 	.globl	smc
 	.globl	zeromem
+	.globl	zero_normalmem
 	.globl	memcpy4
 	.globl	disable_mmu_icache_secure
 	.globl	disable_mmu_secure
@@ -50,30 +51,108 @@
 endfunc smc
 
 /* -----------------------------------------------------------------------
- * void zeromem(void *mem, unsigned int length);
+ * void zeromem(void *mem, unsigned int length)
  *
- * Initialise a memory region to 0.
- * The memory address and length must be 4-byte aligned.
+ * Initialise a region in normal memory to 0. This functions complies with the
+ * AAPCS and can be called from C code.
+ *
  * -----------------------------------------------------------------------
  */
 func zeromem
-#if ASM_ASSERTION
-	tst	r0, #0x3
-	ASM_ASSERT(eq)
-	tst	r1, #0x3
-	ASM_ASSERT(eq)
-#endif
-	add	r2, r0, r1
-	mov	r1, #0
-z_loop:
-	cmp	r2, r0
-	beq	z_end
-	str	r1, [r0], #4
-	b	z_loop
-z_end:
+	/*
+	 * Readable names for registers
+	 *
+	 * Registers r0, r1 and r2 are also set by zeromem which
+	 * branches into the fallback path directly, so cursor, length and
+	 * stop_address should not be retargeted to other registers.
+	 */
+	cursor       .req r0 /* Start address and then current address */
+	length       .req r1 /* Length in bytes of the region to zero out */
+	/*
+	 * Reusing the r1 register as length is only used at the beginning of
+	 * the function.
+	 */
+	stop_address .req r1  /* Address past the last zeroed byte */
+	zeroreg1     .req r2  /* Source register filled with 0 */
+	zeroreg2     .req r3  /* Source register filled with 0 */
+	tmp	     .req r12 /* Temporary scratch register */
+
+	mov	zeroreg1, #0
+
+	/* stop_address is the address past the last to zero */
+	add	stop_address, cursor, length
+
+	/*
+	 * Length cannot be used anymore as it shares the same register with
+	 * stop_address.
+	 */
+	.unreq	length
+
+	/*
+	 * If the start address is already aligned to 8 bytes, skip this loop.
+	 */
+	tst	cursor, #(8-1)
+	beq	.Lzeromem_8bytes_aligned
+
+	/* Calculate the next address aligned to 8 bytes */
+	orr	tmp, cursor, #(8-1)
+	adds	tmp, tmp, #1
+	/* If it overflows, fallback to byte per byte zeroing */
+	beq	.Lzeromem_1byte_aligned
+	/* If the next aligned address is after the stop address, fall back */
+	cmp	tmp, stop_address
+	bhs	.Lzeromem_1byte_aligned
+
+	/* zero byte per byte */
+1:
+	strb	zeroreg1, [cursor], #1
+	cmp	cursor, tmp
+	bne	1b
+
+	/* zero 8 bytes at a time */
+.Lzeromem_8bytes_aligned:
+
+	/* Calculate the last 8 bytes aligned address. */
+	bic	tmp, stop_address, #(8-1)
+
+	cmp	cursor, tmp
+	bhs	2f
+
+	mov	zeroreg2, #0
+1:
+	stmia	cursor!, {zeroreg1, zeroreg2}
+	cmp	cursor, tmp
+	blo	1b
+2:
+
+	/* zero byte per byte */
+.Lzeromem_1byte_aligned:
+	cmp	cursor, stop_address
+	beq	2f
+1:
+	strb	zeroreg1, [cursor], #1
+	cmp	cursor, stop_address
+	bne	1b
+2:
 	bx	lr
+
+	.unreq	cursor
+	/*
+	 * length is already unreq'ed to reuse the register for another
+	 * variable.
+	 */
+	.unreq	stop_address
+	.unreq	zeroreg1
+	.unreq	zeroreg2
+	.unreq	tmp
 endfunc zeromem
 
+/*
+ * AArch32 does not have special ways of zeroing normal memory as AArch64 does
+ * using the DC ZVA instruction, so we just alias zero_normalmem to zeromem.
+ */
+.equ	zero_normalmem, zeromem
+
 /* --------------------------------------------------------------------------
  * void memcpy4(void *dest, const void *src, unsigned int length)
  *
diff --git a/lib/aarch64/misc_helpers.S b/lib/aarch64/misc_helpers.S
index 574146f..84265e0 100644
--- a/lib/aarch64/misc_helpers.S
+++ b/lib/aarch64/misc_helpers.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,6 +37,8 @@
 	.globl	eret
 	.globl	smc
 
+	.globl	zero_normalmem
+	.globl	zeromem
 	.globl	zeromem16
 	.globl	memcpy16
 
@@ -80,31 +82,358 @@
  *
  * Initialise a memory region to 0.
  * The memory address must be 16-byte aligned.
+ * NOTE: This function is deprecated and zeromem should be used instead.
  * -----------------------------------------------------------------------
  */
-func zeromem16
+.equ	zeromem16, zeromem
+
+/* -----------------------------------------------------------------------
+ * void zero_normalmem(void *mem, unsigned int length);
+ *
+ * Initialise a region in normal memory to 0. This functions complies with the
+ * AAPCS and can be called from C code.
+ *
+ * NOTE: MMU must be enabled when using this function as it can only operate on
+ *       normal memory. It is intended to be mainly used from C code when MMU
+ *       is usually enabled.
+ * -----------------------------------------------------------------------
+ */
+.equ	zero_normalmem, zeromem_dczva
+
+/* -----------------------------------------------------------------------
+ * void zeromem(void *mem, unsigned int length);
+ *
+ * Initialise a region of device memory to 0. This functions complies with the
+ * AAPCS and can be called from C code.
+ *
+ * NOTE: When data caches and MMU are enabled, zero_normalmem can usually be
+ *       used instead for faster zeroing.
+ *
+ * -----------------------------------------------------------------------
+ */
+func zeromem
+	/* x2 is the address past the last zeroed address */
+	add	x2, x0, x1
+	/*
+	 * Uses the fallback path that does not use DC ZVA instruction and
+	 * therefore does not need enabled MMU
+	 */
+	b	.Lzeromem_dczva_fallback_entry
+endfunc zeromem
+
+/* -----------------------------------------------------------------------
+ * void zeromem_dczva(void *mem, unsigned int length);
+ *
+ * Fill a region of normal memory of size "length" in bytes with null bytes.
+ * MMU must be enabled and the memory be of
+ * normal type. This is because this function internally uses the DC ZVA
+ * instruction, which generates an Alignment fault if used on any type of
+ * Device memory (see section D3.4.9 of the ARMv8 ARM, issue k). When the MMU
+ * is disabled, all memory behaves like Device-nGnRnE memory (see section
+ * D4.2.8), hence the requirement on the MMU being enabled.
+ * NOTE: The code assumes that the block size as defined in DCZID_EL0
+ *       register is at least 16 bytes.
+ *
+ * -----------------------------------------------------------------------
+ */
+func zeromem_dczva
+
+	/*
+	 * The function consists of a series of loops that zero memory one byte
+	 * at a time, 16 bytes at a time or using the DC ZVA instruction to
+	 * zero aligned block of bytes, which is assumed to be more than 16.
+	 * In the case where the DC ZVA instruction cannot be used or if the
+	 * first 16 bytes loop would overflow, there is fallback path that does
+	 * not use DC ZVA.
+	 * Note: The fallback path is also used by the zeromem function that
+	 *       branches to it directly.
+	 *
+	 *              +---------+   zeromem_dczva
+	 *              |  entry  |
+	 *              +----+----+
+	 *                   |
+	 *                   v
+	 *              +---------+
+	 *              | checks  |>o-------+ (If any check fails, fallback)
+	 *              +----+----+         |
+	 *                   |              |---------------+
+	 *                   v              | Fallback path |
+	 *            +------+------+       |---------------+
+	 *            | 1 byte loop |       |
+	 *            +------+------+ .Lzeromem_dczva_initial_1byte_aligned_end
+	 *                   |              |
+	 *                   v              |
+	 *           +-------+-------+      |
+	 *           | 16 bytes loop |      |
+	 *           +-------+-------+      |
+	 *                   |              |
+	 *                   v              |
+	 *            +------+------+ .Lzeromem_dczva_blocksize_aligned
+	 *            | DC ZVA loop |       |
+	 *            +------+------+       |
+	 *       +--------+  |              |
+	 *       |        |  |              |
+	 *       |        v  v              |
+	 *       |   +-------+-------+ .Lzeromem_dczva_final_16bytes_aligned
+	 *       |   | 16 bytes loop |      |
+	 *       |   +-------+-------+      |
+	 *       |           |              |
+	 *       |           v              |
+	 *       |    +------+------+ .Lzeromem_dczva_final_1byte_aligned
+	 *       |    | 1 byte loop |       |
+	 *       |    +-------------+       |
+	 *       |           |              |
+	 *       |           v              |
+	 *       |       +---+--+           |
+	 *       |       | exit |           |
+	 *       |       +------+           |
+	 *       |			    |
+	 *       |           +--------------+    +------------------+ zeromem
+	 *       |           |  +----------------| zeromem function |
+	 *       |           |  |                +------------------+
+	 *       |           v  v
+	 *       |    +-------------+ .Lzeromem_dczva_fallback_entry
+	 *       |    | 1 byte loop |
+	 *       |    +------+------+
+	 *       |           |
+	 *       +-----------+
+	 */
+
+	/*
+	 * Readable names for registers
+	 *
+	 * Registers x0, x1 and x2 are also set by zeromem which
+	 * branches into the fallback path directly, so cursor, length and
+	 * stop_address should not be retargeted to other registers.
+	 */
+	cursor       .req x0 /* Start address and then current address */
+	length       .req x1 /* Length in bytes of the region to zero out */
+	/* Reusing x1 as length is never used after block_mask is set */
+	block_mask   .req x1 /* Bitmask of the block size read in DCZID_EL0 */
+	stop_address .req x2 /* Address past the last zeroed byte */
+	block_size   .req x3 /* Size of a block in bytes as read in DCZID_EL0 */
+	tmp1         .req x4
+	tmp2         .req x5
+
 #if ASM_ASSERTION
-	tst	x0, #0xf
-	ASM_ASSERT(eq)
+	/*
+	 * Check for M bit (MMU enabled) of the current SCTLR_EL(1|3)
+	 * register value and panic if the MMU is disabled.
+	 */
+#if defined(IMAGE_BL1) || defined(IMAGE_BL31)
+	mrs	tmp1, sctlr_el3
+#else
+	mrs	tmp1, sctlr_el1
 #endif
-	add	x2, x0, x1
-/* zero 16 bytes at a time */
-z_loop16:
-	sub	x3, x2, x0
-	cmp	x3, #16
-	b.lt	z_loop1
-	stp	xzr, xzr, [x0], #16
-	b	z_loop16
-/* zero byte per byte */
-z_loop1:
-	cmp	x0, x2
-	b.eq	z_end
-	strb	wzr, [x0], #1
-	b	z_loop1
-z_end:
+
+	tst	tmp1, #SCTLR_M_BIT
+	ASM_ASSERT(ne)
+#endif /* ASM_ASSERTION */
+
+	/* stop_address is the address past the last to zero */
+	add	stop_address, cursor, length
+
+	/*
+	 * Get block_size = (log2(<block size>) >> 2) (see encoding of
+	 * dczid_el0 reg)
+	 */
+	mrs	block_size, dczid_el0
+
+	/*
+	 * Select the 4 lowest bits and convert the extracted log2(<block size
+	 * in words>) to <block size in bytes>
+	 */
+	ubfx	block_size, block_size, #0, #4
+	mov	tmp2, #(1 << 2)
+	lsl	block_size, tmp2, block_size
+
+#if ASM_ASSERTION
+	/*
+	 * Assumes block size is at least 16 bytes to avoid manual realignment
+	 * of the cursor at the end of the DCZVA loop.
+	 */
+	cmp	block_size, #16
+	ASM_ASSERT(hs)
+#endif
+	/*
+	 * Not worth doing all the setup for a region less than a block and
+	 * protects against zeroing a whole block when the area to zero is
+	 * smaller than that. Also, as it is assumed that the block size is at
+	 * least 16 bytes, this also protects the initial aligning loops from
+	 * trying to zero 16 bytes when length is less than 16.
+	 */
+	cmp	length, block_size
+	b.lo	.Lzeromem_dczva_fallback_entry
+
+	/*
+	 * Calculate the bitmask of the block alignment. It will never
+	 * underflow as the block size is between 4 bytes and 2kB.
+	 * block_mask = block_size - 1
+	 */
+	sub	block_mask, block_size, #1
+
+	/*
+	 * length alias should not be used after this point unless it is
+	 * defined as a register other than block_mask's.
+	 */
+	 .unreq length
+
+	/*
+	 * If the start address is already aligned to zero block size, go
+	 * straight to the cache zeroing loop. This is safe because at this
+	 * point, the length cannot be smaller than a block size.
+	 */
+	tst	cursor, block_mask
+	b.eq	.Lzeromem_dczva_blocksize_aligned
+
+	/*
+	 * Calculate the first block-size-aligned address. It is assumed that
+	 * the zero block size is at least 16 bytes. This address is the last
+	 * address of this initial loop.
+	 */
+	orr	tmp1, cursor, block_mask
+	add	tmp1, tmp1, #1
+
+	/*
+	 * If the addition overflows, skip the cache zeroing loops. This is
+	 * quite unlikely however.
+	 */
+	cbz	tmp1, .Lzeromem_dczva_fallback_entry
+
+	/*
+	 * If the first block-size-aligned address is past the last address,
+	 * fallback to the simpler code.
+	 */
+	cmp	tmp1, stop_address
+	b.hi	.Lzeromem_dczva_fallback_entry
+
+	/*
+	 * If the start address is already aligned to 16 bytes, skip this loop.
+	 * It is safe to do this because tmp1 (the stop address of the initial
+	 * 16 bytes loop) will never be greater than the final stop address.
+	 */
+	tst	cursor, #0xf
+	b.eq	.Lzeromem_dczva_initial_1byte_aligned_end
+
+	/* Calculate the next address aligned to 16 bytes */
+	orr	tmp2, cursor, #0xf
+	add	tmp2, tmp2, #1
+	/* If it overflows, fallback to the simple path (unlikely) */
+	cbz	tmp2, .Lzeromem_dczva_fallback_entry
+	/*
+	 * Next aligned address cannot be after the stop address because the
+	 * length cannot be smaller than 16 at this point.
+	 */
+
+	/* First loop: zero byte per byte */
+1:
+	strb	wzr, [cursor], #1
+	cmp	cursor, tmp2
+	b.ne	1b
+.Lzeromem_dczva_initial_1byte_aligned_end:
+
+	/*
+	 * Second loop: we need to zero 16 bytes at a time from cursor to tmp1
+	 * before being able to use the code that deals with block-size-aligned
+	 * addresses.
+	 */
+	cmp	cursor, tmp1
+	b.hs	2f
+1:
+	stp	xzr, xzr, [cursor], #16
+	cmp	cursor, tmp1
+	b.lo	1b
+2:
+
+	/*
+	 * Third loop: zero a block at a time using DC ZVA cache block zeroing
+	 * instruction.
+	 */
+.Lzeromem_dczva_blocksize_aligned:
+	/*
+	 * Calculate the last block-size-aligned address. If the result equals
+	 * to the start address, the loop will exit immediately.
+	 */
+	bic	tmp1, stop_address, block_mask
+
+	cmp	cursor, tmp1
+	b.hs	2f
+1:
+	/* Zero the block containing the cursor */
+	dc	zva, cursor
+	/* Increment the cursor by the size of a block */
+	add	cursor, cursor, block_size
+	cmp	cursor, tmp1
+	b.lo	1b
+2:
+
+	/*
+	 * Fourth loop: zero 16 bytes at a time and then byte per byte the
+	 * remaining area
+	 */
+.Lzeromem_dczva_final_16bytes_aligned:
+	/*
+	 * Calculate the last 16 bytes aligned address. It is assumed that the
+	 * block size will never be smaller than 16 bytes so that the current
+	 * cursor is aligned to at least 16 bytes boundary.
+	 */
+	bic	tmp1, stop_address, #15
+
+	cmp	cursor, tmp1
+	b.hs	2f
+1:
+	stp	xzr, xzr, [cursor], #16
+	cmp	cursor, tmp1
+	b.lo	1b
+2:
+
+	/* Fifth and final loop: zero byte per byte */
+.Lzeromem_dczva_final_1byte_aligned:
+	cmp	cursor, stop_address
+	b.eq	2f
+1:
+	strb	wzr, [cursor], #1
+	cmp	cursor, stop_address
+	b.ne	1b
+2:
 	ret
-endfunc zeromem16
+
+	/* Fallback for unaligned start addresses */
+.Lzeromem_dczva_fallback_entry:
+	/*
+	 * If the start address is already aligned to 16 bytes, skip this loop.
+	 */
+	tst	cursor, #0xf
+	b.eq	.Lzeromem_dczva_final_16bytes_aligned
+
+	/* Calculate the next address aligned to 16 bytes */
+	orr	tmp1, cursor, #15
+	add	tmp1, tmp1, #1
+	/* If it overflows, fallback to byte per byte zeroing */
+	cbz	tmp1, .Lzeromem_dczva_final_1byte_aligned
+	/* If the next aligned address is after the stop address, fall back */
+	cmp	tmp1, stop_address
+	b.hs	.Lzeromem_dczva_final_1byte_aligned
+
+	/* Fallback entry loop: zero byte per byte */
+1:
+	strb	wzr, [cursor], #1
+	cmp	cursor, tmp1
+	b.ne	1b
+
+	b	.Lzeromem_dczva_final_16bytes_aligned
 
+	.unreq	cursor
+	/*
+	 * length is already unreq'ed to reuse the register for another
+	 * variable.
+	 */
+	.unreq	stop_address
+	.unreq	block_size
+	.unreq	block_mask
+	.unreq	tmp1
+	.unreq	tmp2
+endfunc zeromem_dczva
 
 /* --------------------------------------------------------------------------
  * void memcpy16(void *dest, const void *src, unsigned int length)
diff --git a/lib/el3_runtime/aarch32/context_mgmt.c b/lib/el3_runtime/aarch32/context_mgmt.c
index 51b7759..df22eaf 100644
--- a/lib/el3_runtime/aarch32/context_mgmt.c
+++ b/lib/el3_runtime/aarch32/context_mgmt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,6 +38,7 @@
 #include <platform_def.h>
 #include <smcc_helpers.h>
 #include <string.h>
+#include <utils.h>
 
 /*******************************************************************************
  * Context management library initialisation routine. This library is used by
@@ -84,7 +85,7 @@
 	security_state = GET_SECURITY_STATE(ep->h.attr);
 
 	/* Clear any residual register values from the context */
-	memset(ctx, 0, sizeof(*ctx));
+	zeromem(ctx, sizeof(*ctx));
 
 	reg_ctx = get_regs_ctx(ctx);
 
diff --git a/lib/el3_runtime/aarch64/context_mgmt.c b/lib/el3_runtime/aarch64/context_mgmt.c
index e26950d..5cce879 100644
--- a/lib/el3_runtime/aarch64/context_mgmt.c
+++ b/lib/el3_runtime/aarch64/context_mgmt.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -39,6 +39,7 @@
 #include <platform_def.h>
 #include <smcc_helpers.h>
 #include <string.h>
+#include <utils.h>
 
 
 /*******************************************************************************
@@ -91,7 +92,7 @@
 	security_state = GET_SECURITY_STATE(ep->h.attr);
 
 	/* Clear any residual register values from the context */
-	memset(ctx, 0, sizeof(*ctx));
+	zeromem(ctx, sizeof(*ctx));
 
 	/*
 	 * Base the context SCR on the current value, adjust for entry point
diff --git a/lib/psci/psci_common.c b/lib/psci/psci_common.c
index 68cdd6e..9fdce49 100644
--- a/lib/psci/psci_common.c
+++ b/lib/psci/psci_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -37,6 +37,7 @@
 #include <debug.h>
 #include <platform.h>
 #include <string.h>
+#include <utils.h>
 #include "psci_private.h"
 
 /*
@@ -622,7 +623,7 @@
 	SET_PARAM_HEAD(ep, PARAM_EP, VERSION_1, ep_attr);
 
 	ep->pc = entrypoint;
-	memset(&ep->args, 0, sizeof(ep->args));
+	zeromem(&ep->args, sizeof(ep->args));
 	ep->args.arg0 = context_id;
 
 	mode = scr & SCR_HCE_BIT ? MODE32_hyp : MODE32_svc;
@@ -659,7 +660,7 @@
 	SET_PARAM_HEAD(ep, PARAM_EP, VERSION_1, ep_attr);
 
 	ep->pc = entrypoint;
-	memset(&ep->args, 0, sizeof(ep->args));
+	zeromem(&ep->args, sizeof(ep->args));
 	ep->args.arg0 = context_id;
 
 	/*
@@ -760,13 +761,7 @@
 				      cpu_idx);
 
 #if ENABLE_PSCI_STAT
-	/*
-	 * Capture power up time-stamp.
-	 * No cache maintenance is required as caches are off
-	 * and writes are direct to the main memory.
-	 */
-	PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_EXIT_LOW_PWR,
-		PMF_NO_CACHE_MAINT);
+	plat_psci_stat_accounting_stop(&state_info);
 #endif
 
 	psci_get_target_local_pwr_states(end_pwrlvl, &state_info);
@@ -801,7 +796,7 @@
 	 * Since caches are now enabled, it's necessary to do cache
 	 * maintenance before reading that same data.
 	 */
-	psci_stats_update_pwr_up(end_pwrlvl, &state_info, PMF_CACHE_MAINT);
+	psci_stats_update_pwr_up(end_pwrlvl, &state_info);
 #endif
 
 	/*
@@ -957,7 +952,7 @@
 {
 	psci_power_state_t state_info;
 
-	memset(&state_info, 0, sizeof(state_info));
+	zeromem(&state_info, sizeof(state_info));
 	psci_get_target_local_pwr_states(PLAT_MAX_PWR_LVL, &state_info);
 
 	return psci_find_target_suspend_lvl(&state_info);
diff --git a/lib/psci/psci_main.c b/lib/psci/psci_main.c
index 0a3a60a..5e166b5 100644
--- a/lib/psci/psci_main.c
+++ b/lib/psci/psci_main.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -117,13 +117,7 @@
 		psci_set_cpu_local_state(cpu_pd_state);
 
 #if ENABLE_PSCI_STAT
-		/*
-		 * Capture time-stamp before CPU standby
-		 * No cache maintenance is needed as caches
-		 * are ON through out the CPU standby operation.
-		 */
-		PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
-			PMF_NO_CACHE_MAINT);
+		plat_psci_stat_accounting_start(&state_info);
 #endif
 
 #if ENABLE_RUNTIME_INSTRUMENTATION
@@ -144,13 +138,10 @@
 #endif
 
 #if ENABLE_PSCI_STAT
-		/* Capture time-stamp after CPU standby */
-		PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_EXIT_LOW_PWR,
-			PMF_NO_CACHE_MAINT);
+		plat_psci_stat_accounting_stop(&state_info);
 
 		/* Update PSCI stats */
-		psci_stats_update_pwr_up(PSCI_CPU_PWR_LVL, &state_info,
-			PMF_NO_CACHE_MAINT);
+		psci_stats_update_pwr_up(PSCI_CPU_PWR_LVL, &state_info);
 #endif
 
 		return PSCI_E_SUCCESS;
diff --git a/lib/psci/psci_off.c b/lib/psci/psci_off.c
index 897bf31..394aaa3 100644
--- a/lib/psci/psci_off.c
+++ b/lib/psci/psci_off.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -137,13 +137,7 @@
 	psci_plat_pm_ops->pwr_domain_off(&state_info);
 
 #if ENABLE_PSCI_STAT
-	/*
-	 * Capture time-stamp while entering low power state.
-	 * No cache maintenance needed because caches are off
-	 * and writes are direct to main memory.
-	 */
-	PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
-		PMF_NO_CACHE_MAINT);
+	plat_psci_stat_accounting_start(&state_info);
 #endif
 
 exit:
diff --git a/lib/psci/psci_private.h b/lib/psci/psci_private.h
index 781b3b5..ca8291e 100644
--- a/lib/psci/psci_private.h
+++ b/lib/psci/psci_private.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,6 @@
 #include <bakery_lock.h>
 #include <bl_common.h>
 #include <cpu_data.h>
-#include <pmf.h>
 #include <psci.h>
 #include <spinlock.h>
 
@@ -106,15 +105,6 @@
 #define is_cpu_standby_req(is_power_down_state, retn_lvl) \
 		(((!(is_power_down_state)) && ((retn_lvl) == 0)) ? 1 : 0)
 
-/* Following are used as ID's to capture time-stamp */
-#define PSCI_STAT_ID_ENTER_LOW_PWR		0
-#define PSCI_STAT_ID_EXIT_LOW_PWR		1
-#define PSCI_STAT_TOTAL_IDS			2
-
-/* Declare PMF service functions for PSCI */
-PMF_DECLARE_CAPTURE_TIMESTAMP(psci_svc)
-PMF_DECLARE_GET_TIMESTAMP(psci_svc)
-
 /*******************************************************************************
  * The following two data structures implement the power domain tree. The tree
  * is used to track the state of all the nodes i.e. power domain instances
@@ -246,8 +236,7 @@
 void psci_stats_update_pwr_down(unsigned int end_pwrlvl,
 			const psci_power_state_t *state_info);
 void psci_stats_update_pwr_up(unsigned int end_pwrlvl,
-			const psci_power_state_t *state_info,
-			unsigned int flags);
+			const psci_power_state_t *state_info);
 u_register_t psci_stat_residency(u_register_t target_cpu,
 			unsigned int power_state);
 u_register_t psci_stat_count(u_register_t target_cpu,
diff --git a/lib/psci/psci_stat.c b/lib/psci/psci_stat.c
index ecbe592..d8034a5 100644
--- a/lib/psci/psci_stat.c
+++ b/lib/psci/psci_stat.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,9 +38,6 @@
 #define PLAT_MAX_PWR_LVL_STATES 2
 #endif
 
-/* Ticks elapsed in one second by a signal of 1 MHz */
-#define MHZ_TICKS_PER_SEC 1000000
-
 /* Following structure is used for PSCI STAT */
 typedef struct psci_stat {
 	u_register_t residency;
@@ -62,28 +59,7 @@
 static psci_stat_t psci_non_cpu_stat[PSCI_NUM_NON_CPU_PWR_DOMAINS]
 				[PLAT_MAX_PWR_LVL_STATES];
 
-/* Register PMF PSCI service */
-PMF_REGISTER_SERVICE(psci_svc, PMF_PSCI_STAT_SVC_ID,
-	 PSCI_STAT_TOTAL_IDS, PMF_STORE_ENABLE)
-
-/* The divisor to use to convert raw timestamp into microseconds */
-u_register_t residency_div;
-
 /*
- * This macro calculates the stats residency in microseconds,
- * taking in account the wrap around condition.
- */
-#define calc_stat_residency(_pwrupts, _pwrdnts, _res)		\
-	do {							\
-		if (_pwrupts < _pwrdnts)			\
-			_res = UINT64_MAX - _pwrdnts + _pwrupts;\
-		else						\
-			_res = _pwrupts - _pwrdnts;		\
-		/* Convert timestamp into microseconds */	\
-		_res = _res/residency_div;			\
-	} while (0)
-
-/*
  * This functions returns the index into the `psci_stat_t` array given the
  * local power state and power domain level. If the platform implements the
  * `get_pwr_lvl_state_idx` pm hook, then that will be used to return the index.
@@ -150,44 +126,23 @@
  * It is called with caches enabled and locks acquired(for NON-CPU domain)
  ******************************************************************************/
 void psci_stats_update_pwr_up(unsigned int end_pwrlvl,
-			const psci_power_state_t *state_info,
-			unsigned int flags)
+			const psci_power_state_t *state_info)
 {
 	int parent_idx, cpu_idx = plat_my_core_pos();
 	int lvl, stat_idx;
 	plat_local_state_t local_state;
-	unsigned long long pwrup_ts = 0, pwrdn_ts = 0;
 	u_register_t residency;
 
 	assert(end_pwrlvl <= PLAT_MAX_PWR_LVL);
 	assert(state_info);
 
-	/* Initialize the residency divisor if not already initialized */
-	if (!residency_div) {
-		/* Pre-calculate divisor so that it can be directly used to
-		   convert time-stamp into microseconds */
-		residency_div = read_cntfrq_el0() / MHZ_TICKS_PER_SEC;
-		assert(residency_div);
-	}
-
-	/* Get power down time-stamp for current CPU */
-	PMF_GET_TIMESTAMP_BY_INDEX(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
-			cpu_idx, flags, pwrdn_ts);
-
-	/* In the case of 1st power on just return */
-	if (!pwrdn_ts)
-		return;
-
-	/* Get power up time-stamp for current CPU */
-	PMF_GET_TIMESTAMP_BY_INDEX(psci_svc, PSCI_STAT_ID_EXIT_LOW_PWR,
-			cpu_idx, flags, pwrup_ts);
-
 	/* Get the index into the stats array */
 	local_state = state_info->pwr_domain_state[PSCI_CPU_PWR_LVL];
 	stat_idx = get_stat_idx(local_state, PSCI_CPU_PWR_LVL);
 
-	/* Calculate stats residency */
-	calc_stat_residency(pwrup_ts, pwrdn_ts, residency);
+	/* Call into platform interface to calculate residency. */
+	residency = plat_psci_stat_get_residency(PSCI_CPU_PWR_LVL,
+	    state_info, cpu_idx);
 
 	/* Update CPU stats. */
 	psci_cpu_stat[cpu_idx][stat_idx].residency += residency;
@@ -207,10 +162,9 @@
 
 		assert(last_cpu_in_non_cpu_pd[parent_idx] != -1);
 
-		/* Get power down time-stamp for last CPU */
-		PMF_GET_TIMESTAMP_BY_INDEX(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
-				last_cpu_in_non_cpu_pd[parent_idx],
-				flags, pwrdn_ts);
+		/* Call into platform interface to calculate residency. */
+		residency = plat_psci_stat_get_residency(lvl, state_info,
+		    last_cpu_in_non_cpu_pd[parent_idx]);
 
 		/* Initialize back to reset value */
 		last_cpu_in_non_cpu_pd[parent_idx] = -1;
@@ -218,9 +172,6 @@
 		/* Get the index into the stats array */
 		stat_idx = get_stat_idx(local_state, lvl);
 
-		/* Calculate stats residency */
-		calc_stat_residency(pwrup_ts, pwrdn_ts, residency);
-
 		/* Update non cpu stats */
 		psci_non_cpu_stat[parent_idx][stat_idx].residency += residency;
 		psci_non_cpu_stat[parent_idx][stat_idx].count++;
diff --git a/lib/psci/psci_suspend.c b/lib/psci/psci_suspend.c
index dc2ab77..302116b 100644
--- a/lib/psci/psci_suspend.c
+++ b/lib/psci/psci_suspend.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -211,13 +211,7 @@
 	psci_plat_pm_ops->pwr_domain_suspend(state_info);
 
 #if ENABLE_PSCI_STAT
-	/*
-	 * Capture time-stamp while entering low power state.
-	 * No cache maintenance needed because caches are off
-	 * and writes are direct to main memory.
-	 */
-	PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
-		PMF_NO_CACHE_MAINT);
+	plat_psci_stat_accounting_start(state_info);
 #endif
 
 exit:
@@ -257,6 +251,10 @@
 	    PMF_NO_CACHE_MAINT);
 #endif
 
+#if ENABLE_PSCI_STAT
+	plat_psci_stat_accounting_start(state_info);
+#endif
+
 	/*
 	 * We will reach here if only retention/standby states have been
 	 * requested at multiple power levels. This means that the cpu
@@ -264,6 +262,11 @@
 	 */
 	wfi();
 
+#if ENABLE_PSCI_STAT
+	plat_psci_stat_accounting_stop(state_info);
+	psci_stats_update_pwr_up(end_pwrlvl, state_info);
+#endif
+
 #if ENABLE_RUNTIME_INSTRUMENTATION
 	PMF_CAPTURE_TIMESTAMP(rt_instr_svc,
 	    RT_INSTR_EXIT_HW_LOW_PWR,
diff --git a/plat/arm/board/juno/juno_security.c b/plat/arm/board/juno/juno_security.c
index 202342a..70637d6 100644
--- a/plat/arm/board/juno/juno_security.c
+++ b/plat/arm/board/juno/juno_security.c
@@ -60,16 +60,34 @@
 }
 
 /*******************************************************************************
+ * Initialize debug configuration.
+ ******************************************************************************/
+static void init_debug_cfg(void)
+{
+#if !DEBUG
+	/* Set internal drive selection for SPIDEN. */
+	mmio_write_32(SSC_REG_BASE + SSC_DBGCFG_SET,
+		1U << SPIDEN_SEL_SET_SHIFT);
+
+	/* Drive SPIDEN LOW to disable invasive debug of secure state. */
+	mmio_write_32(SSC_REG_BASE + SSC_DBGCFG_CLR,
+		1U << SPIDEN_INT_CLR_SHIFT);
+#endif
+}
+
+/*******************************************************************************
  * Initialize the secure environment.
  ******************************************************************************/
 void plat_arm_security_setup(void)
 {
+	/* Initialize debug configuration */
+	init_debug_cfg();
 	/* Initialize the TrustZone Controller */
 	arm_tzc400_setup();
 	/* Do ARM CSS internal NIC setup */
 	css_init_nic400();
 	/* Do ARM CSS SoC security setup */
 	soc_css_security_setup();
-	/* Initialize the SMMU SSD tables*/
+	/* Initialize the SMMU SSD tables */
 	init_mmu401();
 }
diff --git a/plat/arm/common/arm_bl2_setup.c b/plat/arm/common/arm_bl2_setup.c
index 5f30708..007108d 100644
--- a/plat/arm/common/arm_bl2_setup.c
+++ b/plat/arm/common/arm_bl2_setup.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -38,6 +38,7 @@
 #include <plat_arm.h>
 #include <platform_def.h>
 #include <string.h>
+#include <utils.h>
 
 /* Data structure which holds the extents of the trusted SRAM for BL2 */
 static meminfo_t bl2_tzram_layout __aligned(CACHE_WRITEBACK_GRANULE);
@@ -123,7 +124,7 @@
 	 * Initialise the memory for all the arguments that needs to
 	 * be passed to BL31
 	 */
-	memset(&bl31_params_mem, 0, sizeof(bl2_to_bl31_params_mem_t));
+	zeromem(&bl31_params_mem, sizeof(bl2_to_bl31_params_mem_t));
 
 	/* Assign memory for TF related information */
 	bl2_to_bl31_params = &bl31_params_mem.bl31_params;
diff --git a/plat/arm/common/arm_common.mk b/plat/arm/common/arm_common.mk
index c2f28f9..4628a43 100644
--- a/plat/arm/common/arm_common.mk
+++ b/plat/arm/common/arm_common.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2016, ARM Limited and Contributors. All rights reserved.
+# Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@@ -92,6 +92,7 @@
 
 # Enable PSCI_STAT_COUNT/RESIDENCY APIs on ARM platforms
 ENABLE_PSCI_STAT		:=	1
+ENABLE_PMF			:=	1
 
 # On ARM platforms, separate the code and read-only data sections to allow
 # mapping the former as executable and the latter as execute-never.
diff --git a/plat/arm/css/common/css_bl2_setup.c b/plat/arm/css/common/css_bl2_setup.c
index 11ca342..5361d89 100644
--- a/plat/arm/css/common/css_bl2_setup.c
+++ b/plat/arm/css/common/css_bl2_setup.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -98,7 +98,7 @@
 	 *  - restoring the SCP boot configuration.
 	 */
 	VERBOSE("BL2: Restoring SCP reset data in Trusted SRAM\n");
-	memset((void *) ARM_TRUSTED_SRAM_BASE, 0, 128);
+	zero_normalmem((void *)ARM_TRUSTED_SRAM_BASE, 128);
 	mmio_write_32(SCP_BOOT_CFG_ADDR, scp_boot_config);
 }
 #endif /* EL3_PAYLOAD_BASE */
diff --git a/plat/arm/css/drivers/scpi/css_scpi.c b/plat/arm/css/drivers/scpi/css_scpi.c
index f419abd..65ae978 100644
--- a/plat/arm/css/drivers/scpi/css_scpi.c
+++ b/plat/arm/css/drivers/scpi/css_scpi.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2014-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -34,6 +34,7 @@
 #include <debug.h>
 #include <platform.h>
 #include <string.h>
+#include <utils.h>
 #include "css_mhu.h"
 #include "css_scpi.h"
 
@@ -204,7 +205,8 @@
 	scpi_secure_message_start();
 
 	/* Populate request headers */
-	cmd = memset(SCPI_CMD_HEADER_AP_TO_SCP, 0, sizeof(*cmd));
+	zeromem(SCPI_CMD_HEADER_AP_TO_SCP, sizeof(*cmd));
+	cmd = SCPI_CMD_HEADER_AP_TO_SCP;
 	cmd->id = SCPI_CMD_GET_CSS_POWER_STATE;
 
 	/*
diff --git a/plat/common/plat_psci_common.c b/plat/common/plat_psci_common.c
index 3eb6886..0e00faa 100644
--- a/plat/common/plat_psci_common.c
+++ b/plat/common/plat_psci_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,8 +31,125 @@
 #include <arch.h>
 #include <assert.h>
 #include <platform.h>
+#include <pmf.h>
 #include <psci.h>
 
+#if ENABLE_PSCI_STAT && ENABLE_PMF
+#pragma weak plat_psci_stat_accounting_start
+#pragma weak plat_psci_stat_accounting_stop
+#pragma weak plat_psci_stat_get_residency
+
+/* Ticks elapsed in one second by a signal of 1 MHz */
+#define MHZ_TICKS_PER_SEC 1000000
+
+/* Following are used as ID's to capture time-stamp */
+#define PSCI_STAT_ID_ENTER_LOW_PWR		0
+#define PSCI_STAT_ID_EXIT_LOW_PWR		1
+#define PSCI_STAT_TOTAL_IDS			2
+
+PMF_REGISTER_SERVICE(psci_svc, PMF_PSCI_STAT_SVC_ID, PSCI_STAT_TOTAL_IDS,
+	PMF_STORE_ENABLE)
+
+/*
+ * This function calculates the stats residency in microseconds,
+ * taking in account the wrap around condition.
+ */
+static u_register_t calc_stat_residency(unsigned long long pwrupts,
+	unsigned long long pwrdnts)
+{
+	/* The divisor to use to convert raw timestamp into microseconds. */
+	u_register_t residency_div;
+	u_register_t res;
+
+	/*
+	 * Calculate divisor so that it can be directly used to
+	 * convert time-stamp into microseconds.
+	 */
+	residency_div = read_cntfrq_el0() / MHZ_TICKS_PER_SEC;
+	assert(residency_div);
+
+	if (pwrupts < pwrdnts)
+		res = UINT64_MAX - pwrdnts + pwrupts;
+	else
+		res = pwrupts - pwrdnts;
+
+	return res / residency_div;
+}
+
+/*
+ * Capture timestamp before entering a low power state.
+ * No cache maintenance is required when capturing the timestamp.
+ * Cache maintenance may be needed when reading these timestamps.
+ */
+void plat_psci_stat_accounting_start(
+	__unused const psci_power_state_t *state_info)
+{
+	assert(state_info);
+	PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_ENTER_LOW_PWR,
+		PMF_NO_CACHE_MAINT);
+}
+
+/*
+ * Capture timestamp after exiting a low power state.
+ * No cache maintenance is required when capturing the timestamp.
+ * Cache maintenance may be needed when reading these timestamps.
+ */
+void plat_psci_stat_accounting_stop(
+	__unused const psci_power_state_t *state_info)
+{
+	assert(state_info);
+	PMF_CAPTURE_TIMESTAMP(psci_svc, PSCI_STAT_ID_EXIT_LOW_PWR,
+		PMF_NO_CACHE_MAINT);
+}
+
+/*
+ * Calculate the residency for the given level and power state
+ * information.
+ */
+u_register_t plat_psci_stat_get_residency(unsigned int lvl,
+	const psci_power_state_t *state_info,
+	int last_cpu_idx)
+{
+	plat_local_state_t state;
+	unsigned long long pwrup_ts = 0, pwrdn_ts = 0;
+	unsigned int pmf_flags;
+
+	assert(lvl >= PSCI_CPU_PWR_LVL && lvl <= PLAT_MAX_PWR_LVL);
+	assert(state_info);
+	assert(last_cpu_idx >= 0 && last_cpu_idx <= PLATFORM_CORE_COUNT);
+
+	if (lvl == PSCI_CPU_PWR_LVL)
+		assert(last_cpu_idx == plat_my_core_pos());
+
+	/*
+	 * If power down is requested, then timestamp capture will
+	 * be with caches OFF.  Hence we have to do cache maintenance
+	 * when reading the timestamp.
+	 */
+	state = state_info->pwr_domain_state[PSCI_CPU_PWR_LVL];
+	if (is_local_state_off(state)) {
+		pmf_flags = PMF_CACHE_MAINT;
+	} else {
+		assert(is_local_state_retn(state));
+		pmf_flags = PMF_NO_CACHE_MAINT;
+	}
+
+	PMF_GET_TIMESTAMP_BY_INDEX(psci_svc,
+		PSCI_STAT_ID_ENTER_LOW_PWR,
+		last_cpu_idx,
+		pmf_flags,
+		pwrdn_ts);
+
+	PMF_GET_TIMESTAMP_BY_INDEX(psci_svc,
+		PSCI_STAT_ID_EXIT_LOW_PWR,
+		plat_my_core_pos(),
+		pmf_flags,
+		pwrup_ts);
+
+	return calc_stat_residency(pwrup_ts, pwrdn_ts);
+}
+#endif /* ENABLE_PSCI_STAT && ENABLE_PMF */
+
 /*
  * The PSCI generic code uses this API to let the platform participate in state
  * coordination during a power management operation. It compares the platform
diff --git a/plat/mediatek/mt6795/bl31.ld.S b/plat/mediatek/mt6795/bl31.ld.S
index 44510a7..472cd2e 100644
--- a/plat/mediatek/mt6795/bl31.ld.S
+++ b/plat/mediatek/mt6795/bl31.ld.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -113,7 +113,8 @@
 
     /*
      * The .bss section gets initialised to 0 at runtime.
-     * Its base address must be 16-byte aligned.
+     * Its base address should be 16-byte aligned for better performance of the
+     * zero-initialization code.
      */
     .bss (NOLOAD) : ALIGN(16) {
         __BSS_START__ = .;
diff --git a/plat/nvidia/tegra/common/drivers/memctrl/memctrl.c b/plat/nvidia/tegra/common/drivers/memctrl/memctrl.c
index 40d1bab..4f7c71e 100644
--- a/plat/nvidia/tegra/common/drivers/memctrl/memctrl.c
+++ b/plat/nvidia/tegra/common/drivers/memctrl/memctrl.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,10 +35,9 @@
 #include <memctrl.h>
 #include <string.h>
 #include <tegra_def.h>
+#include <utils.h>
 #include <xlat_tables.h>
 
-extern void zeromem16(void *mem, unsigned int length);
-
 #define TEGRA_GPU_RESET_REG_OFFSET	0x28c
 #define  GPU_RESET_BIT			(1 << 24)
 
@@ -114,13 +113,13 @@
 	 * Perform cache maintenance to ensure that the non-overlapping area is
 	 * zeroed out. The first invalidation of this range ensures that
 	 * possible evictions of dirty cache lines do not interfere with the
-	 * 'zeromem16' operation. Other CPUs could speculatively prefetch the
+	 * 'zeromem' operation. Other CPUs could speculatively prefetch the
 	 * main memory contents of this area between the first invalidation and
-	 * the 'zeromem16' operation. The second invalidation ensures that any
+	 * the 'zeromem' operation. The second invalidation ensures that any
 	 * such cache lines are removed as well.
 	 */
 	inv_dcache_range(non_overlap_area_start, non_overlap_area_size);
-	zeromem16((void *)non_overlap_area_start, non_overlap_area_size);
+	zeromem((void *)non_overlap_area_start, non_overlap_area_size);
 	inv_dcache_range(non_overlap_area_start, non_overlap_area_size);
 }
 
diff --git a/plat/qemu/qemu_bl2_setup.c b/plat/qemu/qemu_bl2_setup.c
index dba3bee..738d671 100644
--- a/plat/qemu/qemu_bl2_setup.c
+++ b/plat/qemu/qemu_bl2_setup.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2015-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,7 +35,7 @@
 #include <platform_def.h>
 #include "qemu_private.h"
 #include <string.h>
-
+#include <utils.h>
 
 /*
  * The next 2 constants identify the extents of the code & RO data region.
@@ -91,7 +91,7 @@
 	 * Initialise the memory for all the arguments that needs to
 	 * be passed to BL3-1
 	 */
-	memset(&bl31_params_mem, 0, sizeof(bl2_to_bl31_params_mem_t));
+	zeromem(&bl31_params_mem, sizeof(bl2_to_bl31_params_mem_t));
 
 	/* Assign memory for TF related information */
 	bl2_to_bl31_params = &bl31_params_mem.bl31_params;
diff --git a/plat/rockchip/rk3399/drivers/dram/dram_spec_timing.c b/plat/rockchip/rk3399/drivers/dram/dram_spec_timing.c
index fbf1d39..3f6ab2f 100644
--- a/plat/rockchip/rk3399/drivers/dram/dram_spec_timing.c
+++ b/plat/rockchip/rk3399/drivers/dram/dram_spec_timing.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2016-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -31,6 +31,7 @@
 #include <string.h>
 #include <stdint.h>
 #include <dram.h>
+#include <utils.h>
 #include "dram_spec_timing.h"
 
 static const uint8_t ddr3_cl_cwl[][7] = {
@@ -228,7 +229,7 @@
 	uint32_t ddr_capability_per_die = get_max_die_capability(timing_config);
 	uint32_t tmp;
 
-	memset((void *)pdram_timing, 0, sizeof(struct dram_timing_t));
+	zeromem((void *)pdram_timing, sizeof(struct dram_timing_t));
 	pdram_timing->mhz = nmhz;
 	pdram_timing->al = 0;
 	pdram_timing->bl = timing_config->bl;
@@ -441,7 +442,7 @@
 	uint32_t ddr_capability_per_die = get_max_die_capability(timing_config);
 	uint32_t tmp, trp_tmp, trppb_tmp, tras_tmp, twr_tmp, bl_tmp;
 
-	memset((void *)pdram_timing, 0, sizeof(struct dram_timing_t));
+	zeromem((void *)pdram_timing, sizeof(struct dram_timing_t));
 	pdram_timing->mhz = nmhz;
 	pdram_timing->al = 0;
 	pdram_timing->bl = timing_config->bl;
@@ -678,7 +679,7 @@
 	uint32_t ddr_capability_per_die = get_max_die_capability(timing_config);
 	uint32_t tmp, trp_tmp, trppb_tmp, tras_tmp, twr_tmp, bl_tmp;
 
-	memset((void *)pdram_timing, 0, sizeof(struct dram_timing_t));
+	zeromem((void *)pdram_timing, sizeof(struct dram_timing_t));
 	pdram_timing->mhz = nmhz;
 	pdram_timing->al = 0;
 	pdram_timing->bl = timing_config->bl;
@@ -968,7 +969,7 @@
 	uint32_t ddr_capability_per_die = get_max_die_capability(timing_config);
 	uint32_t tmp, trp_tmp, trppb_tmp, tras_tmp;
 
-	memset((void *)pdram_timing, 0, sizeof(struct dram_timing_t));
+	zeromem((void *)pdram_timing, sizeof(struct dram_timing_t));
 	pdram_timing->mhz = nmhz;
 	pdram_timing->al = 0;
 	pdram_timing->bl = timing_config->bl;
diff --git a/plat/xilinx/zynqmp/pm_service/pm_client.c b/plat/xilinx/zynqmp/pm_service/pm_client.c
index e102b4f..0fe17b5 100644
--- a/plat/xilinx/zynqmp/pm_service/pm_client.c
+++ b/plat/xilinx/zynqmp/pm_service/pm_client.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -40,6 +40,7 @@
 #include <bl_common.h>
 #include <mmio.h>
 #include <string.h>
+#include <utils.h>
 #include "pm_api_sys.h"
 #include "pm_client.h"
 #include "pm_ipi.h"
@@ -188,7 +189,7 @@
 	uint8_t pm_wakeup_nodes_set[NODE_MAX];
 	uintptr_t isenabler1 = BASE_GICD_BASE + GICD_ISENABLER + 4;
 
-	memset(&pm_wakeup_nodes_set, 0, sizeof(pm_wakeup_nodes_set));
+	zeromem(&pm_wakeup_nodes_set, sizeof(pm_wakeup_nodes_set));
 
 	for (reg_num = 0; reg_num < NUM_GICD_ISENABLER; reg_num++) {
 		uint32_t base_irq = reg_num << ISENABLER_SHIFT;
diff --git a/services/spd/opteed/opteed_common.c b/services/spd/opteed/opteed_common.c
index 2f20b7c..910f900 100644
--- a/services/spd/opteed/opteed_common.c
+++ b/services/spd/opteed/opteed_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -33,6 +33,7 @@
 #include <bl_common.h>
 #include <context_mgmt.h>
 #include <string.h>
+#include <utils.h>
 #include "opteed_private.h"
 
 /*******************************************************************************
@@ -73,7 +74,7 @@
 						      DAIF_FIQ_BIT |
 							DAIF_IRQ_BIT |
 							DAIF_ABT_BIT);
-	memset(&optee_entry_point->args, 0, sizeof(optee_entry_point->args));
+	zeromem(&optee_entry_point->args, sizeof(optee_entry_point->args));
 }
 
 /*******************************************************************************
diff --git a/services/spd/tspd/tspd_common.c b/services/spd/tspd/tspd_common.c
index 3dcefea..70959d7 100644
--- a/services/spd/tspd/tspd_common.c
+++ b/services/spd/tspd/tspd_common.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2016, ARM Limited and Contributors. All rights reserved.
+ * Copyright (c) 2013-2017, ARM Limited and Contributors. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,6 +35,7 @@
 #include <debug.h>
 #include <string.h>
 #include <tsp.h>
+#include <utils.h>
 #include "tspd_private.h"
 
 /*******************************************************************************
@@ -78,7 +79,7 @@
 	tsp_entry_point->spsr = SPSR_64(MODE_EL1,
 					MODE_SP_ELX,
 					DISABLE_ALL_EXCEPTIONS);
-	memset(&tsp_entry_point->args, 0, sizeof(tsp_entry_point->args));
+	zeromem(&tsp_entry_point->args, sizeof(tsp_entry_point->args));
 }
 
 /*******************************************************************************
diff --git a/tools/cert_create/include/key.h b/tools/cert_create/include/key.h
index f60997f..433f72c 100644
--- a/tools/cert_create/include/key.h
+++ b/tools/cert_create/include/key.h
@@ -73,6 +73,7 @@
 /* Exported API */
 int key_init(void);
 key_t *key_get_by_opt(const char *opt);
+int key_new(key_t *key);
 int key_create(key_t *key, int type);
 int key_load(key_t *key, unsigned int *err_code);
 int key_store(key_t *key);
diff --git a/tools/cert_create/src/cert.c b/tools/cert_create/src/cert.c
index a559832..375c66b 100644
--- a/tools/cert_create/src/cert.c
+++ b/tools/cert_create/src/cert.c
@@ -103,10 +103,10 @@
 	cert_t *issuer_cert = &certs[cert->issuer];
 	EVP_PKEY *ikey = keys[issuer_cert->key].key;
 	X509 *issuer = issuer_cert->x;
-	X509 *x = NULL;
-	X509_EXTENSION *ex = NULL;
-	X509_NAME *name = NULL;
-	ASN1_INTEGER *sno = NULL;
+	X509 *x;
+	X509_EXTENSION *ex;
+	X509_NAME *name;
+	ASN1_INTEGER *sno;
 	int i, num;
 
 	/* Create the certificate structure */
@@ -202,7 +202,7 @@
 
 cert_t *cert_get_by_opt(const char *opt)
 {
-	cert_t *cert = NULL;
+	cert_t *cert;
 	unsigned int i;
 
 	for (i = 0; i < num_certs; i++) {
diff --git a/tools/cert_create/src/ext.c b/tools/cert_create/src/ext.c
index 3f56edb..a50919e 100644
--- a/tools/cert_create/src/ext.c
+++ b/tools/cert_create/src/ext.c
@@ -181,13 +181,13 @@
 X509_EXTENSION *ext_new_hash(int nid, int crit, const EVP_MD *md,
 		unsigned char *buf, size_t len)
 {
-	X509_EXTENSION *ex = NULL;
-	ASN1_OCTET_STRING *octet = NULL;
-	HASH *hash = NULL;
-	ASN1_OBJECT *algorithm = NULL;
-	X509_ALGOR *x509_algor = NULL;
+	X509_EXTENSION *ex;
+	ASN1_OCTET_STRING *octet;
+	HASH *hash;
+	ASN1_OBJECT *algorithm;
+	X509_ALGOR *x509_algor;
 	unsigned char *p = NULL;
-	int sz = -1;
+	int sz;
 
 	/* OBJECT_IDENTIFIER with hash algorithm */
 	algorithm = OBJ_nid2obj(md->type);
@@ -254,16 +254,15 @@
  */
 X509_EXTENSION *ext_new_nvcounter(int nid, int crit, int value)
 {
-	X509_EXTENSION *ex = NULL;
-	ASN1_INTEGER *counter = NULL;
+	X509_EXTENSION *ex;
+	ASN1_INTEGER *counter;
 	unsigned char *p = NULL;
-	int sz = -1;
+	int sz;
 
 	/* Encode counter */
 	counter = ASN1_INTEGER_new();
 	ASN1_INTEGER_set(counter, value);
-	sz = i2d_ASN1_INTEGER(counter, NULL);
-	i2d_ASN1_INTEGER(counter, &p);
+	sz = i2d_ASN1_INTEGER(counter, &p);
 
 	/* Create the extension */
 	ex = ext_new(nid, crit, p, sz);
@@ -292,9 +291,9 @@
  */
 X509_EXTENSION *ext_new_key(int nid, int crit, EVP_PKEY *k)
 {
-	X509_EXTENSION *ex = NULL;
-	unsigned char *p = NULL;
-	int sz = -1;
+	X509_EXTENSION *ex;
+	unsigned char *p;
+	int sz;
 
 	/* Encode key */
 	BIO *mem = BIO_new(BIO_s_mem());
@@ -316,7 +315,7 @@
 
 ext_t *ext_get_by_opt(const char *opt)
 {
-	ext_t *ext = NULL;
+	ext_t *ext;
 	unsigned int i;
 
 	/* Sequential search. This is not a performance concern since the number
diff --git a/tools/cert_create/src/key.c b/tools/cert_create/src/key.c
index a7ee759..ce0e4da 100644
--- a/tools/cert_create/src/key.c
+++ b/tools/cert_create/src/key.c
@@ -49,7 +49,7 @@
 /*
  * Create a new key container
  */
-static int key_new(key_t *key)
+int key_new(key_t *key)
 {
 	/* Create key pair container */
 	key->key = EVP_PKEY_new();
@@ -62,7 +62,7 @@
 
 static int key_create_rsa(key_t *key)
 {
-	RSA *rsa = NULL;
+	RSA *rsa;
 
 	rsa = RSA_generate_key(RSA_KEY_BITS, RSA_F4, NULL, NULL);
 	if (rsa == NULL) {
@@ -83,7 +83,7 @@
 #ifndef OPENSSL_NO_EC
 static int key_create_ecdsa(key_t *key)
 {
-	EC_KEY *ec = NULL;
+	EC_KEY *ec;
 
 	ec = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1);
 	if (ec == NULL) {
@@ -123,11 +123,6 @@
 		return 0;
 	}
 
-	/* Create OpenSSL key container */
-	if (!key_new(key)) {
-		return 0;
-	}
-
 	if (key_create_fn[type]) {
 		return key_create_fn[type](key);
 	}
@@ -137,14 +132,8 @@
 
 int key_load(key_t *key, unsigned int *err_code)
 {
-	FILE *fp = NULL;
-	EVP_PKEY *k = NULL;
-
-	/* Create OpenSSL key container */
-	if (!key_new(key)) {
-		*err_code = KEY_ERR_MALLOC;
-		return 0;
-	}
+	FILE *fp;
+	EVP_PKEY *k;
 
 	if (key->fn) {
 		/* Load key from file */
@@ -173,7 +162,7 @@
 
 int key_store(key_t *key)
 {
-	FILE *fp = NULL;
+	FILE *fp;
 
 	if (key->fn) {
 		fp = fopen(key->fn, "w");
@@ -196,7 +185,6 @@
 {
 	cmd_opt_t cmd_opt;
 	key_t *key;
-	int rc = 0;
 	unsigned int i;
 
 	for (i = 0; i < num_keys; i++) {
@@ -211,12 +199,12 @@
 		}
 	}
 
-	return rc;
+	return 0;
 }
 
 key_t *key_get_by_opt(const char *opt)
 {
-	key_t *key = NULL;
+	key_t *key;
 	unsigned int i;
 
 	/* Sequential search. This is not a performance concern since the number
diff --git a/tools/cert_create/src/main.c b/tools/cert_create/src/main.c
index c58f41d..c9c9622 100644
--- a/tools/cert_create/src/main.c
+++ b/tools/cert_create/src/main.c
@@ -134,7 +134,6 @@
 	printf("\t%s [OPTIONS]\n\n", cmd);
 
 	printf("Available options:\n");
-	i = 0;
 	opt = long_opt;
 	while (opt->name) {
 		p = line;
@@ -261,12 +260,12 @@
 
 int main(int argc, char *argv[])
 {
-	STACK_OF(X509_EXTENSION) * sk = NULL;
-	X509_EXTENSION *cert_ext = NULL;
-	ext_t *ext = NULL;
-	key_t *key = NULL;
-	cert_t *cert = NULL;
-	FILE *file = NULL;
+	STACK_OF(X509_EXTENSION) * sk;
+	X509_EXTENSION *cert_ext;
+	ext_t *ext;
+	key_t *key;
+	cert_t *cert;
+	FILE *file;
 	int i, j, ext_nid, nvctr;
 	int c, opt_idx = 0;
 	const struct option *cmd_opt;
@@ -367,6 +366,11 @@
 
 	/* Load private keys from files (or generate new ones) */
 	for (i = 0 ; i < num_keys ; i++) {
+		if (!key_new(&keys[i])) {
+			ERROR("Failed to allocate key container\n");
+			exit(1);
+		}
+
 		/* First try to load the key from disk */
 		if (key_load(&keys[i], &err_code)) {
 			/* Key loaded successfully */
@@ -374,11 +378,7 @@
 		}
 
 		/* Key not loaded. Check the error code */
-		if (err_code == KEY_ERR_MALLOC) {
-			/* Cannot allocate memory. Abort. */
-			ERROR("Malloc error while loading '%s'\n", keys[i].fn);
-			exit(1);
-		} else if (err_code == KEY_ERR_LOAD) {
+		if (err_code == KEY_ERR_LOAD) {
 			/* File exists, but it does not contain a valid private
 			 * key. Abort. */
 			ERROR("Error loading '%s'\n", keys[i].fn);
diff --git a/tools/fiptool/fiptool.c b/tools/fiptool/fiptool.c
index 865aeae..f3f831b 100644
--- a/tools/fiptool/fiptool.c
+++ b/tools/fiptool/fiptool.c
@@ -52,8 +52,6 @@
 #define OPT_PLAT_TOC_FLAGS 1
 #define OPT_ALIGN 2
 
-static image_desc_t *lookup_image_desc_from_uuid(const uuid_t *uuid);
-static image_t *lookup_image_from_uuid(const uuid_t *uuid);
 static int info_cmd(int argc, char *argv[]);
 static void info_usage(void);
 static int create_cmd(int argc, char *argv[]);
@@ -822,11 +820,9 @@
 	printf("\n");
 	printf("Options:\n");
 	printf("  --align <value>\t\tEach image is aligned to <value> (default: 1).\n");
-	printf("  --blob uuid=...,file=...\tAdd an image with the given UUID "
-	    "pointed to by file.\n");
-	printf("  --plat-toc-flags <value>\t16-bit platform specific flag field "
-	    "occupying bits 32-47 in 64-bit ToC header.\n");
-	fputc('\n', stderr);
+	printf("  --blob uuid=...,file=...\tAdd an image with the given UUID pointed to by file.\n");
+	printf("  --plat-toc-flags <value>\t16-bit platform specific flag field occupying bits 32-47 in 64-bit ToC header.\n");
+	printf("\n");
 	printf("Specific images are packed with the following options:\n");
 	for (; toc_entry->cmdline_name != NULL; toc_entry++)
 		printf("  --%-16s FILENAME\t%s\n", toc_entry->cmdline_name,
@@ -938,12 +934,10 @@
 	printf("\n");
 	printf("Options:\n");
 	printf("  --align <value>\t\tEach image is aligned to <value> (default: 1).\n");
-	printf("  --blob uuid=...,file=...\tAdd or update an image "
-	    "with the given UUID pointed to by file.\n");
+	printf("  --blob uuid=...,file=...\tAdd or update an image with the given UUID pointed to by file.\n");
 	printf("  --out FIP_FILENAME\t\tSet an alternative output FIP file.\n");
-	printf("  --plat-toc-flags <value>\t16-bit platform specific flag field "
-	    "occupying bits 32-47 in 64-bit ToC header.\n");
-	fputc('\n', stderr);
+	printf("  --plat-toc-flags <value>\t16-bit platform specific flag field occupying bits 32-47 in 64-bit ToC header.\n");
+	printf("\n");
 	printf("Specific images are packed with the following options:\n");
 	for (; toc_entry->cmdline_name != NULL; toc_entry++)
 		printf("  --%-16s FILENAME\t%s\n", toc_entry->cmdline_name,
@@ -1076,17 +1070,15 @@
 	printf("fiptool unpack [opts] FIP_FILENAME\n");
 	printf("\n");
 	printf("Options:\n");
-	printf("  --blob uuid=...,file=...\tUnpack an image with the given UUID "
-	    "to file.\n");
-	printf("  --force\t\t\tIf the output file already exists, use --force to "
-	    "overwrite it.\n");
+	printf("  --blob uuid=...,file=...\tUnpack an image with the given UUID to file.\n");
+	printf("  --force\t\t\tIf the output file already exists, use --force to overwrite it.\n");
 	printf("  --out path\t\t\tSet the output directory path.\n");
-	fputc('\n', stderr);
+	printf("\n");
 	printf("Specific images are unpacked with the following options:\n");
 	for (; toc_entry->cmdline_name != NULL; toc_entry++)
 		printf("  --%-16s FILENAME\t%s\n", toc_entry->cmdline_name,
 		    toc_entry->name);
-	fputc('\n', stderr);
+	printf("\n");
 	printf("If no options are provided, all images will be unpacked.\n");
 	exit(1);
 }
@@ -1207,10 +1199,9 @@
 	printf("Options:\n");
 	printf("  --align <value>\tEach image is aligned to <value> (default: 1).\n");
 	printf("  --blob uuid=...\tRemove an image with the given UUID.\n");
-	printf("  --force\t\tIf the output FIP file already exists, use --force to "
-	    "overwrite it.\n");
+	printf("  --force\t\tIf the output FIP file already exists, use --force to overwrite it.\n");
 	printf("  --out FIP_FILENAME\tSet an alternative output FIP file.\n");
-	fputc('\n', stderr);
+	printf("\n");
 	printf("Specific images are removed with the following options:\n");
 	for (; toc_entry->cmdline_name != NULL; toc_entry++)
 		printf("  --%-16s\t%s\n", toc_entry->cmdline_name,
@@ -1258,7 +1249,7 @@
 	printf("usage: fiptool [--verbose] <command> [<args>]\n");
 	printf("Global options supported:\n");
 	printf("  --verbose\tEnable verbose output for all commands.\n");
-	fputc('\n', stderr);
+	printf("\n");
 	printf("Commands supported:\n");
 	printf("  info\t\tList images contained in FIP.\n");
 	printf("  create\tCreate a new FIP with the given images.\n");