From afb06baf48cade0bf4b40058a3f9d557f0fab2c4 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 25 Apr 2024 06:16:35 -0700
Subject: [PATCH] toplev: Use TMA 4.8 model for Meteorlake

---
 mtl_rwc_ratios.py | 573 ++++++++++++++++++++++++++++++----------------
 1 file changed, 372 insertions(+), 201 deletions(-)

diff --git a/mtl_rwc_ratios.py b/mtl_rwc_ratios.py
index 59ee604a..94fa8e07 100644
--- a/mtl_rwc_ratios.py
+++ b/mtl_rwc_ratios.py
@@ -1,6 +1,6 @@
 # -*- coding: latin-1 -*-
 #
-# auto generated TopDown/TMA 4.7-full-perf description for Intel 14th gen Core (code name Meteor Lake) with Redwood Cove
+# auto generated TopDown/TMA 4.8-full-perf description for Intel 14th gen Core (code name Meteor Lake) with Redwood Cove
 # Please see http://ark.intel.com for more details on these CPUs.
 #
 # References:
@@ -16,10 +16,13 @@
 print_error = lambda msg: False
 smt_enabled = False
 ebs_mode = False
-version = "4.7-full-perf"
+version = "4.8-full-perf"
 base_frequency = -1.0
 Memory = 0
 Average_Frequency = 0.0
+num_cores = 1
+num_threads = 1
+num_sockets = 1
 topdown_use_fixed = False
 
 def handle_error(obj, msg):
@@ -46,6 +49,8 @@ def handle_error_metric(obj, msg):
 OneMillion = 1000000
 OneBillion = 1000000000
 Energy_Unit = 61
+PERF_METRICS_MSR = 1
+DS = 0
 
 # Aux. formulas
 
@@ -54,7 +59,7 @@ def Br_DoI_Jumps(self, EV, level):
     return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level)
 
 def Branching_Retired(self, EV, level):
-    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level)
+    return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level)
 
 def Serialize_Core(self, EV, level):
     return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + EV("RS.EMPTY:u1", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV))
@@ -99,11 +104,11 @@ def FLOP_Count(self, EV, level):
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Scalar(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level)
+    return EV("FP_ARITH_INST_RETIRED.SCALAR", level)
 
 # Floating Point computational (arithmetic) Operations Count
 def FP_Arith_Vector(self, EV, level):
-    return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level)
+    return EV("FP_ARITH_INST_RETIRED.VECTOR", level)
 
 def HighIPC(self, EV, level):
     val = IPC(self, EV, level) / Pipeline_Width
@@ -160,7 +165,11 @@ def Retired_Slots(self, EV, level):
 
 # Number of logical processors (enabled or online) on the target system
 def Num_CPUs(self, EV, level):
-    return 24 if smt_enabled else 16
+    return num_cores * num_threads if num_cores else(8 + 16 /(2 - smt_enabled))
+
+# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads)
+def Dependent_Loads_Weight(self, EV, level):
+    return 20
 
 # Total pipeline cost of Branch Misprediction related bottlenecks
 def Mispredictions(self, EV, level):
@@ -174,7 +183,7 @@ def Big_Code(self, EV, level):
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of instruction fetch bandwidth related bottlenecks
+# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)
 def Instruction_Fetch_BW(self, EV, level):
     val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level)
     self.thresh = (val > 20)
@@ -182,19 +191,19 @@ def Instruction_Fetch_BW(self, EV, level):
 
 # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks
 def Cache_Memory_Bandwidth(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks
 def Cache_Memory_Latency(self, EV, level):
-    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
 # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)
 def Memory_Data_TLBs(self, EV, level):
-    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Load.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
+    val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Load.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))))
     self.thresh = (val > 20)
     return val
 
@@ -216,24 +225,30 @@ def Irregular_Overhead(self, EV, level):
     self.thresh = (val > 10)
     return val
 
-# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
+# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls.
 def Other_Bottlenecks(self, EV, level):
-    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level))
+    val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level))
     self.thresh = (val > 20)
     return val
 
-# Total pipeline cost of branch related instructions (used for program control-flow including function calls)
+# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations
 def Branching_Overhead(self, EV, level):
     val = 100 * Branching_Retired(self, EV, level)
     self.thresh = (val > 5)
     return val
 
-# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead.
-def Base_Non_Br(self, EV, level):
+# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.
+def Useful_Work(self, EV, level):
     val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level))
     self.thresh = (val > 20)
     return val
 
+# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
+def Core_Bound_Likely(self, EV, level):
+    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
+    self.thresh = (val > 0.5)
+    return val
+
 # Instructions Per Cycle (per Logical Processor)
 def IPC(self, EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level)
@@ -244,7 +259,7 @@ def UopPI(self, EV, level):
     self.thresh = (val > 1.05)
     return val
 
-# Instruction per taken branch
+# Uops per taken branch
 def UpTB(self, EV, level):
     val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 1.5
@@ -290,12 +305,6 @@ def ILP(self, EV, level):
 def EPC(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level)
 
-# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled
-def Core_Bound_Likely(self, EV, level):
-    val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0
-    self.thresh = (val > 0.5)
-    return val
-
 # Core actual clocks when any Logical Processor is active on the Physical Core
 def CORE_CLKS(self, EV, level):
     return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level)
@@ -324,7 +333,7 @@ def IpCall(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instruction per taken branch
+# Instructions per taken branch
 def IpTB(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
     self.thresh = val < Pipeline_Width * 2 + 1
@@ -400,10 +409,21 @@ def IpAssist(self, EV, level):
     self.thresh = (val < 100000)
     return val
 
-# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core
 def Execute(self, EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level)
 
+# Average number of uops fetched from LSD per cycle
+def Fetch_LSD(self, EV, level):
+    return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level)
+
+# Average number of uops fetched from DSB per cycle
+def Fetch_DSB(self, EV, level):
+    return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level)
+
+# Average number of uops fetched from MITE per cycle
+def Fetch_MITE(self, EV, level):
+    return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level)
+
 # Average number of Uops issued by front-end when it issued something
 def Fetch_UpC(self, EV, level):
     return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level)
@@ -432,6 +452,12 @@ def DSB_Misses(self, EV, level):
     self.thresh = (val > 10)
     return val
 
+# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck.
+def DSB_Bandwidth(self, EV, level):
+    val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.LSD.compute(EV) + self.MITE.compute(EV) + self.DSB.compute(EV))))
+    self.thresh = (val > 10)
+    return val
+
 # Average Latency for L1 instruction cache misses
 def ICache_Miss_Latency(self, EV, level):
     return EV("ICACHE_DATA.STALLS", level) / EV("ICACHE_DATA.STALLS:c1:e1", level)
@@ -466,25 +492,25 @@ def IpMispredict(self, EV, level):
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Ntaken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).
 def IpMisp_Cond_Taken(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level)
     self.thresh = (val < 200)
     return val
 
-# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).
 def IpMisp_Ret(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level)
     self.thresh = (val < 500)
     return val
 
-# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
+# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).
 def IpMisp_Indirect(self, EV, level):
     val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level)
     self.thresh = (val < 1000)
@@ -494,7 +520,7 @@ def IpMisp_Indirect(self, EV, level):
 def Branch_Misprediction_Cost(self, EV, level):
     return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100
 
-# Speculative to Retired ratio of all clears (covering mispredicts and nukes)
+# Speculative to Retired ratio of all clears (covering Mispredicts and nukes)
 def Spec_Clears_Ratio(self, EV, level):
     return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
 
@@ -546,6 +572,10 @@ def L2MPKI_All(self, EV, level):
 def L2MPKI_Load(self, EV, level):
     return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level)
 
+# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs
+def L2MPKI_RFO(self, EV, level):
+    return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level)
+
 # L2 cache hits per kilo instruction for all request types (including speculative)
 def L2HPKI_All(self, EV, level):
     return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level)
@@ -562,19 +592,15 @@ def L3MPKI(self, EV, level):
 def FB_HPKI(self, EV, level):
     return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level)
 
-# Average per-thread data fill bandwidth to the L1 data cache [GB / sec]
 def L1D_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L2 cache [GB / sec]
 def L2_Cache_Fill_BW(self, EV, level):
     return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data fill bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Fill_BW(self, EV, level):
     return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level)
 
-# Average per-thread data access bandwidth to the L3 cache [GB / sec]
 def L3_Cache_Access_BW(self, EV, level):
     return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level)
 
@@ -638,17 +664,17 @@ def Bus_Lock_PKI(self, EV, level):
 
 # Average CPU Utilization (percentage)
 def CPU_Utilization(self, EV, level):
-    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
+    return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level)
 
 # Average number of utilized CPUs
 def CPUs_Utilized(self, EV, level):
-    return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level)
+    return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0)
 
 # Measured Average Core Frequency for unhalted processors [GHz]
 def Core_Frequency(self, EV, level):
     return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level)
 
-# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width  .
+# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width
 def GFLOPs(self, EV, level):
     return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level)
 
@@ -716,7 +742,7 @@ class Frontend_Bound:
     sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1', 'PGO'])
+    metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO'])
     maxval = None
     def compute(self, EV):
         try:
@@ -778,7 +804,7 @@ class ICache_Misses:
     sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss'])
     maxval = None
     def compute(self, EV):
         try:
@@ -803,7 +829,7 @@ class ITLB_Misses:
     sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB'])
     maxval = None
     def compute(self, EV):
         try:
@@ -860,7 +886,7 @@ class Mispredicts_Resteers:
     sample = ['INT_MISC.CLEAR_RESTEER_CYCLES']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP'])
     maxval = None
     def compute(self, EV):
         try:
@@ -908,7 +934,7 @@ class Unknown_Branches:
     sample = ['FRONTEND_RETIRED.UNKNOWN_BRANCH']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BigFootprint', 'FetchLat'])
+    metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat'])
     maxval = None
     def compute(self, EV):
         try:
@@ -935,7 +961,7 @@ class MS_Switches:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['FetchLat', 'MicroSeq'])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = MS_Switches_Cost * EV("UOPS_RETIRED.MS:c1:e1", 3) / Retire_Fraction(self, EV, 3) / CLKS(self, EV, 3)
@@ -1189,7 +1215,7 @@ class Branch_Mispredicts:
     sample = ['TOPDOWN.BR_MISPREDICT_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1220,7 +1246,7 @@ class Other_Mispredicts:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BrMispredicts'])
+    metricgroup = frozenset(['BvIO', 'BrMispredicts'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1244,7 +1270,7 @@ class Machine_Clears:
     sample = ['MACHINE_CLEARS.COUNT']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2'])
+    metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1274,7 +1300,7 @@ class Other_Nukes:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Machine_Clears'])
+    metricgroup = frozenset(['BvIO', 'Machine_Clears'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1298,7 +1324,7 @@ class Backend_Bound:
     sample = ['TOPDOWN.BACKEND_BOUND_SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvOB', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1390,8 +1416,8 @@ class DTLB_Load:
     sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("MEM_INST_RETIRED.STLB_HIT_LOADS", 4) * min(EV("MEM_INST_RETIRED.STLB_HIT_LOADS", 999) , Mem_STLB_Hit_Cost) / CLKS(self, EV, 4) + self.Load_STLB_Miss.compute(EV)
@@ -1422,7 +1448,7 @@ class Load_STLB_Hit:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = max(0 , self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV))
@@ -1446,7 +1472,7 @@ class Load_STLB_Miss:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = None
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5)
@@ -1470,7 +1496,7 @@ class Store_Fwd_Blk:
     errcount = 0
     sibling = None
     metricgroup = frozenset([])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = 13 * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(self, EV, 4)
@@ -1491,17 +1517,42 @@ def compute(self, EV):
 region than the load is reading."""
 
 
+class L1_Hit_Latency:
+    name = "L1_Hit_Latency"
+    domain = "Clocks_Estimated"
+    area = "BE/Mem"
+    level = 4
+    htoff = False
+    sample = ['MEM_LOAD_RETIRED.L1_HIT']
+    errcount = 0
+    sibling = None
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
+    maxval = 1.0
+    def compute(self, EV):
+        try:
+            self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4)
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            handle_error(self, "L1_Hit_Latency zero division")
+        return self.val
+    desc = """
+This metric roughly estimates fraction of cycles with demand
+load accesses that hit the L1 cache. The short latency of
+the L1 data cache may be exposed in pointer-chasing memory
+access patterns as an example."""
+
+
 class Lock_Latency:
     name = "Lock_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     level = 4
     htoff = False
-    sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp']
+    sample = ['MEM_INST_RETIRED.LOCK_LOADS']
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Offcore'])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("MEM_INST_RETIRED.LOCK_LOADS", 4) * EV("MEM_INST_RETIRED.LOCK_LOADS", 999) / CLKS(self, EV, 4)
@@ -1526,7 +1577,7 @@ class Split_Loads:
     errcount = 0
     sibling = None
     metricgroup = frozenset([])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("MEM_INST_RETIRED.SPLIT_LOADS", 4) * min(EV("MEM_INST_RETIRED.SPLIT_LOADS", 999) , Load_Miss_Real_Latency(self, EV, 4)) / CLKS(self, EV, 4)
@@ -1550,7 +1601,7 @@ class FB_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW'])
+    metricgroup = frozenset(['BvMB', 'MemoryBW'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1579,7 +1630,7 @@ class L2_Bound:
     sample = ['MEM_LOAD_RETIRED.L2_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem'])
+    metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1629,8 +1680,8 @@ class Contested_Accesses:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
-    maxval = 1
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", 4) * min(EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", 999) , Mem_XSNP_Hit_Cost(self, EV, 4) - Mem_L2_Hit_Cost(self, EV, 4)) + EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", 4) * min(EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", 999) , Mem_XSNP_HitM_Cost(self, EV, 4) - Mem_L2_Hit_Cost(self, EV, 4)) * True_XSNP_HitM_Fraction(self, EV, 4)) * FB_Factor(self, EV, 4) / CLKS(self, EV, 4)
@@ -1657,8 +1708,8 @@ class Data_Sharing:
     sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Offcore', 'Snoop'])
-    maxval = 1
+    metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", 4) * min(EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", 999) , Mem_XSNP_Hit_Cost(self, EV, 4) - Mem_L2_Hit_Cost(self, EV, 4)) + EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", 4) * min(EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", 999) , Mem_XSNP_Hit_Cost(self, EV, 4) - Mem_L2_Hit_Cost(self, EV, 4)) * (1 - True_XSNP_HitM_Fraction(self, EV, 4))) * FB_Factor(self, EV, 4) / CLKS(self, EV, 4)
@@ -1684,8 +1735,8 @@ class L3_Hit_Latency:
     sample = ['MEM_LOAD_RETIRED.L3_HIT:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat'])
-    maxval = 1
+    metricgroup = frozenset(['BvML', 'MemoryLat'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("MEM_LOAD_RETIRED.L3_HIT", 4) * min(EV("MEM_LOAD_RETIRED.L3_HIT", 999) , Mem_XSNP_None_Cost(self, EV, 4) - Mem_L2_Hit_Cost(self, EV, 4)) * FB_Factor(self, EV, 4) / CLKS(self, EV, 4)
@@ -1712,7 +1763,7 @@ class SQ_Full:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1737,7 +1788,7 @@ class DRAM_Bound:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryBound', 'TmaL3mem'])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = MEM_Bound_Ratio(self, EV, 3)
@@ -1760,7 +1811,7 @@ class MEM_Bandwidth:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'MemoryBW', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1798,7 +1849,7 @@ class MEM_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
     maxval = None
     def compute(self, EV):
         try:
@@ -1854,8 +1905,8 @@ class Store_Latency:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryLat', 'Offcore'])
-    maxval = 1
+    metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (Store_L2_Hit_Cycles(self, EV, 4) + (1 - Mem_Lock_St_Fraction(self, EV, 4)) * ORO_Demand_RFO_C1(self, EV, 4)) / CLKS(self, EV, 4)
@@ -1882,8 +1933,8 @@ class False_Sharing:
     sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop'])
-    maxval = 1
+    metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = Mem_XSNP_HitM_Cost(self, EV, 4) * EV("OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", 4) / CLKS(self, EV, 4)
@@ -1934,10 +1985,11 @@ class Streaming_Stores:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryBW', 'Offcore'])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
-            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4)
+            self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0
+            EV("OCR.STREAMING_WR.ANY_RESPONSE", 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Streaming_Stores zero division")
@@ -1961,8 +2013,8 @@ class DTLB_Store:
     sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['MemoryTLB'])
-    maxval = 1
+    metricgroup = frozenset(['BvMT', 'MemoryTLB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("MEM_INST_RETIRED.STLB_HIT_STORES", 4) * min(EV("MEM_INST_RETIRED.STLB_HIT_STORES", 999) , Mem_STLB_Hit_Cost) / CLKS(self, EV, 4) + self.Store_STLB_Miss.compute(EV)
@@ -1991,7 +2043,7 @@ class Store_STLB_Hit:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = max(0 , self.DTLB_Store.compute(EV) - self.Store_STLB_Miss.compute(EV))
@@ -2015,7 +2067,7 @@ class Store_STLB_Miss:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['MemoryTLB'])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("DTLB_STORE_MISSES.WALK_ACTIVE", 5) / CORE_CLKS(self, EV, 5)
@@ -2069,8 +2121,8 @@ class Divider:
     sample = ['ARITH.DIV_ACTIVE']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = None
+    metricgroup = frozenset(['BvCB'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = EV("ARITH.DIV_ACTIVE", 3) / CLKS(self, EV, 3)
@@ -2095,7 +2147,7 @@ class Serializing_Operation:
     sample = ['RESOURCE_STALLS.SCOREBOARD']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvIO', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2192,7 +2244,7 @@ class Memory_Fence:
     errcount = 0
     sibling = None
     metricgroup = frozenset([])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = 13 * EV("MISC2_RETIRED.LFENCE", 4) / CLKS(self, EV, 4)
@@ -2254,7 +2306,7 @@ class Ports_Utilized_0:
     maxval = None
     def compute(self, EV):
         try:
-            self.val = max((EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + EV("RS.EMPTY_RESOURCE", 4)) / CLKS(self, EV, 4) , 1) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4)
+            self.val = max((EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + max(EV("RS.EMPTY_RESOURCE", 4) - EV("RESOURCE_STALLS.SCOREBOARD", 4) , 0)) / CLKS(self, EV, 4) , 1) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4)
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             handle_error(self, "Ports_Utilized_0 zero division")
@@ -2280,7 +2332,7 @@ class Mixing_Vectors:
     errcount = 0
     sibling = None
     metricgroup = frozenset([])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = 160 * EV("ASSISTS.SSE_AVX_MIX", 5) / CLKS(self, EV, 5)
@@ -2364,7 +2416,7 @@ class Ports_Utilized_3m:
     sample = ['UOPS_EXECUTED.CYCLES_GE_3']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['PortsUtil'])
+    metricgroup = frozenset(['BvCB', 'PortsUtil'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2527,7 +2579,7 @@ class Retiring:
     sample = ['UOPS_RETIRED.SLOTS']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['TmaL1'])
+    metricgroup = frozenset(['BvUW', 'TmaL1'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2676,7 +2728,7 @@ class FP_Vector:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Compute', 'Flops'])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = FP_Arith_Vector(self, EV, 4) / Retired_Slots(self, EV, 4)
@@ -2701,7 +2753,7 @@ class FP_Vector_128b:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Compute', 'Flops'])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5)
@@ -2726,7 +2778,7 @@ class FP_Vector_256b:
     errcount = 0
     sibling = None
     metricgroup = frozenset(['Compute', 'Flops'])
-    maxval = 1
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5)
@@ -2810,9 +2862,9 @@ def compute(self, EV):
             handle_error(self, "Int_Vector_256b zero division")
         return self.val
     desc = """
-This metric represents 256-bit vector Integer ADD/SUB/SAD or
-VNNI (Vector Neural Network Instructions) uops fraction the
-CPU has retired."""
+This metric represents 256-bit vector Integer
+ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions)
+uops fraction the CPU has retired."""
 
 
 class Memory_Operations:
@@ -2848,7 +2900,7 @@ class Fused_Instructions:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2876,7 +2928,7 @@ class Non_Fused_Branches:
     sample = []
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Branches', 'Pipeline'])
+    metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -2926,7 +2978,7 @@ class Nop_Instructions:
     sample = ['INST_RETIRED.NOP']
     errcount = 0
     sibling = None
-    metricgroup = frozenset(['Pipeline'])
+    metricgroup = frozenset(['BvBO', 'Pipeline'])
     maxval = None
     def compute(self, EV):
         try:
@@ -3056,8 +3108,8 @@ class Assists:
     sample = ['ASSISTS.ANY']
     errcount = 0
     sibling = None
-    metricgroup = frozenset([])
-    maxval = 1
+    metricgroup = frozenset(['BvIO'])
+    maxval = 1.0
     def compute(self, EV):
         try:
             self.val = Avg_Assist_Cost * EV("ASSISTS.ANY", 4) / SLOTS(self, EV, 4)
@@ -3197,7 +3249,7 @@ class Metric_Mispredictions:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts'])
+    metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP'])
     sibling = None
 
     def compute(self, EV):
@@ -3217,7 +3269,7 @@ class Metric_Big_Code:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
+    metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB'])
     sibling = None
 
     def compute(self, EV):
@@ -3238,7 +3290,7 @@ class Metric_Instruction_Fetch_BW:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend'])
+    metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend'])
     sibling = None
 
     def compute(self, EV):
@@ -3249,7 +3301,8 @@ def compute(self, EV):
             handle_error_metric(self, "Instruction_Fetch_BW zero division")
     desc = """
 Total pipeline cost of instruction fetch bandwidth related
-bottlenecks"""
+bottlenecks (when the front-end could not sustain operations
+delivery to the back-end)"""
 
 
 class Metric_Cache_Memory_Bandwidth:
@@ -3258,7 +3311,7 @@ class Metric_Cache_Memory_Bandwidth:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore'])
+    metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3278,7 +3331,7 @@ class Metric_Cache_Memory_Latency:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore'])
+    metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3298,7 +3351,7 @@ class Metric_Memory_Data_TLBs:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore'])
+    metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3318,7 +3371,7 @@ class Metric_Memory_Synchronization:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Mem', 'Offcore'])
+    metricgroup = frozenset(['BvMS', 'Mem', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3339,7 +3392,7 @@ class Metric_Compute_Bound_Est:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor'])
+    metricgroup = frozenset(['BvCB', 'Cor'])
     sibling = None
 
     def compute(self, EV):
@@ -3360,7 +3413,7 @@ class Metric_Irregular_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Bad', 'Cor', 'Ret'])
+    metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3382,7 +3435,7 @@ class Metric_Other_Bottlenecks:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Cor', 'Offcore'])
+    metricgroup = frozenset(['BvOB', 'Cor', 'Offcore'])
     sibling = None
 
     def compute(self, EV):
@@ -3392,10 +3445,9 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Other_Bottlenecks zero division")
     desc = """
-Total pipeline cost of remaining bottlenecks (apart from
-those listed in the Info.Bottlenecks metrics class).
-Examples include data-dependencies (Core Bound when Low ILP)
-and other unlisted memory-related stalls."""
+Total pipeline cost of remaining bottlenecks in the back-
+end. Examples include data-dependencies (Core Bound when Low
+ILP) and other unlisted memory-related stalls."""
 
 
 class Metric_Branching_Overhead:
@@ -3404,7 +3456,7 @@ class Metric_Branching_Overhead:
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvBO', 'Ret'])
     sibling = None
 
     def compute(self, EV):
@@ -3414,31 +3466,54 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Branching_Overhead zero division")
     desc = """
-Total pipeline cost of branch related instructions (used for
-program control-flow including function calls)"""
+Total pipeline cost of instructions used for program
+control-flow - a subset of the Retiring category in TMA.
+Examples include function calls; loops and alignments. (A
+lower bound). Consider Loop Unrolling or function inlining
+optimizations"""
 
 
-class Metric_Base_Non_Br:
-    name = "Base_Non_Br"
+class Metric_Useful_Work:
+    name = "Useful_Work"
     domain = "Scaled_Slots"
     maxval = 0
     errcount = 0
     area = "Info.Bottleneck"
-    metricgroup = frozenset(['Ret'])
+    metricgroup = frozenset(['BvUW', 'Ret'])
     sibling = None
 
     def compute(self, EV):
         try:
-            self.val = Base_Non_Br(self, EV, 0)
+            self.val = Useful_Work(self, EV, 0)
             self.thresh = (self.val > 20)
         except ZeroDivisionError:
-            handle_error_metric(self, "Base_Non_Br zero division")
+            handle_error_metric(self, "Useful_Work zero division")
     desc = """
-Total pipeline cost of \"useful operations\" - the baseline
-operations not covered by Branching_Overhead nor
+Total pipeline cost of \"useful operations\" - the portion
+of Retiring category not covered by Branching_Overhead nor
 Irregular_Overhead."""
 
 
+class Metric_Core_Bound_Likely:
+    name = "Core_Bound_Likely"
+    domain = "Metric"
+    maxval = 1.0
+    errcount = 0
+    area = "Info.Botlnk.L0"
+    metricgroup = frozenset(['Cor', 'SMT'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Core_Bound_Likely(self, EV, 0)
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            handle_error_metric(self, "Core_Bound_Likely zero division")
+    desc = """
+Probability of Core Bound bottleneck hidden by SMT-profiling
+artifacts. Tip: consider analysis with SMT disabled"""
+
+
 class Metric_IPC:
     name = "IPC"
     domain = "Metric"
@@ -3461,7 +3536,7 @@ def compute(self, EV):
 class Metric_UopPI:
     name = "UopPI"
     domain = "Metric"
-    maxval = 2
+    maxval = 2.0
     errcount = 0
     area = "Info.Thread"
     metricgroup = frozenset(['Pipeline', 'Ret', 'Retire'])
@@ -3493,7 +3568,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "UpTB zero division")
     desc = """
-Instruction per taken branch"""
+Uops per taken branch"""
 
 
 class Metric_CPI:
@@ -3558,7 +3633,7 @@ def compute(self, EV):
 class Metric_Slots_Utilization:
     name = "Slots_Utilization"
     domain = "Metric"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.Thread"
     metricgroup = frozenset(['SMT', 'TmaL1'])
@@ -3619,7 +3694,7 @@ def compute(self, EV):
 class Metric_FLOPc:
     name = "FLOPc"
     domain = "Core_Metric"
-    maxval = 10
+    maxval = 10.0
     errcount = 0
     area = "Info.Core"
     metricgroup = frozenset(['Ret', 'Flops'])
@@ -3638,7 +3713,7 @@ def compute(self, EV):
 class Metric_FP_Arith_Utilization:
     name = "FP_Arith_Utilization"
     domain = "Core_Metric"
-    maxval = 2
+    maxval = 2.0
     errcount = 0
     area = "Info.Core"
     metricgroup = frozenset(['Cor', 'Flops', 'HPC'])
@@ -3682,7 +3757,7 @@ def compute(self, EV):
 class Metric_EPC:
     name = "EPC"
     domain = "Metric"
-    maxval = 20
+    maxval = 20.0
     errcount = 0
     area = "Info.Core"
     metricgroup = frozenset(['Power'])
@@ -3698,26 +3773,6 @@ def compute(self, EV):
 uops Executed per Cycle"""
 
 
-class Metric_Core_Bound_Likely:
-    name = "Core_Bound_Likely"
-    domain = "Metric"
-    maxval = 1
-    errcount = 0
-    area = "Info.Botlnk.L0"
-    metricgroup = frozenset(['Cor', 'SMT'])
-    sibling = None
-
-    def compute(self, EV):
-        try:
-            self.val = Core_Bound_Likely(self, EV, 0)
-            self.thresh = (self.val > 0.5)
-        except ZeroDivisionError:
-            handle_error_metric(self, "Core_Bound_Likely zero division")
-    desc = """
-Probability of Core Bound bottleneck hidden by SMT-profiling
-artifacts. Tip: consider analysis with SMT disabled"""
-
-
 class Metric_CORE_CLKS:
     name = "CORE_CLKS"
     domain = "Count"
@@ -3834,7 +3889,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpTB zero division")
     desc = """
-Instruction per taken branch"""
+Instructions per taken branch"""
 
 
 class Metric_BpTkBranch:
@@ -4124,14 +4179,70 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "Execute zero division")
     desc = """
-Instruction-Level-Parallelism (average number of uops
-executed when there is execution) per physical core"""
+"""
+
+
+class Metric_Fetch_LSD:
+    name = "Fetch_LSD"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_LSD(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_LSD zero division")
+    desc = """
+Average number of uops fetched from LSD per cycle"""
+
+
+class Metric_Fetch_DSB:
+    name = "Fetch_DSB"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_DSB(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_DSB zero division")
+    desc = """
+Average number of uops fetched from DSB per cycle"""
+
+
+class Metric_Fetch_MITE:
+    name = "Fetch_MITE"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Pipeline"
+    metricgroup = frozenset(['Fed', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = Fetch_MITE(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "Fetch_MITE zero division")
+    desc = """
+Average number of uops fetched from MITE per cycle"""
 
 
 class Metric_Fetch_UpC:
     name = "Fetch_UpC"
     domain = "Metric"
-    maxval = 6
+    maxval = 6.0
     errcount = 0
     area = "Info.Frontend"
     metricgroup = frozenset(['Fed', 'FetchBW'])
@@ -4151,7 +4262,7 @@ def compute(self, EV):
 class Metric_LSD_Coverage:
     name = "LSD_Coverage"
     domain = "Metric"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.Frontend"
     metricgroup = frozenset(['Fed', 'LSD'])
@@ -4171,7 +4282,7 @@ def compute(self, EV):
 class Metric_DSB_Coverage:
     name = "DSB_Coverage"
     domain = "Metric"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.Frontend"
     metricgroup = frozenset(['DSB', 'Fed', 'FetchBW'])
@@ -4251,6 +4362,26 @@ def compute(self, EV):
 the Instruction_Fetch_BW Bottleneck."""
 
 
+class Metric_DSB_Bandwidth:
+    name = "DSB_Bandwidth"
+    domain = "Scaled_Slots"
+    maxval = 0
+    errcount = 0
+    area = "Info.Botlnk.L2"
+    metricgroup = frozenset(['DSB', 'FetchBW'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Bandwidth(self, EV, 0)
+            self.thresh = (self.val > 10)
+        except ZeroDivisionError:
+            handle_error_metric(self, "DSB_Bandwidth zero division")
+    desc = """
+Total pipeline cost of DSB (uop cache) hits - subset of the
+Instruction_Fetch_BW Bottleneck."""
+
+
 class Metric_ICache_Miss_Latency:
     name = "ICache_Miss_Latency"
     domain = "Metric"
@@ -4406,7 +4537,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Ntaken zero division")
     desc = """
-Instructions per retired mispredicts for conditional non-
+Instructions per retired Mispredicts for conditional non-
 taken branches (lower number means higher occurrence rate)."""
 
 
@@ -4426,7 +4557,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Cond_Taken zero division")
     desc = """
-Instructions per retired mispredicts for conditional taken
+Instructions per retired Mispredicts for conditional taken
 branches (lower number means higher occurrence rate)."""
 
 
@@ -4446,7 +4577,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Ret zero division")
     desc = """
-Instructions per retired mispredicts for return branches
+Instructions per retired Mispredicts for return branches
 (lower number means higher occurrence rate)."""
 
 
@@ -4466,7 +4597,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "IpMisp_Indirect zero division")
     desc = """
-Instructions per retired mispredicts for indirect CALL or
+Instructions per retired Mispredicts for indirect CALL or
 JMP branches (lower number means higher occurrence rate)."""
 
 
@@ -4507,13 +4638,13 @@ def compute(self, EV):
             handle_error_metric(self, "Spec_Clears_Ratio zero division")
     desc = """
 Speculative to Retired ratio of all clears (covering
-mispredicts and nukes)"""
+Mispredicts and nukes)"""
 
 
 class Metric_Cond_NT:
     name = "Cond_NT"
     domain = "Fraction"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.Branches"
     metricgroup = frozenset(['Bad', 'Branches', 'CodeGen', 'PGO'])
@@ -4532,7 +4663,7 @@ def compute(self, EV):
 class Metric_Cond_TK:
     name = "Cond_TK"
     domain = "Fraction"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.Branches"
     metricgroup = frozenset(['Bad', 'Branches', 'CodeGen', 'PGO'])
@@ -4551,7 +4682,7 @@ def compute(self, EV):
 class Metric_CallRet:
     name = "CallRet"
     domain = "Fraction"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.Branches"
     metricgroup = frozenset(['Bad', 'Branches'])
@@ -4570,7 +4701,7 @@ def compute(self, EV):
 class Metric_Jump:
     name = "Jump"
     domain = "Fraction"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.Branches"
     metricgroup = frozenset(['Bad', 'Branches'])
@@ -4590,7 +4721,7 @@ def compute(self, EV):
 class Metric_Other_Branches:
     name = "Other_Branches"
     domain = "Fraction"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.Branches"
     metricgroup = frozenset(['Bad', 'Branches'])
@@ -4630,7 +4761,7 @@ def compute(self, EV):
 class Metric_MLP:
     name = "MLP"
     domain = "Metric"
-    maxval = 10
+    maxval = 10.0
     errcount = 0
     area = "Info.Memory"
     metricgroup = frozenset(['Mem', 'MemoryBound', 'MemoryBW'])
@@ -4748,6 +4879,26 @@ def compute(self, EV):
 loads  (including speculative)"""
 
 
+class Metric_L2MPKI_RFO:
+    name = "L2MPKI_RFO"
+    domain = "Metric"
+    maxval = 0
+    errcount = 0
+    area = "Info.Memory"
+    metricgroup = frozenset(['CacheMisses', 'Offcore'])
+    sibling = None
+
+    def compute(self, EV):
+        try:
+            self.val = L2MPKI_RFO(self, EV, 0)
+            self.thresh = True
+        except ZeroDivisionError:
+            handle_error_metric(self, "L2MPKI_RFO zero division")
+    desc = """
+Offcore requests (L2 cache miss) per kilo instruction for
+demand RFOs"""
+
+
 class Metric_L2HPKI_All:
     name = "L2HPKI_All"
     domain = "Metric"
@@ -4845,8 +4996,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L1D_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L1 data cache
-[GB / sec]"""
+"""
 
 
 class Metric_L2_Cache_Fill_BW:
@@ -4865,8 +5015,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L2_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L2 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Fill_BW:
@@ -4885,8 +5034,7 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Fill_BW zero division")
     desc = """
-Average per-thread data fill bandwidth to the L3 cache [GB /
-sec]"""
+"""
 
 
 class Metric_L3_Cache_Access_BW:
@@ -4905,14 +5053,13 @@ def compute(self, EV):
         except ZeroDivisionError:
             handle_error_metric(self, "L3_Cache_Access_BW zero division")
     desc = """
-Average per-thread data access bandwidth to the L3 cache [GB
-/ sec]"""
+"""
 
 
 class Metric_Page_Walks_Utilization:
     name = "Page_Walks_Utilization"
     domain = "Core_Metric"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.Memory.TLB"
     metricgroup = frozenset(['Mem', 'MemoryTLB'])
@@ -5189,7 +5336,7 @@ def compute(self, EV):
 class Metric_CPU_Utilization:
     name = "CPU_Utilization"
     domain = "Metric"
-    maxval = 200
+    maxval = 1
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['HPC', 'Summary'])
@@ -5208,7 +5355,7 @@ def compute(self, EV):
 class Metric_CPUs_Utilized:
     name = "CPUs_Utilized"
     domain = "Metric"
-    maxval = 0
+    maxval = 300
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Summary'])
@@ -5262,13 +5409,13 @@ def compute(self, EV):
     desc = """
 Giga Floating Point Operations Per Second. Aggregate across
 all supported options of: FP precisions, scalar and vector
-instructions, vector-width  ."""
+instructions, vector-width"""
 
 
 class Metric_Turbo_Utilization:
     name = "Turbo_Utilization"
     domain = "Core_Metric"
-    maxval = 10
+    maxval = 10.0
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['Power'])
@@ -5287,7 +5434,7 @@ def compute(self, EV):
 class Metric_SMT_2T_Utilization:
     name = "SMT_2T_Utilization"
     domain = "Core_Metric"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['SMT'])
@@ -5307,7 +5454,7 @@ def compute(self, EV):
 class Metric_Kernel_Utilization:
     name = "Kernel_Utilization"
     domain = "Metric"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['OS'])
@@ -5347,7 +5494,7 @@ def compute(self, EV):
 class Metric_C0_Wait:
     name = "C0_Wait"
     domain = "Metric"
-    maxval = 1
+    maxval = 1.0
     errcount = 0
     area = "Info.System"
     metricgroup = frozenset(['C0Wait'])
@@ -5521,6 +5668,7 @@ def __init__(self, r):
         n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n
         n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n
         n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = FB_Full() ; r.run(n) ; o["FB_Full"] = n
@@ -5613,6 +5761,7 @@ def __init__(self, r):
         o["Load_STLB_Hit"].parent = o["DTLB_Load"]
         o["Load_STLB_Miss"].parent = o["DTLB_Load"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["L1_Hit_Latency"].parent = o["L1_Bound"]
         o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["FB_Full"].parent = o["L1_Bound"]
@@ -5690,7 +5839,8 @@ def __init__(self, r):
         n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n
         n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n
         n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n
-        n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n
+        n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n
+        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n
         n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n
         n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n
@@ -5704,7 +5854,6 @@ def __init__(self, r):
         n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n
         n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n
         n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n
-        n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n
         n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n
         n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n
         n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n
@@ -5725,12 +5874,16 @@ def __init__(self, r):
         n = Metric_Strings_Cycles() ; r.metric(n) ; o["Strings_Cycles"] = n
         n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n
         n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n
+        n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n
+        n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n
+        n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n
         n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n
         n = Metric_LSD_Coverage() ; r.metric(n) ; o["LSD_Coverage"] = n
         n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n
         n = Metric_Unknown_Branch_Cost() ; r.metric(n) ; o["Unknown_Branch_Cost"] = n
         n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n
         n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n
+        n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n
         n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n
         n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n
         n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n
@@ -5756,6 +5909,7 @@ def __init__(self, r):
         n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n
         n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n
         n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n
+        n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n
         n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n
         n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n
         n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n
@@ -5930,46 +6084,54 @@ def __init__(self, r):
         o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
         o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"]
         o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].Load_STLB_Miss = o["Load_STLB_Miss"]
-        o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"]
         o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"]
         o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"]
         o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"]
-        o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"]
+        o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"]
+        o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Bandwidth"].L3_Bound = o["L3_Bound"]
         o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"]
         o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"]
         o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"]
         o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"]
-        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
-        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
-        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
-        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"]
         o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"]
         o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"]
         o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"]
         o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"]
         o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"]
-        o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
+        o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"]
         o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"]
-        o["Cache_Memory_Latency"].Store_STLB_Miss = o["Store_STLB_Miss"]
-        o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
+        o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"]
         o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"]
+        o["Cache_Memory_Latency"].FB_Full = o["FB_Full"]
         o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"]
         o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"]
+        o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
+        o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"]
+        o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"]
+        o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"]
+        o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"]
+        o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"]
+        o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"]
+        o["Cache_Memory_Latency"].Store_STLB_Miss = o["Store_STLB_Miss"]
+        o["Cache_Memory_Latency"].Load_STLB_Miss = o["Load_STLB_Miss"]
+        o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"]
         o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"]
         o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"]
-        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
+        o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"]
         o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"]
         o["Memory_Data_TLBs"].Load_STLB_Miss = o["Load_STLB_Miss"]
         o["Memory_Data_TLBs"].False_Sharing = o["False_Sharing"]
-        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
+        o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"]
         o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"]
         o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"]
         o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"]
@@ -5979,6 +6141,7 @@ def __init__(self, r):
         o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"]
         o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"]
         o["Memory_Data_TLBs"].FB_Full = o["FB_Full"]
+        o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"]
         o["Memory_Data_TLBs"].DRAM_Bound = o["DRAM_Bound"]
         o["Memory_Synchronization"].L1_Bound = o["L1_Bound"]
         o["Memory_Synchronization"].Frontend_Bound = o["Frontend_Bound"]
@@ -6055,7 +6218,7 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Divider = o["Divider"]
         o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"]
         o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"]
-        o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
+        o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"]
         o["Other_Bottlenecks"].FB_Full = o["FB_Full"]
         o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"]
         o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"]
@@ -6077,6 +6240,7 @@ def __init__(self, r):
         o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"]
         o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"]
+        o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"]
         o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"]
         o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"]
         o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"]
@@ -6099,19 +6263,19 @@ def __init__(self, r):
         o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"]
         o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"]
         o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"]
-        o["Base_Non_Br"].Retiring = o["Retiring"]
-        o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"]
-        o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"]
-        o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
-        o["Base_Non_Br"].Assists = o["Assists"]
-        o["UopPI"].Retiring = o["Retiring"]
-        o["UpTB"].Retiring = o["Retiring"]
+        o["Useful_Work"].Retiring = o["Retiring"]
+        o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"]
+        o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"]
+        o["Useful_Work"].Assists = o["Assists"]
         o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"]
         o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"]
         o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"]
         o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"]
         o["Core_Bound_Likely"].Retiring = o["Retiring"]
         o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"]
+        o["UopPI"].Retiring = o["Retiring"]
+        o["UpTB"].Retiring = o["Retiring"]
         o["Retire"].Retiring = o["Retiring"]
         o["DSB_Misses"].LSD = o["LSD"]
         o["DSB_Misses"].MITE = o["MITE"]
@@ -6126,6 +6290,12 @@ def __init__(self, r):
         o["DSB_Misses"].DSB = o["DSB"]
         o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"]
         o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"]
+        o["DSB_Bandwidth"].LSD = o["LSD"]
+        o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"]
+        o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["DSB_Bandwidth"].MITE = o["MITE"]
+        o["DSB_Bandwidth"].DSB = o["DSB"]
+        o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"]
         o["IC_Misses"].LCP = o["LCP"]
         o["IC_Misses"].MS_Switches = o["MS_Switches"]
@@ -6202,5 +6372,6 @@ def __init__(self, r):
         o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
+        o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],)
         o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],)
         o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],)