From afb06baf48cade0bf4b40058a3f9d557f0fab2c4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 25 Apr 2024 06:16:35 -0700 Subject: [PATCH] toplev: Use TMA 4.8 model for Meteorlake --- mtl_rwc_ratios.py | 573 ++++++++++++++++++++++++++++++---------------- 1 file changed, 372 insertions(+), 201 deletions(-) diff --git a/mtl_rwc_ratios.py b/mtl_rwc_ratios.py index 59ee604a..94fa8e07 100644 --- a/mtl_rwc_ratios.py +++ b/mtl_rwc_ratios.py @@ -1,6 +1,6 @@ # -*- coding: latin-1 -*- # -# auto generated TopDown/TMA 4.7-full-perf description for Intel 14th gen Core (code name Meteor Lake) with Redwood Cove +# auto generated TopDown/TMA 4.8-full-perf description for Intel 14th gen Core (code name Meteor Lake) with Redwood Cove # Please see http://ark.intel.com for more details on these CPUs. # # References: @@ -16,10 +16,13 @@ print_error = lambda msg: False smt_enabled = False ebs_mode = False -version = "4.7-full-perf" +version = "4.8-full-perf" base_frequency = -1.0 Memory = 0 Average_Frequency = 0.0 +num_cores = 1 +num_threads = 1 +num_sockets = 1 topdown_use_fixed = False def handle_error(obj, msg): @@ -46,6 +49,8 @@ def handle_error_metric(obj, msg): OneMillion = 1000000 OneBillion = 1000000000 Energy_Unit = 61 +PERF_METRICS_MSR = 1 +DS = 0 # Aux. formulas @@ -54,7 +59,7 @@ def Br_DoI_Jumps(self, EV, level): return EV("BR_INST_RETIRED.NEAR_TAKEN", level) - EV("BR_INST_RETIRED.COND_TAKEN", level) - 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) def Branching_Retired(self, EV, level): - return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + EV("BR_INST_RETIRED.NEAR_CALL", level)) / SLOTS(self, EV, level) + return (EV("BR_INST_RETIRED.ALL_BRANCHES", level) + 2 * EV("BR_INST_RETIRED.NEAR_CALL", level) + EV("INST_RETIRED.NOP", level)) / SLOTS(self, EV, level) def Serialize_Core(self, EV, level): return self.Core_Bound.compute(EV) * (self.Serializing_Operation.compute(EV) + EV("RS.EMPTY:u1", level) / CLKS(self, EV, level) * self.Ports_Utilized_0.compute(EV)) / (self.Serializing_Operation.compute(EV) + self.Ports_Utilization.compute(EV) + self.Divider.compute(EV)) @@ -99,11 +104,11 @@ def FLOP_Count(self, EV, level): # Floating Point computational (arithmetic) Operations Count def FP_Arith_Scalar(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.SCALAR_SINGLE:u0x03", level) + return EV("FP_ARITH_INST_RETIRED.SCALAR", level) # Floating Point computational (arithmetic) Operations Count def FP_Arith_Vector(self, EV, level): - return EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE:u0x3c", level) + return EV("FP_ARITH_INST_RETIRED.VECTOR", level) def HighIPC(self, EV, level): val = IPC(self, EV, level) / Pipeline_Width @@ -160,7 +165,11 @@ def Retired_Slots(self, EV, level): # Number of logical processors (enabled or online) on the target system def Num_CPUs(self, EV, level): - return 24 if smt_enabled else 16 + return num_cores * num_threads if num_cores else(8 + 16 /(2 - smt_enabled)) + +# A system parameter for dependent-loads (pointer chasing like access pattern) of the workload. An integer fraction in range from 0 (no dependent loads) to 100 (all loads are dependent loads) +def Dependent_Loads_Weight(self, EV, level): + return 20 # Total pipeline cost of Branch Misprediction related bottlenecks def Mispredictions(self, EV, level): @@ -174,7 +183,7 @@ def Big_Code(self, EV, level): self.thresh = (val > 20) return val -# Total pipeline cost of instruction fetch bandwidth related bottlenecks +# Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end) def Instruction_Fetch_BW(self, EV, level): val = 100 *(self.Frontend_Bound.compute(EV) - (1 - Umisp(self, EV, level)) * self.Fetch_Latency.compute(EV) * self.Mispredicts_Resteers.compute(EV) / (self.LCP.compute(EV) + self.ICache_Misses.compute(EV) + self.DSB_Switches.compute(EV) + self.Branch_Resteers.compute(EV) + self.MS_Switches.compute(EV) + self.ITLB_Misses.compute(EV)) - Assist_Frontend(self, EV, level)) - Big_Code(self, EV, level) self.thresh = (val > 20) @@ -182,19 +191,19 @@ def Instruction_Fetch_BW(self, EV, level): # Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks def Cache_Memory_Bandwidth(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Bandwidth.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.SQ_Full.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.FB_Full.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of external Memory- or Cache-Latency related bottlenecks def Cache_Memory_Latency(self, EV, level): - val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *((self.Memory_Bound.compute(EV) * (self.DRAM_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.MEM_Latency.compute(EV) / (self.MEM_Latency.compute(EV) + self.MEM_Bandwidth.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L3_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L3_Hit_Latency.compute(EV) / (self.L3_Hit_Latency.compute(EV) + self.Contested_Accesses.compute(EV) + self.SQ_Full.compute(EV) + self.Data_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * self.L2_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.Store_Latency.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV)))) + (self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.L1_Hit_Latency.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))))) self.thresh = (val > 20) return val # Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs) def Memory_Data_TLBs(self, EV, level): - val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Load.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.Split_Loads.compute(EV) + self.Lock_Latency.compute(EV) + self.FB_Full.compute(EV) + self.DTLB_Load.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) + val = 100 *(self.Memory_Bound.compute(EV) * (self.L1_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Load.compute(EV) / (self.Store_Fwd_Blk.compute(EV) + self.L1_Hit_Latency.compute(EV) + self.DTLB_Load.compute(EV) + self.Lock_Latency.compute(EV) + self.Split_Loads.compute(EV) + self.FB_Full.compute(EV))) + (self.Memory_Bound.compute(EV) * (self.Store_Bound.compute(EV) / (self.L1_Bound.compute(EV) + self.L3_Bound.compute(EV) + self.DRAM_Bound.compute(EV) + self.Store_Bound.compute(EV) + self.L2_Bound.compute(EV))) * (self.DTLB_Store.compute(EV) / (self.Split_Stores.compute(EV) + self.DTLB_Store.compute(EV) + self.Streaming_Stores.compute(EV) + self.Store_Latency.compute(EV) + self.False_Sharing.compute(EV))))) self.thresh = (val > 20) return val @@ -216,24 +225,30 @@ def Irregular_Overhead(self, EV, level): self.thresh = (val > 10) return val -# Total pipeline cost of remaining bottlenecks (apart from those listed in the Info.Bottlenecks metrics class). Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. +# Total pipeline cost of remaining bottlenecks in the back-end. Examples include data-dependencies (Core Bound when Low ILP) and other unlisted memory-related stalls. def Other_Bottlenecks(self, EV, level): - val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Base_Non_Br(self, EV, level)) + val = 100 -(Big_Code(self, EV, level) + Instruction_Fetch_BW(self, EV, level) + Mispredictions(self, EV, level) + Cache_Memory_Bandwidth(self, EV, level) + Cache_Memory_Latency(self, EV, level) + Memory_Data_TLBs(self, EV, level) + Memory_Synchronization(self, EV, level) + Compute_Bound_Est(self, EV, level) + Irregular_Overhead(self, EV, level) + Branching_Overhead(self, EV, level) + Useful_Work(self, EV, level)) self.thresh = (val > 20) return val -# Total pipeline cost of branch related instructions (used for program control-flow including function calls) +# Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound). Consider Loop Unrolling or function inlining optimizations def Branching_Overhead(self, EV, level): val = 100 * Branching_Retired(self, EV, level) self.thresh = (val > 5) return val -# Total pipeline cost of "useful operations" - the baseline operations not covered by Branching_Overhead nor Irregular_Overhead. -def Base_Non_Br(self, EV, level): +# Total pipeline cost of "useful operations" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead. +def Useful_Work(self, EV, level): val = 100 *(self.Retiring.compute(EV) - Branching_Retired(self, EV, level) - Assist_Retired(self, EV, level)) self.thresh = (val > 20) return val +# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled +def Core_Bound_Likely(self, EV, level): + val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 + self.thresh = (val > 0.5) + return val + # Instructions Per Cycle (per Logical Processor) def IPC(self, EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(self, EV, level) @@ -244,7 +259,7 @@ def UopPI(self, EV, level): self.thresh = (val > 1.05) return val -# Instruction per taken branch +# Uops per taken branch def UpTB(self, EV, level): val = Retired_Slots(self, EV, level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 1.5 @@ -290,12 +305,6 @@ def ILP(self, EV, level): def EPC(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / CLKS(self, EV, level) -# Probability of Core Bound bottleneck hidden by SMT-profiling artifacts. Tip: consider analysis with SMT disabled -def Core_Bound_Likely(self, EV, level): - val = 100 *(1 - self.Core_Bound.compute(EV) / self.Ports_Utilization.compute(EV) if self.Core_Bound.compute(EV)< self.Ports_Utilization.compute(EV) else 1) if SMT_2T_Utilization(self, EV, level)> 0.5 else 0 - self.thresh = (val > 0.5) - return val - # Core actual clocks when any Logical Processor is active on the Physical Core def CORE_CLKS(self, EV, level): return EV("CPU_CLK_UNHALTED.DISTRIBUTED", level) if smt_enabled else CLKS(self, EV, level) @@ -324,7 +333,7 @@ def IpCall(self, EV, level): self.thresh = (val < 200) return val -# Instruction per taken branch +# Instructions per taken branch def IpTB(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) self.thresh = val < Pipeline_Width * 2 + 1 @@ -400,10 +409,21 @@ def IpAssist(self, EV, level): self.thresh = (val < 100000) return val -# Instruction-Level-Parallelism (average number of uops executed when there is execution) per physical core def Execute(self, EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(self, EV, level) +# Average number of uops fetched from LSD per cycle +def Fetch_LSD(self, EV, level): + return EV("LSD.UOPS", level) / EV("LSD.CYCLES_ACTIVE", level) + +# Average number of uops fetched from DSB per cycle +def Fetch_DSB(self, EV, level): + return EV("IDQ.DSB_UOPS", level) / EV("IDQ.DSB_CYCLES_ANY", level) + +# Average number of uops fetched from MITE per cycle +def Fetch_MITE(self, EV, level): + return EV("IDQ.MITE_UOPS", level) / EV("IDQ.MITE_CYCLES_ANY", level) + # Average number of Uops issued by front-end when it issued something def Fetch_UpC(self, EV, level): return EV("UOPS_ISSUED.ANY", level) / EV("UOPS_ISSUED.ANY:c1", level) @@ -432,6 +452,12 @@ def DSB_Misses(self, EV, level): self.thresh = (val > 10) return val +# Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck. +def DSB_Bandwidth(self, EV, level): + val = 100 *(self.Frontend_Bound.compute(EV) * (self.Fetch_Bandwidth.compute(EV) / (self.Fetch_Bandwidth.compute(EV) + self.Fetch_Latency.compute(EV))) * (self.DSB.compute(EV) / (self.LSD.compute(EV) + self.MITE.compute(EV) + self.DSB.compute(EV)))) + self.thresh = (val > 10) + return val + # Average Latency for L1 instruction cache misses def ICache_Miss_Latency(self, EV, level): return EV("ICACHE_DATA.STALLS", level) / EV("ICACHE_DATA.STALLS:c1:e1", level) @@ -466,25 +492,25 @@ def IpMispredict(self, EV, level): self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional non-taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional non-taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Ntaken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_NTAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for conditional taken branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate). def IpMisp_Cond_Taken(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.COND_TAKEN", level) self.thresh = (val < 200) return val -# Instructions per retired mispredicts for return branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate). def IpMisp_Ret(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.RET", level) self.thresh = (val < 500) return val -# Instructions per retired mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). +# Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate). def IpMisp_Indirect(self, EV, level): val = EV("INST_RETIRED.ANY", level) / EV("BR_MISP_RETIRED.INDIRECT", level) self.thresh = (val < 1000) @@ -494,7 +520,7 @@ def IpMisp_Indirect(self, EV, level): def Branch_Misprediction_Cost(self, EV, level): return Mispredictions(self, EV, level) * SLOTS(self, EV, level) / EV("BR_MISP_RETIRED.ALL_BRANCHES", level) / 100 -# Speculative to Retired ratio of all clears (covering mispredicts and nukes) +# Speculative to Retired ratio of all clears (covering Mispredicts and nukes) def Spec_Clears_Ratio(self, EV, level): return EV("INT_MISC.CLEARS_COUNT", level) / (EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) @@ -546,6 +572,10 @@ def L2MPKI_All(self, EV, level): def L2MPKI_Load(self, EV, level): return 1000 * EV("L2_RQSTS.DEMAND_DATA_RD_MISS", level) / EV("INST_RETIRED.ANY", level) +# Offcore requests (L2 cache miss) per kilo instruction for demand RFOs +def L2MPKI_RFO(self, EV, level): + return 1000 * EV("L2_RQSTS.RFO_MISS", level) / EV("INST_RETIRED.ANY", level) + # L2 cache hits per kilo instruction for all request types (including speculative) def L2HPKI_All(self, EV, level): return 1000 *(EV("L2_RQSTS.REFERENCES", level) - EV("L2_RQSTS.MISS", level)) / EV("INST_RETIRED.ANY", level) @@ -562,19 +592,15 @@ def L3MPKI(self, EV, level): def FB_HPKI(self, EV, level): return 1000 * EV("MEM_LOAD_RETIRED.FB_HIT", level) / EV("INST_RETIRED.ANY", level) -# Average per-thread data fill bandwidth to the L1 data cache [GB / sec] def L1D_Cache_Fill_BW(self, EV, level): return 64 * EV("L1D.REPLACEMENT", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L2 cache [GB / sec] def L2_Cache_Fill_BW(self, EV, level): return 64 * EV("L2_LINES_IN.ALL", level) / OneBillion / Time(self, EV, level) -# Average per-thread data fill bandwidth to the L3 cache [GB / sec] def L3_Cache_Fill_BW(self, EV, level): return 64 * EV("LONGEST_LAT_CACHE.MISS", level) / OneBillion / Time(self, EV, level) -# Average per-thread data access bandwidth to the L3 cache [GB / sec] def L3_Cache_Access_BW(self, EV, level): return 64 * EV("OFFCORE_REQUESTS.ALL_REQUESTS", level) / OneBillion / Time(self, EV, level) @@ -638,17 +664,17 @@ def Bus_Lock_PKI(self, EV, level): # Average CPU Utilization (percentage) def CPU_Utilization(self, EV, level): - return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) + return CPUs_Utilized(self, EV, level) / Num_CPUs(self, EV, level) # Average number of utilized CPUs def CPUs_Utilized(self, EV, level): - return Num_CPUs(self, EV, level) * CPU_Utilization(self, EV, level) + return EV("CPU_CLK_UNHALTED.REF_TSC", level) / EV("msr/tsc/", 0) # Measured Average Core Frequency for unhalted processors [GHz] def Core_Frequency(self, EV, level): return Turbo_Utilization(self, EV, level) * EV("msr/tsc/", 0) / OneBillion / Time(self, EV, level) -# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width . +# Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width def GFLOPs(self, EV, level): return (FLOP_Count(self, EV, level) / OneBillion) / Time(self, EV, level) @@ -716,7 +742,7 @@ class Frontend_Bound: sample = ['FRONTEND_RETIRED.LATENCY_GE_4:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1', 'PGO']) + metricgroup = frozenset(['BvFB', 'BvIO', 'TmaL1', 'PGO']) maxval = None def compute(self, EV): try: @@ -778,7 +804,7 @@ class ICache_Misses: sample = ['FRONTEND_RETIRED.L2_MISS:pp', 'FRONTEND_RETIRED.L1I_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'IcMiss']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'IcMiss']) maxval = None def compute(self, EV): try: @@ -803,7 +829,7 @@ class ITLB_Misses: sample = ['FRONTEND_RETIRED.STLB_MISS:pp', 'FRONTEND_RETIRED.ITLB_MISS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat', 'MemoryTLB']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat', 'MemoryTLB']) maxval = None def compute(self, EV): try: @@ -860,7 +886,7 @@ class Mispredicts_Resteers: sample = ['INT_MISC.CLEAR_RESTEER_CYCLES'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP']) maxval = None def compute(self, EV): try: @@ -908,7 +934,7 @@ class Unknown_Branches: sample = ['FRONTEND_RETIRED.UNKNOWN_BRANCH'] errcount = 0 sibling = None - metricgroup = frozenset(['BigFootprint', 'FetchLat']) + metricgroup = frozenset(['BigFootprint', 'BvBC', 'FetchLat']) maxval = None def compute(self, EV): try: @@ -935,7 +961,7 @@ class MS_Switches: errcount = 0 sibling = None metricgroup = frozenset(['FetchLat', 'MicroSeq']) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = MS_Switches_Cost * EV("UOPS_RETIRED.MS:c1:e1", 3) / Retire_Fraction(self, EV, 3) / CLKS(self, EV, 3) @@ -1189,7 +1215,7 @@ class Branch_Mispredicts: sample = ['TOPDOWN.BR_MISPREDICT_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BrMispredicts', 'BvMP', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1220,7 +1246,7 @@ class Other_Mispredicts: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['BrMispredicts']) + metricgroup = frozenset(['BvIO', 'BrMispredicts']) maxval = None def compute(self, EV): try: @@ -1244,7 +1270,7 @@ class Machine_Clears: sample = ['MACHINE_CLEARS.COUNT'] errcount = 0 sibling = None - metricgroup = frozenset(['BadSpec', 'MachineClears', 'TmaL2']) + metricgroup = frozenset(['BadSpec', 'BvMS', 'MachineClears', 'TmaL2']) maxval = None def compute(self, EV): try: @@ -1274,7 +1300,7 @@ class Other_Nukes: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Machine_Clears']) + metricgroup = frozenset(['BvIO', 'Machine_Clears']) maxval = None def compute(self, EV): try: @@ -1298,7 +1324,7 @@ class Backend_Bound: sample = ['TOPDOWN.BACKEND_BOUND_SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvOB', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -1390,8 +1416,8 @@ class DTLB_Load: sample = ['MEM_INST_RETIRED.STLB_MISS_LOADS:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = None + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("MEM_INST_RETIRED.STLB_HIT_LOADS", 4) * min(EV("MEM_INST_RETIRED.STLB_HIT_LOADS", 999) , Mem_STLB_Hit_Cost) / CLKS(self, EV, 4) + self.Load_STLB_Miss.compute(EV) @@ -1422,7 +1448,7 @@ class Load_STLB_Hit: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = max(0 , self.DTLB_Load.compute(EV) - self.Load_STLB_Miss.compute(EV)) @@ -1446,7 +1472,7 @@ class Load_STLB_Miss: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = None + maxval = 1.0 def compute(self, EV): try: self.val = EV("DTLB_LOAD_MISSES.WALK_ACTIVE", 5) / CLKS(self, EV, 5) @@ -1470,7 +1496,7 @@ class Store_Fwd_Blk: errcount = 0 sibling = None metricgroup = frozenset([]) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = 13 * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(self, EV, 4) @@ -1491,17 +1517,42 @@ def compute(self, EV): region than the load is reading.""" +class L1_Hit_Latency: + name = "L1_Hit_Latency" + domain = "Clocks_Estimated" + area = "BE/Mem" + level = 4 + htoff = False + sample = ['MEM_LOAD_RETIRED.L1_HIT'] + errcount = 0 + sibling = None + metricgroup = frozenset(['BvML', 'MemoryLat']) + maxval = 1.0 + def compute(self, EV): + try: + self.val = min(2 *(EV("MEM_INST_RETIRED.ALL_LOADS", 4) - EV("MEM_LOAD_RETIRED.FB_HIT", 4) - EV("MEM_LOAD_RETIRED.L1_MISS", 4)) * Dependent_Loads_Weight(self, EV, 4) / 100 , max(EV("CYCLE_ACTIVITY.CYCLES_MEM_ANY", 4) - EV("MEMORY_ACTIVITY.CYCLES_L1D_MISS", 4) , 0)) / CLKS(self, EV, 4) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + handle_error(self, "L1_Hit_Latency zero division") + return self.val + desc = """ +This metric roughly estimates fraction of cycles with demand +load accesses that hit the L1 cache. The short latency of +the L1 data cache may be exposed in pointer-chasing memory +access patterns as an example.""" + + class Lock_Latency: name = "Lock_Latency" domain = "Clocks" area = "BE/Mem" level = 4 htoff = False - sample = ['MEM_INST_RETIRED.LOCK_LOADS:pp'] + sample = ['MEM_INST_RETIRED.LOCK_LOADS'] errcount = 0 sibling = None metricgroup = frozenset(['Offcore']) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = EV("MEM_INST_RETIRED.LOCK_LOADS", 4) * EV("MEM_INST_RETIRED.LOCK_LOADS", 999) / CLKS(self, EV, 4) @@ -1526,7 +1577,7 @@ class Split_Loads: errcount = 0 sibling = None metricgroup = frozenset([]) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = EV("MEM_INST_RETIRED.SPLIT_LOADS", 4) * min(EV("MEM_INST_RETIRED.SPLIT_LOADS", 999) , Load_Miss_Real_Latency(self, EV, 4)) / CLKS(self, EV, 4) @@ -1550,7 +1601,7 @@ class FB_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW']) + metricgroup = frozenset(['BvMB', 'MemoryBW']) maxval = None def compute(self, EV): try: @@ -1579,7 +1630,7 @@ class L2_Bound: sample = ['MEM_LOAD_RETIRED.L2_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['CacheHits', 'MemoryBound', 'TmaL3mem']) + metricgroup = frozenset(['BvML', 'CacheHits', 'MemoryBound', 'TmaL3mem']) maxval = None def compute(self, EV): try: @@ -1629,8 +1680,8 @@ class Contested_Accesses: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD', 'MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) - maxval = 1 + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) + maxval = 1.0 def compute(self, EV): try: self.val = (EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", 4) * min(EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", 999) , Mem_XSNP_Hit_Cost(self, EV, 4) - Mem_L2_Hit_Cost(self, EV, 4)) + EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", 4) * min(EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", 999) , Mem_XSNP_HitM_Cost(self, EV, 4) - Mem_L2_Hit_Cost(self, EV, 4)) * True_XSNP_HitM_Fraction(self, EV, 4)) * FB_Factor(self, EV, 4) / CLKS(self, EV, 4) @@ -1657,8 +1708,8 @@ class Data_Sharing: sample = ['MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD'] errcount = 0 sibling = None - metricgroup = frozenset(['Offcore', 'Snoop']) - maxval = 1 + metricgroup = frozenset(['BvMS', 'Offcore', 'Snoop']) + maxval = 1.0 def compute(self, EV): try: self.val = (EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", 4) * min(EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", 999) , Mem_XSNP_Hit_Cost(self, EV, 4) - Mem_L2_Hit_Cost(self, EV, 4)) + EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", 4) * min(EV("MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", 999) , Mem_XSNP_Hit_Cost(self, EV, 4) - Mem_L2_Hit_Cost(self, EV, 4)) * (1 - True_XSNP_HitM_Fraction(self, EV, 4))) * FB_Factor(self, EV, 4) / CLKS(self, EV, 4) @@ -1684,8 +1735,8 @@ class L3_Hit_Latency: sample = ['MEM_LOAD_RETIRED.L3_HIT:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat']) - maxval = 1 + metricgroup = frozenset(['BvML', 'MemoryLat']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("MEM_LOAD_RETIRED.L3_HIT", 4) * min(EV("MEM_LOAD_RETIRED.L3_HIT", 999) , Mem_XSNP_None_Cost(self, EV, 4) - Mem_L2_Hit_Cost(self, EV, 4)) * FB_Factor(self, EV, 4) / CLKS(self, EV, 4) @@ -1712,7 +1763,7 @@ class SQ_Full: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1737,7 +1788,7 @@ class DRAM_Bound: errcount = 0 sibling = None metricgroup = frozenset(['MemoryBound', 'TmaL3mem']) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = MEM_Bound_Ratio(self, EV, 3) @@ -1760,7 +1811,7 @@ class MEM_Bandwidth: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'MemoryBW', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1798,7 +1849,7 @@ class MEM_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) maxval = None def compute(self, EV): try: @@ -1854,8 +1905,8 @@ class Store_Latency: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryLat', 'Offcore']) - maxval = 1 + metricgroup = frozenset(['BvML', 'MemoryLat', 'Offcore']) + maxval = 1.0 def compute(self, EV): try: self.val = (Store_L2_Hit_Cycles(self, EV, 4) + (1 - Mem_Lock_St_Fraction(self, EV, 4)) * ORO_Demand_RFO_C1(self, EV, 4)) / CLKS(self, EV, 4) @@ -1882,8 +1933,8 @@ class False_Sharing: sample = ['OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM'] errcount = 0 sibling = None - metricgroup = frozenset(['DataSharing', 'Offcore', 'Snoop']) - maxval = 1 + metricgroup = frozenset(['BvMS', 'DataSharing', 'Offcore', 'Snoop']) + maxval = 1.0 def compute(self, EV): try: self.val = Mem_XSNP_HitM_Cost(self, EV, 4) * EV("OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", 4) / CLKS(self, EV, 4) @@ -1934,10 +1985,11 @@ class Streaming_Stores: errcount = 0 sibling = None metricgroup = frozenset(['MemoryBW', 'Offcore']) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: - self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) + self.val = 9 * EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) / CLKS(self, EV, 4) if DS else 0 + EV("OCR.STREAMING_WR.ANY_RESPONSE", 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Streaming_Stores zero division") @@ -1961,8 +2013,8 @@ class DTLB_Store: sample = ['MEM_INST_RETIRED.STLB_MISS_STORES:pp'] errcount = 0 sibling = None - metricgroup = frozenset(['MemoryTLB']) - maxval = 1 + metricgroup = frozenset(['BvMT', 'MemoryTLB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("MEM_INST_RETIRED.STLB_HIT_STORES", 4) * min(EV("MEM_INST_RETIRED.STLB_HIT_STORES", 999) , Mem_STLB_Hit_Cost) / CLKS(self, EV, 4) + self.Store_STLB_Miss.compute(EV) @@ -1991,7 +2043,7 @@ class Store_STLB_Hit: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = max(0 , self.DTLB_Store.compute(EV) - self.Store_STLB_Miss.compute(EV)) @@ -2015,7 +2067,7 @@ class Store_STLB_Miss: errcount = 0 sibling = None metricgroup = frozenset(['MemoryTLB']) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = EV("DTLB_STORE_MISSES.WALK_ACTIVE", 5) / CORE_CLKS(self, EV, 5) @@ -2069,8 +2121,8 @@ class Divider: sample = ['ARITH.DIV_ACTIVE'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = None + metricgroup = frozenset(['BvCB']) + maxval = 1.0 def compute(self, EV): try: self.val = EV("ARITH.DIV_ACTIVE", 3) / CLKS(self, EV, 3) @@ -2095,7 +2147,7 @@ class Serializing_Operation: sample = ['RESOURCE_STALLS.SCOREBOARD'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvIO', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2192,7 +2244,7 @@ class Memory_Fence: errcount = 0 sibling = None metricgroup = frozenset([]) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = 13 * EV("MISC2_RETIRED.LFENCE", 4) / CLKS(self, EV, 4) @@ -2254,7 +2306,7 @@ class Ports_Utilized_0: maxval = None def compute(self, EV): try: - self.val = max((EV("EXE_ACTIVITY.3_PORTS_UTIL:u0x80", 4) + EV("RS.EMPTY_RESOURCE", 4)) / CLKS(self, EV, 4) , 1) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4) + self.val = max((EV("EXE_ACTIVITY.EXE_BOUND_0_PORTS", 4) + max(EV("RS.EMPTY_RESOURCE", 4) - EV("RESOURCE_STALLS.SCOREBOARD", 4) , 0)) / CLKS(self, EV, 4) , 1) * (EV("CYCLE_ACTIVITY.STALLS_TOTAL", 4) - EV("EXE_ACTIVITY.BOUND_ON_LOADS", 4)) / CLKS(self, EV, 4) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: handle_error(self, "Ports_Utilized_0 zero division") @@ -2280,7 +2332,7 @@ class Mixing_Vectors: errcount = 0 sibling = None metricgroup = frozenset([]) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = 160 * EV("ASSISTS.SSE_AVX_MIX", 5) / CLKS(self, EV, 5) @@ -2364,7 +2416,7 @@ class Ports_Utilized_3m: sample = ['UOPS_EXECUTED.CYCLES_GE_3'] errcount = 0 sibling = None - metricgroup = frozenset(['PortsUtil']) + metricgroup = frozenset(['BvCB', 'PortsUtil']) maxval = None def compute(self, EV): try: @@ -2527,7 +2579,7 @@ class Retiring: sample = ['UOPS_RETIRED.SLOTS'] errcount = 0 sibling = None - metricgroup = frozenset(['TmaL1']) + metricgroup = frozenset(['BvUW', 'TmaL1']) maxval = None def compute(self, EV): try: @@ -2676,7 +2728,7 @@ class FP_Vector: errcount = 0 sibling = None metricgroup = frozenset(['Compute', 'Flops']) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = FP_Arith_Vector(self, EV, 4) / Retired_Slots(self, EV, 4) @@ -2701,7 +2753,7 @@ class FP_Vector_128b: errcount = 0 sibling = None metricgroup = frozenset(['Compute', 'Flops']) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = (EV("FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5) @@ -2726,7 +2778,7 @@ class FP_Vector_256b: errcount = 0 sibling = None metricgroup = frozenset(['Compute', 'Flops']) - maxval = 1 + maxval = 1.0 def compute(self, EV): try: self.val = (EV("FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", 5) + EV("FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", 5)) / Retired_Slots(self, EV, 5) @@ -2810,9 +2862,9 @@ def compute(self, EV): handle_error(self, "Int_Vector_256b zero division") return self.val desc = """ -This metric represents 256-bit vector Integer ADD/SUB/SAD or -VNNI (Vector Neural Network Instructions) uops fraction the -CPU has retired.""" +This metric represents 256-bit vector Integer +ADD/SUB/SAD/MUL or VNNI (Vector Neural Network Instructions) +uops fraction the CPU has retired.""" class Memory_Operations: @@ -2848,7 +2900,7 @@ class Fused_Instructions: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2876,7 +2928,7 @@ class Non_Fused_Branches: sample = [] errcount = 0 sibling = None - metricgroup = frozenset(['Branches', 'Pipeline']) + metricgroup = frozenset(['Branches', 'BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -2926,7 +2978,7 @@ class Nop_Instructions: sample = ['INST_RETIRED.NOP'] errcount = 0 sibling = None - metricgroup = frozenset(['Pipeline']) + metricgroup = frozenset(['BvBO', 'Pipeline']) maxval = None def compute(self, EV): try: @@ -3056,8 +3108,8 @@ class Assists: sample = ['ASSISTS.ANY'] errcount = 0 sibling = None - metricgroup = frozenset([]) - maxval = 1 + metricgroup = frozenset(['BvIO']) + maxval = 1.0 def compute(self, EV): try: self.val = Avg_Assist_Cost * EV("ASSISTS.ANY", 4) / SLOTS(self, EV, 4) @@ -3197,7 +3249,7 @@ class Metric_Mispredictions: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts']) + metricgroup = frozenset(['Bad', 'BadSpec', 'BrMispredicts', 'BvMP']) sibling = None def compute(self, EV): @@ -3217,7 +3269,7 @@ class Metric_Big_Code: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) + metricgroup = frozenset(['BvBC', 'BigFootprint', 'Fed', 'Frontend', 'IcMiss', 'MemoryTLB']) sibling = None def compute(self, EV): @@ -3238,7 +3290,7 @@ class Metric_Instruction_Fetch_BW: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Fed', 'FetchBW', 'Frontend']) + metricgroup = frozenset(['BvFB', 'Fed', 'FetchBW', 'Frontend']) sibling = None def compute(self, EV): @@ -3249,7 +3301,8 @@ def compute(self, EV): handle_error_metric(self, "Instruction_Fetch_BW zero division") desc = """ Total pipeline cost of instruction fetch bandwidth related -bottlenecks""" +bottlenecks (when the front-end could not sustain operations +delivery to the back-end)""" class Metric_Cache_Memory_Bandwidth: @@ -3258,7 +3311,7 @@ class Metric_Cache_Memory_Bandwidth: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryBW', 'Offcore']) + metricgroup = frozenset(['BvMB', 'Mem', 'MemoryBW', 'Offcore']) sibling = None def compute(self, EV): @@ -3278,7 +3331,7 @@ class Metric_Cache_Memory_Latency: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryLat', 'Offcore']) + metricgroup = frozenset(['BvML', 'Mem', 'MemoryLat', 'Offcore']) sibling = None def compute(self, EV): @@ -3298,7 +3351,7 @@ class Metric_Memory_Data_TLBs: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'MemoryTLB', 'Offcore']) + metricgroup = frozenset(['BvMT', 'Mem', 'MemoryTLB', 'Offcore']) sibling = None def compute(self, EV): @@ -3318,7 +3371,7 @@ class Metric_Memory_Synchronization: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Mem', 'Offcore']) + metricgroup = frozenset(['BvMS', 'Mem', 'Offcore']) sibling = None def compute(self, EV): @@ -3339,7 +3392,7 @@ class Metric_Compute_Bound_Est: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor']) + metricgroup = frozenset(['BvCB', 'Cor']) sibling = None def compute(self, EV): @@ -3360,7 +3413,7 @@ class Metric_Irregular_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Bad', 'Cor', 'Ret']) + metricgroup = frozenset(['Bad', 'BvIO', 'Cor', 'Ret']) sibling = None def compute(self, EV): @@ -3382,7 +3435,7 @@ class Metric_Other_Bottlenecks: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Cor', 'Offcore']) + metricgroup = frozenset(['BvOB', 'Cor', 'Offcore']) sibling = None def compute(self, EV): @@ -3392,10 +3445,9 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Other_Bottlenecks zero division") desc = """ -Total pipeline cost of remaining bottlenecks (apart from -those listed in the Info.Bottlenecks metrics class). -Examples include data-dependencies (Core Bound when Low ILP) -and other unlisted memory-related stalls.""" +Total pipeline cost of remaining bottlenecks in the back- +end. Examples include data-dependencies (Core Bound when Low +ILP) and other unlisted memory-related stalls.""" class Metric_Branching_Overhead: @@ -3404,7 +3456,7 @@ class Metric_Branching_Overhead: maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvBO', 'Ret']) sibling = None def compute(self, EV): @@ -3414,31 +3466,54 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Branching_Overhead zero division") desc = """ -Total pipeline cost of branch related instructions (used for -program control-flow including function calls)""" +Total pipeline cost of instructions used for program +control-flow - a subset of the Retiring category in TMA. +Examples include function calls; loops and alignments. (A +lower bound). Consider Loop Unrolling or function inlining +optimizations""" -class Metric_Base_Non_Br: - name = "Base_Non_Br" +class Metric_Useful_Work: + name = "Useful_Work" domain = "Scaled_Slots" maxval = 0 errcount = 0 area = "Info.Bottleneck" - metricgroup = frozenset(['Ret']) + metricgroup = frozenset(['BvUW', 'Ret']) sibling = None def compute(self, EV): try: - self.val = Base_Non_Br(self, EV, 0) + self.val = Useful_Work(self, EV, 0) self.thresh = (self.val > 20) except ZeroDivisionError: - handle_error_metric(self, "Base_Non_Br zero division") + handle_error_metric(self, "Useful_Work zero division") desc = """ -Total pipeline cost of \"useful operations\" - the baseline -operations not covered by Branching_Overhead nor +Total pipeline cost of \"useful operations\" - the portion +of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.""" +class Metric_Core_Bound_Likely: + name = "Core_Bound_Likely" + domain = "Metric" + maxval = 1.0 + errcount = 0 + area = "Info.Botlnk.L0" + metricgroup = frozenset(['Cor', 'SMT']) + sibling = None + + def compute(self, EV): + try: + self.val = Core_Bound_Likely(self, EV, 0) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + handle_error_metric(self, "Core_Bound_Likely zero division") + desc = """ +Probability of Core Bound bottleneck hidden by SMT-profiling +artifacts. Tip: consider analysis with SMT disabled""" + + class Metric_IPC: name = "IPC" domain = "Metric" @@ -3461,7 +3536,7 @@ def compute(self, EV): class Metric_UopPI: name = "UopPI" domain = "Metric" - maxval = 2 + maxval = 2.0 errcount = 0 area = "Info.Thread" metricgroup = frozenset(['Pipeline', 'Ret', 'Retire']) @@ -3493,7 +3568,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "UpTB zero division") desc = """ -Instruction per taken branch""" +Uops per taken branch""" class Metric_CPI: @@ -3558,7 +3633,7 @@ def compute(self, EV): class Metric_Slots_Utilization: name = "Slots_Utilization" domain = "Metric" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.Thread" metricgroup = frozenset(['SMT', 'TmaL1']) @@ -3619,7 +3694,7 @@ def compute(self, EV): class Metric_FLOPc: name = "FLOPc" domain = "Core_Metric" - maxval = 10 + maxval = 10.0 errcount = 0 area = "Info.Core" metricgroup = frozenset(['Ret', 'Flops']) @@ -3638,7 +3713,7 @@ def compute(self, EV): class Metric_FP_Arith_Utilization: name = "FP_Arith_Utilization" domain = "Core_Metric" - maxval = 2 + maxval = 2.0 errcount = 0 area = "Info.Core" metricgroup = frozenset(['Cor', 'Flops', 'HPC']) @@ -3682,7 +3757,7 @@ def compute(self, EV): class Metric_EPC: name = "EPC" domain = "Metric" - maxval = 20 + maxval = 20.0 errcount = 0 area = "Info.Core" metricgroup = frozenset(['Power']) @@ -3698,26 +3773,6 @@ def compute(self, EV): uops Executed per Cycle""" -class Metric_Core_Bound_Likely: - name = "Core_Bound_Likely" - domain = "Metric" - maxval = 1 - errcount = 0 - area = "Info.Botlnk.L0" - metricgroup = frozenset(['Cor', 'SMT']) - sibling = None - - def compute(self, EV): - try: - self.val = Core_Bound_Likely(self, EV, 0) - self.thresh = (self.val > 0.5) - except ZeroDivisionError: - handle_error_metric(self, "Core_Bound_Likely zero division") - desc = """ -Probability of Core Bound bottleneck hidden by SMT-profiling -artifacts. Tip: consider analysis with SMT disabled""" - - class Metric_CORE_CLKS: name = "CORE_CLKS" domain = "Count" @@ -3834,7 +3889,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpTB zero division") desc = """ -Instruction per taken branch""" +Instructions per taken branch""" class Metric_BpTkBranch: @@ -4124,14 +4179,70 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "Execute zero division") desc = """ -Instruction-Level-Parallelism (average number of uops -executed when there is execution) per physical core""" +""" + + +class Metric_Fetch_LSD: + name = "Fetch_LSD" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_LSD(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_LSD zero division") + desc = """ +Average number of uops fetched from LSD per cycle""" + + +class Metric_Fetch_DSB: + name = "Fetch_DSB" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_DSB(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_DSB zero division") + desc = """ +Average number of uops fetched from DSB per cycle""" + + +class Metric_Fetch_MITE: + name = "Fetch_MITE" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Pipeline" + metricgroup = frozenset(['Fed', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = Fetch_MITE(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "Fetch_MITE zero division") + desc = """ +Average number of uops fetched from MITE per cycle""" class Metric_Fetch_UpC: name = "Fetch_UpC" domain = "Metric" - maxval = 6 + maxval = 6.0 errcount = 0 area = "Info.Frontend" metricgroup = frozenset(['Fed', 'FetchBW']) @@ -4151,7 +4262,7 @@ def compute(self, EV): class Metric_LSD_Coverage: name = "LSD_Coverage" domain = "Metric" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.Frontend" metricgroup = frozenset(['Fed', 'LSD']) @@ -4171,7 +4282,7 @@ def compute(self, EV): class Metric_DSB_Coverage: name = "DSB_Coverage" domain = "Metric" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.Frontend" metricgroup = frozenset(['DSB', 'Fed', 'FetchBW']) @@ -4251,6 +4362,26 @@ def compute(self, EV): the Instruction_Fetch_BW Bottleneck.""" +class Metric_DSB_Bandwidth: + name = "DSB_Bandwidth" + domain = "Scaled_Slots" + maxval = 0 + errcount = 0 + area = "Info.Botlnk.L2" + metricgroup = frozenset(['DSB', 'FetchBW']) + sibling = None + + def compute(self, EV): + try: + self.val = DSB_Bandwidth(self, EV, 0) + self.thresh = (self.val > 10) + except ZeroDivisionError: + handle_error_metric(self, "DSB_Bandwidth zero division") + desc = """ +Total pipeline cost of DSB (uop cache) hits - subset of the +Instruction_Fetch_BW Bottleneck.""" + + class Metric_ICache_Miss_Latency: name = "ICache_Miss_Latency" domain = "Metric" @@ -4406,7 +4537,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Ntaken zero division") desc = """ -Instructions per retired mispredicts for conditional non- +Instructions per retired Mispredicts for conditional non- taken branches (lower number means higher occurrence rate).""" @@ -4426,7 +4557,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Cond_Taken zero division") desc = """ -Instructions per retired mispredicts for conditional taken +Instructions per retired Mispredicts for conditional taken branches (lower number means higher occurrence rate).""" @@ -4446,7 +4577,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Ret zero division") desc = """ -Instructions per retired mispredicts for return branches +Instructions per retired Mispredicts for return branches (lower number means higher occurrence rate).""" @@ -4466,7 +4597,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "IpMisp_Indirect zero division") desc = """ -Instructions per retired mispredicts for indirect CALL or +Instructions per retired Mispredicts for indirect CALL or JMP branches (lower number means higher occurrence rate).""" @@ -4507,13 +4638,13 @@ def compute(self, EV): handle_error_metric(self, "Spec_Clears_Ratio zero division") desc = """ Speculative to Retired ratio of all clears (covering -mispredicts and nukes)""" +Mispredicts and nukes)""" class Metric_Cond_NT: name = "Cond_NT" domain = "Fraction" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.Branches" metricgroup = frozenset(['Bad', 'Branches', 'CodeGen', 'PGO']) @@ -4532,7 +4663,7 @@ def compute(self, EV): class Metric_Cond_TK: name = "Cond_TK" domain = "Fraction" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.Branches" metricgroup = frozenset(['Bad', 'Branches', 'CodeGen', 'PGO']) @@ -4551,7 +4682,7 @@ def compute(self, EV): class Metric_CallRet: name = "CallRet" domain = "Fraction" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.Branches" metricgroup = frozenset(['Bad', 'Branches']) @@ -4570,7 +4701,7 @@ def compute(self, EV): class Metric_Jump: name = "Jump" domain = "Fraction" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.Branches" metricgroup = frozenset(['Bad', 'Branches']) @@ -4590,7 +4721,7 @@ def compute(self, EV): class Metric_Other_Branches: name = "Other_Branches" domain = "Fraction" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.Branches" metricgroup = frozenset(['Bad', 'Branches']) @@ -4630,7 +4761,7 @@ def compute(self, EV): class Metric_MLP: name = "MLP" domain = "Metric" - maxval = 10 + maxval = 10.0 errcount = 0 area = "Info.Memory" metricgroup = frozenset(['Mem', 'MemoryBound', 'MemoryBW']) @@ -4748,6 +4879,26 @@ def compute(self, EV): loads (including speculative)""" +class Metric_L2MPKI_RFO: + name = "L2MPKI_RFO" + domain = "Metric" + maxval = 0 + errcount = 0 + area = "Info.Memory" + metricgroup = frozenset(['CacheMisses', 'Offcore']) + sibling = None + + def compute(self, EV): + try: + self.val = L2MPKI_RFO(self, EV, 0) + self.thresh = True + except ZeroDivisionError: + handle_error_metric(self, "L2MPKI_RFO zero division") + desc = """ +Offcore requests (L2 cache miss) per kilo instruction for +demand RFOs""" + + class Metric_L2HPKI_All: name = "L2HPKI_All" domain = "Metric" @@ -4845,8 +4996,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L1D_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L1 data cache -[GB / sec]""" +""" class Metric_L2_Cache_Fill_BW: @@ -4865,8 +5015,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L2_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L2 cache [GB / -sec]""" +""" class Metric_L3_Cache_Fill_BW: @@ -4885,8 +5034,7 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Fill_BW zero division") desc = """ -Average per-thread data fill bandwidth to the L3 cache [GB / -sec]""" +""" class Metric_L3_Cache_Access_BW: @@ -4905,14 +5053,13 @@ def compute(self, EV): except ZeroDivisionError: handle_error_metric(self, "L3_Cache_Access_BW zero division") desc = """ -Average per-thread data access bandwidth to the L3 cache [GB -/ sec]""" +""" class Metric_Page_Walks_Utilization: name = "Page_Walks_Utilization" domain = "Core_Metric" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.Memory.TLB" metricgroup = frozenset(['Mem', 'MemoryTLB']) @@ -5189,7 +5336,7 @@ def compute(self, EV): class Metric_CPU_Utilization: name = "CPU_Utilization" domain = "Metric" - maxval = 200 + maxval = 1 errcount = 0 area = "Info.System" metricgroup = frozenset(['HPC', 'Summary']) @@ -5208,7 +5355,7 @@ def compute(self, EV): class Metric_CPUs_Utilized: name = "CPUs_Utilized" domain = "Metric" - maxval = 0 + maxval = 300 errcount = 0 area = "Info.System" metricgroup = frozenset(['Summary']) @@ -5262,13 +5409,13 @@ def compute(self, EV): desc = """ Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector -instructions, vector-width .""" +instructions, vector-width""" class Metric_Turbo_Utilization: name = "Turbo_Utilization" domain = "Core_Metric" - maxval = 10 + maxval = 10.0 errcount = 0 area = "Info.System" metricgroup = frozenset(['Power']) @@ -5287,7 +5434,7 @@ def compute(self, EV): class Metric_SMT_2T_Utilization: name = "SMT_2T_Utilization" domain = "Core_Metric" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.System" metricgroup = frozenset(['SMT']) @@ -5307,7 +5454,7 @@ def compute(self, EV): class Metric_Kernel_Utilization: name = "Kernel_Utilization" domain = "Metric" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.System" metricgroup = frozenset(['OS']) @@ -5347,7 +5494,7 @@ def compute(self, EV): class Metric_C0_Wait: name = "C0_Wait" domain = "Metric" - maxval = 1 + maxval = 1.0 errcount = 0 area = "Info.System" metricgroup = frozenset(['C0Wait']) @@ -5521,6 +5668,7 @@ def __init__(self, r): n = Load_STLB_Hit() ; r.run(n) ; o["Load_STLB_Hit"] = n n = Load_STLB_Miss() ; r.run(n) ; o["Load_STLB_Miss"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = L1_Hit_Latency() ; r.run(n) ; o["L1_Hit_Latency"] = n n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = FB_Full() ; r.run(n) ; o["FB_Full"] = n @@ -5613,6 +5761,7 @@ def __init__(self, r): o["Load_STLB_Hit"].parent = o["DTLB_Load"] o["Load_STLB_Miss"].parent = o["DTLB_Load"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["L1_Hit_Latency"].parent = o["L1_Bound"] o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["FB_Full"].parent = o["L1_Bound"] @@ -5690,7 +5839,8 @@ def __init__(self, r): n = Metric_Irregular_Overhead() ; r.metric(n) ; o["Irregular_Overhead"] = n n = Metric_Other_Bottlenecks() ; r.metric(n) ; o["Other_Bottlenecks"] = n n = Metric_Branching_Overhead() ; r.metric(n) ; o["Branching_Overhead"] = n - n = Metric_Base_Non_Br() ; r.metric(n) ; o["Base_Non_Br"] = n + n = Metric_Useful_Work() ; r.metric(n) ; o["Useful_Work"] = n + n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_IPC() ; r.metric(n) ; o["IPC"] = n n = Metric_UopPI() ; r.metric(n) ; o["UopPI"] = n n = Metric_UpTB() ; r.metric(n) ; o["UpTB"] = n @@ -5704,7 +5854,6 @@ def __init__(self, r): n = Metric_FP_Arith_Utilization() ; r.metric(n) ; o["FP_Arith_Utilization"] = n n = Metric_ILP() ; r.metric(n) ; o["ILP"] = n n = Metric_EPC() ; r.metric(n) ; o["EPC"] = n - n = Metric_Core_Bound_Likely() ; r.metric(n) ; o["Core_Bound_Likely"] = n n = Metric_CORE_CLKS() ; r.metric(n) ; o["CORE_CLKS"] = n n = Metric_IpLoad() ; r.metric(n) ; o["IpLoad"] = n n = Metric_IpStore() ; r.metric(n) ; o["IpStore"] = n @@ -5725,12 +5874,16 @@ def __init__(self, r): n = Metric_Strings_Cycles() ; r.metric(n) ; o["Strings_Cycles"] = n n = Metric_IpAssist() ; r.metric(n) ; o["IpAssist"] = n n = Metric_Execute() ; r.metric(n) ; o["Execute"] = n + n = Metric_Fetch_LSD() ; r.metric(n) ; o["Fetch_LSD"] = n + n = Metric_Fetch_DSB() ; r.metric(n) ; o["Fetch_DSB"] = n + n = Metric_Fetch_MITE() ; r.metric(n) ; o["Fetch_MITE"] = n n = Metric_Fetch_UpC() ; r.metric(n) ; o["Fetch_UpC"] = n n = Metric_LSD_Coverage() ; r.metric(n) ; o["LSD_Coverage"] = n n = Metric_DSB_Coverage() ; r.metric(n) ; o["DSB_Coverage"] = n n = Metric_Unknown_Branch_Cost() ; r.metric(n) ; o["Unknown_Branch_Cost"] = n n = Metric_DSB_Switch_Cost() ; r.metric(n) ; o["DSB_Switch_Cost"] = n n = Metric_DSB_Misses() ; r.metric(n) ; o["DSB_Misses"] = n + n = Metric_DSB_Bandwidth() ; r.metric(n) ; o["DSB_Bandwidth"] = n n = Metric_ICache_Miss_Latency() ; r.metric(n) ; o["ICache_Miss_Latency"] = n n = Metric_IC_Misses() ; r.metric(n) ; o["IC_Misses"] = n n = Metric_IpDSB_Miss_Ret() ; r.metric(n) ; o["IpDSB_Miss_Ret"] = n @@ -5756,6 +5909,7 @@ def __init__(self, r): n = Metric_L2MPKI() ; r.metric(n) ; o["L2MPKI"] = n n = Metric_L2MPKI_All() ; r.metric(n) ; o["L2MPKI_All"] = n n = Metric_L2MPKI_Load() ; r.metric(n) ; o["L2MPKI_Load"] = n + n = Metric_L2MPKI_RFO() ; r.metric(n) ; o["L2MPKI_RFO"] = n n = Metric_L2HPKI_All() ; r.metric(n) ; o["L2HPKI_All"] = n n = Metric_L2HPKI_Load() ; r.metric(n) ; o["L2HPKI_Load"] = n n = Metric_L3MPKI() ; r.metric(n) ; o["L3MPKI"] = n @@ -5930,46 +6084,54 @@ def __init__(self, r): o["Cache_Memory_Bandwidth"].Store_Fwd_Blk = o["Store_Fwd_Blk"] o["Cache_Memory_Bandwidth"].SQ_Full = o["SQ_Full"] o["Cache_Memory_Bandwidth"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["Cache_Memory_Bandwidth"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Cache_Memory_Bandwidth"].Load_STLB_Miss = o["Load_STLB_Miss"] - o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Bandwidth"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Bandwidth"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Bandwidth"].Memory_Bound = o["Memory_Bound"] o["Cache_Memory_Bandwidth"].Lock_Latency = o["Lock_Latency"] o["Cache_Memory_Bandwidth"].MEM_Latency = o["MEM_Latency"] o["Cache_Memory_Bandwidth"].Store_Bound = o["Store_Bound"] - o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Cache_Memory_Bandwidth"].Split_Loads = o["Split_Loads"] + o["Cache_Memory_Bandwidth"].L3_Hit_Latency = o["L3_Hit_Latency"] + o["Cache_Memory_Bandwidth"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Bandwidth"].L3_Bound = o["L3_Bound"] o["Cache_Memory_Bandwidth"].FB_Full = o["FB_Full"] o["Cache_Memory_Bandwidth"].Contested_Accesses = o["Contested_Accesses"] o["Cache_Memory_Bandwidth"].DRAM_Bound = o["DRAM_Bound"] o["Cache_Memory_Latency"].L1_Bound = o["L1_Bound"] - o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] - o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] - o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] - o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].DTLB_Load = o["DTLB_Load"] o["Cache_Memory_Latency"].False_Sharing = o["False_Sharing"] o["Cache_Memory_Latency"].Data_Sharing = o["Data_Sharing"] o["Cache_Memory_Latency"].L2_Bound = o["L2_Bound"] o["Cache_Memory_Latency"].Memory_Bound = o["Memory_Bound"] o["Cache_Memory_Latency"].DTLB_Store = o["DTLB_Store"] - o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] + o["Cache_Memory_Latency"].SQ_Full = o["SQ_Full"] o["Cache_Memory_Latency"].Store_Bound = o["Store_Bound"] - o["Cache_Memory_Latency"].Store_STLB_Miss = o["Store_STLB_Miss"] - o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] + o["Cache_Memory_Latency"].Split_Loads = o["Split_Loads"] o["Cache_Memory_Latency"].L3_Bound = o["L3_Bound"] + o["Cache_Memory_Latency"].FB_Full = o["FB_Full"] o["Cache_Memory_Latency"].Streaming_Stores = o["Streaming_Stores"] o["Cache_Memory_Latency"].Contested_Accesses = o["Contested_Accesses"] + o["Cache_Memory_Latency"].Store_Fwd_Blk = o["Store_Fwd_Blk"] + o["Cache_Memory_Latency"].L1_Hit_Latency = o["L1_Hit_Latency"] + o["Cache_Memory_Latency"].MEM_Bandwidth = o["MEM_Bandwidth"] + o["Cache_Memory_Latency"].Store_Latency = o["Store_Latency"] + o["Cache_Memory_Latency"].Split_Stores = o["Split_Stores"] + o["Cache_Memory_Latency"].Lock_Latency = o["Lock_Latency"] + o["Cache_Memory_Latency"].MEM_Latency = o["MEM_Latency"] + o["Cache_Memory_Latency"].Store_STLB_Miss = o["Store_STLB_Miss"] + o["Cache_Memory_Latency"].Load_STLB_Miss = o["Load_STLB_Miss"] + o["Cache_Memory_Latency"].L3_Hit_Latency = o["L3_Hit_Latency"] o["Cache_Memory_Latency"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Data_TLBs"].L1_Bound = o["L1_Bound"] o["Memory_Data_TLBs"].Store_Fwd_Blk = o["Store_Fwd_Blk"] - o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] + o["Memory_Data_TLBs"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Memory_Data_TLBs"].DTLB_Load = o["DTLB_Load"] o["Memory_Data_TLBs"].Store_Latency = o["Store_Latency"] o["Memory_Data_TLBs"].Load_STLB_Miss = o["Load_STLB_Miss"] o["Memory_Data_TLBs"].False_Sharing = o["False_Sharing"] - o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] + o["Memory_Data_TLBs"].DTLB_Store = o["DTLB_Store"] o["Memory_Data_TLBs"].L2_Bound = o["L2_Bound"] o["Memory_Data_TLBs"].Memory_Bound = o["Memory_Bound"] o["Memory_Data_TLBs"].Lock_Latency = o["Lock_Latency"] @@ -5979,6 +6141,7 @@ def __init__(self, r): o["Memory_Data_TLBs"].Split_Loads = o["Split_Loads"] o["Memory_Data_TLBs"].L3_Bound = o["L3_Bound"] o["Memory_Data_TLBs"].FB_Full = o["FB_Full"] + o["Memory_Data_TLBs"].Streaming_Stores = o["Streaming_Stores"] o["Memory_Data_TLBs"].DRAM_Bound = o["DRAM_Bound"] o["Memory_Synchronization"].L1_Bound = o["L1_Bound"] o["Memory_Synchronization"].Frontend_Bound = o["Frontend_Bound"] @@ -6055,7 +6218,7 @@ def __init__(self, r): o["Other_Bottlenecks"].Divider = o["Divider"] o["Other_Bottlenecks"].L3_Bound = o["L3_Bound"] o["Other_Bottlenecks"].Ports_Utilized_3m = o["Ports_Utilized_3m"] - o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] + o["Other_Bottlenecks"].L1_Hit_Latency = o["L1_Hit_Latency"] o["Other_Bottlenecks"].FB_Full = o["FB_Full"] o["Other_Bottlenecks"].MEM_Bandwidth = o["MEM_Bandwidth"] o["Other_Bottlenecks"].Store_Latency = o["Store_Latency"] @@ -6077,6 +6240,7 @@ def __init__(self, r): o["Other_Bottlenecks"].ICache_Misses = o["ICache_Misses"] o["Other_Bottlenecks"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Other_Bottlenecks"].Frontend_Bound = o["Frontend_Bound"] + o["Other_Bottlenecks"].Machine_Clears = o["Machine_Clears"] o["Other_Bottlenecks"].Streaming_Stores = o["Streaming_Stores"] o["Other_Bottlenecks"].Memory_Bound = o["Memory_Bound"] o["Other_Bottlenecks"].SQ_Full = o["SQ_Full"] @@ -6099,19 +6263,19 @@ def __init__(self, r): o["Other_Bottlenecks"].Other_Nukes = o["Other_Nukes"] o["Other_Bottlenecks"].Unknown_Branches = o["Unknown_Branches"] o["Other_Bottlenecks"].DRAM_Bound = o["DRAM_Bound"] - o["Base_Non_Br"].Retiring = o["Retiring"] - o["Base_Non_Br"].Heavy_Operations = o["Heavy_Operations"] - o["Base_Non_Br"].Microcode_Sequencer = o["Microcode_Sequencer"] - o["Base_Non_Br"].Few_Uops_Instructions = o["Few_Uops_Instructions"] - o["Base_Non_Br"].Assists = o["Assists"] - o["UopPI"].Retiring = o["Retiring"] - o["UpTB"].Retiring = o["Retiring"] + o["Useful_Work"].Retiring = o["Retiring"] + o["Useful_Work"].Heavy_Operations = o["Heavy_Operations"] + o["Useful_Work"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Useful_Work"].Few_Uops_Instructions = o["Few_Uops_Instructions"] + o["Useful_Work"].Assists = o["Assists"] o["Core_Bound_Likely"].Memory_Bound = o["Memory_Bound"] o["Core_Bound_Likely"].Ports_Utilized_0 = o["Ports_Utilized_0"] o["Core_Bound_Likely"].Core_Bound = o["Core_Bound"] o["Core_Bound_Likely"].Ports_Utilization = o["Ports_Utilization"] o["Core_Bound_Likely"].Retiring = o["Retiring"] o["Core_Bound_Likely"].Backend_Bound = o["Backend_Bound"] + o["UopPI"].Retiring = o["Retiring"] + o["UpTB"].Retiring = o["Retiring"] o["Retire"].Retiring = o["Retiring"] o["DSB_Misses"].LSD = o["LSD"] o["DSB_Misses"].MITE = o["MITE"] @@ -6126,6 +6290,12 @@ def __init__(self, r): o["DSB_Misses"].DSB = o["DSB"] o["DSB_Misses"].Unknown_Branches = o["Unknown_Branches"] o["DSB_Misses"].Fetch_Latency = o["Fetch_Latency"] + o["DSB_Bandwidth"].LSD = o["LSD"] + o["DSB_Bandwidth"].Fetch_Bandwidth = o["Fetch_Bandwidth"] + o["DSB_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["DSB_Bandwidth"].MITE = o["MITE"] + o["DSB_Bandwidth"].DSB = o["DSB"] + o["DSB_Bandwidth"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].Fetch_Latency = o["Fetch_Latency"] o["IC_Misses"].LCP = o["LCP"] o["IC_Misses"].MS_Switches = o["MS_Switches"] @@ -6202,5 +6372,6 @@ def __init__(self, r): o["IpTB"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Coverage"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["DSB_Misses"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) + o["DSB_Bandwidth"].sibling = (o["LCP"], o["DSB_Switches"], o["Fetch_Bandwidth"],) o["Branch_Misprediction_Cost"].sibling = (o["Mispredicts_Resteers"], o["Branch_Mispredicts"],) o["DRAM_BW_Use"].sibling = (o["FB_Full"], o["SQ_Full"], o["MEM_Bandwidth"],)