Mailing List Archive

[PATCH v2] perf/x86/ibs: Set mem_lvl_num, mem_remote and mem_hops for data_src
From: Namhyung Kim <namhyung@kernel.org>

Kernel IBS driver wasn't using new PERF_MEM_* APIs due to few of its
limitations. Mainly:

- mem_lvl_num doesn't allow setting multiple sources whereas old API
allows it. Setting multiple data sources is useful because IBS on
pre-zen4 uarch doesn't provide fine granular DataSrc details (there
is only one such DataSrc(2h) though).
- perf mem sorting logic (sort__lvl_cmp()) ignores mem_lvl_num. perf
c2c (c2c_decode_stats()) does not use mem_lvl_num at all. perf mem
prints mem_lvl and mem_lvl_num both if both are set, which is ugly.

1st one can be handled by using ANY_CACHE with HOPS_0. 2nd one is
purely perf tool specific issue and should be fixed separately.

Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
---
v1: https://lore.kernel.org/r/20230311000642.1270971-1-namhyung@kernel.org
v1->v2:
- In addition to setting new API fields, convert all individual field
assignments to compile time wrapper macros built using PERF_MEM_S().
Also convert DataSrc conditional code to array lookups.

arch/x86/events/amd/ibs.c | 155 +++++++++++++++++---------------------
1 file changed, 68 insertions(+), 87 deletions(-)

diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 64582954b5f6..b46e0b725fe5 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -703,9 +703,41 @@ static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
return op_data2->data_src_lo;
}

-static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
- union ibs_op_data3 *op_data3,
- struct perf_sample_data *data)
+#define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT))
+#define LN(x) PERF_MEM_S(LVLNUM, x)
+#define REM PERF_MEM_S(REMOTE, REMOTE)
+#define HOPS(x) PERF_MEM_S(HOPS, x)
+
+static u64 g_data_src[8] = {
+ [IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0),
+ [IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM),
+ [IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
+ [IBS_DATA_SRC_IO] = L(IO) | LN(IO),
+};
+
+#define RMT_NODE_BITS ((1 << IBS_DATA_SRC_DRAM) | \
+ (1 << IBS_DATA_SRC_IO))
+#define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
+
+static u64 g_zen4_data_src[32] = {
+ [IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3),
+ [IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0),
+ [IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM),
+ [IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
+ [IBS_DATA_SRC_EXT_PMEM] = LN(PMEM),
+ [IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO),
+ [IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL),
+};
+
+#define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \
+ (1 << IBS_DATA_SRC_EXT_PMEM) | \
+ (1 << IBS_DATA_SRC_EXT_IO) |\
+ (1 << IBS_DATA_SRC_EXT_EXT_MEM))
+#define ZEN4_RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
+
+static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
+ union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
{
union perf_mem_data_src *data_src = &data->data_src;
u8 ibs_data_src = perf_ibs_data_src(op_data2);
@@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
* DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
* memory accesses. So, check DcUcMemAcc bit early.
*/
- if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
- data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
- return;
- }
+ if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
+ return L(UNC);

/* L1 Hit */
- if (op_data3->dc_miss == 0) {
- data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
- return;
- }
+ if (op_data3->dc_miss == 0)
+ return L(L1) | LN(L1);

/* L2 Hit */
if (op_data3->l2_miss == 0) {
/* Erratum #1293 */
if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
- !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
- data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
- return;
- }
+ !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc))
+ return L(L2) | LN(L2);
}

/*
@@ -744,82 +770,36 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
if (data_src->mem_op != PERF_MEM_OP_LOAD)
goto check_mab;

- /* L3 Hit */
if (ibs_caps & IBS_CAPS_ZEN4) {
- if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
- return;
- }
- } else {
- if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 |
- PERF_MEM_LVL_HIT;
- return;
- }
- }
+ u64 val = g_zen4_data_src[ibs_data_src];

- /* A peer cache in a near CCX */
- if (ibs_caps & IBS_CAPS_ZEN4 &&
- ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
- return;
- }
+ if (!val)
+ goto check_mab;

- /* A peer cache in a far CCX */
- if (ibs_caps & IBS_CAPS_ZEN4) {
- if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
- return;
+ /* HOPS_1 because IBS doesn't provide remote socket detail */
+ if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) {
+ if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
+ val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
+ else
+ val |= REM | HOPS(1);
}
- } else {
- if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) {
- data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
- return;
- }
- }

- /* DRAM */
- if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) {
- if (op_data2->rmt_node == 0)
- data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
- else
- data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
- return;
- }
+ return val;
+ } else {
+ u64 val = g_data_src[ibs_data_src];

- /* PMEM */
- if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) {
- data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM;
- if (op_data2->rmt_node) {
- data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
- /* IBS doesn't provide Remote socket detail */
- data_src->mem_hops = PERF_MEM_HOPS_1;
- }
- return;
- }
+ if (!val)
+ goto check_mab;

- /* Extension Memory */
- if (ibs_caps & IBS_CAPS_ZEN4 &&
- ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
- data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL;
- if (op_data2->rmt_node) {
- data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
- /* IBS doesn't provide Remote socket detail */
- data_src->mem_hops = PERF_MEM_HOPS_1;
+ /* HOPS_1 because IBS doesn't provide remote socket detail */
+ if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) {
+ if (ibs_data_src == IBS_DATA_SRC_DRAM)
+ val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
+ else
+ val |= REM | HOPS(1);
}
- return;
- }

- /* IO */
- if (ibs_data_src == IBS_DATA_SRC_EXT_IO) {
- data_src->mem_lvl = PERF_MEM_LVL_IO;
- data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
- if (op_data2->rmt_node) {
- data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
- /* IBS doesn't provide Remote socket detail */
- data_src->mem_hops = PERF_MEM_HOPS_1;
- }
- return;
+ return val;
}

check_mab:
@@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
* DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
* MAB only when IBS fails to provide DataSrc.
*/
- if (op_data3->dc_miss_no_mab_alloc) {
- data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
- return;
- }
+ if (op_data3->dc_miss_no_mab_alloc)
+ return L(LFB) | LN(LFB);

data_src->mem_lvl = PERF_MEM_LVL_NA;
+ return 0;
}

static bool perf_ibs_cache_hit_st_valid(void)
@@ -925,7 +904,9 @@ static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
union ibs_op_data2 *op_data2,
union ibs_op_data3 *op_data3)
{
- perf_ibs_get_mem_lvl(op_data2, op_data3, data);
+ union perf_mem_data_src *data_src = &data->data_src;
+
+ data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data);
perf_ibs_get_mem_snoop(op_data2, data);
perf_ibs_get_tlb_lvl(op_data3, data);
perf_ibs_get_mem_lock(op_data3, data);
--
2.39.2
Re: [PATCH v2] perf/x86/ibs: Set mem_lvl_num, mem_remote and mem_hops for data_src [ In reply to ]
Hi Ravi,

On Mon, Mar 27, 2023 at 6:09?AM Ravi Bangoria <ravi.bangoria@amd.com> wrote:
>
> From: Namhyung Kim <namhyung@kernel.org>
>
> Kernel IBS driver wasn't using new PERF_MEM_* APIs due to few of its
> limitations. Mainly:
>
> - mem_lvl_num doesn't allow setting multiple sources whereas old API
> allows it. Setting multiple data sources is useful because IBS on
> pre-zen4 uarch doesn't provide fine granular DataSrc details (there
> is only one such DataSrc(2h) though).
> - perf mem sorting logic (sort__lvl_cmp()) ignores mem_lvl_num. perf
> c2c (c2c_decode_stats()) does not use mem_lvl_num at all. perf mem
> prints mem_lvl and mem_lvl_num both if both are set, which is ugly.
>
> 1st one can be handled by using ANY_CACHE with HOPS_0. 2nd one is
> purely perf tool specific issue and should be fixed separately.
>
> Signed-off-by: Namhyung Kim <namhyung@kernel.org>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
> ---
> v1: https://lore.kernel.org/r/20230311000642.1270971-1-namhyung@kernel.org
> v1->v2:
> - In addition to setting new API fields, convert all individual field
> assignments to compile time wrapper macros built using PERF_MEM_S().
> Also convert DataSrc conditional code to array lookups.
>
> arch/x86/events/amd/ibs.c | 155 +++++++++++++++++---------------------
> 1 file changed, 68 insertions(+), 87 deletions(-)
>
> diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
> index 64582954b5f6..b46e0b725fe5 100644
> --- a/arch/x86/events/amd/ibs.c
> +++ b/arch/x86/events/amd/ibs.c
> @@ -703,9 +703,41 @@ static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
> return op_data2->data_src_lo;
> }
>
> -static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> - union ibs_op_data3 *op_data3,
> - struct perf_sample_data *data)
> +#define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT))
> +#define LN(x) PERF_MEM_S(LVLNUM, x)
> +#define REM PERF_MEM_S(REMOTE, REMOTE)
> +#define HOPS(x) PERF_MEM_S(HOPS, x)
> +
> +static u64 g_data_src[8] = {
> + [IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0),
> + [IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM),
> + [IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
> + [IBS_DATA_SRC_IO] = L(IO) | LN(IO),
> +};
> +
> +#define RMT_NODE_BITS ((1 << IBS_DATA_SRC_DRAM) | \
> + (1 << IBS_DATA_SRC_IO))
> +#define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
> +
> +static u64 g_zen4_data_src[32] = {
> + [IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3),
> + [IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0),
> + [IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM),
> + [IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
> + [IBS_DATA_SRC_EXT_PMEM] = LN(PMEM),
> + [IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO),
> + [IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL),
> +};
> +
> +#define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \
> + (1 << IBS_DATA_SRC_EXT_PMEM) | \
> + (1 << IBS_DATA_SRC_EXT_IO) |\
> + (1 << IBS_DATA_SRC_EXT_EXT_MEM))
> +#define ZEN4_RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
> +
> +static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> + union ibs_op_data3 *op_data3,
> + struct perf_sample_data *data)
> {
> union perf_mem_data_src *data_src = &data->data_src;
> u8 ibs_data_src = perf_ibs_data_src(op_data2);
> @@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
> * memory accesses. So, check DcUcMemAcc bit early.
> */
> - if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
> - data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
> + return L(UNC);

Hmm.. it seems we don't have PERF_MEM_LVLNUM_UNC.

>
> /* L1 Hit */
> - if (op_data3->dc_miss == 0) {
> - data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (op_data3->dc_miss == 0)
> + return L(L1) | LN(L1);
>
> /* L2 Hit */
> if (op_data3->l2_miss == 0) {
> /* Erratum #1293 */
> if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
> - !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
> - data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc))
> + return L(L2) | LN(L2);
> }
>
> /*
> @@ -744,82 +770,36 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> if (data_src->mem_op != PERF_MEM_OP_LOAD)
> goto check_mab;
>
> - /* L3 Hit */
> if (ibs_caps & IBS_CAPS_ZEN4) {
> - if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
> - return;
> - }
> - } else {
> - if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 |
> - PERF_MEM_LVL_HIT;
> - return;
> - }
> - }
> + u64 val = g_zen4_data_src[ibs_data_src];
>
> - /* A peer cache in a near CCX */
> - if (ibs_caps & IBS_CAPS_ZEN4 &&
> - ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (!val)
> + goto check_mab;
>
> - /* A peer cache in a far CCX */
> - if (ibs_caps & IBS_CAPS_ZEN4) {
> - if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
> - return;
> + /* HOPS_1 because IBS doesn't provide remote socket detail */
> + if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) {
> + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
> + val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
> + else
> + val |= REM | HOPS(1);
> }
> - } else {
> - if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) {
> - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
> - return;
> - }
> - }
>
> - /* DRAM */
> - if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) {
> - if (op_data2->rmt_node == 0)
> - data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
> - else
> - data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
> - return;
> - }
> + return val;
> + } else {
> + u64 val = g_data_src[ibs_data_src];
>
> - /* PMEM */
> - if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) {
> - data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM;
> - if (op_data2->rmt_node) {
> - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> - /* IBS doesn't provide Remote socket detail */
> - data_src->mem_hops = PERF_MEM_HOPS_1;
> - }
> - return;
> - }
> + if (!val)
> + goto check_mab;
>
> - /* Extension Memory */
> - if (ibs_caps & IBS_CAPS_ZEN4 &&
> - ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
> - data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL;
> - if (op_data2->rmt_node) {
> - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> - /* IBS doesn't provide Remote socket detail */
> - data_src->mem_hops = PERF_MEM_HOPS_1;
> + /* HOPS_1 because IBS doesn't provide remote socket detail */
> + if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) {
> + if (ibs_data_src == IBS_DATA_SRC_DRAM)
> + val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
> + else
> + val |= REM | HOPS(1);
> }
> - return;
> - }
>
> - /* IO */
> - if (ibs_data_src == IBS_DATA_SRC_EXT_IO) {
> - data_src->mem_lvl = PERF_MEM_LVL_IO;
> - data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
> - if (op_data2->rmt_node) {
> - data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
> - /* IBS doesn't provide Remote socket detail */
> - data_src->mem_hops = PERF_MEM_HOPS_1;
> - }
> - return;
> + return val;
> }
>
> check_mab:
> @@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
> * MAB only when IBS fails to provide DataSrc.
> */
> - if (op_data3->dc_miss_no_mab_alloc) {
> - data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
> - return;
> - }
> + if (op_data3->dc_miss_no_mab_alloc)
> + return L(LFB) | LN(LFB);
>
> data_src->mem_lvl = PERF_MEM_LVL_NA;
> + return 0;

Wouldn't it be 'return L(NA) | LN(NA);' ?

Thanks,
Namhyung


> }
>
> static bool perf_ibs_cache_hit_st_valid(void)
> @@ -925,7 +904,9 @@ static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
> union ibs_op_data2 *op_data2,
> union ibs_op_data3 *op_data3)
> {
> - perf_ibs_get_mem_lvl(op_data2, op_data3, data);
> + union perf_mem_data_src *data_src = &data->data_src;
> +
> + data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data);
> perf_ibs_get_mem_snoop(op_data2, data);
> perf_ibs_get_tlb_lvl(op_data3, data);
> perf_ibs_get_mem_lock(op_data3, data);
> --
> 2.39.2
>
Re: [PATCH v2] perf/x86/ibs: Set mem_lvl_num, mem_remote and mem_hops for data_src [ In reply to ]
Hi Namhyung,

>> @@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
>> * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
>> * memory accesses. So, check DcUcMemAcc bit early.
>> */
>> - if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
>> - data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
>> - return;
>> - }
>> + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
>> + return L(UNC);
>
> Hmm.. it seems we don't have PERF_MEM_LVLNUM_UNC.

Right. Is it worth to introduce one?

On a side note, I came to know that IBS OpData2[RmtNode] is not applicable
when DataSrc=7 (I/O). So, I need to respin this patch with that change.

[...]

>> check_mab:
>> @@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
>> * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
>> * MAB only when IBS fails to provide DataSrc.
>> */
>> - if (op_data3->dc_miss_no_mab_alloc) {
>> - data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
>> - return;
>> - }
>> + if (op_data3->dc_miss_no_mab_alloc)
>> + return L(LFB) | LN(LFB);
>>
>> data_src->mem_lvl = PERF_MEM_LVL_NA;
>> + return 0;
>
> Wouldn't it be 'return L(NA) | LN(NA);' ?

IBS has no instruction type filtering, i.e. it tags whatever instruction it
sees at overflow. When IBS tags non-load/store instruction, data_src->val is
set to PERF_MEM_NA, which does not initialize mem_lvl_num (Shall we change
that?). If I set both LVL_NA and LVL_NUM_NA for load/store with no DataSrc
info, perf mem output becomes funny:

$ sudo ./perf mem report -F sample,mem --stdio
# Samples Memory access
# ............ .......................................
#
1914 N/A <====== Non-LS
905 L1 or L1 hit
19 L3 or L3 hit
16 L2 or L2 hit
6 N/A or N/A hit <====== LS with no DataSrc info
6 Local RAM or RAM hit
4 Remote node, same socket RAM hit
3 Remote core, same node Any cache hit
2 Remote node, same socket Any cache hit

Also, L(NA) is PERF_MEM_LVL_NA | PERF_MEM_LVL_HIT. If I just return L(NA),
perf tools shows it as "N/A hit".

So, until tool code gets refactored, setting mem_lvl = NA here is hiding
tool's dumbness :(. Maybe I should refactor perf_mem__snp_scnprintf() as
part of this patchset.

Thanks,
Ravi
Re: [PATCH v2] perf/x86/ibs: Set mem_lvl_num, mem_remote and mem_hops for data_src [ In reply to ]
On Wed, Mar 29, 2023 at 2:45?AM Ravi Bangoria <ravi.bangoria@amd.com> wrote:
>
> Hi Namhyung,
>
> >> @@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> >> * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
> >> * memory accesses. So, check DcUcMemAcc bit early.
> >> */
> >> - if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
> >> - data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
> >> - return;
> >> - }
> >> + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
> >> + return L(UNC);
> >
> > Hmm.. it seems we don't have PERF_MEM_LVLNUM_UNC.
>
> Right. Is it worth to introduce one?

I think MEM_LVLNUM should express every memory level in MEM_LVL.

>
> On a side note, I came to know that IBS OpData2[RmtNode] is not applicable
> when DataSrc=7 (I/O). So, I need to respin this patch with that change.
>
> [...]
>
> >> check_mab:
> >> @@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
> >> * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
> >> * MAB only when IBS fails to provide DataSrc.
> >> */
> >> - if (op_data3->dc_miss_no_mab_alloc) {
> >> - data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
> >> - return;
> >> - }
> >> + if (op_data3->dc_miss_no_mab_alloc)
> >> + return L(LFB) | LN(LFB);
> >>
> >> data_src->mem_lvl = PERF_MEM_LVL_NA;
> >> + return 0;
> >
> > Wouldn't it be 'return L(NA) | LN(NA);' ?
>
> IBS has no instruction type filtering, i.e. it tags whatever instruction it
> sees at overflow. When IBS tags non-load/store instruction, data_src->val is
> set to PERF_MEM_NA, which does not initialize mem_lvl_num (Shall we change
> that?). If I set both LVL_NA and LVL_NUM_NA for load/store with no DataSrc
> info, perf mem output becomes funny:

Probably worth changing PERF_MEM_NA.

>
> $ sudo ./perf mem report -F sample,mem --stdio
> # Samples Memory access
> # ............ .......................................
> #
> 1914 N/A <====== Non-LS
> 905 L1 or L1 hit
> 19 L3 or L3 hit
> 16 L2 or L2 hit
> 6 N/A or N/A hit <====== LS with no DataSrc info
> 6 Local RAM or RAM hit
> 4 Remote node, same socket RAM hit
> 3 Remote core, same node Any cache hit
> 2 Remote node, same socket Any cache hit

Maybe that's better to differentiate them :)

>
> Also, L(NA) is PERF_MEM_LVL_NA | PERF_MEM_LVL_HIT. If I just return L(NA),
> perf tools shows it as "N/A hit".
>
> So, until tool code gets refactored, setting mem_lvl = NA here is hiding
> tool's dumbness :(. Maybe I should refactor perf_mem__snp_scnprintf() as
> part of this patchset.

I think we can change the tool independently - preferring MEM_LVLNUM
if present but you might want to add an option to override.
Given IBS didn't set it so far, the output would remain mostly the same.

Thanks,
Namhyung
Re: [PATCH v2] perf/x86/ibs: Set mem_lvl_num, mem_remote and mem_hops for data_src [ In reply to ]
On 30-Mar-23 5:17 AM, Namhyung Kim wrote:
> On Wed, Mar 29, 2023 at 2:45?AM Ravi Bangoria <ravi.bangoria@amd.com> wrote:
>>
>> Hi Namhyung,
>>
>>>> @@ -716,25 +748,19 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
>>>> * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
>>>> * memory accesses. So, check DcUcMemAcc bit early.
>>>> */
>>>> - if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
>>>> - data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
>>>> - return;
>>>> - }
>>>> + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
>>>> + return L(UNC);
>>>
>>> Hmm.. it seems we don't have PERF_MEM_LVLNUM_UNC.
>>
>> Right. Is it worth to introduce one?
>
> I think MEM_LVLNUM should express every memory level in MEM_LVL.

Ok.

>
>>
>> On a side note, I came to know that IBS OpData2[RmtNode] is not applicable
>> when DataSrc=7 (I/O). So, I need to respin this patch with that change.
>>
>> [...]
>>
>>>> check_mab:
>>>> @@ -830,12 +810,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
>>>> * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
>>>> * MAB only when IBS fails to provide DataSrc.
>>>> */
>>>> - if (op_data3->dc_miss_no_mab_alloc) {
>>>> - data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
>>>> - return;
>>>> - }
>>>> + if (op_data3->dc_miss_no_mab_alloc)
>>>> + return L(LFB) | LN(LFB);
>>>>
>>>> data_src->mem_lvl = PERF_MEM_LVL_NA;
>>>> + return 0;
>>>
>>> Wouldn't it be 'return L(NA) | LN(NA);' ?
>>
>> IBS has no instruction type filtering, i.e. it tags whatever instruction it
>> sees at overflow. When IBS tags non-load/store instruction, data_src->val is
>> set to PERF_MEM_NA, which does not initialize mem_lvl_num (Shall we change
>> that?). If I set both LVL_NA and LVL_NUM_NA for load/store with no DataSrc
>> info, perf mem output becomes funny:
>
> Probably worth changing PERF_MEM_NA.

Yeah seems so.

>
>>
>> $ sudo ./perf mem report -F sample,mem --stdio
>> # Samples Memory access
>> # ............ .......................................
>> #
>> 1914 N/A <====== Non-LS
>> 905 L1 or L1 hit
>> 19 L3 or L3 hit
>> 16 L2 or L2 hit
>> 6 N/A or N/A hit <====== LS with no DataSrc info
>> 6 Local RAM or RAM hit
>> 4 Remote node, same socket RAM hit
>> 3 Remote core, same node Any cache hit
>> 2 Remote node, same socket Any cache hit
>
> Maybe that's better to differentiate them :)

:). I would rather add instruction type column and print non-LS, L, S or LS.

>
>>
>> Also, L(NA) is PERF_MEM_LVL_NA | PERF_MEM_LVL_HIT. If I just return L(NA),
>> perf tools shows it as "N/A hit".
>>
>> So, until tool code gets refactored, setting mem_lvl = NA here is hiding
>> tool's dumbness :(. Maybe I should refactor perf_mem__snp_scnprintf() as
>> part of this patchset.
>
> I think we can change the tool independently - preferring MEM_LVLNUM
> if present but you might want to add an option to override.
> Given IBS didn't set it so far, the output would remain mostly the same.

Sure.

Thanks,
Ravi