首页 文章

Haswell微体系结构在perf中没有Stalled-cycles-backend

提问于
浏览
5

我在Haswell CPU(Intel Core i7-4790)上安装了perf . 但"perf list"不包括"stalled-cycles-frontend"也不包括"stalled-cycles-backend" . 我检查了http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html,但没有找到与表19-7(第4代英特尔酷睿处理器的处理器内核中的非架构性能事件)中的停顿周期后端相关的性能事件 .

所以我的问题是:如何使用Haswell CPU内核中的perf或其他工具来测量停滞循环后端 . 内核是3.19,perf版本也是3.19 .

谢谢

1 回答

  • 2

    是的,对于像Ivy Bridge或Haswell这样的新处理器,内核 perf_events 子系统中没有"stalled-cycles-frontend"和"stalled-cycles-backend"合成事件的映射 . 并且没有旧Core 2的映射 . 可能这个名称/概念/想法不适用于现代无序CPU的改变和复杂的微体系结构,而没有简单的全局"Stall"标量测量 .

    代码is in arch/x86/events/intel/core.c和合成事件名称是 PERF_COUNT_HW_STALLED_CYCLES_FRONTENDPERF_COUNT_HW_STALLED_CYCLES_BACKEND

    __init int intel_pmu_init(void)
    {...
    

    两者都是自Nehalem以来为Westmere,Sandy Bridge定义的:

    case INTEL_FAM6_NEHALEM:
        case INTEL_FAM6_NEHALEM_EP:
        case INTEL_FAM6_NEHALEM_EX:
    
            /* UOPS_ISSUED.STALLED_CYCLES */
            intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
                X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
            /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
            intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
                X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
    
        case INTEL_FAM6_WESTMERE:
        case INTEL_FAM6_WESTMERE_EP:
        case INTEL_FAM6_WESTMERE_EX:
    
            /* UOPS_ISSUED.STALLED_CYCLES */
            intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
                X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
            /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
            intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
                X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
    
    
        case INTEL_FAM6_SANDYBRIDGE:
        case INTEL_FAM6_SANDYBRIDGE_X:
    
    
            /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
            intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
                X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
            /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
            intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =
                X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
    

    只为Ivy Bridge定义了前端档位

    case INTEL_FAM6_IVYBRIDGE:
        case INTEL_FAM6_IVYBRIDGE_X:
    
            /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
            intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
                X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
    

    对于最近的CPU桌面(Haswell,Broadwell,Skylake,Kaby Lake)和Phi(KNL,KNM),没有前端和后端停顿的映射:

    case INTEL_FAM6_HASWELL_CORE:
        case INTEL_FAM6_HASWELL_X:
        case INTEL_FAM6_HASWELL_ULT:
        case INTEL_FAM6_HASWELL_GT3E:
    
        case INTEL_FAM6_BROADWELL_CORE:
        case INTEL_FAM6_BROADWELL_XEON_D:
        case INTEL_FAM6_BROADWELL_GT3E:
        case INTEL_FAM6_BROADWELL_X:
    
    
        case INTEL_FAM6_XEON_PHI_KNL:
        case INTEL_FAM6_XEON_PHI_KNM:
    
    
        case INTEL_FAM6_SKYLAKE_MOBILE:
        case INTEL_FAM6_SKYLAKE_DESKTOP:
        case INTEL_FAM6_SKYLAKE_X:
        case INTEL_FAM6_KABYLAKE_MOBILE:
        case INTEL_FAM6_KABYLAKE_DESKTOP:
    

    也没有为旧的Core2定义(没有检查Atoms):

    http://elixir.free-electrons.com/linux/v4.11/source/arch/x86/events/intel/core.c#L27

    static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
    {
        [PERF_COUNT_HW_CPU_CYCLES]      = 0x003c,
        [PERF_COUNT_HW_INSTRUCTIONS]        = 0x00c0,
        [PERF_COUNT_HW_CACHE_REFERENCES]    = 0x4f2e,
        [PERF_COUNT_HW_CACHE_MISSES]        = 0x412e,
        [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
        [PERF_COUNT_HW_BRANCH_MISSES]       = 0x00c5,
        [PERF_COUNT_HW_BUS_CYCLES]      = 0x013c,
        [PERF_COUNT_HW_REF_CPU_CYCLES]      = 0x0300, /* pseudo-encoding */
    };
    

相关问题