首页 文章

为什么JVM不在Windows x86上发出预取指令

提问于
浏览
8

正如 Headers 所述,为什么OpenJDK JVM不会在Windows x86上发出预取指令?见OpenJDK Mercurial @ http://hg.openjdk.java.net/jdk8u/jdk8u/hotspot/file/c49dcaf78a65/src/os_cpu/windows_x86/vm/prefetch_windows_x86.inline.hpp

inline void Prefetch::read (void *loc, intx interval) {}
inline void Prefetch::write(void *loc, intx interval) {}

没有评论,我发现除了源代码之外没有其他资源 . 我问,因为它适用于Linux x86,请参阅http://hg.openjdk.java.net/jdk8u/jdk8u/hotspot/file/c49dcaf78a65/src/os_cpu/linux_x86/vm/prefetch_linux_x86.inline.hpp

inline void Prefetch::read (void *loc, intx interval) {
#ifdef AMD64
  __asm__ ("prefetcht0 (%0,%1,1)" : : "r" (loc), "r" (interval));
#endif // AMD64
}

inline void Prefetch::write(void *loc, intx interval) {
#ifdef AMD64

  // Do not use the 3dnow prefetchw instruction.  It isn't supported on em64t.
  //  __asm__ ("prefetchw (%0,%1,1)" : : "r" (loc), "r" (interval));
  __asm__ ("prefetcht0 (%0,%1,1)" : : "r" (loc), "r" (interval));

#endif // AMD64
}

2 回答

  • 6

    你引用的文件都有asm代码片段(inline assembler),一些C / C软件在自己的代码中使用(如apangin, the JVM expert pointed,主要是GC代码) . 实际上存在差异:x86_64热点的LinuxSolarisBSD变体在热点中有预取,而Windows禁用/未实现这是部分奇怪的,部分无法解释的原因,它也可能使JVM位(某些百分比;更多)没有硬件预取的平台)在Windows上速度较慢,但仍无法帮助销售更多针对Sun / Oracle的solaris / solaris付费支持 Contract . Ross also guessed MS C编译器可能不支持内联asm语法,但 _mm_prefetch 应该(谁将打开JDK bug来添加它to the file?) .

    JVM热点是JIT,JIT作为字节由JIT发出(生成)JITted代码(虽然JIT可以将代码从其自己的函数复制到生成的代码中或者发出对支持函数的调用,但是预取是作为字节发出的 . 热点) . 我们怎样才能发现它是如何排放的?简单的在线方式是找到一些jdk8u的在线可搜索副本(或更好的cross-reference like metager),例如在github:https://github.com/JetBrains/jdk8u_hotspot上,并搜索prefetchprefetch emitprefetchrlir_prefetchr . 有一些相关的结果:

    jdk8u_hotspot/src/cpu/x86/vm/assembler_x86.cpp中JVM的c1 compiler / LIR中发出的实际字节数:

    void Assembler::prefetch_prefix(Address src) {
      prefix(src);
      emit_int8(0x0F);
    }
    
    void Assembler::prefetchnta(Address src) {
      NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
      InstructionMark im(this);
      prefetch_prefix(src);
      emit_int8(0x18);
      emit_operand(rax, src); // 0, src
    }
    
    void Assembler::prefetchr(Address src) {
      assert(VM_Version::supports_3dnow_prefetch(), "must support");
      InstructionMark im(this);
      prefetch_prefix(src);
      emit_int8(0x0D);
      emit_operand(rax, src); // 0, src
    }
    
    void Assembler::prefetcht0(Address src) {
      NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
      InstructionMark im(this);
      prefetch_prefix(src);
      emit_int8(0x18);
      emit_operand(rcx, src); // 1, src
    }
    
    void Assembler::prefetcht1(Address src) {
      NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
      InstructionMark im(this);
      prefetch_prefix(src);
      emit_int8(0x18);
      emit_operand(rdx, src); // 2, src
    }
    
    void Assembler::prefetcht2(Address src) {
      NOT_LP64(assert(VM_Version::supports_sse(), "must support"));
      InstructionMark im(this);
      prefetch_prefix(src);
      emit_int8(0x18);
      emit_operand(rbx, src); // 3, src
    }
    
    void Assembler::prefetchw(Address src) {
      assert(VM_Version::supports_3dnow_prefetch(), "must support");
      InstructionMark im(this);
      prefetch_prefix(src);
      emit_int8(0x0D);
      emit_operand(rcx, src); // 1, src
    }
    

    用于c1 LIR:src/share/vm/c1/c1_LIRAssembler.cpp

    void LIR_Assembler::emit_op1(LIR_Op1* op) {
      switch (op->code()) { 
    ...
        case lir_prefetchr:
          prefetchr(op->in_opr());
          break;
    
        case lir_prefetchw:
          prefetchw(op->in_opr());
          break;
    

    现在我们知道the opcode lir_prefetchr and can search for itOpenGrok xreflir_prefetchw,在src/share/vm/c1/c1_LIR.cpp找到 the only example

    void LIR_List::prefetch(LIR_Address* addr, bool is_store) {
      append(new LIR_Op1(
                is_store ? lir_prefetchw : lir_prefetchr,
                LIR_OprFact::address(addr)));
    }
    

    还有其他地方可以定义预取指令(对于C2,如noted by apangin),the src/cpu/x86/vm/x86_64.ad

    // Prefetch instructions. ...
    instruct prefetchr( memory mem ) %{
      predicate(ReadPrefetchInstr==3);
      match(PrefetchRead mem);
      ins_cost(125);
    
      format %{ "PREFETCHR $mem\t# Prefetch into level 1 cache" %}
      ins_encode %{
        __ prefetchr($mem$$Address);
      %}
      ins_pipe(ialu_mem);
    %}
    
    instruct prefetchrNTA( memory mem ) %{
      predicate(ReadPrefetchInstr==0);
      match(PrefetchRead mem);
      ins_cost(125);
    
      format %{ "PREFETCHNTA $mem\t# Prefetch into non-temporal cache for read" %}
      ins_encode %{
        __ prefetchnta($mem$$Address);
      %}
      ins_pipe(ialu_mem);
    %}
    
    instruct prefetchrT0( memory mem ) %{
      predicate(ReadPrefetchInstr==1);
      match(PrefetchRead mem);
      ins_cost(125);
    
      format %{ "PREFETCHT0 $mem\t# prefetch into L1 and L2 caches for read" %}
      ins_encode %{
        __ prefetcht0($mem$$Address);
      %}
      ins_pipe(ialu_mem);
    %}
    
    instruct prefetchrT2( memory mem ) %{
      predicate(ReadPrefetchInstr==2);
      match(PrefetchRead mem);
      ins_cost(125);
    
      format %{ "PREFETCHT2 $mem\t# prefetch into L2 caches for read" %}
      ins_encode %{
        __ prefetcht2($mem$$Address);
      %}
      ins_pipe(ialu_mem);
    %}
    
    instruct prefetchwNTA( memory mem ) %{
      match(PrefetchWrite mem);
      ins_cost(125);
    
      format %{ "PREFETCHNTA $mem\t# Prefetch to non-temporal cache for write" %}
      ins_encode %{
        __ prefetchnta($mem$$Address);
      %}
      ins_pipe(ialu_mem);
    %}
    
    // Prefetch instructions for allocation.
    
    instruct prefetchAlloc( memory mem ) %{
      predicate(AllocatePrefetchInstr==3);
      match(PrefetchAllocation mem);
      ins_cost(125);
    
      format %{ "PREFETCHW $mem\t# Prefetch allocation into level 1 cache and mark modified" %}
      ins_encode %{
        __ prefetchw($mem$$Address);
      %}
      ins_pipe(ialu_mem);
    %}
    
    instruct prefetchAllocNTA( memory mem ) %{
      predicate(AllocatePrefetchInstr==0);
      match(PrefetchAllocation mem);
      ins_cost(125);
    
      format %{ "PREFETCHNTA $mem\t# Prefetch allocation to non-temporal cache for write" %}
      ins_encode %{
        __ prefetchnta($mem$$Address);
      %}
      ins_pipe(ialu_mem);
    %}
    
    instruct prefetchAllocT0( memory mem ) %{
      predicate(AllocatePrefetchInstr==1);
      match(PrefetchAllocation mem);
      ins_cost(125);
    
      format %{ "PREFETCHT0 $mem\t# Prefetch allocation to level 1 and 2 caches for write" %}
      ins_encode %{
        __ prefetcht0($mem$$Address);
      %}
      ins_pipe(ialu_mem);
    %}
    
    instruct prefetchAllocT2( memory mem ) %{
      predicate(AllocatePrefetchInstr==2);
      match(PrefetchAllocation mem);
      ins_cost(125);
    
      format %{ "PREFETCHT2 $mem\t# Prefetch allocation to level 2 cache for write" %}
      ins_encode %{
        __ prefetcht2($mem$$Address);
      %}
      ins_pipe(ialu_mem);
    %}
    
  • 8

    正如JDK-4453409所示,预取是在JDK 1.4中的HotSpot JVM中实现的,以加速GC . 那是超过15年前,没有人会记得为什么它没有在Windows上实现 . 我的猜测是Visual Studio(它一直用于在Windows上构建HotSpot)在这些时候基本上不理解预取指令 . 看起来像是一个改进的地方 .

    无论如何,您询问的代码由JVM垃圾收集器在内部使用 . 这不是JIT生成的 . C2 JIT代码生成器规则位于体系结构定义文件x86_64.ad中,并且rulesPrefetchReadPrefetchWritePrefetchAllocation 节点转换为相应的x64指令 .

    一个有趣的事实是 PrefetchReadPrefetchWrite 节点不会在代码中的任何地方创建 . 它们的存在只是为了支持Unsafe.prefetchX内在函数,但它们在JDK 9中是removed .

    JIT生成预取指令的唯一情况是 PrefetchAllocation node . 您可以使用 -XX:+UnlockDiagnosticVMOptions -XX:+PrintAssembly 验证 PREFETCHNTA 确实是在对象分配后生成的 both on Linux and Windows .

    class Test {
        public static void main(String[] args) {
            byte[] b = new byte[0];
            for (;;) {
                b = Arrays.copyOf(b, b.length + 1);
            }
        }
    }
    

    java.exe -XX:+UnlockDiagnosticVMOptions -XX:+PrintAssembly Test

    # {method} {0x00000000176124e0} 'main' '([Ljava/lang/String;)V' in 'Test'
      ...
      0x000000000340e512: cmp    $0x100000,%r11d
      0x000000000340e519: ja     0x000000000340e60f
      0x000000000340e51f: movslq 0x24(%rsp),%r10
      0x000000000340e524: add    $0x1,%r10
      0x000000000340e528: add    $0x17,%r10
      0x000000000340e52c: mov    %r10,%r8
      0x000000000340e52f: and    $0xfffffffffffffff8,%r8
      0x000000000340e533: cmp    $0x100000,%r11d
      0x000000000340e53a: ja     0x000000000340e496
      0x000000000340e540: mov    0x60(%r15),%rbp
      0x000000000340e544: mov    %rbp,%r9
      0x000000000340e547: add    %r8,%r9
      0x000000000340e54a: cmp    0x70(%r15),%r9
      0x000000000340e54e: jae    0x000000000340e496
      0x000000000340e554: mov    %r9,0x60(%r15)
      0x000000000340e558: prefetchnta 0xc0(%r9)
      0x000000000340e560: movq   $0x1,0x0(%rbp)
      0x000000000340e568: prefetchnta 0x100(%r9)
      0x000000000340e570: movl   $0x200000f5,0x8(%rbp)  ;   {metadata({type array byte})}
      0x000000000340e577: mov    %r11d,0xc(%rbp)
      0x000000000340e57b: prefetchnta 0x140(%r9)
      0x000000000340e583: prefetchnta 0x180(%r9)    ;*newarray
                                                    ; - java.util.Arrays::copyOf@1 (line 3236)
                                                    ; - Test::main@9 (line 9)
    

相关问题