英特尔编译器与GCC代码生成的差异-Java 学习之路

我正在学习x64编程以及Intel C编译器和GCC之间的差异以及它们如何优化指令

问题：

告诉英特尔编译器转储汇编代码（类似于gcc -S）的最佳方法是什么？现在我在Visual Studio中调试和反汇编以查看说明 .
反汇编的英特尔编译psum1不遵守传递寄存器rdi，rsi，rdx，rcx，r8，r9的参数约定，就像在GCC汇编器输出中看到的那样 . 我在这里错过了什么？
出于某种原因，intel编译器没有优化内存访问，我需要更改哪些设置？

//intel compiler /Ox output
            p[i] = p[i-1] + a[i];
            000000013F79118B  movss       xmm1,dword ptr [rcx+rax*4+8]
            000000013F791191  addss       xmm0,dword ptr [rcx+rax*4+4]
            000000013F791197  movss       dword ptr [rdx+rax*4+4],xmm0
            000000013F79119D  addss       xmm0,xmm1
            000000013F7911A1  movss       dword ptr [rdx+rax*4+8],xmm0

//GCC -O3 ouput
LBB1_3:
decq    %rdx
LBB1_2:
addq    $4, %rsi
        addq    $4, %rdi
        addss   (%rdi), %xmm0
        movss   %xmm0, (%rsi)
testq   %rdx, %rdx
        jne LBB1_3
        LBB1_4:

原始C代码

void psum1( float a[], float p[], long int n ) {
    long int i;
    p[0] = a[0];
    for (i=1; i<n; i++) {
        p[i] = p[i-1] + a[i];
    }
}

在Visual Studio 2010上从英特尔C编译器2013中反汇编：

全面优化/牛
启用内在函数/ Oi
赞成速度/ Ot

void psum1（float a []，float p []，long int n）{long int i;

p[0] = a[0];
000000013F791156  movss       xmm0,dword ptr [rcx]
000000013F79115A  mov         dword ptr [rdx],eax

for( i=1; i<n; i++ ) {
    000000013F79115C  jle         psum1+7Ah (13F7911CAh)
    000000013F79115E  mov         eax,1
    000000013F791163  lea         r10d,[r8-1]
    000000013F791167  mov         r11d,r10d
    000000013F79116A  xor         r9d,r9d
    000000013F79116D  shr         r11d,1Fh
    000000013F791171  lea         r8d,[r11+r8-1]
    000000013F791176  sar         r8d,1
    000000013F791179  test        r8d,r8d
    000000013F79117C  jbe         psum1+5Eh (13F7911AEh)

    p[i] = p[i-1] + a[i];
    000000013F79117E  lea         eax,[r9+r9]

    for( i=1; i<n; i++ ) {
        000000013F791182  inc         r9d

        p[i] = p[i-1] + a[i];
        000000013F791185  movsxd      rax,eax

        for( i=1; i<n; i++ ) {
            000000013F791188  cmp         r9d,r8d

            p[i] = p[i-1] + a[i];
            000000013F79118B  movss       xmm1,dword ptr [rcx+rax*4+8]
            000000013F791191  addss       xmm0,dword ptr [rcx+rax*4+4]
            000000013F791197  movss       dword ptr [rdx+rax*4+4],xmm0
            000000013F79119D  addss       xmm0,xmm1
            000000013F7911A1  movss       dword ptr [rdx+rax*4+8],xmm0

            for( i=1; i<n; i++ ) {
                000000013F7911A7  jb          psum1+2Eh (13F79117Eh)
                000000013F7911A9  lea         eax,[r9+r9+1]
                000000013F7911AE  lea         r8d,[rax-1]
                000000013F7911B2  cmp         r10d,r8d
                000000013F7911B5  jbe         psum1+7Ah (13F7911CAh)

                p[i] = p[i-1] + a[i];
                000000013F7911B7  movsxd      rax,eax
                000000013F7911BA  movss       xmm0,dword ptr [rdx+rax*4-4]
                000000013F7911C0  addss       xmm0,dword ptr [rcx+rax*4]
                000000013F7911C5  movss       dword ptr [rdx+rax*4],xmm0
            }
        }
        000000013F7911CA  ret
        000000013F7911CB  nop         dword ptr [rax+rax]

GCC装配输出完全优化-O3

.section    __TEXT,__text,regular,pure_instructions
.globl  _psum1
.align  4, 0x90
_psum1:
Leh_func_begin1:
pushq   %rbp
        Ltmp0:
movq    %rsp, %rbp
        Ltmp1:
movss   (%rdi), %xmm0
        movss   %xmm0, (%rsi)
cmpq    $2, %rdx
        jl  LBB1_4
        addq    $-2, %rdx
        jmp LBB1_2
.align  4, 0x90
LBB1_3:
decq    %rdx
LBB1_2:
addq    $4, %rsi
        addq    $4, %rdi
        addss   (%rdi), %xmm0
        movss   %xmm0, (%rsi)
testq   %rdx, %rdx
        jne LBB1_3
LBB1_4:
popq    %rbp
        ret
Leh_func_end1:

英特尔编译器与GCC代码生成的差异

原始C代码

在Visual Studio 2010上从英特尔C编译器2013中反汇编：

GCC装配输出完全优化-O3

相关问题