ASM阻塞功能和ABI x86-64-Java 学习之路

我为大整数编码了一个非常好的整数lib，但限制为512位（由于各种原因比GMP快） . 我试图推广大尺寸的lib . 所以我必须循环遍历adcq指令 .

// long addition little indian order due the technique incq-jnz
// I can not use compare because it destroy the Carry Bit
template<int n>
void test_add(boost::uint64_t*, boost::uint64_t* ){    
    asm volatile (
        "clc                                     \n"
        "movq %0, %%rcx                          \n"
    "loop:                                       \n"
        "movq 8(%%rsi,%%rcx,8), %%rax            \n"  /* original -8(%%rsi,%%rbx,8) */
        "adcq %%rax           , 8(%%rdi,%%rcx,8) \n"  /* original -8(%%rsi,%%rbx,8) */
        "incq %%rcx                              \n"  /* original decq */
    "jnz loop                                    \n"
        :   
        :"g"(n)
        :"rax","rcx","cc","memory"
    );  
}


int main(int argc, char* argv[]) {
boost::uint64_t c[4],d[4];

c[0] = -1; 
c[1] = -1; 
c[2] = -1; 
c[3] =  0;  

d[0] = 1;
d[1] = 0;
d[2] = 0;
d[3] = 0;

test_add<-4>(&d[3],&c[3]); // <-- BigEndian to LittleEndian

这个东西在调试模式-O0下运行良好，但是一旦我使用优化，segfault /

我真的不明白因为我尊重rsi for rsi和rdi，clobber寄存器，使用好的寄存器，所以我编译了GCC -O0 -S和-O2 -S

对于-O0 -S我得到了

3 .globl main
 4         .type   main, @function
 5 main:
 6 .LFB1:
 7         .cfi_startproc
 8         .cfi_personality 0x3,__gxx_personality_v0
 9         pushq   %rbp
 10         .cfi_def_cfa_offset 16
 11         .cfi_offset 6, -16
 12         movq    %rsp, %rbp
 13         .cfi_def_cfa_register 6
 14         subq    $80, %rsp
 15         movl    %edi, -68(%rbp)
 16         movq    %rsi, -80(%rbp)
 17         movq    $-1, -32(%rbp)
 18         movq    $-1, -24(%rbp)
 19         movq    $-1, -16(%rbp)
 20         movq    $0, -8(%rbp)
 21         movq    $1, -64(%rbp)
 22         movq    $0, -56(%rbp)
 23         movq    $0, -48(%rbp)
 24         movq    $0, -40(%rbp)
 25         leaq    -32(%rbp), %rax
 26         leaq    24(%rax), %rdx
 27         leaq    -64(%rbp), %rax
 28         addq    $24, %rax
 29         movq    %rdx, %rsi
 30         movq    %rax, %rdi
 31         call    _Z8test_addILin4EEvPyS0_
 32         movl    $0, %eax
 33         leave
 34         .cfi_def_cfa 7, 8
 35         ret
 36         .cfi_endproc
 37 .LFE1:
 38         .size   main, .-main
 39         .section              .   enter code here  `enter code here`text._Z8test_addILin4EEvPyS0_,"axG",@progbits,_Z8test_addILin4EEvPyS0_,comdat
 40         .weak   _Z8test_addILin4EEvPyS0_
 41         .type   _Z8test_addILin4EEvPyS0_, @function
 42 _Z8test_addILin4EEvPyS0_:
 43 .LFB2:
 44         .cfi_startproc
 45         .cfi_personality 0x3,__gxx_personality_v0
 46         pushq   %rbp
 47         .cfi_def_cfa_offset 16
 48         .cfi_offset 6, -16
 49         movq    %rsp, %rbp
 50         .cfi_def_cfa_register 6
 51         movq    %rdi, -8(%rbp)
 52         movq    %rsi, -16(%rbp)
 53 #APP
 54 # 14 "test.cpp" 1
 55         clc
 56 movq $-4, %rcx
 57 loop:
 58 movq 8(%rsi,%rcx,8), %rax
 59 adcq %rax           , 8(%rdi,%rcx,8)
 60 incq %rcx
 61 jnz loop
 62 
 63 # 0 "" 2
 64 #NO_APP
 65         leave
 66         .cfi_def_cfa 7, 8
 67         ret
 68         .cfi_endproc
 69 .LFE2:
 70         .size   _Z8test_addILin4EEvPyS0_, .-_Z8test_addILin4EEvPyS0_
 71         .ident  "GCC: (GNU) 4.4.6 20120305 (Red Hat 4.4.6-4)"
 72         .section        .note.GNU-stack,"",@progbits

第20-30行我们看到编译器重新组织堆栈以将arg传递给rsi和rdi（第29-30行）和调用 . 完美如ABI

如果现在我看看我得到的优化版本

1         .file   "test.cpp"
  2         .text
  3         .p2align 4,,15
  4 .globl main
  5         .type   main, @function
  6 main:
  7 .LFB1:
  8         .cfi_startproc
  9         .cfi_personality 0x3,__gxx_personality_v0
  10 #APP
  11 # 14 "test.cpp" 1
  12         clc
  13 movq $-4, %rcx
  14 loop:
  15 movq 8(%rsi,%rcx,8), %rax
  16 adcq %rax           , 8(%rdi,%rcx,8)
  17 incq %rcx
  18 jnz loop
  19 
  20 # 0 "" 2
  21 #NO_APP
  22         xorl    %eax, %eax
  23         ret
  24         .cfi_endproc
  25 .LFE1:
  26         .size   main, .-main
  27         .ident  "GCC: (GNU) 4.4.6 20120305 (Red Hat 4.4.6-4)"
  28         .section        .note.GNU-stack,"",@progbits

再见ABI，我不明白 . 堆栈由什么????管理

ASM大师有个主意吗？我拒绝把这个函数放到一个独立的文件中，很好的元编程精神 .

干杯 .

-------编辑：

我发现你的解决方案有一个错误，如果我把它放到一个循环中：

#include <boost/cstdint.hpp> //boost type

template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t const* y) {
    boost::uint64_t dummy;
    boost::uint64_t loop_index(n);
    __asm__ __volatile__ (
        "clc\n\t"
        "1:\n\t"
        "movq (%[y],%[counter],8), %[dummy]\n\t"
        "adcq %[dummy], (%[x], %[counter], 8)\n\t"
        "incq %[counter]\n\t"
        "jnz 1b\n\t"
        : [dummy] "=&r" (dummy)
        : [x] "r" (x), [y] "r" (y), [counter] "r" (loop_index)
        : "memory", "cc");
 }


int main(int argc, char* argv[]) {
    boost::uint64_t c[3],d[3];

    c[0] = -1; 
    c[1] = -1; 
    c[2] = -1; 
    c[3] =  0;  

    d[0] = 1;
    d[1] = 0;
    d[2] = 0;
    d[3] = 0;

for(int i=0; i < 0xfff; ++i)
    test_add<-4>(&c[4],&d[4]);

 return 0;

}

将给出以下ASM：

movq    $-4, %rdx <---------------------template parameter
      leaq    -32(%rsp), %rcx
      movq    $-1, -32(%rsp)
      movq    $-1, -24(%rsp)
      movq    $-1, -16(%rsp)
      movq    $0, -8(%rsp)
      movq    $1, -64(%rsp)
      movq    $0, -56(%rsp)
      movq    $0, -48(%rsp)
      movq    $0, -40(%rsp)
      .p2align 4,,10
      .p2align 3
  .L2: <-------- OUPUT loop
#APP
# 16 "main.cpp" 1
       clc
       1: <-------- INPUT loop
       movq (%rcx,%rdx,8), %rsi
       adcq %rsi, (%rsp, %rdx, 8)
       incq %rdx <------------ rdx++ -> (-4)++ (for the @nd iteration of L2 it is not reset to -4)
       jnz 1b

 # 0 "" 2
 #NO_APP
       addl    $1, %eax
       cmpl    $4095, %eax <----- test second loop
       jne     .L2

对于输出循环的第二次迭代，rdx不会重新注入到-4，因此movq指令会给出错误的读取，即segfault . 我修补得非常糟糕（我用手重置-4），我只是在jnz之后添加“movq $ -4，％[counter] \ n \ t”，但我需要更通用的东西 . 是否存在将计数器重置为模板参数值的约束？

目前的修正是：

template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t const* y) {
    boost::uint64_t dummy;
    __asm__ __volatile__ (
        "clc\n\t"
        "movq %[counter_value], %[counter]\n\t" // set the counter to the template value, it's not sure if the function is reused
        "1:\n\t"
        "movq (%[y],%[counter],8), %[dummy]\n\t"
        "adcq %[dummy], (%[x], %[counter], 8)\n\t"
        "incq %[counter]\n\t"
        "jnz 1b\n\t"
        : [dummy] "=&r" (dummy)
        : [x] "r" (x), [y] "r" (y), [counter] "r" (n), [counter_value] "i" (n)
        : "memory", "cc");
}

1 回答

3
您应该使用约束来访问参数 . 对于内部函数， gcc 不需要遵循ABI，即使这样做，也不需要在执行asm块时保持初始状态不变 . 当然，内联asm的要点是让编译器内联它，然后甚至不会发生函数调用 . （许多人错误地认为内联意味着“嵌入在C源文件中”并使用它作为便利功能，即使不需要实际的代码内联 . ）

gcc 也非常有能力将东西放入你想要的寄存器中（不是你特别关心这里的计数器是 rcx ） . 它's also generally a good idea to leave as much to the compiler as possible, so that it can do register allocation, loop unrolling and other optimizations. Unfortunately I couldn' t得到 gcc 来生成 ADC ，所以这次asm块保持不变 . 由于部分标志更新，建议不要使用 inc ，但我现在看不到明显的方法 .

最后，如果您传递 d[3] 的地址，您将通过 d[2] 访问项目 d[-1] ，这不是您想要的 . 你应该通过 d[4] .

固定版本可能看起来像这样（带有命名参数）：
```
template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t* y) {
    boost::uint64_t dummy, dummy2;
    __asm__ __volatile__ (
        "clc\n\t"
        "1:\n\t"
        "movq (%[y], %[counter], 8), %[dummy]\n\t"
        "adcq %[dummy], (%[x], %[counter], 8)\n\t"
        "incq %[counter]\n\t"
        "jnz 1b\n\t"
        : [dummy] "=&r" (dummy), "=r" (dummy2)
        : [x] "r" (x), [y] "r" (y), [counter] "1" (n)
        : "memory", "cc");
}
```
请注意， dummy 变量将被优化掉，同时允许 gcc 选择合适的寄存器而不是强制它使用特定的寄存器 .

Update ：这是一个纯C版本，编译器可以完全展开和优化（包括在编译时计算东西！） . 虽然在通用情况下编译器's code isn't与手写的一样有效，但所提到的优化可能使其在环境下更好 . 注意：由于您使用的是 gcc inline asm，这意味着您的代码已经 gcc 和 x86-64 特定，因此使用 __uint128_t 不是进一步的限制（事实上，这将适用于 gcc 支持128位整数的任何架构） .
```
template<long n>
void test_add(boost::uint64_t* x, boost::uint64_t* y) {
    __uint128_t work = 0;
    for(long i = n; i < 0; i += 1) {
        work = work + x[i] + y[i];
        x[i] = work; // automatic truncation
        work >>= 64;
    }
}
```
回复于 2024-05-04T13:51:39+08:00

ASM阻塞功能和ABI x86-64

1 回答

相关问题