(使用Visual C 19.00.23918,在调试中编译并关闭优化)

我为__m128内在函数创建了一个类包装器,如下所示:

class alignas(16) V4Xm
{
public:

    inline constexpr V4Xm()
        : _mFoo({0}){
    }

    inline V4Xm(float X, float Y, float Z, float W)
        : _mFoo(_mm_setr_ps(X, Y, Z, W)) {
    }

    inline V4Xm(const V4Xm& Other) :
        _mFoo(Other._mFoo) {
    }

    inline V4Xm(const __m128& Intrinsic) :
        _mFoo(Intrinsic) {
    }

    inline V4Xm operator*(const V4Xm& Other) const {
        return(V4Xm(_mm_mul_ps(_mFoo, Other._mFoo)));
    }

private:
    __m128 _mFoo;
};

并尝试比较调试中的简单情况,看看在我的内联类和原始__m128之间生成汇编代码时,与编译器有什么区别:

V4Xm Bar(const V4Xm& A) {
    return(A * A);
}

__m128 Bar(const __m128& A) {
    return(_mm_mul_ps(A, A));
}


int main()
{ 
    V4Xm A(1, 1, 1, 1);
    A = Bar(A);

    __m128 B(_mm_setr_ps(1, 1, 1, 1));
    B = Bar(B);

    return(0);
}

这首先为我的 class 生成:

V4Xm A(1, 1, 1, 1);
00A11A6A  push        ecx  
00A11A6B  movss       xmm0,dword ptr [__real@3f800000 (0A16B30h)]  
00A11A73  movss       dword ptr [esp],xmm0  
00A11A78  push        ecx  
00A11A79  movss       xmm0,dword ptr [__real@3f800000 (0A16B30h)]  
00A11A81  movss       dword ptr [esp],xmm0  
00A11A86  push        ecx  
00A11A87  movss       xmm0,dword ptr [__real@3f800000 (0A16B30h)]  
00A11A8F  movss       dword ptr [esp],xmm0  
00A11A94  push        ecx  
00A11A95  movss       xmm0,dword ptr [__real@3f800000 (0A16B30h)]  
00A11A9D  movss       dword ptr [esp],xmm0  
00A11AA2  lea         ecx,[A]  
00A11AA5  call        V4Xm::V4Xm (0A11325h)  

    A = Bar(A);
00A11AAA  lea         eax,[A]  
00A11AAD  push        eax  
00A11AAE  lea         ecx,[ebp-120h]  
00A11AB4  push        ecx  
00A11AB5  call        Bar (0A112E4h)  
00A11ABA  add         esp,8  
00A11ABD  mov         edx,dword ptr [eax]  
00A11ABF  mov         dword ptr [A],edx  
00A11AC2  mov         ecx,dword ptr [eax+4]  
00A11AC5  mov         dword ptr [ebp-1Ch],ecx  
00A11AC8  mov         edx,dword ptr [eax+8]  
00A11ACB  mov         dword ptr [ebp-18h],edx  
00A11ACE  mov         eax,dword ptr [eax+0Ch]  
00A11AD1  mov         dword ptr [ebp-14h],eax

内在的指令更少:

__m128 B(_mm_setr_ps(1, 1, 1, 1));
00A11AD4  movaps      xmm0,xmmword ptr[__xmm@3f8000003f8000003f8000003f800000 (0A16B40h)]  
00A11ADB  movaps      xmmword ptr [ebp-140h],xmm0  
00A11AE2  movaps      xmm0,xmmword ptr [ebp-140h]  
00A11AE9  movaps      xmmword ptr [B],xmm0  
    B = Bar(B);
00A11AED  lea         eax,[B]  
00A11AF0  push        eax  
00A11AF1  call        Bar (0A1102Dh)  
00A11AF6  add         esp,4  
00A11AF9  movaps      xmmword ptr [ebp-160h],xmm0  
00A11B00  movaps      xmm0,xmmword ptr [ebp-160h]  
00A11B07  movaps      xmmword ptr [B],xmm0

我很好奇为什么在调试,我的类和内在函数中这两个东西之间的编译器有这么大的差异 . 对我而言,就像这样的内在一样 . 为什么会这样?

另请注意,这在调试时关闭优化时是独占的 . 当我在发布中尝试其他测试时,编译器总是为两种情况生成相同的指令 .