(使用Visual C 19.00.23918,在调试中编译并关闭优化)
我为__m128内在函数创建了一个类包装器,如下所示:
class alignas(16) V4Xm
{
public:
inline constexpr V4Xm()
: _mFoo({0}){
}
inline V4Xm(float X, float Y, float Z, float W)
: _mFoo(_mm_setr_ps(X, Y, Z, W)) {
}
inline V4Xm(const V4Xm& Other) :
_mFoo(Other._mFoo) {
}
inline V4Xm(const __m128& Intrinsic) :
_mFoo(Intrinsic) {
}
inline V4Xm operator*(const V4Xm& Other) const {
return(V4Xm(_mm_mul_ps(_mFoo, Other._mFoo)));
}
private:
__m128 _mFoo;
};
并尝试比较调试中的简单情况,看看在我的内联类和原始__m128之间生成汇编代码时,与编译器有什么区别:
V4Xm Bar(const V4Xm& A) {
return(A * A);
}
__m128 Bar(const __m128& A) {
return(_mm_mul_ps(A, A));
}
int main()
{
V4Xm A(1, 1, 1, 1);
A = Bar(A);
__m128 B(_mm_setr_ps(1, 1, 1, 1));
B = Bar(B);
return(0);
}
这首先为我的 class 生成:
V4Xm A(1, 1, 1, 1);
00A11A6A push ecx
00A11A6B movss xmm0,dword ptr [__real@3f800000 (0A16B30h)]
00A11A73 movss dword ptr [esp],xmm0
00A11A78 push ecx
00A11A79 movss xmm0,dword ptr [__real@3f800000 (0A16B30h)]
00A11A81 movss dword ptr [esp],xmm0
00A11A86 push ecx
00A11A87 movss xmm0,dword ptr [__real@3f800000 (0A16B30h)]
00A11A8F movss dword ptr [esp],xmm0
00A11A94 push ecx
00A11A95 movss xmm0,dword ptr [__real@3f800000 (0A16B30h)]
00A11A9D movss dword ptr [esp],xmm0
00A11AA2 lea ecx,[A]
00A11AA5 call V4Xm::V4Xm (0A11325h)
A = Bar(A);
00A11AAA lea eax,[A]
00A11AAD push eax
00A11AAE lea ecx,[ebp-120h]
00A11AB4 push ecx
00A11AB5 call Bar (0A112E4h)
00A11ABA add esp,8
00A11ABD mov edx,dword ptr [eax]
00A11ABF mov dword ptr [A],edx
00A11AC2 mov ecx,dword ptr [eax+4]
00A11AC5 mov dword ptr [ebp-1Ch],ecx
00A11AC8 mov edx,dword ptr [eax+8]
00A11ACB mov dword ptr [ebp-18h],edx
00A11ACE mov eax,dword ptr [eax+0Ch]
00A11AD1 mov dword ptr [ebp-14h],eax
内在的指令更少:
__m128 B(_mm_setr_ps(1, 1, 1, 1));
00A11AD4 movaps xmm0,xmmword ptr[__xmm@3f8000003f8000003f8000003f800000 (0A16B40h)]
00A11ADB movaps xmmword ptr [ebp-140h],xmm0
00A11AE2 movaps xmm0,xmmword ptr [ebp-140h]
00A11AE9 movaps xmmword ptr [B],xmm0
B = Bar(B);
00A11AED lea eax,[B]
00A11AF0 push eax
00A11AF1 call Bar (0A1102Dh)
00A11AF6 add esp,4
00A11AF9 movaps xmmword ptr [ebp-160h],xmm0
00A11B00 movaps xmm0,xmmword ptr [ebp-160h]
00A11B07 movaps xmmword ptr [B],xmm0
我很好奇为什么在调试,我的类和内在函数中这两个东西之间的编译器有这么大的差异 . 对我而言,就像这样的内在一样 . 为什么会这样?
另请注意,这在调试时关闭优化时是独占的 . 当我在发布中尝试其他测试时,编译器总是为两种情况生成相同的指令 .