Background
如果你一直关注我的帖子,我试图复制Kazushige Goto关于方阵乘法 C = AB
的开创性论文中的结果 . 我关于这个主题的最后一篇文章可以在here找到 . 在我的代码版本中,我遵循Goto的内存分层和打包策略,使用内核计算 2x8
块使用128位SSE3内在函数 . 我的CPU是i5-540M,超线程关闭 . 有关我的硬件的其他信息可以在另一个post中找到,并在下面重复 .
My Hardware
我的CPU是Intel i5 - 540M . 您可以在cpu-world.com上找到相关的CPUID信息 . 微体系结构是Nehalem(westmere),因此理论上每循环每个核心可以计算4个双精度触发器 . 我将只使用一个核心(没有OpenMP),因此对于超线程关闭和4步Intel Turbo Boost,我应该看到 ( 2.533 Ghz + 4*0.133 Ghz ) * ( 4 DP flops/core/cycle ) * ( 1 core ) = 12.27 DP Gflops
的峰值 . 作为参考,两个内核都运行在峰值时,Intel Turbo Boost提供了两步加速,我应该获得 22.4 DP Gflops
的理论峰值 .
My Software
Windows7 64位,但MinGW / GCC 32位由于我的电脑限制 .
What's new this time?
-
我计算
C
的2x4
块 . 这提供了更好的性能,并且与Goto所说的一致(一半的寄存器用于计算C
) . 我尝试过很多尺码:1x8
,2x8
,2x4
,4x2
,2x2
,4x4
. -
我的内核是手工编码的x86程序集,我尽我所能进行了优化(与Goto编写的一些内核相匹配),这比SIMD提供了相当大的性能提升 . 此代码在内核内部展开8次(为方便起见定义为宏),从我尝试的其他展开策略中获得最佳性能 .
-
我使用Windows性能计数器来计算代码的时间,而不是
clock()
. -
我独立于总代码计算内核,看看我的手工编码程序集的效果如何 .
-
我报告了一些试验的最佳结果,而不是试验的平均结果 .
-
不再是OpenMP(仅限单核性能) .
-
注意我今晚将重新编译OpenBLAS以仅使用1个核心,因此我可以进行比较 .
Some Preliminary Results
N
是方形矩阵的维数, Total Gflops/s
是整个代码的Gflops / s, Kernel Gflops/s
是内核的Gflops / s . 您可以看到,在一个内核上峰值为12.26 Gflops / s,内核的效率提高了约75%,而整体代码的效率约为60% .
I would like to get closer to 95% efficiency for the kernel and 80% for the overall code. What else can I do to improve the performance, of at least the inner kernel?
N Total Gflops/s Kernel Gflops/s
256 7.778089 9.756284
512 7.308523 9.462700
768 7.223283 9.253639
1024 7.197375 9.132235
1280 7.142538 8.974122
1536 7.114665 8.967249
1792 7.060789 8.861958
Source Code
如果您感觉特别宽大,请在您的机器上测试我的代码 . 用 gcc -std=c99 -O2 -m32 -msse3 -mincoming-stack-boundary=2 -masm=intel somatmul2.c -o somatmul2.exe
编译 . 随意尝试其他标志,但我发现这些最适合我的机器 .
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <xmmintrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>
#include <windows.h>
#ifndef min
#define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
#endif
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) rpack( double* restrict dst,
const double* restrict src,
const int kc, const int mc, const int mr, const int n)
{
double tmp[mc*kc] __attribute__ ((aligned(64)));
double* restrict ptr = &tmp[0];
for (int i = 0; i < mc; ++i)
for (int j = 0; j < kc; j+=4) {
*ptr++ = *(src + i*n + j );
*ptr++ = *(src + i*n + j+1);
*ptr++ = *(src + i*n + j+2);
*ptr++ = *(src + i*n + j+3);
}
ptr = &tmp[0];
//const int inc_dst = mr*kc;
for (int k = 0; k < mc; k+=mr)
for (int j = 0; j < kc; ++j)
for (int i = 0; i < mr*kc; i+=kc)
*dst++ = *(ptr + k*kc + j + i);
}
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) cpack(double* restrict dst,
const double* restrict src,
const int nc,
const int kc,
const int nr,
const int n)
{ //nc cols, kc rows, nr size ofregister strips
double tmp[kc*nc] __attribute__ ((aligned(64)));
double* restrict ptr = &tmp[0];
for (int i = 0; i < kc; ++i)
for (int j = 0; j < nc; j+=4) {
*ptr++ = *(src + i*n + j );
*ptr++ = *(src + i*n + j+1);
*ptr++ = *(src + i*n + j+2);
*ptr++ = *(src + i*n + j+3);
}
ptr = &tmp[0];
// const int inc_k = nc/nr;
for (int k = 0; k < nc; k+=nr)
for (int j = 0; j < kc*nc; j+=nc)
for (int i = 0; i < nr; ++i)
*dst++ = *(ptr + k + i + j);
}
#define KERNEL0(add0,add1,add2,add3) \
"mulpd xmm4, xmm6 \n\t" \
"addpd xmm0, xmm4 \n\t" \
"movapd xmm4, XMMWORD PTR [edx+"#add2"] \n\t" \
"mulpd xmm7, xmm4 \n\t" \
"addpd xmm1, xmm7 \n\t" \
"movddup xmm5, QWORD PTR [eax+"#add0"] \n\t" \
"mulpd xmm6, xmm5 \n\t" \
"addpd xmm2, xmm6 \n\t" \
"movddup xmm7, QWORD PTR [eax+"#add1"] \n\t" \
"mulpd xmm4, xmm5 \n\t" \
"movapd xmm6, XMMWORD PTR [edx+"#add3"] \n\t" \
"addpd xmm3, xmm4 \n\t" \
"movapd xmm4, xmm7 \n\t" \
" \n\t"
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) dgemm_2x4_asm_j
(
const int mc, const int nc, const int kc,
const double* restrict locA, const int cs_a, // mr
const double* restrict locB, const int rs_b, // nr
double* restrict C, const int rs_c
)
{
const double* restrict a1 = locA;
for (int i = 0; i < mc ; i+=cs_a) {
const double* restrict b1 = locB;
double* restrict c11 = C + i*rs_c;
for (int j = 0; j < nc ; j+=rs_b) {
__asm__ __volatile__
(
"mov eax, %[a1] \n\t"
"mov edx, %[b1] \n\t"
"mov edi, %[c11] \n\t"
"mov ecx, %[kc] \n\t"
"pxor xmm0, xmm0 \n\t"
"movddup xmm7, QWORD PTR [eax] \n\t" // a1
"pxor xmm1, xmm1 \n\t"
"movapd xmm6, XMMWORD PTR [edx] \n\t" // b1
"pxor xmm2, xmm2 \n\t"
"movapd xmm4, xmm7 \n\t" // a1
"pxor xmm3, xmm3 \n\t"
"sar ecx, 3 \n\t" // divide by 2^num
"L%=: \n\t" // start loop
KERNEL0( 8, 16, 16, 32)
KERNEL0( 24, 32, 48, 64)
KERNEL0( 40, 48, 80, 96)
KERNEL0( 56, 64, 112, 128)
KERNEL0( 72, 80, 144, 160)
KERNEL0( 88, 96, 176, 192)
KERNEL0( 104, 112, 208, 224)
KERNEL0( 120, 128, 240, 256)
"add eax, 128 \n\t"
"add edx, 256 \n\t"
" \n\t"
"dec ecx \n\t"
"jne L%= \n\t" // end loop
" \n\t"
"mov esi, %[rs_c] \n\t" // don't need cs_a anymore
"sal esi, 3 \n\t" // times 8
"lea ebx, [edi+esi] \n\t" // don't need b1 anymore
"addpd xmm0, XMMWORD PTR [edi] \n\t" // c11
"addpd xmm1, XMMWORD PTR [edi+16] \n\t" // c11 + 2
"addpd xmm2, XMMWORD PTR [ebx] \n\t" // c11
"addpd xmm3, XMMWORD PTR [ebx+16] \n\t" // c11 + 2
"movapd XMMWORD PTR [edi], xmm0 \n\t"
"movapd XMMWORD PTR [edi+16], xmm1 \n\t"
"movapd XMMWORD PTR [ebx], xmm2 \n\t"
"movapd XMMWORD PTR [ebx+16], xmm3 \n\t"
: // no outputs
: // inputs
[kc] "m" (kc),
[a1] "m" (a1),
[cs_a] "m" (cs_a),
[b1] "m" (b1),
[rs_b] "m" (rs_b),
[c11] "m" (c11),
[rs_c] "m" (rs_c)
: //register clobber
"memory",
"eax","ebx","ecx","edx","esi","edi",
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"
);
b1 += rs_b*kc;
c11 += rs_b;
}
a1 += cs_a*kc;
}
}
double blis_dgemm_ref(
const int n,
const double* restrict A,
const double* restrict B,
double* restrict C,
const int mc,
const int nc,
const int kc
)
{
int mr = 2;
int nr = 4;
double locA[mc*kc] __attribute__ ((aligned(64)));
double locB[kc*nc] __attribute__ ((aligned(64)));
LARGE_INTEGER frequency, time1, time2;
double time3 = 0.0;
QueryPerformanceFrequency(&frequency);
// zero C
memset(C, 0, n*n*sizeof(double));
int ii,jj,kk;
//#pragma omp parallel num_threads(2) shared(A,B,C) private(ii,jj,kk,locA,locB)
{//use all threads in parallel
//#pragma omp for
for ( jj = 0; jj < n; jj+=nc) {
for ( kk = 0; kk < n; kk+=kc) {
cpack(locB, B + kk*n + jj, nc, kc, nr, n);
for ( ii = 0; ii < n; ii+=mc) {
rpack(locA, A + ii*n + kk, kc, mc, mr, n);
QueryPerformanceCounter(&time1);
dgemm_2x4_asm_j( mc, nc, kc,
locA , mr,
locB , nr,
C + ii*n + jj, n );
QueryPerformanceCounter(&time2);
time3 += (double) (time2.QuadPart - time1.QuadPart);
}
}
}
}
return time3 / frequency.QuadPart;
}
double compute_gflops(const double time, const int n)
{
// computes the gigaflops for a square matrix-matrix multiplication
double gflops;
gflops = (double) (2.0*n*n*n)/time/1.0e9;
return(gflops);
}
void main() {
LARGE_INTEGER frequency, time1, time2;
double time3, best_time;
double kernel_time, best_kernel_time;
QueryPerformanceFrequency(&frequency);
int best_flag;
double gflops, kernel_gflops;
const int trials = 100;
int nmax = 4096;
printf("%16s %16s %16s\n","N","Total Gflops/s","Kernel Gflops/s");
int mc = 256;
int kc = 256;
int nc = 128;
for (int n = kc; n <= nmax; n+=kc) {
double *A = NULL;
double *B = NULL;
double *C = NULL;
A = _mm_malloc (n*n * sizeof(*A),64); if (!A) {printf("A failed\n"); return;}
B = _mm_malloc (n*n * sizeof(*B),64); if (!B) {printf("B failed\n"); return;}
C = _mm_malloc (n*n * sizeof(*C),64); if (!C) {printf("C failed\n"); return;}
srand(time(NULL));
// Create the matrices
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
*(A + i*n + j) = (double) rand()/RAND_MAX;
*(B + i*n + j) = (double) rand()/RAND_MAX;
}
}
// warmup
blis_dgemm_ref(n,A,B,C,mc,nc,kc);
for (int count = 0; count < trials; count++){
QueryPerformanceCounter(&time1);
kernel_time = blis_dgemm_ref(n,A,B,C,mc,nc,kc);
QueryPerformanceCounter(&time2);
time3 = (double)(time2.QuadPart - time1.QuadPart) / frequency.QuadPart;
if (count == 0) {
best_time = time3;
best_kernel_time = kernel_time;
}
else {
best_flag = ( time3 < best_time ? 1 : 0 );
if (best_flag) {
best_time = time3;
best_kernel_time = kernel_time;
}
}
}
gflops = compute_gflops(best_time, n);
kernel_gflops = compute_gflops(best_kernel_time, n);
printf("%16d %16f %16f\n",n,gflops,kernel_gflops);
_mm_free(A);
_mm_free(B);
_mm_free(C);
}
printf("tests are done\n");
}
EDIT
使用以下新版本和更快版本替换包装功能:
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) rpack( double* restrict dst,
const double* restrict src,
const int kc, const int mc, const int mr, const int n)
{
for (int i = 0; i < mc/mr; ++i)
for (int j = 0; j < kc; ++j)
for (int k = 0; k < mr; ++k)
*dst++ = *(src + i*n*mr + k*n + j);
}
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) cpack(double* restrict dst,
const double* restrict src,
const int nc,
const int kc,
const int nr,
const int n)
{
for (int i = 0; i < nc/nr; ++i)
for (int j = 0; j < kc; ++j)
for (int k = 0; k < nr; ++k)
*dst++ = *(src + i*nr + j*n + k);
}
Results With New Packing Functions
良好的整体表现提升:
N Total Gflops/s Kernel Gflops/s
256 7.915617 8.794849
512 8.466467 9.350920
768 8.354890 9.135575
1024 8.168944 8.884611
1280 8.174249 8.825920
1536 8.285458 8.938712
1792 7.988038 8.581001
LINPACK 32-bit with 1 thread
CPU frequency: 2.792 GHz
Number of CPUs: 1
Number of cores: 2
Number of threads: 1
Performance Summary (GFlops)
Size LDA Align. Average Maximal
128 128 4 4.7488 5.0094
256 256 4 6.0747 6.9652
384 384 4 6.5208 7.2767
512 512 4 6.8329 7.5706
640 640 4 7.4278 7.8835
768 768 4 7.7622 8.0677
896 896 4 7.8860 8.4737
1024 1024 4 7.7292 8.1076
1152 1152 4 8.0411 8.4738
1280 1280 4 8.1429 8.4863
1408 1408 4 8.2284 8.7073
1536 1536 4 8.3753 8.6437
1664 1664 4 8.6993 8.9108
1792 1792 4 8.7576 8.9176
1920 1920 4 8.7945 9.0678
2048 2048 4 8.5490 8.8827
2176 2176 4 9.0138 9.1161
2304 2304 4 8.1402 9.1446
2432 2432 4 9.0003 9.2082
2560 2560 4 8.8560 9.1197
2688 2688 4 9.1008 9.3144
2816 2816 4 9.0876 9.3089
2944 2944 4 9.0771 9.4191
3072 3072 4 8.9402 9.2920
3200 3200 4 9.2259 9.3699
3328 3328 4 9.1224 9.3821
3456 3456 4 9.1354 9.4082
3584 3584 4 9.0489 9.3351
3712 3712 4 9.3093 9.5108
3840 3840 4 9.3307 9.5324
3968 3968 4 9.3895 9.5352
4096 4096 4 9.3269 9.3872
EDIT 2
以下是单线程OpenBLAS的一些结果,昨晚花了4个小时编译 . 如您所见,它的CPU使用率接近95% . 两个内核的最大单线程性能是 11.2 Gflops
(2步Intel Turbo Boost) . 我需要关闭另一个核心来获得 12.26 Gflops
(4步Intel Turbo Boost) . 假设OpeBLAS中的打包功能不会产生额外的开销 . 然后OpenBLAS内核必须至少与总OpenBLAS代码一样快 . 所以我需要让我的内核以这种速度运行 . 我还没弄明白如何让我的装配更快 . 我会在接下来的几天里关注这一点 .
从Windows命令行执行以下测试: start /realtime /affinity 1
我的代码:
N Total Gflops/s Kernel Gflops/s
256 7.927740 8.832366
512 8.427591 9.347094
768 8.547722 9.352993
1024 8.597336 9.351794
1280 8.588663 9.296724
1536 8.589808 9.271710
1792 8.634201 9.306406
2048 8.527889 9.235653
OpenBLAS:
N Total Gflops/s
256 10.599065
512 10.622686
768 10.745133
1024 10.762757
1280 10.832540
1536 10.793132
1792 10.848356
2048 10.819986
1 回答
理论上可以查看代码并推断是否可以安排更好地利用微架构资源 - 但即使是英特尔的性能架构师也不建议这样做 . 使用像VTune或Intel Performance Counter Monitor这样的工具可以帮助您了解您的工作负载有多少是内存与前端和后端绑定 . Intel Architecture Code Analyzer也可能是帮助缩小下面列出的哪些潜在问题以便首先跟进的快速帮助来源 .
在评论中,Nominal Animal可能正在谈论访问内存和进行计算的交错指令 . 其他一些可能性:
对某些计算使用其他指令可能会减少其中一个执行端口的压力(参见this presentation的3.3.4节) . 特别是,
mulpd
总是要派遣到Westmere的1号港口 . 也许如果有任何循环没有使用端口0,你可以潜入标量FP乘以那里 .硬件预取程序中的一个或另一个可能是saturating the bus early or polluting the cache with lines you don't end up using .
另一方面,内存引用的顺序或内存布局的可能性很小
dgemm_2x4_asm_j
暗示的是伪造预先的人 .更改没有任何数据依赖性的指令对的相对顺序可能会更好地使用前端或后端资源 .