首页 文章

优化具有大量零的cuda中的向量矩阵乘法

提问于
浏览
-2

我正在使用以下内核来优化矢量矩阵乘法,以用于矢量和矩阵都具有大量零的情况 . 使用这个内核 may reduce 这是乘以cublasSgemv所用时间 up to half 所花费的时间,对于有超过90%零的情况 . 但是,它仍然是Ubuntu 14.04上的主机调用

vec = 1×m,mat = m×m,prod = 1×m;所有都是按主要顺序排列的

m> = 5000

__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)      
{
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    if(x < m)
    {
        prod[x] = 0;
        for(int i = 0; i < m; i++)
        {
            int offset = i*m + x;
            if( mat[offset] != 0 && vec[i] != 0 )       
                prod[x] += vec[i] * mat[i*m+x];
       }
    }
}

除了像cuSparse这样的库之外,还可以做些什么来进一步增强这个内核的性能?

如果此优化与1.2的Compute Capability兼容,那将会很好

谢谢

EDIT

更正:prod = 1 x m

GPU = Quadro FX 1800M,Ubuntu 14.04上的Cuda v.5.0

EDIT

使用i执行乘法的完整代码 . 布拉斯,ii . cublas,iii . 上面的内核为m = 6000.当要求输入一个值时,请输入0

#include <iostream>
#include <stdio.h>
#include <time.h>
#include <cblas.h>
#include <cublas_v2.h>
#include <math.h>
using namespace std;

const int m = 6000; 
const int BS = 512; // threads per block
const int NB = ceil((float) m / BS); // number of blocks

__global__ void calc_v_m(float *vec, float *mat, float *prod, int m)  
{
    int x = blockDim.x * blockIdx.x + threadIdx.x;
    if(x < m)
    {
        prod[x] = 0;
        for(int i = 0; i < m; i++)
        {
            int offset = i*m + x;
            if( mat[offset] != 0 && vec[i] != 0 )       
                prod[x] += vec[i] * mat[i*m+x];
        }
    }
}

int main()
{
timespec blas_start, blas_end, cublas_start, cublas_end, opt_start, opt_end;
long totalnsec; //total nano sec
double totalsec, totaltime;
int i, j;

float *A = new float[m]; // 1 x m
float *B = new float[m*m]; // m x m
float *C = new float[m]; // 1 x m

float input;
cout<<"Enter a value to populate the vector (0 to make it sparse) ";
cin>>input;

// input martix A: every 600th element is non-zero i.e 90% zero
for(i = 0; i < m; i++)
{       
    A[i] = input;
    if( i % 600 == 0)    //adjust for sparsity
            A[i] = i;
}

// input matrix B: identity matrix
for(i = 0; i < m; i++)
        for(j = 0; j < m; j++)
                B[j*m + i] = (i==j);

//blas on host
clock_gettime(CLOCK_REALTIME, &blas_start);
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, m, m, 1.0f, A, m, B, m, 0.0f, C, m);
//cblas_sgemv(CblasRowMajor, CblasTrans, m, m, 1.0f, B, m, A, 1, 0.0f, C, 1);
clock_gettime(CLOCK_REALTIME, &blas_end);

/* for(i = 0; i < m; i++) printf("%f ", C[i]); */

//cublas section
cudaError_t cudaStat;   
cublasHandle_t handle;
cublasCreate(&handle);
float *A_d, *B_d, *C_d;

cudaStat = cudaMalloc(&A_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for A_d\n");

cudaStat = cudaMalloc(&B_d, sizeof(float)*m*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for B_d\n");

cudaStat = cudaMalloc(&C_d, sizeof(float)*m);
if(cudaStat != cudaSuccess) printf("Error Allocating Memory for C_d\n");

cudaMemcpy(A_d, A, sizeof(float)*m, cudaMemcpyHostToDevice);
cudaMemcpy(B_d, B, sizeof(float)*m*m, cudaMemcpyHostToDevice);

float alpha = 1.0f, beta = 0.0f;

cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_start);
cublasSgemv(handle, CUBLAS_OP_N, m, m, &alpha, B_d, m, A_d, 1, &beta, C_d, 1);
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &cublas_end);

cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);    
/* for(i = 0; i < m; i++) printf("%f ", C[i]); */ 

// Call kernel having Optimization for Zeros
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_start);
/////////////////// call kernel //////////////////
calc_v_m<<<NB, BS>>>(A_d, B_d, C_d,  m);
//////////////////////////////////////////////////
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &opt_end);

cudaMemcpy(C, C_d, sizeof(float)*m, cudaMemcpyDeviceToHost);    
/*for(i = 0; i < m; i++) printf("%f ", C[i]); */

// Print times
// blas time
totalsec = (double)blas_end.tv_sec - (double)blas_start.tv_sec;
totalnsec = blas_end.tv_nsec - blas_start.tv_nsec;
if(totalnsec < 0)
{
    totalnsec += 1e9;
    totalsec -= 1;
}   
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"blas Time = "<< totaltime << "\n";

//cublas time
totalsec = (double)cublas_end.tv_sec - (double)cublas_start.tv_sec;
totalnsec = cublas_end.tv_nsec - cublas_start.tv_nsec;
if(totalnsec < 0)
{
    totalnsec += 1e9;
    totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"cublas Time = "<< totaltime << "\n";

//Optimized Kernel Time
totalsec = (double)opt_end.tv_sec - (double)opt_start.tv_sec;
totalnsec = opt_end.tv_nsec - opt_start.tv_nsec;
if(totalnsec < 0)
{
     totalnsec += 1e9;
     totalsec -= 1;
}
totaltime = totalsec + (double)totalnsec*1e-9;
cout<<"Opt Kernel Time = "<< totaltime << "\n";

return 0;
}

结果

$ nvcc -arch=sm_12 blascomp.cu -o blascomp.o -lblas -lcublas
$ ./blascomp.o
Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 0.000105207
cublas Time = 0.0070294
Opt Kernel Time = 0.00642797

至少在我的系统上,blas仍然是这种情况下最快的

如果每个'1200th'元素而不是'600th'设置为0,事情会变得更有趣

Enter a value to populate the vector (0 to make it sparse) 0
blas Time = 7.84e-05
cublas Time = 0.00698783
Opt Kernel Time = 0.00643042

1 回答

  • 1

    这里要认识到的重要一点是,您关注的gemv操作基本上是GPU上的内存吞吐量限制,而不是计算吞吐量有限 . 这意味着您在内核中显示的“优化”:

    __global__ void calc_v_m(float *vec, float *mat, float *prod, int m)  
    {
        int x = blockDim.x * blockIdx.x + threadIdx.x;
        if(x < m)
        {
            prod[x] = 0;
            for(int i = 0; i < m; i++)
            {
                int offset = i*m + x;
                if( mat[offset] != 0 && vec[i] != 0 )       
                    prod[x] += vec[i] * mat[i*m+x];
            }
        }
    }
    

    根本不是因为内存事务是内核中的性能瓶颈,而不是浮点运算,并且你的代码必须执行大部分内存事务,而不管是否会执行乘法加法操作零检测与否 .

    请考虑以下大致相同代码的检测版本:

    __constant__ float cvec1[2];
    __global__ void 
    __launch_bounds__(512,4)    
    calc_v_m1(const float* __restrict__ vec,
              const float* __restrict__ mat, 
              float* __restrict__ prod, 
              int m,
              int do_reads = 1,
              int do_write = 1)
    {
        int x = blockDim.x * blockIdx.x + threadIdx.x;
        if(x < m)
        {
            float res = 0;
            float mval = cvec1[0], vval = cvec1[1];
    #pragma unroll 8
            for(int i = 0; i < m; i++)
            {
                int offset = i*m + x;
                if (do_reads) {
                    mval = mat[offset];
                    vval = vec[i];
                } 
                res += mval * vval;
            }
            if (do_write) prod[x] = res;
        }
    }
    

    在这里,我添加了两个可选参数,用于控制内核是否从全局内存加载,以及内核是否将存储到全局内存 . 这允许我独立地量化内存负载,计算和内存存储的性能影响 . 使用您的测试代码的结果是有益的:

    Function                            nvprof time
    -----------------------------------------------
    cublasSgemv                         942.75us
    calc_v_m                            2798.4us
    calc_v_m1(do_reads=1, do_write=1)   962.40us
    calc_v_m1(do_reads=1, do_write=0)   970.40us
    calc_v_m1(do_reads=0, do_write=1)   55.166us
    calc_v_m1(do_reads=0, do_write=0)   55.102us
    

    [使用CUDA 7.5发布工具链和CUBLAS 7.5库在GTX970上完成所有基准测试]

    没有特别的顺序:

    • 完整的检测内核运行时在等效CUBLAS调用的百分之几内

    • 从全局内存中取出的内存是瓶颈

    • 内核中的实际计算仅占内核运行时间的5%

    • CUDA中写操作的"fire-and-forget"性质意味着写入的延迟对吞吐量没有显着影响 .

    • 你的"optimised"内核比CUBLAS或检测内核慢得多,可能是因为你所引入的只是分支差异而没有解决内核瓶颈的来源(内存加载的延迟) .

    有条件地执行FMAD操作的唯一时间是在存储器具有接近零延迟并且浮点吞吐量受到严格约束的架构中 . GPU肯定不属于那个类别 .

    优化此选项的唯一其他选择是利用有关LHS矩阵中稀疏模式的先验信息,以消除读取零条目的需要 . 这恰恰是稀疏矩阵格式和线性代数代码的设计 .

相关问题