首页 文章

如何配置cublas {t} symm()函数参数

提问于
浏览
0

该函数使用CUDA执行对称矩阵 - 矩阵乘法 . 虽然,我成功地使用了非对称版本“cublas gemm()”但我无法正确使用“cublas symm()”函数 .

我知道CUBLAS库使用列主要矩阵存储 . 我正在使用行主C / C矩阵,我知道如何通过替换输入矩阵等来解决“cublas gemm()”这个问题 . 但是,我无法解决对称情况 . 问题是即使我使用列主要矩阵存储我发现了意想不到的结果 . 矩阵包含复杂的浮点数(cuComplex) . 我假设我有行主矩阵 . 这是代码和输出:

// Matrix multiplication: C = A * B.
// Host code.
//

// Utilities and system includes
#include <assert.h>
#include <helper_string.h>  // helper for shared functions common to CUDA SDK samples

// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>

#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif

////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)

void inline checkError(cublasStatus_t status, const char *msg)
{
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        printf("%s", msg);
        exit(EXIT_FAILURE);
    }
}
// end of CUDA Helper Functions

// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
    for (int i = 0; i < size; ++i)
        data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}


//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
    // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
    cudaError_t error;
    devID = 0;
    int m,n,k;

    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
    {
        devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
        error = cudaSetDevice(devID);

        if (error != cudaSuccess)
        {
            printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
            exit(EXIT_FAILURE);
        }
    }

    // get number of SMs on this GPU
    error = cudaGetDevice(&devID);
    cudaDeviceProp deviceProp;
    error = cudaGetDeviceProperties(&deviceProp, devID);

    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);

    // use a larger block size for Fermi and above
    int block_size = (deviceProp.major < 2) ? 16 : 32;
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
    int i,j;
    unsigned int m,n,k; 
    cudaDeviceProp deviceProp;
    cudaError_t error;

    error = cudaGetDeviceProperties(&deviceProp, devID);

    if (error != cudaSuccess)
    {
        printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
        exit(EXIT_FAILURE);
    }

    // use a larger block size for Fermi and above
    int block_size = (deviceProp.major < 2) ? 16 : 32;

    m=3; //number of rows of matrix op(A) and C.  A--> (m x k)
    n=2; //number of columns of matrix op(B) and C. B--> (k x n)
    k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)

    // I want to compute C = A*B in row-major format, 
    //so I must find C(T)=B(T)A(T) = C(T)A in column-major format 

    // allocate host memory for matrices A and B
    unsigned int size_A = m*(m+1)/2; //size of a symmetric matrix 
    unsigned int mem_size_A = sizeof(cuComplex) * size_A;
    cuComplex *h_A = (cuComplex *)malloc(mem_size_A);

    unsigned int size_B = m*n;
    unsigned int mem_size_B = sizeof(cuComplex) * size_B;
    cuComplex *h_B = (cuComplex *)malloc(mem_size_B);

    // initialize host memory
    for (i = 0; i < size_A; ++i)
    h_A[i] = make_cuComplex( (float)(i+1),(float)0);

    for (i = 0; i < size_B; ++i)
    h_B[i] = make_cuComplex((float)(i+2), (float)0);

    // allocate device memory
    cuComplex *d_A, *d_B, *d_C;
    unsigned int size_C = m*n;
    unsigned int mem_size_C = sizeof(cuComplex) * size_C;

    // allocate host memory for the result
    cuComplex *h_C      = (cuComplex *) malloc(mem_size_C);
    cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);

    error = cudaMalloc((void **) &d_A, mem_size_A);
    error = cudaMalloc((void **) &d_B, mem_size_B);

    // copy host memory to device
    error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
    error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
    error = cudaMalloc((void **) &d_C, mem_size_C);

    // setup execution parameters
    dim3 threads(block_size, block_size);
    dim3 grid(n / threads.x, m / threads.y);

    // create and start timer
    printf("Computing result using CUBLAS...");

    // CUBLAS version 2.0
    {
        cublasHandle_t handle;
        cublasStatus_t ret;

        ret = cublasCreate(&handle);

        if (ret != CUBLAS_STATUS_SUCCESS)
        {
            printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
            exit(EXIT_FAILURE);
        }

        const cuComplex alpha = make_cuComplex(1.0f,0.0f);
        const cuComplex beta  = make_cuComplex(0.0f,0.0f);
        //Perform operation with cublas
        ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, n,m,&alpha,d_A,m,d_B,m,&beta,d_C,m);

        // copy result from device to host
        error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);

        checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
    }


    printf ("\nComputations completed.\n\n");

    printf (" symm matrix A: \n");
    int s=0;
    for (i=0; i<min(m,4); i++) {
      for (j=0; j<=i; j++) {
        //printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
        printf ("%7.5G", h_A[s].x); 
        s++;
      }
      printf ("\n");
    }

    printf ("\n matrix B: \n");
    for (i=0; i<min(k,4); i++) {
      for (j=0; j<min(n,4); j++) {
        //printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
          printf ("%7.5G", h_B[j+i*n].x);
      }
      printf ("\n");
    }

    printf ("\n matrix C=A*B: \n");
    for (i=0; i<min(m,4); i++) {
      for (j=0; j<min(n,4); j++) {
        //printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
          printf ("%7.5G", h_CUBLAS[j+i*n].x);
      }
      printf ("\n");
    }

    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    //free(reference);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    cudaDeviceReset();
}

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("[Matrix Multiply CUBLAS] - Starting...\n");
    int devID = 0, sizeMult = 5;
    initializeCUDA(argc, argv, devID);
    int matrix_result = matrixMultiply(argc, argv, devID);
}

我想我有乘法的下列矩阵:

A =

1     2     4
2     3     5
4     5     6

B =

2     3
 4     5
 6     7

并希望获得

A * B =

34    41
46    56
64    79

但获得的 OUTPUT 如下:

symm matrix A:    
     1     
     2     3
     4     5     6

matrix B:
     2     3
     4     5
     6     7

matrix C=A*B:
    78    90
    74    97
    114    146

我在这段代码中缺少什么?可能“cublasCsymm”功能的论点是错误的 .

谢谢,卡根

1 回答

  • 2

    EDIT: 根据下面提出的问题,我选择重新处理我的答案和示例代码 . 您可以处理行主存储而无需至少为这些操作进行转置 . symm 函数不使用打包存储这一事实进一步促进了这一观察 .

    所以回答其他问题:

    • cublasCsymm 函数不使用打包存储格式(例如某些其他函数,例如cublasCspmv),因为 cublasCsymm 函数旨在复制相应netlib function的功能,该功能也不使用打包存储格式 . 基于我对cublas API的评论,我没有看到对称打包存储矩阵 - 矩阵乘法函数 .

    • 您可以使用带有cublas的行主存储(例如C风格),无需转置,至少对于这些操作(矩阵 - 矩阵乘法,没有打包存储),遵循给定的建议here .

    以下是我上一个例子的重新编写的版本,它包含了上面第2项中的信息 .

    // Matrix multiplication: C = A * B.
    // Host code.
    //
    
    // Utilities and system includes
    #include <assert.h>
    #include <helper_string.h>  // helper for shared functions common to CUDA SDK sa
    mples
    
    // CUDA runtime
    #include <cuda_runtime.h>
    #include <cublas_v2.h>
    
    // error check macros
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    // for CUBLAS V2 API
    #define cublasCheckErrors(fn) \
        do { \
            cublasStatus_t __err = fn; \
            if (__err != CUBLAS_STATUS_SUCCESS) { \
                fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
                    (int)(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    
    
    #ifndef min
    #define min(a,b) ((a < b) ? a : b)
    #endif
    #ifndef max
    #define max(a,b) ((a > b) ? a : b)
    #endif
    
    ////////////////////////////////////////////////////////////////////////////////
    // These are CUDA Helper functions (in addition to helper_cuda.h)
    
    void inline checkError(cublasStatus_t status, const char *msg)
    {
        if (status != CUBLAS_STATUS_SUCCESS)
        {
            printf("%s", msg);
            exit(EXIT_FAILURE);
        }
    }
    // end of CUDA Helper Functions
    
    // Allocates a matrix with random float entries.
    void randomCmplxInit(cuComplex *data, int size)
    {
        for (int i = 0; i < size; ++i)
            data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND
    _MAX);
    }
    
    
    //void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMa
    trixSize &matrix_size)
    void initializeCUDA(int argc, char **argv, int &devID)
    {
        // By default, we use device 0, otherwise we override the device ID based on
     what is provided at the command line
        cudaError_t error;
        devID = 0;
    
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
            error = cudaSetDevice(devID);
    
            if (error != cudaSuccess)
            {
                printf("cudaSetDevice returned error code %d, line(%d)\n", error, __
    LINE__);
                exit(EXIT_FAILURE);
            }
        }
    
        // get number of SMs on this GPU
        error = cudaGetDevice(&devID);
        cudaDeviceProp deviceProp;
        error = cudaGetDeviceProperties(&deviceProp, devID);
    
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, dev
    iceProp.name, deviceProp.major, deviceProp.minor);
    
    }
    
    
    ////////////////////////////////////////////////////////////////////////////////
    //! Run a simple test matrix multiply using CUBLAS
    ////////////////////////////////////////////////////////////////////////////////
    int matrixMultiply(int argc, char **argv, int devID)
    {
        int i,j;
        unsigned int m,n,k;
        cudaDeviceProp deviceProp;
        cudaError_t error;
    
        error = cudaGetDeviceProperties(&deviceProp, devID);
    
        if (error != cudaSuccess)
        {
            printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
            exit(EXIT_FAILURE);
        }
    
        // use a larger block size for Fermi and above
    
        m=3; //number of rows of matrix op(A) and C.  A--> (m x k)
        n=2; //number of columns of matrix op(B) and C. B--> (k x n)
        k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
    
        // I want to compute C = A*B in row-major format,
        //so I must find C(T)=B(T)A(T) = C(T)A in column-major format
    
        // allocate host memory for matrices A and B
        unsigned int size_A = m*m; //size of a symmetric matrix
        printf("size_A = %d\n", size_A);
        unsigned int mem_size_A = sizeof(cuComplex) * size_A;
        cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
    
        unsigned int size_B = m*n;
        unsigned int mem_size_B = sizeof(cuComplex) * size_B;
        cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
    
        // initialize host memory
    //    for (i = 0; i < size_A; ++i)
    //    h_A[i] = make_cuComplex( (float)(i+1),(float)0);
        h_A[0] = make_cuComplex((float)1, (float)0);
        h_A[1] = make_cuComplex((float)2, (float)0);
        h_A[2] = make_cuComplex((float)4, (float)0);
        h_A[3] = make_cuComplex((float)0, (float)0);
        h_A[4] = make_cuComplex((float)3, (float)0);
        h_A[5] = make_cuComplex((float)5, (float)0);
        h_A[6] = make_cuComplex((float)0, (float)0);
        h_A[7] = make_cuComplex((float)0, (float)0);
        h_A[8] = make_cuComplex((float)6, (float)0);
    
    //    for (i = 0; i < size_B; ++i)
    //    h_B[i] = make_cuComplex((float)(i+2), (float)0);
        h_B[0] = make_cuComplex((float)2, (float)0);
        h_B[1] = make_cuComplex((float)3, (float)0);
        h_B[2] = make_cuComplex((float)4, (float)0);
        h_B[3] = make_cuComplex((float)5, (float)0);
        h_B[4] = make_cuComplex((float)6, (float)0);
        h_B[5] = make_cuComplex((float)7, (float)0);
    
        // allocate device memory
        cuComplex *d_A, *d_B, *d_C;
        unsigned int size_C = m*n;
        unsigned int mem_size_C = sizeof(cuComplex) * size_C;
    
        // allocate host memory for the result
        cuComplex *h_C      = (cuComplex *) malloc(mem_size_C);
        cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
    
        error = cudaMalloc((void **) &d_A, mem_size_A);
        error = cudaMalloc((void **) &d_B, mem_size_B);
    
        // copy host memory to device
        error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
        error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
        error = cudaMalloc((void **) &d_C, mem_size_C);
    
    
        // create and start timer
        printf("Computing result using CUBLAS...");
    
        // CUBLAS version 2.0
        {
            cublasHandle_t handle;
            cublasStatus_t ret;
    
            ret = cublasCreate(&handle);
    
            if (ret != CUBLAS_STATUS_SUCCESS)
            {
                printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
                exit(EXIT_FAILURE);
            }
    
            const cuComplex alpha = make_cuComplex(1.0f,0.0f);
            const cuComplex beta  = make_cuComplex(0.0f,0.0f);
            //Perform operation with cublas
            ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, n,m,&alpha,d_A,m,d_B,n,&beta,d_C,n);
    
            if (ret != CUBLAS_STATUS_SUCCESS)
            {
                printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
                exit(EXIT_FAILURE);
            }
    
            // copy result from device to host
            error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
    
            checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
        }
    
    
        printf ("\nComputations completed.\n\n");
    
        printf (" symm matrix A: \n");
    //    int s=0;
        for (i=0; i<min(m,4); i++) {
          for (j=0; j<min(m,4); j++) {
            //printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
    //        printf ("%7.5G", h_A[s].x);
            printf ("%7.5G", h_A[j+(i*m)].x);
    //        s++;
          }
          printf ("\n");
        }
    
        printf ("\n matrix B: \n");
        for (i=0; i<min(k,4); i++) {
          for (j=0; j<min(n,4); j++) {
            //printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
              printf ("%7.5G", h_B[j+(i*n)].x);
          }
          printf ("\n");
        }
    
        printf ("\n matrix C=A*B: \n");
        for (i=0; i<min(m,4); i++) {
          for (j=0; j<min(n,4); j++) {
            //printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
              printf ("%7.5G", h_CUBLAS[j+(i*n)].x);
          }
          printf ("\n");
        }
    
        // clean up memory
        free(h_A);
        free(h_B);
        free(h_C);
        //free(reference);
        cudaFree(d_A);
        cudaFree(d_B);
        cudaFree(d_C);
    
        cudaDeviceReset();
        return 0;
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    // Program main
    ////////////////////////////////////////////////////////////////////////////////
    int main(int argc, char **argv)
    {
        printf("[Matrix Multiply CUBLAS] - Starting...\n");
        int devID = 0;
        initializeCUDA(argc, argv, devID);
        int matrix_result = matrixMultiply(argc, argv, devID);
        cudaCheckErrors("some error");
        return 0;
    }
    $ ./t213
    [Matrix Multiply CUBLAS] - Starting...
    GPU Device 0: "Tesla M2070" with compute capability 2.0
    
    size_A = 9
    Computing result using CUBLAS...
    Computations completed.
    
     symm matrix A:
          1      2      4
          0      3      5
          0      0      6
    
     matrix B:
          2      3
          4      5
          6      7
    
     matrix C=A*B:
         34     41
         46     56
         64     79
    $
    

    原始回应:

    几个问题:

    • 当我现在运行您的代码时,我没有得到您显示的结果 . 这是我得到的:
    [Matrix Multiply CUBLAS] - Starting...
    GPU Device 0: "Tesla M2070" with compute capability 2.0
    
    Computing result using CUBLAS...
    Computations completed.
    
    symm matrix A:
      1
      2      3
      4      5      6
    
    matrix B:
      2      3
      4      5
      6      7
    
    matrix C=A*B:
      -131   -128
       260   -122
      -115    266
    

    代码编译了许多警告,而你're not doing proper error checking (for example you'也没有检查 cublasCsymm 的返回值

    • 你想要乘以C = A * B这意味着A在LEFT上,但是你将 CUBLAS_SIDE_RIGHT 传递给 cublasCsymm 其他几个 cublasCsymm 参数也是错误的 . 我想也许你认为你可以做 A*B (B(T)* A(T)),但这只适用于方形矩阵 . 确切地说,不确定你在想什么 .

    • 您在矩阵上有行主存储并将它们传递给cublas,它们按列主顺序解释它们 . 对于以下矩阵:

    1 2
    3 4
    

    行主存储看起来像这样:

    1 2 3 4
    

    column-major存储看起来像这样:

    1 3 2 4
    

    如果您愿意,可以使用 cublasCgeam 转置这些矩阵,也可以手动修改存储 .

    • 您正在对对称矩阵 A 的某种压缩存储格式做出某种假设,这是不正确的 . 仔细阅读storage type的定义 . 它没有说矩阵的部分是"supplied"或"present"它表示填充的矩阵部分 .

    这是一个完整的代码,修复了上述问题:

    // Matrix multiplication: C = A * B.
    // Host code.
    //
    
    // Utilities and system includes
    #include <assert.h>
    #include <helper_string.h>  // helper for shared functions common to CUDA SDK sa
    mples
    
    // CUDA runtime
    #include <cuda_runtime.h>
    #include <cublas_v2.h>
    
    // error check macros
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    // for CUBLAS V2 API
    #define cublasCheckErrors(fn) \
        do { \
            cublasStatus_t __err = fn; \
            if (__err != CUBLAS_STATUS_SUCCESS) { \
                fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
                    (int)(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    
    
    #ifndef min
    #define min(a,b) ((a < b) ? a : b)
    #endif
    #ifndef max
    #define max(a,b) ((a > b) ? a : b)
    #endif
    
    ////////////////////////////////////////////////////////////////////////////////
    // These are CUDA Helper functions (in addition to helper_cuda.h)
    
    
    void inline checkError(cublasStatus_t status, const char *msg)
    {
        if (status != CUBLAS_STATUS_SUCCESS)
        {
            printf("%s", msg);
            exit(EXIT_FAILURE);
        }
    }
    // end of CUDA Helper Functions
    
    // Allocates a matrix with random float entries.
    void randomCmplxInit(cuComplex *data, int size)
    {
        for (int i = 0; i < size; ++i)
            data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
    }
    
    
    //void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
    void initializeCUDA(int argc, char **argv, int &devID)
    {
        // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
        cudaError_t error;
        devID = 0;
    
        if (checkCmdLineFlag(argc, (const char **)argv, "device"))
        {
            devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
            error = cudaSetDevice(devID);
    
            if (error != cudaSuccess)
            {
                printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
                exit(EXIT_FAILURE);
            }
        }
    
        // get number of SMs on this GPU
        error = cudaGetDevice(&devID);
        cudaDeviceProp deviceProp;
        error = cudaGetDeviceProperties(&deviceProp, devID);
    
        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
    
    }
    
    ////////////////////////////////////////////////////////////////////////////////
    //! Run a simple test matrix multiply using CUBLAS
    ////////////////////////////////////////////////////////////////////////////////
    int matrixMultiply(int argc, char **argv, int devID)
    {
        int i,j;
        unsigned int m,n,k;
        cudaDeviceProp deviceProp;
        cudaError_t error;
    
        error = cudaGetDeviceProperties(&deviceProp, devID);
    
        if (error != cudaSuccess)
        {
            printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
            exit(EXIT_FAILURE);
        }
    
        // use a larger block size for Fermi and above
    
        m=3; //number of rows of matrix op(A) and C.  A--> (m x k)
        n=2; //number of columns of matrix op(B) and C. B--> (k x n)
        k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
    
        // I want to compute C = A*B in row-major format,
        //so I must find C(T)=B(T)A(T) = C(T)A in column-major format
    
        // allocate host memory for matrices A and B
        unsigned int size_A = m*m; //size of a symmetric matrix
        printf("size_A = %d\n", size_A);
        unsigned int mem_size_A = sizeof(cuComplex) * size_A;
        cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
    
        unsigned int size_B = m*n;
        unsigned int mem_size_B = sizeof(cuComplex) * size_B;
        cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
    
        // initialize host memory
    //    for (i = 0; i < size_A; ++i)
    //    h_A[i] = make_cuComplex( (float)(i+1),(float)0);
        h_A[0] = make_cuComplex((float)1, (float)0);
        h_A[1] = make_cuComplex((float)2, (float)0);
        h_A[2] = make_cuComplex((float)4, (float)0);
        h_A[3] = make_cuComplex((float)0, (float)0);
        h_A[4] = make_cuComplex((float)3, (float)0);
        h_A[5] = make_cuComplex((float)5, (float)0);
        h_A[6] = make_cuComplex((float)0, (float)0);
        h_A[7] = make_cuComplex((float)0, (float)0);
        h_A[8] = make_cuComplex((float)6, (float)0);
    
    //    for (i = 0; i < size_B; ++i)
    //    h_B[i] = make_cuComplex((float)(i+2), (float)0);
        h_B[0] = make_cuComplex((float)2, (float)0);
        h_B[1] = make_cuComplex((float)4, (float)0);
        h_B[2] = make_cuComplex((float)6, (float)0);
        h_B[3] = make_cuComplex((float)3, (float)0);
        h_B[4] = make_cuComplex((float)5, (float)0);
        h_B[5] = make_cuComplex((float)7, (float)0);
    
        // allocate device memory
        cuComplex *d_A, *d_B, *d_C;
        unsigned int size_C = m*n;
        unsigned int mem_size_C = sizeof(cuComplex) * size_C;
    
        // allocate host memory for the result
        cuComplex *h_C      = (cuComplex *) malloc(mem_size_C);
        cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
    
        error = cudaMalloc((void **) &d_A, mem_size_A);
        error = cudaMalloc((void **) &d_B, mem_size_B);
    
        // copy host memory to device
        error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
        error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
        error = cudaMalloc((void **) &d_C, mem_size_C);
    
    
        // create and start timer
        printf("Computing result using CUBLAS...");
    
        // CUBLAS version 2.0
        {
            cublasHandle_t handle;
            cublasStatus_t ret;
    
            ret = cublasCreate(&handle);
    
            if (ret != CUBLAS_STATUS_SUCCESS)
            {
                printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
                exit(EXIT_FAILURE);
            }
    
            const cuComplex alpha = make_cuComplex(1.0f,0.0f);
            const cuComplex beta  = make_cuComplex(0.0f,0.0f);
            //Perform operation with cublas
            ret = cublasCsymm(handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, m,n,&alpha,d_A,m,d_B,m,&beta,d_C,m);
            if (ret != CUBLAS_STATUS_SUCCESS)
            {
                printf("cublasCsymm returned error code %d, line(%d)\n", ret, __LINE__);
                exit(EXIT_FAILURE);
            }
    

    这是输出:

    [Matrix Multiply CUBLAS] - Starting...
    GPU Device 0: "Tesla M2070" with compute capability 2.0
    
    size_A = 9
    Computing result using CUBLAS...
    Computations completed.
    
     symm matrix A:
          1      0      0
          2      3      0
          4      5      6
    
     matrix B:
          2      3
          4      5
          6      7
    
     matrix C=A*B:
         34     41
         46     56
         64     79
    

相关问题