该函数使用CUDA执行对称矩阵 - 矩阵乘法 . 虽然,我成功地使用了非对称版本“cublas gemm()”但我无法正确使用“cublas symm()”函数 .
我知道CUBLAS库使用列主要矩阵存储 . 我正在使用行主C / C矩阵,我知道如何通过替换输入矩阵等来解决“cublas gemm()”这个问题 . 但是,我无法解决对称情况 . 问题是即使我使用列主要矩阵存储我发现了意想不到的结果 . 矩阵包含复杂的浮点数(cuComplex) . 我假设我有行主矩阵 . 这是代码和输出:
// Matrix multiplication: C = A * B.
// Host code.
//
// Utilities and system includes
#include <assert.h>
#include <helper_string.h> // helper for shared functions common to CUDA SDK samples
// CUDA runtime
#include <cuda_runtime.h>
#include <cublas_v2.h>
#ifndef min
#define min(a,b) ((a < b) ? a : b)
#endif
#ifndef max
#define max(a,b) ((a > b) ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions (in addition to helper_cuda.h)
void inline checkError(cublasStatus_t status, const char *msg)
{
if (status != CUBLAS_STATUS_SUCCESS)
{
printf("%s", msg);
exit(EXIT_FAILURE);
}
}
// end of CUDA Helper Functions
// Allocates a matrix with random float entries.
void randomCmplxInit(cuComplex *data, int size)
{
for (int i = 0; i < size; ++i)
data[i] = make_cuComplex( rand() / (float)RAND_MAX, rand() / (float)RAND_MAX);
}
//void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple, sMatrixSize &matrix_size)
void initializeCUDA(int argc, char **argv, int &devID)
{
// By default, we use device 0, otherwise we override the device ID based on what is provided at the command line
cudaError_t error;
devID = 0;
int m,n,k;
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
{
devID = getCmdLineArgumentInt(argc, (const char **)argv, "device");
error = cudaSetDevice(devID);
if (error != cudaSuccess)
{
printf("cudaSetDevice returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
}
// get number of SMs on this GPU
error = cudaGetDevice(&devID);
cudaDeviceProp deviceProp;
error = cudaGetDeviceProperties(&deviceProp, devID);
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor);
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test matrix multiply using CUBLAS
////////////////////////////////////////////////////////////////////////////////
int matrixMultiply(int argc, char **argv, int devID)
{
int i,j;
unsigned int m,n,k;
cudaDeviceProp deviceProp;
cudaError_t error;
error = cudaGetDeviceProperties(&deviceProp, devID);
if (error != cudaSuccess)
{
printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__);
exit(EXIT_FAILURE);
}
// use a larger block size for Fermi and above
int block_size = (deviceProp.major < 2) ? 16 : 32;
m=3; //number of rows of matrix op(A) and C. A--> (m x k)
n=2; //number of columns of matrix op(B) and C. B--> (k x n)
k=m; //number of columns of op(A) and rows of op(B). C--> (m x n)
// I want to compute C = A*B in row-major format,
//so I must find C(T)=B(T)A(T) = C(T)A in column-major format
// allocate host memory for matrices A and B
unsigned int size_A = m*(m+1)/2; //size of a symmetric matrix
unsigned int mem_size_A = sizeof(cuComplex) * size_A;
cuComplex *h_A = (cuComplex *)malloc(mem_size_A);
unsigned int size_B = m*n;
unsigned int mem_size_B = sizeof(cuComplex) * size_B;
cuComplex *h_B = (cuComplex *)malloc(mem_size_B);
// initialize host memory
for (i = 0; i < size_A; ++i)
h_A[i] = make_cuComplex( (float)(i+1),(float)0);
for (i = 0; i < size_B; ++i)
h_B[i] = make_cuComplex((float)(i+2), (float)0);
// allocate device memory
cuComplex *d_A, *d_B, *d_C;
unsigned int size_C = m*n;
unsigned int mem_size_C = sizeof(cuComplex) * size_C;
// allocate host memory for the result
cuComplex *h_C = (cuComplex *) malloc(mem_size_C);
cuComplex *h_CUBLAS = (cuComplex *) malloc(mem_size_C);
error = cudaMalloc((void **) &d_A, mem_size_A);
error = cudaMalloc((void **) &d_B, mem_size_B);
// copy host memory to device
error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
error = cudaMalloc((void **) &d_C, mem_size_C);
// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(n / threads.x, m / threads.y);
// create and start timer
printf("Computing result using CUBLAS...");
// CUBLAS version 2.0
{
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const cuComplex alpha = make_cuComplex(1.0f,0.0f);
const cuComplex beta = make_cuComplex(0.0f,0.0f);
//Perform operation with cublas
ret = cublasCsymm(handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, n,m,&alpha,d_A,m,d_B,m,&beta,d_C,m);
// copy result from device to host
error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost);
checkError(cublasDestroy(handle), "cublasDestroy() error!\n");
}
printf ("\nComputations completed.\n\n");
printf (" symm matrix A: \n");
int s=0;
for (i=0; i<min(m,4); i++) {
for (j=0; j<=i; j++) {
//printf ("%7.5G + j(%7.5G)", h_A[j+i*k].x,h_A[j+i*k].y);
printf ("%7.5G", h_A[s].x);
s++;
}
printf ("\n");
}
printf ("\n matrix B: \n");
for (i=0; i<min(k,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_B[j+i*n].x,h_B[j+i*n].y);
printf ("%7.5G", h_B[j+i*n].x);
}
printf ("\n");
}
printf ("\n matrix C=A*B: \n");
for (i=0; i<min(m,4); i++) {
for (j=0; j<min(n,4); j++) {
//printf ("%7.5G + j(%7.5G)", h_CUBLAS[j+i*n].x,h_CUBLAS[j+i*n].y);
printf ("%7.5G", h_CUBLAS[j+i*n].x);
}
printf ("\n");
}
// clean up memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaDeviceReset();
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("[Matrix Multiply CUBLAS] - Starting...\n");
int devID = 0, sizeMult = 5;
initializeCUDA(argc, argv, devID);
int matrix_result = matrixMultiply(argc, argv, devID);
}
我想我有乘法的下列矩阵:
A =
1 2 4
2 3 5
4 5 6
B =
2 3
4 5
6 7
并希望获得
A * B =
34 41
46 56
64 79
但获得的 OUTPUT 如下:
symm matrix A:
1
2 3
4 5 6
matrix B:
2 3
4 5
6 7
matrix C=A*B:
78 90
74 97
114 146
我在这段代码中缺少什么?可能“cublasCsymm”功能的论点是错误的 .
谢谢,卡根
1 回答
EDIT: 根据下面提出的问题,我选择重新处理我的答案和示例代码 . 您可以处理行主存储而无需至少为这些操作进行转置 .
symm
函数不使用打包存储这一事实进一步促进了这一观察 .所以回答其他问题:
cublasCsymm
函数不使用打包存储格式(例如某些其他函数,例如cublasCspmv),因为cublasCsymm
函数旨在复制相应netlib function的功能,该功能也不使用打包存储格式 . 基于我对cublas API的评论,我没有看到对称打包存储矩阵 - 矩阵乘法函数 .您可以使用带有cublas的行主存储(例如C风格),无需转置,至少对于这些操作(矩阵 - 矩阵乘法,没有打包存储),遵循给定的建议here .
以下是我上一个例子的重新编写的版本,它包含了上面第2项中的信息 .
原始回应:
几个问题:
代码编译了许多警告,而你're not doing proper error checking (for example you'也没有检查
cublasCsymm
的返回值你想要乘以C = A * B这意味着A在LEFT上,但是你将
CUBLAS_SIDE_RIGHT
传递给cublasCsymm
其他几个cublasCsymm
参数也是错误的 . 我想也许你认为你可以做A*B
(B(T)* A(T)),但这只适用于方形矩阵 . 确切地说,不确定你在想什么 .您在矩阵上有行主存储并将它们传递给cublas,它们按列主顺序解释它们 . 对于以下矩阵:
行主存储看起来像这样:
column-major存储看起来像这样:
如果您愿意,可以使用
cublasCgeam
转置这些矩阵,也可以手动修改存储 .A
的某种压缩存储格式做出某种假设,这是不正确的 . 仔细阅读storage type的定义 . 它没有说矩阵的部分是"supplied"或"present"它表示填充的矩阵部分 .这是一个完整的代码,修复了上述问题:
这是输出: