首页 文章

如何使CUDA中的矩阵列标准化并获得最大性能?

提问于
浏览
7

如何有效地规范化CUDA中的矩阵列?

我的矩阵存储在column-major中,典型大小为2000x200 .

该操作可以用以下matlab代码表示 .

A = rand(2000,200);

A = exp(A);
A = A./repmat(sum(A,1), [size(A,1) 1]);

这可以通过Thrust,cuBLAS和/或cuNPP有效地完成吗?

包括4个内核的快速实现如下所示 .

想知道这些是否可以在1或2个内核中完成以提高性能,尤其是对于由cublasDgemv()实现的列求和步骤 .

#include <cuda.h>
#include <curand.h>
#include <cublas_v2.h>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/transform.h>
#include <thrust/iterator/constant_iterator.h>
#include <math.h>

struct Exp
{
    __host__ __device__ void operator()(double& x)
    {
        x = exp(x);
    }
};

struct Inv
{
    __host__ __device__ void operator()(double& x)
    {
        x = (double) 1.0 / x;
    }
};

int main()
{
    cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
    cublasHandle_t hd;
    curandGenerator_t rng;
    cublasCreate(&hd);
    curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);

    const size_t m = 2000, n = 200;
    const double c1 = 1.0;
    const double c0 = 0.0;

    thrust::device_vector<double> A(m * n);
    thrust::device_vector<double> sum(1 * n);
    thrust::device_vector<double> one(m * n, 1.0);

    double* pA = thrust::raw_pointer_cast(&A[0]);
    double* pSum = thrust::raw_pointer_cast(&sum[0]);
    double* pOne = thrust::raw_pointer_cast(&one[0]);

    for (int i = 0; i < 100; i++)
    {
        curandGenerateUniformDouble(rng, pA, A.size());


        thrust::for_each(A.begin(), A.end(), Exp());

        cublasDgemv(hd, CUBLAS_OP_T, m, n,
                &c1, pA, m, pOne, 1, &c0, pSum, 1);

        thrust::for_each(sum.begin(), sum.end(), Inv());

        cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m);
    }

    curandDestroyGenerator(rng);
    cublasDestroy(hd);

    return 0;
}

3 回答

  • 5

    我将M2090的3种方法的性能与CUDA 5.0进行了比较 .

    • [173.179 us] cublas实现如问题所示

    • [733.734 us]来自@talonmies的 thrust::reduce_by_key 的纯粹推力实施

    • [1.508 ms]纯推力实现 thrust::inclusive_scan_by_key

    Performance on A_{2,000 x 200}

    可以看出,

    • cublas在这种情况下表现最好;

    • thrust::reduce_by_keythrust::inclusive_scan_by_key 启动多个内核,这会导致额外的开销;
      thrust::reduce_by_key 相比,

    • thrust::inclusive_scan_by_key 向DRAM写入更多数据,这可能是内核时间更长的原因之一;

    • cublas和推力方法的主要性能差异是矩阵列求和 . 推力较慢可能是因为 thrust::reduce_by_key 旨在减少具有变体长度的段,但 cublas_gemv() 只能应用于固定长度的段(行/列) .

    当矩阵A足够大以忽略内核启动开销时,cublas appoach仍然表现最佳 . A_ {20,000 x 2,000}的分析结果如下所示 .

    Performance on A_{20,000 x 2,000}

    将第一个 for_each 操作与@talonmies指示的 cublasSgemv 调用融合可能会进一步提高性能,但我认为应该使用手写的内核而不是 thrust::reduce_by_key .

    3种方法的代码如下所示 .

    #include <cuda.h>
    #include <curand.h>
    #include <cublas_v2.h>
    #include <thrust/device_vector.h>
    #include <thrust/device_ptr.h>
    #include <thrust/transform.h>
    #include <thrust/reduce.h>
    #include <thrust/scan.h>
    #include <thrust/iterator/counting_iterator.h>
    #include <thrust/iterator/transform_iterator.h>
    #include <thrust/iterator/discard_iterator.h>
    #include <thrust/iterator/permutation_iterator.h>
    #include <math.h>
    
    struct Exp: public thrust::unary_function<double, double>
    {
        __host__ __device__ double operator()(double x)
        {
            return exp(x);
        }
    };
    
    struct Inv: public thrust::unary_function<double, double>
    {
        __host__ __device__ double operator()(double x)
        {
            return (double) 1.0 / x;
        }
    };
    
    template<typename T>
    struct MulC: public thrust::unary_function<T, T>
    {
        T C;
        __host__ __device__ MulC(T c) :
            C(c)
        {
        }
        __host__ __device__ T operator()(T x)
        {
            return x * C;
        }
    };
    
    template<typename T>
    struct line2col: public thrust::unary_function<T, T>
    {
        T C;
        __host__ __device__ line2col(T C) :
                C(C)
        {
        }
    
        __host__ __device__ T operator()(T i)
        {
            return i / C;
        }
    };
    
    int main()
    {
        cudaDeviceSetCacheConfig(cudaFuncCachePreferShared);
        cublasHandle_t hd;
        curandGenerator_t rng;
        cublasCreate(&hd);
        curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT);
    
        const size_t m = 2000, n = 200;
        const double c1 = 1.0;
        const double c0 = 0.0;
    
        thrust::device_vector<double> A(m * n);
        thrust::device_vector<double> B(m * n);
        thrust::device_vector<double> C(m * n);
        thrust::device_vector<double> sum1(1 * n);
        thrust::device_vector<double> sum2(1 * n);
        thrust::device_vector<double> one(m * n, 1);
    
        double* pA = thrust::raw_pointer_cast(&A[0]);
        double* pB = thrust::raw_pointer_cast(&B[0]);
        double* pSum1 = thrust::raw_pointer_cast(&sum1[0]);
        double* pSum2 = thrust::raw_pointer_cast(&sum2[0]);
        double* pOne = thrust::raw_pointer_cast(&one[0]);
    
        curandGenerateUniformDouble(rng, pA, A.size());
    
        const int count = 2;
    
        for (int i = 0; i < count; i++)
        {
            thrust::transform(A.begin(), A.end(), B.begin(), Exp());
            cublasDgemv(hd, CUBLAS_OP_T, m, n, &c1, pB, m, pOne, 1, &c0, pSum1, 1);
            thrust::transform(sum1.begin(), sum1.end(), sum1.begin(), Inv());
            cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pB, m, pSum2, 1, pB, m);
        }
    
        for (int i = 0; i < count; i++)
        {
            thrust::reduce_by_key(
                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)),
                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(),
                    thrust::make_transform_iterator(A.begin(), Exp()),
                    thrust::make_discard_iterator(),
                    sum2.begin());
            thrust::transform(
                    A.begin(), A.end(),
                    thrust::make_permutation_iterator(
                            sum2.begin(),
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))),
                    C.begin(),
                    thrust::divides<double>());
        }
    
        for (int i = 0; i < count; i++)
        {
            thrust::inclusive_scan_by_key(
                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)),
                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m)) + A.size(),
                    thrust::make_transform_iterator(A.begin(), Exp()),
                    C.begin());
            thrust::copy(
                    thrust::make_permutation_iterator(
                            C.begin() + m - 1,
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))),
                    thrust::make_permutation_iterator(
                            C.begin() + m - 1,
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(m))) + n,
                    sum2.begin());
            thrust::transform(
                    A.begin(), A.end(),
                    thrust::make_permutation_iterator(
                            sum2.begin(),
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0), line2col<int>(m))),
                    C.begin(),
                    thrust::divides<double>());
        }
    
        curandDestroyGenerator(rng);
        cublasDestroy(hd);
    
        return 0;
    }
    
  • 2

    您应该能够将第一个 for_each 操作与 cublasSgemv 呼叫融合到一个 reduce_by_key 呼叫中 . 如果您将仿函数定义/重新定义为:

    struct Accessor : public thrust::unary_function<int,int>
    {
        int lda;
        __host__ __device__ Accessor(int _lda) : lda(_lda) {};
        __host__ __device__ int operator()(const int& idx)
        {
            return idx/lda;
        }
    };
    
    struct Exp : public thrust::unary_function<double,double>
    {
        __host__ __device__ double operator()(const double& x)
        {
            return exp(x);
        }
    };
    
    struct Inv : public thrust::unary_function<double,double>
    {
        __host__ __device__ double operator()(const double& x)
        {
            return double(1.0) / x;
        }
    };
    

    然后,您可以将标准化输出计算为

    Accessor columns(m);
    thrust::reduce_by_key(
            thrust::make_transform_iterator(thrust::make_counting_iterator(int(0)), columns),
            thrust::make_transform_iterator(thrust::make_counting_iterator(int(m*n)), columns),
            thrust::make_transform_iterator(A.begin(), Exp()),
            thrust::make_discard_iterator(),
            sum.begin());
    
    thrust::for_each(sum.begin(), sum.end(), Inv());
    
    cublasDdgmm(hd, CUBLAS_SIDE_RIGHT, m, n, pA, m, pSum, 1, pA, m);
    

    [免责声明:所有代码均以浏览器编写,未经测试,使用风险自负]

    除了减少内核调用的数量之外,使用花哨的迭代器消除了对大单位矩阵的需要,这应该减少内存占用和内存事务的总数以进行求和和取幂操作 .

  • 3

    您可以通过以下方式使用ArrayFire

    array A = randu(2000, 2000);
    A = exp(A);
    A /= tile(sum(A, 0), A.dims(0), 1);
    

    你也可以用推力做到这一点 . 但是如果你打算使用矩阵(而不是简单的向量),你必须在for循环中进行,这样效率会不高 .

    DISCLAIMER 我是Accelereyes的开发人员,正在开展阵火工作 .

    EDIT 我正在按照要求制作新的基准 .

    EDIT 由于此基准测试,我们在代码中发现了 exp 的性能错误 . 我们正在审查和修复它 .

相关问题