首页 文章

使用CUDA减少矩阵列

提问于
浏览
3

我有一个矩阵,我想使用CUDA并以最快的方式计算列方式的均值(简化为简单的总和),即返回包含该矩阵中每列的平均值的行向量 . 用于计算单列向量之和的总和减少实现如下所示:

template<typename T>
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
    extern __shared__ T sdata[];

    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;

    // load input into __shared__ memory
    T x = 0.0;
    if (tid < n) {
        x = input[tid];
    }
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
        if(threadIdx.x < offset) {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }
        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0) {
        per_block_results[blockIdx.x] = sdata[0];
    }
}

这被调用为:

int n = ... // vector size
const int BLOCK_SIZE = 1024;
int number_of_blocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
double* per_block_results = NULL;
cudaMalloc((void**) &per_block_results, sizeof(double)*(number_of_blocks + 1));
// launch one kernel to compute, per-block, a partial sum
kernelSum<double> <<<number_of_blocks, BLOCK_SIZE, BLOCK_SIZE*sizeof(double)>>>(a, per_block_results, n);
// launch a single block to compute the sum of the partial sums
kernelSum<double> <<<1, number_of_blocks, number_of_blocks*sizeof(double)>>>(per_block_results, per_block_results + number_of_blocks, number_of_blocks);

我可以将这个内核推广到任意数量的列的矩阵,但我受共享内存的限制 . 我的GPU具有计算能力 3.5 ,因此它具有 48KB 共享内存,最大块大小为 1024 ,即每个块的线程数 . 由于我对双精度感兴趣,我有 48*1024/8= 6144 共享内存的最大双精度数 . 由于每个块都进行了减少,因此我可以最多使用 6144 (doubles in shared memory) / 1024 (block size) = 6 列,我可以同时计算减少的总和 . 然后减小块大小将允许同时计算更多列,例如 6144 (doubles in shared memory) / 512 (block size) = 12 .

这种更复杂的策略是否会超过矩阵每列的简单CPU循环并调用总和减少量 . 还有另一种更好的方法吗?

2 回答

  • 3

    什么阻止你做这样的事情:

    template<typename T>
    __global__ void kernelSum(const T* __restrict__ input, 
                              T* __restrict__ per_block_results, 
                              const size_t lda, const size_t n)
    {
        extern __shared__ T sdata[];
    
        // Accumulate per thread partial sum
        T x = 0.0;
        T * p = &input[blockIdx.x * lda];
        for(int i=threadIdx.x; i < n; i += blockDim.x) {
            x += p[i];
        }
    
        // load partial sum into __shared__ memory
        sdata[threadIdx.x] = x;
        __syncthreads();
    
        // contiguous range pattern
        for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
            if(threadIdx.x < offset) {
                // add a partial sum upstream to our own
                sdata[threadIdx.x] += sdata[threadIdx.x + offset];
            }
            // wait until all threads in the block have
            // updated their partial sums
            __syncthreads();
        }
    
        // thread 0 writes the final result
        if(threadIdx.x == 0) {
            per_block_results[blockIdx.x] = sdata[0];
        }
    }
    

    [标准免责声明:用浏览器编写,从未编译或测试,使用风险自负]

    即 . 对于共享内存缩减,块中的每个线程只需要 sdata 中的一个条目 . 每个线程总和所需的值以覆盖整列输入 . 然后没有共享内存限制,您可以使用相同的块大小对任何大小的列求和 .


    编辑:显然使用每个线程的部分总和的想法对你来说是新的,所以这里有一个完整的例子来研究:

    #include <iostream>
    
    template<typename T>
    __global__ 
    void kernelSum(const T* __restrict__ input, 
            const size_t lda, // pitch of input in words of sizeof(T)
            T* __restrict__ per_block_results, 
                    const size_t n)
    {
        extern __shared__ T sdata[];
    
        T x = 0.0;
        const T * p = &input[blockIdx.x * lda];
        // Accumulate per thread partial sum
        for(int i=threadIdx.x; i < n; i += blockDim.x) {
            x += p[i];
        }
    
        // load thread partial sum into shared memory
        sdata[threadIdx.x] = x;
        __syncthreads();
    
        for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
            if(threadIdx.x < offset) {
                sdata[threadIdx.x] += sdata[threadIdx.x + offset];
            }
            __syncthreads();
        }
    
        // thread 0 writes the final result
        if(threadIdx.x == 0) {
            per_block_results[blockIdx.x] = sdata[0];
        }
    }
    
    
    int main(void)
    {
        const int m = 10000, n = 16;
        float * a = new float[m*n];
    
        for(int i=0; i<(m*n); i++) { a[i] = (float)(i%10); }
    
        float *a_;
        size_t size_a = m * n * sizeof(float);
        cudaMalloc((void **)&a_, size_a);
        cudaMemcpy(a_, a, size_a, cudaMemcpyHostToDevice);
    
        float *b_;
        size_t size_b = n * sizeof(float);
        cudaMalloc((void **)&b_, size_b);
    
        // select number of warps per block according to size of the
        // colum and launch one block per column. Probably makes sense
        // to have at least 4:1 column size to block size
        dim3 blocksize(256); 
        dim3 gridsize(n);
        size_t shmsize = sizeof(float) * (size_t)blocksize.x;
        kernelSum<float><<<gridsize, blocksize, shmsize>>>(a_, b_, m, m);
    
        float * b = new float[n];
        cudaMemcpy(b, b_, size_b, cudaMemcpyDeviceToHost);
    
        for(int i=0; i<n; i++) {
           std::cout << i << " " << b[i] << std::endl;
        }
    
        cudaDeviceReset();
    
        return 0;
    }
    

    您应该尝试相对于矩阵大小的块大小以获得最佳性能,但通常内核每个线程的工作量越多,整体性能就越好(因为共享内存减少非常昂贵) . 您可以在this answer中看到一种阻止和网格大小启发式的方法,用于类似的内存带宽限制问题 .

  • 2

    作为talonmies已经提供的答案的替代方案,我在这里报告 4 其他减少列的方法, 3 基于使用CUDA Thrust和 1 基于使用 cublas<t>gemv()1 列,如我在上面的评论中所建议的 .

    CUDA推力方法类似于Reduce matrix rows with CUDA,通过获得隐式转置

    thrust::make_permutation_iterator(d_matrix.begin(),                 
            thrust::make_transform_iterator(thrust::make_counting_iterator(0),
                    (_1 % Nrows) * Ncols + _1 / Nrows))
    

    这是完整的代码:

    #include <cublas_v2.h>
    
    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include <thrust/generate.h>
    #include <thrust/reduce.h>
    #include <thrust/functional.h>
    #include <thrust/random.h>
    #include <thrust/sequence.h>
    
    #include <stdio.h>
    #include <iostream>
    
    #include "Utilities.cuh"
    #include "TimingGPU.cuh"
    
    using namespace thrust::placeholders;
    
    // --- Required for approach #2
    __device__ float *vals;
    
    /**************************************************************/
    /* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */
    /**************************************************************/
    template <typename T>
    struct linear_index_to_row_index : public thrust::unary_function<T,T> {
    
        T Ncols; // --- Number of columns
    
        __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {}
    
        __host__ __device__ T operator()(T i) { return i / Ncols; }
    };
    
    /******************************************/
    /* ROW_REDUCTION - NEEDED FOR APPROACH #2 */
    /******************************************/
    struct col_reduction {
    
        const int Nrows;    // --- Number of rows
        const int Ncols;    // --- Number of cols
    
        col_reduction(int _Nrows, int _Ncols) : Nrows(_Nrows), Ncols(_Ncols) {}
    
        __device__ float operator()(float& x, int& y ) {
            float temp = 0.f;
            for (int i = 0; i<Nrows; i++) {
                temp += vals[y + (i*Ncols)];
            }
            return temp;
        }
    };
    
    /**************************/
    /* NEEDED FOR APPROACH #3 */
    /**************************/
    template<typename T>
    struct MulC: public thrust::unary_function<T, T>
    {
        T C;
        __host__ __device__ MulC(T c) : C(c) { }
        __host__ __device__ T operator()(T x) { return x * C; }
    };
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        const int Nrows = 5;     // --- Number of rows
        const int Ncols = 8;     // --- Number of columns
    
        // --- Random uniform integer distribution between 10 and 99
        thrust::default_random_engine rng;
        thrust::uniform_int_distribution<int> dist(10, 99);
    
        // --- Matrix allocation and initialization
        thrust::device_vector<float> d_matrix(Nrows * Ncols);
        for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng);
    
        TimingGPU timerGPU;
    
        /***************/
        /* APPROACH #1 */
        /***************/
        timerGPU.StartCounter();
        // --- Allocate space for row sums and indices
        thrust::device_vector<float> d_col_sums(Ncols);
        thrust::device_vector<int> d_col_indices(Ncols);
    
        // --- Compute row sums by summing values with equal row indices
        thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)),
                              thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
                              thrust::make_permutation_iterator(
                                    d_matrix.begin(),
                                    thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
                              d_col_indices.begin(),
                              d_col_sums.begin(),
                              thrust::equal_to<int>(),
                              thrust::plus<float>());
    
        //thrust::reduce_by_key(
     //               thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)),
     //               thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
     //               thrust::make_permutation_iterator(
        //              d_matrix.begin(),
        //              thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
     //               thrust::make_discard_iterator(),
     //               d_col_sums.begin());
    
        printf("Timing for approach #1 = %f\n", timerGPU.GetCounter());
    
        // --- Print result
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums[j] << "\n";
        }
    
        /***************/
        /* APPROACH #2 */
        /***************/
        timerGPU.StartCounter();
        thrust::device_vector<float> d_col_sums_2(Ncols, 0);
        float *s_vals = thrust::raw_pointer_cast(&d_matrix[0]);
        gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *)));
        thrust::transform(d_col_sums_2.begin(), d_col_sums_2.end(), thrust::counting_iterator<int>(0), d_col_sums_2.begin(), col_reduction(Nrows, Ncols));
    
        printf("Timing for approach #2 = %f\n", timerGPU.GetCounter());
    
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums_2[j] << "\n";
        }
    
        /***************/
        /* APPROACH #3 */
        /***************/
    
        timerGPU.StartCounter();
        thrust::device_vector<float> d_col_sums_3(Ncols, 0);
        thrust::device_vector<float> d_temp(Nrows * Ncols);
        thrust::inclusive_scan_by_key(
                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)),
                    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols),
                    thrust::make_permutation_iterator(
                            d_matrix.begin(),
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1 / Nrows)),
                    d_temp.begin());
        thrust::copy(
                    thrust::make_permutation_iterator(
                            d_temp.begin() + Nrows - 1,
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))),
                    thrust::make_permutation_iterator(
                            d_temp.begin() + Nrows - 1,
                            thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))) + Ncols,
                    d_col_sums_3.begin());
    
        printf("Timing for approach #3 = %f\n", timerGPU.GetCounter());
    
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums_3[j] << "\n";
        }
    
        /***************/
        /* APPROACH #4 */
        /***************/
        cublasHandle_t handle;
    
        timerGPU.StartCounter();
        cublasSafeCall(cublasCreate(&handle));
    
        thrust::device_vector<float> d_col_sums_4(Ncols);
        thrust::device_vector<float> d_ones(Nrows, 1.f);
    
        float alpha = 1.f;
        float beta  = 0.f;
        cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, Ncols, Nrows, &alpha, thrust::raw_pointer_cast(d_matrix.data()), Ncols, 
                                   thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_col_sums_4.data()), 1));
    
        printf("Timing for approach #4 = %f\n", timerGPU.GetCounter());
    
        for(int j = 0; j < Ncols; j++) {
            std::cout << "[ ";
            for(int i = 0; i < Nrows; i++)
                std::cout << d_matrix[i * Ncols + j] << " ";
            std::cout << "] = " << d_col_sums_4[j] << "\n";
        }
    
        return 0;
    }
    

相关问题