首页 文章

如何使用推力和CUDA流将内存从主机异步复制到设备

提问于
浏览
7

我想使用推力将内存从主机复制到设备

thrust::host_vector<float> h_vec(1 << 28);
thrust::device_vector<float> d_vec(1 << 28);
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());

使用CUDA流类似于使用流将内存从设备复制到设备的方式:

cudaStream_t s;
cudaStreamCreate(&s);

thrust::device_vector<float> d_vec1(1 << 28), d_vec2(1 << 28);
thrust::copy(thrust::cuda::par.on(s), d_vec1.begin(), d_vec1.end(), d_vec2.begin());

cudaStreamSynchronize(s);
cudaStreamDestroy(s);

问题是我无法将执行策略设置为CUDA以在从主机复制到设备时指定流,因为在这种情况下,推力会假设两个向量都存储在设备上 . 有办法解决这个问题吗?我正在使用github的最新推力版本(它在version.h文件中显示为1.8) .

2 回答

  • 1

    如评论中所示,我认为这不可能直接用 thrust::copy . 但是,我们可以在推力应用程序中使用 cudaMemcpyAsync 来实现异步复制和复制与计算重叠的目标 .

    这是一个有效的例子:

    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include <thrust/system/cuda/experimental/pinned_allocator.h>
    #include <thrust/system/cuda/execution_policy.h>
    #include <thrust/fill.h>
    #include <thrust/sequence.h>
    #include <thrust/for_each.h>
    #include <iostream>
    
    // DSIZE determines duration of H2D and D2H transfers
    #define DSIZE (1048576*8)
    // SSIZE,LSIZE determine duration of kernel launched by thrust
    #define SSIZE (1024*512)
    #define LSIZE 1
    // KSIZE determines size of thrust kernels (number of threads per block)
    #define KSIZE 64
    #define TV1 1
    #define TV2 2
    
    typedef int mytype;
    typedef thrust::host_vector<mytype, thrust::cuda::experimental::pinned_allocator<mytype> > pinnedVector;
    
    struct sum_functor
    {
      mytype *dptr;
      sum_functor(mytype* _dptr) : dptr(_dptr) {};
      __host__ __device__ void operator()(mytype &data) const
        {
          mytype result = data;
          for (int j = 0; j < LSIZE; j++)
            for (int i = 0; i < SSIZE; i++)
              result += dptr[i];
          data = result;
        }
    };
    
    int main(){
    
      pinnedVector hi1(DSIZE);
      pinnedVector hi2(DSIZE);
      pinnedVector ho1(DSIZE);
      pinnedVector ho2(DSIZE);
      thrust::device_vector<mytype> di1(DSIZE);
      thrust::device_vector<mytype> di2(DSIZE);
      thrust::device_vector<mytype> do1(DSIZE);
      thrust::device_vector<mytype> do2(DSIZE);
      thrust::device_vector<mytype> dc1(KSIZE);
      thrust::device_vector<mytype> dc2(KSIZE);
    
      thrust::fill(hi1.begin(), hi1.end(),  TV1);
      thrust::fill(hi2.begin(), hi2.end(),  TV2);
      thrust::sequence(do1.begin(), do1.end());
      thrust::sequence(do2.begin(), do2.end());
    
      cudaStream_t s1, s2;
      cudaStreamCreate(&s1); cudaStreamCreate(&s2);
    
      cudaMemcpyAsync(thrust::raw_pointer_cast(di1.data()), thrust::raw_pointer_cast(hi1.data()), di1.size()*sizeof(mytype), cudaMemcpyHostToDevice, s1);
      cudaMemcpyAsync(thrust::raw_pointer_cast(di2.data()), thrust::raw_pointer_cast(hi2.data()), di2.size()*sizeof(mytype), cudaMemcpyHostToDevice, s2);
    
      thrust::for_each(thrust::cuda::par.on(s1), do1.begin(), do1.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di1.data())));
      thrust::for_each(thrust::cuda::par.on(s2), do2.begin(), do2.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di2.data())));
    
      cudaMemcpyAsync(thrust::raw_pointer_cast(ho1.data()), thrust::raw_pointer_cast(do1.data()), do1.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s1);
      cudaMemcpyAsync(thrust::raw_pointer_cast(ho2.data()), thrust::raw_pointer_cast(do2.data()), do2.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s2);
    
      cudaDeviceSynchronize();
      for (int i=0; i < KSIZE; i++){
        if (ho1[i] != ((LSIZE*SSIZE*TV1) + i)) { std::cout << "mismatch on stream 1 at " << i << " was: " << ho1[i] << " should be: " << ((DSIZE*TV1)+i) << std::endl; return 1;}
        if (ho2[i] != ((LSIZE*SSIZE*TV2) + i)) { std::cout << "mismatch on stream 2 at " << i << " was: " << ho2[i] << " should be: " << ((DSIZE*TV2)+i) << std::endl; return 1;}
        }
      std::cout << "Success!" << std::endl;
      return 0;
    }
    

    对于我的测试用例,我使用了RHEL5.5,Quadro5000和cuda 6.5RC . 这个例子被设计为推力创建非常小的内核(只有一个线程块,只要 KSIZE 很小,比如32或64),这样从 thrust::for_each 创建的内核能够同时运行 .

    当我分析这段代码时,我看到:

    nvvp output for thrust streams application

    这表明我们正在实现推力内核之间以及复制操作和推力内核之间的正确重叠,以及在内核完成时异步数据复制 . 请注意 cudaDeviceSynchronize() 操作"fills"时间轴,表示所有异步操作(数据复制,推力函数)都是异步发出的,并且控制在任何操作进行之前都返回到主机线程 . 所有这些都是预期的,主机,GPU和数据复制操作之间完全并发的正确行为 .

  • 10

    这是一个使用 thrust::cuda::experimental::pinned_allocator<T> 的工作示例:

    // Compile with:
    // nvcc --std=c++11 mem_async.cu -o mem_async
    
    #include <cuda.h>
    #include <cuda_runtime.h>
    #include <cufft.h>
    
    #include <thrust/device_vector.h>
    #include <thrust/host_vector.h>
    #include <thrust/fill.h>
    #include <thrust/system/cuda/experimental/pinned_allocator.h>
    
    #define LEN 1024
    
    int main(int argc, char *argv[]) {
        thrust::host_vector<float, thrust::cuda::experimental::pinned_allocator<float>> h_vec(LEN);
        thrust::device_vector<float> d_vec(LEN);
    
        thrust::fill(d_vec.begin(), d_vec.end(), -1.0);
    
        cudaMemcpyAsync(thrust::raw_pointer_cast(h_vec.data()),
                        thrust::raw_pointer_cast(d_vec.data()),
                        d_vec.size()*sizeof(float),
                        cudaMemcpyDeviceToHost);
    
        // Comment out this line to see what happens.
        cudaDeviceSynchronize();
    
        std::cout << h_vec[0] << std::endl;
    }
    

    注释掉同步步骤,由于异步内存传输,您应该将 0 打印到控制台 .

相关问题