首页 文章

Numba CUDA`vectorize`和`reduce`装饰器比预期慢

提问于
浏览
3

我一直在使用 Numba 包测试一些基本的CUDA函数 . 我的主要目标是在GPU上实现Richardson-Lucy算法 . 可以加速算法,并且可以在以下虚拟函数中总结这样做的一个主要步骤

def dummy(arr1, arr2):
    return (arr1 * arr2).sum() / ((arr2**2).sum() + eps)

这个功能在CPU上运行得相当快,但我想保留GPU上的所有内容以避免主机<--->设备拷贝 .

为了比较不同计算的速度,我写了一小段函数:

import numpy as np
from numba import njit, jit
import numba
import numba.cuda as cuda
import timeit
import time


# define our functions
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def add_gpu(a, b):
    return a + b

@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def mult_gpu(a, b):
    return a * b

@cuda.reduce
def sum_gpu(a, b):
    return a + b

@cuda.jit
def add_gpu_1d(a, b, c):
    x = cuda.grid(1)
    if x < c.size:
        c[x] = a[x] + b[x]

@cuda.jit
def mult_gpu_1d(a, b, c):
    x = cuda.grid(1)
    if x < c.size:
        c[x] = a[x] * b[x]

@cuda.jit
def mult_gpu_2d(a, b, c):
    x, y = cuda.grid(2)
    if x < c.shape[0] and y < c.shape[1]:
        c[x, y] = a[x, y] * b[x, y]

@cuda.jit
def add_gpu_2d(a, b, c):
    x, y = cuda.grid(2)
    if x < c.shape[0] and y < c.shape[1]:
        c[x, y] = a[x, y] + b[x, y]

和一些计时器功能:

def avg_t(t, num):
    return np.mean(t) / num

def format_t(t):
    """Turn t into nice formating"""
    if t < 1e-3:
        return "{:.1f} us".format(t * 1e6)
    elif t < 1:
        return "{:.1f} ms".format(t * 1e3)
    else:
        return "{:.1f} s".format(t)

def test_1d_times(data_len, dtype=np.float32):
    num_times = 10

    title = "Testing 1D Data, Data length = {}, data type = {}".format(data_len, dtype)
    print(len(title) * "=")
    print(title)
    print(len(title) * "=")

    t = time.time()
    arr1, arr2 = np.empty((2, data_len), dtype=dtype)
    d_arr1 = cuda.to_device(arr1)
    d_arr2 = cuda.to_device(arr2)
    d_result = cuda.device_array_like(d_arr1)
    print("Data generated in " + format_t(time.time() - t))
    print("d_arr1 dtype =", d_arr1.dtype)
    print("d_arr1 size = ", d_arr1.size)

    print()
    print("Testing multiplication times")
    print("----------------------------")

    t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: mult_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_mult_1d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing sum times")
    print("------------------")

    t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: add_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_add_1d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing reduction times")
    print("-----------------------")

    t = timeit.repeat((lambda: arr1.sum()), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu.reduce(d_arr1)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: sum_gpu(d_arr1)), number=num_times)
    print("sum_gpu time = " + format_t(avg_t(t, num_times)))
    print()

def test_2d_times(data_len, dtype=np.float32):
    num_times = 10

    title = "Testing 2D Data, Data length = {}, data type = {}".format(data_len, dtype)
    print(len(title) * "=")
    print(title)
    print(len(title) * "=")

    t = time.time()
    arr1, arr2 = np.empty((2, data_len, data_len), dtype=dtype)
    d_arr1 = cuda.to_device(arr1)
    d_arr2 = cuda.to_device(arr2)
    d_result = cuda.device_array_like(d_arr1)
    print("Data generated in {} seconds".format(time.time() - t))
    print("d_arr1 dtype =", d_arr1.dtype)
    print("d_arr1 size = ", d_arr1.size)

    print()
    print("Testing multiplication times")
    print("----------------------------")

    t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: mult_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_mult_2d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing sum times")
    print("------------------")

    t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t= timeit.repeat((lambda: add_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
    print("cuda_add_2d time = " + format_t(avg_t(t, num_times)))

    print()
    print("Testing reduction times")
    print("-----------------------")

    t = timeit.repeat((lambda: arr1.sum()), number=num_times)
    print("cpu/numpy time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: add_gpu.reduce(d_arr1.ravel())), number=num_times)
    print("cuda vectorize time = " + format_t(avg_t(t, num_times)))

    t = timeit.repeat((lambda: sum_gpu(d_arr1.ravel())), number=num_times)
    print("sum_gpu time = " + format_t(avg_t(t, num_times)))
    print()

运行测试功能

numba.cuda.detect()
test_1d_times(2**24)
test_2d_times(2**12)
test_1d_times(2**24, dtype=np.float64)
test_2d_times(2**12, dtype=np.float64)

给出以下输出:

Found 1 CUDA devices
id 0    b'GeForce GTX TITAN X'                              [SUPPORTED]
                      compute capability: 5.2
                           pci device id: 0
                              pci bus id: 3
Summary:
    1/1 devices are supported
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float32'>
============================================================================
Data generated in 88.2 ms
d_arr1 dtype = float32
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 122.8 ms
cuda_mult_1d time = 206.8 us

Testing sum times
------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 106.1 ms
cuda_add_1d time = 212.6 us

Testing reduction times
-----------------------
cpu/numpy time = 16.7 ms
cuda vectorize time = 11.1 ms
sum_gpu time = 127.3 ms

========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float32'>
========================================================================
Data generated in 0.0800013542175293 seconds
d_arr1 dtype = float32
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 35.4 ms
cuda vectorize time = 97.9 ms
cuda_mult_2d time = 208.9 us

Testing sum times
------------------
cpu/numpy time = 36.3 ms
cuda vectorize time = 94.5 ms
cuda_add_2d time = 250.8 us

Testing reduction times
-----------------------
cpu/numpy time = 16.4 ms
cuda vectorize time = 15.8 ms
sum_gpu time = 125.4 ms

============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float64'>
============================================================================
Data generated in 171.0 ms
d_arr1 dtype = float64
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 73.2 ms
cuda vectorize time = 114.9 ms
cuda_mult_1d time = 201.9 us

Testing sum times
------------------
cpu/numpy time = 71.4 ms
cuda vectorize time = 71.0 ms
cuda_add_1d time = 217.2 us

Testing reduction times
-----------------------
cpu/numpy time = 29.0 ms
cuda vectorize time = 12.8 ms
sum_gpu time = 123.5 ms

========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float64'>
========================================================================
Data generated in 0.301849365234375 seconds
d_arr1 dtype = float64
d_arr1 size =  16777216

Testing multiplication times
----------------------------
cpu/numpy time = 73.7 ms
cuda vectorize time = 84.2 ms
cuda_mult_2d time = 226.2 us

Testing sum times
------------------
cpu/numpy time = 74.9 ms
cuda vectorize time = 84.3 ms
cuda_add_2d time = 208.7 us

Testing reduction times
-----------------------
cpu/numpy time = 29.9 ms
cuda vectorize time = 14.3 ms
sum_gpu time = 121.2 ms

似乎 @cuda.vectorize 修饰函数的执行速度比CPU和自定义编写的 @cuda.jit 函数慢 . 虽然 @cuda.jit 函数给出了预期的数量级加速和几乎恒定的时间性能(结果未显示) .

另一方面, @cuda.reduce 函数的运行速度明显慢于 @cuda.vectorize 函数或CPU函数 .

是否有 @cuda.vectorize@cuda.reduce 功能表现不佳的原因?是否可以使用Numba编写CUDA减少内核?

编辑:

看起来这是 Numba 中的合法错误:https://github.com/numba/numba/issues/2266https://github.com/numba/numba/issues/2268

1 回答

  • 2

    我无法解释 @cuda.vectorize@cuda.reduce 的行为 . 有时候结果对我来说有点奇怪 . 例如,Negative Speed Gain Using Numba Vectorize target='cuda' @cuda.vectorize 减慢了计算速度,而使用 @cuda.jit 允许加速计算 . 在这里,我建议尝试PyCUDA(https://documen.tician.de/pycuda/) . 我测试了点积的性能(https://documen.tician.de/pycuda/array.html) .

    import numpy as np
    from pycuda.curandom import rand as curand
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as pycu
    import pycuda.autoinit
    from pycuda.reduction import ReductionKernel
    import numba.cuda as cuda 
    from time import time
    
    dot = ReductionKernel(dtype_out=np.float32, neutral="0",
                          reduce_expr="a+b", map_expr="x[i]*y[i]",
                          arguments="float *x, float *y")
    n = 2**24
    x = curand((n), dtype=np.float32)
    y = curand((n), dtype=np.float32)
    
    x_cpu = np.random.random((n))
    y_cpu = np.random.random((n))
    
    st = time()
    x_dot_y = dot(x, y).get()
    gpu_time = (time() - st)
    print "GPU: ", gpu_time
    
    st = time()
    x_dot_y_cpu = np.dot(x_cpu, y_cpu)
    cpu_time = (time() - st)
    print "CPU: ", cpu_time
    print "speedup: ", cpu_time/gpu_time
    

    在我的电脑上 CPU: Intel Core2 Quad 3GHz, GPU: NVIDIA GeForce GTX 580. 我得到了以下结果:

    GPU:  0.00191593170166
    CPU:  0.0518710613251
    speedup:  27.0735440518
    

    有必要注意,在上面的代码中没有考虑初始化和预编译内核所需的时间 . 但是,这个时间可能很重要 . 考虑到这段时间我得到了:

    GPU:  0.316560029984
    CPU:  0.0511090755463
    speedup:  0.161451449031
    

    因此,在这种情况下,GPU代码比CPU代码慢 . 同时,对于大多数应用程序,您只需要初始化内核一次,然后多次使用它 . 在这种情况下,使用PyCUDA简化内核看起来是合理的 .

    之前,我通过计算2D扩散系数方程测试了 @cuda.jit ,PyCUDA和CUDA-C代码的性能 . 我发现PyCUDA可以获得与CUDA-C几乎相同的性能,而Numba表现出更差的性能 . 下图显示了这些结果 .
    enter image description here

相关问题