我一直在使用 Numba
包测试一些基本的CUDA函数 . 我的主要目标是在GPU上实现Richardson-Lucy算法 . 可以加速算法,并且可以在以下虚拟函数中总结这样做的一个主要步骤
def dummy(arr1, arr2):
return (arr1 * arr2).sum() / ((arr2**2).sum() + eps)
这个功能在CPU上运行得相当快,但我想保留GPU上的所有内容以避免主机<--->设备拷贝 .
为了比较不同计算的速度,我写了一小段函数:
import numpy as np
from numba import njit, jit
import numba
import numba.cuda as cuda
import timeit
import time
# define our functions
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def add_gpu(a, b):
return a + b
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def mult_gpu(a, b):
return a * b
@cuda.reduce
def sum_gpu(a, b):
return a + b
@cuda.jit
def add_gpu_1d(a, b, c):
x = cuda.grid(1)
if x < c.size:
c[x] = a[x] + b[x]
@cuda.jit
def mult_gpu_1d(a, b, c):
x = cuda.grid(1)
if x < c.size:
c[x] = a[x] * b[x]
@cuda.jit
def mult_gpu_2d(a, b, c):
x, y = cuda.grid(2)
if x < c.shape[0] and y < c.shape[1]:
c[x, y] = a[x, y] * b[x, y]
@cuda.jit
def add_gpu_2d(a, b, c):
x, y = cuda.grid(2)
if x < c.shape[0] and y < c.shape[1]:
c[x, y] = a[x, y] + b[x, y]
和一些计时器功能:
def avg_t(t, num):
return np.mean(t) / num
def format_t(t):
"""Turn t into nice formating"""
if t < 1e-3:
return "{:.1f} us".format(t * 1e6)
elif t < 1:
return "{:.1f} ms".format(t * 1e3)
else:
return "{:.1f} s".format(t)
def test_1d_times(data_len, dtype=np.float32):
num_times = 10
title = "Testing 1D Data, Data length = {}, data type = {}".format(data_len, dtype)
print(len(title) * "=")
print(title)
print(len(title) * "=")
t = time.time()
arr1, arr2 = np.empty((2, data_len), dtype=dtype)
d_arr1 = cuda.to_device(arr1)
d_arr2 = cuda.to_device(arr2)
d_result = cuda.device_array_like(d_arr1)
print("Data generated in " + format_t(time.time() - t))
print("d_arr1 dtype =", d_arr1.dtype)
print("d_arr1 size = ", d_arr1.size)
print()
print("Testing multiplication times")
print("----------------------------")
t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: mult_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_mult_1d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing sum times")
print("------------------")
t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: add_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_add_1d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing reduction times")
print("-----------------------")
t = timeit.repeat((lambda: arr1.sum()), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu.reduce(d_arr1)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: sum_gpu(d_arr1)), number=num_times)
print("sum_gpu time = " + format_t(avg_t(t, num_times)))
print()
def test_2d_times(data_len, dtype=np.float32):
num_times = 10
title = "Testing 2D Data, Data length = {}, data type = {}".format(data_len, dtype)
print(len(title) * "=")
print(title)
print(len(title) * "=")
t = time.time()
arr1, arr2 = np.empty((2, data_len, data_len), dtype=dtype)
d_arr1 = cuda.to_device(arr1)
d_arr2 = cuda.to_device(arr2)
d_result = cuda.device_array_like(d_arr1)
print("Data generated in {} seconds".format(time.time() - t))
print("d_arr1 dtype =", d_arr1.dtype)
print("d_arr1 size = ", d_arr1.size)
print()
print("Testing multiplication times")
print("----------------------------")
t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: mult_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_mult_2d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing sum times")
print("------------------")
t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: add_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_add_2d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing reduction times")
print("-----------------------")
t = timeit.repeat((lambda: arr1.sum()), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu.reduce(d_arr1.ravel())), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: sum_gpu(d_arr1.ravel())), number=num_times)
print("sum_gpu time = " + format_t(avg_t(t, num_times)))
print()
运行测试功能
numba.cuda.detect()
test_1d_times(2**24)
test_2d_times(2**12)
test_1d_times(2**24, dtype=np.float64)
test_2d_times(2**12, dtype=np.float64)
给出以下输出:
Found 1 CUDA devices
id 0 b'GeForce GTX TITAN X' [SUPPORTED]
compute capability: 5.2
pci device id: 0
pci bus id: 3
Summary:
1/1 devices are supported
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float32'>
============================================================================
Data generated in 88.2 ms
d_arr1 dtype = float32
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 122.8 ms
cuda_mult_1d time = 206.8 us
Testing sum times
------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 106.1 ms
cuda_add_1d time = 212.6 us
Testing reduction times
-----------------------
cpu/numpy time = 16.7 ms
cuda vectorize time = 11.1 ms
sum_gpu time = 127.3 ms
========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float32'>
========================================================================
Data generated in 0.0800013542175293 seconds
d_arr1 dtype = float32
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 35.4 ms
cuda vectorize time = 97.9 ms
cuda_mult_2d time = 208.9 us
Testing sum times
------------------
cpu/numpy time = 36.3 ms
cuda vectorize time = 94.5 ms
cuda_add_2d time = 250.8 us
Testing reduction times
-----------------------
cpu/numpy time = 16.4 ms
cuda vectorize time = 15.8 ms
sum_gpu time = 125.4 ms
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float64'>
============================================================================
Data generated in 171.0 ms
d_arr1 dtype = float64
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 73.2 ms
cuda vectorize time = 114.9 ms
cuda_mult_1d time = 201.9 us
Testing sum times
------------------
cpu/numpy time = 71.4 ms
cuda vectorize time = 71.0 ms
cuda_add_1d time = 217.2 us
Testing reduction times
-----------------------
cpu/numpy time = 29.0 ms
cuda vectorize time = 12.8 ms
sum_gpu time = 123.5 ms
========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float64'>
========================================================================
Data generated in 0.301849365234375 seconds
d_arr1 dtype = float64
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 73.7 ms
cuda vectorize time = 84.2 ms
cuda_mult_2d time = 226.2 us
Testing sum times
------------------
cpu/numpy time = 74.9 ms
cuda vectorize time = 84.3 ms
cuda_add_2d time = 208.7 us
Testing reduction times
-----------------------
cpu/numpy time = 29.9 ms
cuda vectorize time = 14.3 ms
sum_gpu time = 121.2 ms
似乎 @cuda.vectorize
修饰函数的执行速度比CPU和自定义编写的 @cuda.jit
函数慢 . 虽然 @cuda.jit
函数给出了预期的数量级加速和几乎恒定的时间性能(结果未显示) .
另一方面, @cuda.reduce
函数的运行速度明显慢于 @cuda.vectorize
函数或CPU函数 .
是否有 @cuda.vectorize
和 @cuda.reduce
功能表现不佳的原因?是否可以使用Numba编写CUDA减少内核?
编辑:
看起来这是 Numba
中的合法错误:https://github.com/numba/numba/issues/2266,https://github.com/numba/numba/issues/2268
1 回答
我无法解释
@cuda.vectorize
和@cuda.reduce
的行为 . 有时候结果对我来说有点奇怪 . 例如,Negative Speed Gain Using Numba Vectorize target='cuda'@cuda.vectorize
减慢了计算速度,而使用@cuda.jit
允许加速计算 . 在这里,我建议尝试PyCUDA(https://documen.tician.de/pycuda/) . 我测试了点积的性能(https://documen.tician.de/pycuda/array.html) .在我的电脑上
CPU: Intel Core2 Quad 3GHz, GPU: NVIDIA GeForce GTX 580.
我得到了以下结果:有必要注意,在上面的代码中没有考虑初始化和预编译内核所需的时间 . 但是,这个时间可能很重要 . 考虑到这段时间我得到了:
因此,在这种情况下,GPU代码比CPU代码慢 . 同时,对于大多数应用程序,您只需要初始化内核一次,然后多次使用它 . 在这种情况下,使用PyCUDA简化内核看起来是合理的 .
之前,我通过计算2D扩散系数方程测试了
@cuda.jit
,PyCUDA和CUDA-C代码的性能 . 我发现PyCUDA可以获得与CUDA-C几乎相同的性能,而Numba表现出更差的性能 . 下图显示了这些结果 .