Cuda - 内核执行后设备值为0-Java 学习之路

-3

出于某种原因，当我执行程序时，设备变量的值为零 . 就在我执行cuda内核之前，设备变量具有正确的值 . 输出图像仅为原始图像尺寸的黑色 . 所有内存分配以及与主机的复制似乎都是正确的 .

谢谢你的帮助！

// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#ifdef _WIN32
#  define WINDOWS_LEAN_AND_MEAN
#  define NOMINMAX
#  include <windows.h>
#endif

#define Image_Size 512
#define Kernel_Size 3

// Includes CUDA
#include <cuda_runtime.h>

// Utilities and timing functions
#include "./inc/helper_functions.h"    // includes cuda.h and cuda_runtime_api.h

// CUDA helper functions
#include "./inc/helper_cuda.h"         // helper functions for CUDA error check

const char *imageFilename = "lena_bw.pgm";

const char *sampleName = "simpleTexture";

#define C_PI 3.141592653589793238462643383279502884197169399375

void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
{
    // This function effectively swirls an image
    // This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
    // Each thread on the GPU will process exactly one pixel
    // Before doing anything, we need to determine the current pixel we are calculating in this thread
    // Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel

    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    // Test to see if we're testing a valid pixel
    if (i >= height || j >= width) return;  // Don't bother doing the calculation. We're not in a valid pixel location

    double cX = (double)width/2.0f;
    double cY = (double)height/2.0f;
    double relY = cY-i;
    double relX = j-cX;
    // relX and relY are points in our UV space
    // Calculate the angle our points are relative to UV origin. Everything is in radians.
    double originalAngle;
    if (relX != 0)
    {
        originalAngle = atan(abs(relY)/abs(relX));
        if ( relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
        else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
        else if (relX <=0 && relY <0) originalAngle += C_PI;
    }
    else
    {
        // Take care of rare special case
        if (relY >= 0) originalAngle = 0.5f * C_PI;
        else originalAngle = 1.5f * C_PI;
    }
    // Calculate the distance from the center of the UV using pythagorean distance
    double radius = sqrt(relX*relX + relY*relY);
    // Use any equation we want to determine how much to rotate image by
    //double newAngle = originalAngle + factor*radius;  // a progressive twist
    double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
    // Transform source UV coordinates back into bitmap coordinates
    int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
    int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
    srcX += cX;
    srcY += cY;
    srcY = height - srcY;
    // Clamp the source to legal image pixel
    if (srcX < 0) srcX = 0;
    else if (srcX >= width) srcX = width-1;
    if (srcY < 0) srcY = 0;
    else if (srcY >= height) srcY = height-1;
    // Set the pixel color
    // Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
    pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
}




////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
    printf("%s starting...\n", sampleName);

    // Process command-line arguments
    if (argc > 1)
    {
        if (checkCmdLineFlag(argc, (const char **) argv, "input"))
        {
            getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
        }
        else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
        {
            printf("-reference flag should be used with -input flag");
            exit(EXIT_FAILURE);
        }
    }

    runTest(argc, argv);

    cudaDeviceReset();
    printf("%s completed",
           sampleName);
    //exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
    int devID = findCudaDevice(argc, (const char **) argv);
    unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
    // load image from disk
    float *hData = NULL;
    float *host_array_kernel = 0;

    float *device_array_Image = 0;
    float *device_array_kernel = 0;
    float *device_array_Result = 0;


    unsigned int width, height;
    char *imagePath = sdkFindFilePath(imageFilename, argv[0]);

    if (imagePath == NULL)
    {
        printf("Unable to source image file: %s\n", imageFilename);
        exit(EXIT_FAILURE);
    }

    sdkLoadPGM(imagePath, &hData, &width, &height);

    unsigned int size = width * height * sizeof(float);
    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);

    // Allocation of device arrays using CudaMalloc
    cudaMalloc((void**)&device_array_Image, size);
    cudaMalloc((void**)&device_array_kernel, kernel_bytes);
    cudaMalloc((void**)&device_array_Result, size);


    host_array_kernel = (float*)malloc(kernel_bytes); // kernel


   // Allocate mem for the result on host side
   float *hOutputDataSharp = (float *) malloc(size);

    GenerateKernel (host_array_kernel);


// copy arrays and kernel from host to device
    checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
    checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));



    dim3 dimBlock(16, 16, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

    //Do the Convolution
    printf("DImage : '%.8f'\n",device_array_Image);
    printf("DKernel : '%.8f'\n",device_array_kernel);
    //serialConvolution(hData, host_array_kernel ,hOutputDataSharp);


    SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
    printf("DResult : '%.8f'\n",device_array_Result);
    checkCudaErrors(cudaDeviceSynchronize());
    cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
    printf("HResult : '%.8f'\n",hOutputDataSharp);
    // Write result to file
    char outputSharp[1024];

    strcpy(outputSharp, imagePath);
    strcpy(outputSharp, "data/serial_sharptest.pgm");
    sdkSavePGM(outputSharp, hOutputDataSharp, width, height);

    cudaFree(device_array_Result);
    cudaFree(device_array_Image);
    cudaFree(device_array_kernel);
    free(hData);
    free(imagePath);
    //free(host_array_Image);
    free(host_array_kernel);
    free(hOutputDataSharp);
    //free(hOutputImage);
    //free(hOutputKernel);
}

1 回答

1
您的代码正在写入源图像：
```
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
```
写入 device_array_Image 这是源，而不是您期望的目的地 .

此外，我对 printf("DResult : '%.8f'\n",device_array_Result); 的输出非常好奇，因为 device_array_Result 在GPU地址空间中并且分配了 cudaMalloc . 你在哪台设备上运行？
回复于 2024-04-29T04:36:33+08:00

Cuda - 内核执行后设备值为0

1 回答

相关问题