CUDA C程序没报错结果不对找不到问题的一种可能的问题hhhh....

2024-07-10 12:12| 来源: 网络整理| 查看: 265

你看标题hhhh, 我也想笑但其实它更好笑：在这里插入图片描述 hhhhhhhhhhhhhhhhhhhhh

就是你的程序没保存，但是结果不对，找不到问题，不妨看看这个hhhhh

两年前有幸上过程润伟老师讲的CUDA C高性能编程引论，当时的课只有一周，不过，老师讲的风趣幽默，以至于我先在一些重要的点都记得hhhh

不扯了，切入正题：

dim3 threads_per_block(64, 64, 1); dim3 number_of_blocks(16, 16, 1);

这个你看上去好像没啥问题，但实际上它已经超过的了最大运行线程数，没错就这个简单的东西…

原问题，参考附录

但是它执行并不会报错的，所以需要手动去读取错误，如下面所示：

（PS：顺便学到CUDA的错误处理，俩年前我一直不知道程润伟老师老师这个操作到底是啥意思）

创建一个包装 CUDA 函数调用的宏对于检查错误十分有用。以下是一个宏示例，可以在余下练习中随时使用： (摘自NVIDIA官方CUDA C编程教学notebook)

#include #include inline cudaError_t checkCuda(cudaError_t result) { if (result != cudaSuccess) { fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); assert(result == cudaSuccess); } return result; } int main() { /* * The macro can be wrapped around any function returning * a value of type `cudaError_t`. */ checkCuda( cudaDeviceSynchronize() ) }

但是我在实际使用的时候，发现这个玩意就是个玩具，一点儿都不好用，这是我看到的另一种写法：

dim3 tblocks(32, 16, 1); dim3 grid((nj/tblocks.x)+1, (ni/tblocks.y)+1, 1); cudaError_t ierrSync, ierrAsync; // Execute the modified version using same data for (istep=0; istep printf("Sync error: %s\n", cudaGetErrorString(ierrSync)); } if (ierrAsync != cudaSuccess) { printf("Async error: %s\n", cudaGetErrorString(ierrAsync)); } // swap the temperature pointers 和理解无关的玩意儿 temp_tmp = temp1; temp1 = temp2; temp2= temp_tmp; }

上边那个玩意儿还不如 cudaGetLastError 有用hhh，另外我也懒得解释上边这几句都啥意思了，大家一看就懂，这个写法其实还可以封装一下，今天就只在这里做个记录

附录

原问题是将 step_kernel_mod 改写核函数，并调用

#include #include // Simple define to index into a 1D array from 2D space #define I2D(num, c, r) ((r)*(num)+(c)) /* * `step_kernel_mod` is currently a direct copy of the CPU reference solution * `step_kernel_ref` below. Accelerate it to run as a CUDA kernel. */ void step_kernel_mod(int ni, int nj, float fact, float* temp_in, float* temp_out) { int i00, im10, ip10, i0m1, i0p1; float d2tdx2, d2tdy2; // loop over all points in domain (except boundary) for ( int j=1; j // find indices into linear memory // for central point and neighbours i00 = I2D(ni, i, j); im10 = I2D(ni, i-1, j); ip10 = I2D(ni, i+1, j); i0m1 = I2D(ni, i, j-1); i0p1 = I2D(ni, i, j+1); // evaluate derivatives d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10]; d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1]; // update temperatures temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2); } } } void step_kernel_ref(int ni, int nj, float fact, float* temp_in, float* temp_out) { int i00, im10, ip10, i0m1, i0p1; float d2tdx2, d2tdy2; // loop over all points in domain (except boundary) for ( int j=1; j // find indices into linear memory // for central point and neighbours i00 = I2D(ni, i, j); im10 = I2D(ni, i-1, j); ip10 = I2D(ni, i+1, j); i0m1 = I2D(ni, i, j-1); i0p1 = I2D(ni, i, j+1); // evaluate derivatives d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10]; d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1]; // update temperatures temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2); } } } int main() { int istep; int nstep = 200; // number of time steps // Specify our 2D dimensions const int ni = 200; const int nj = 100; float tfac = 8.418e-5; // thermal diffusivity of silver float *temp1_ref, *temp2_ref, *temp1, *temp2, *temp_tmp; const int size = ni * nj * sizeof(float); temp1_ref = (float*)malloc(size); temp2_ref = (float*)malloc(size); temp1 = (float*)malloc(size); temp2 = (float*)malloc(size); // Initialize with random data for( int i = 0; i step_kernel_ref(ni, nj, tfac, temp1_ref, temp2_ref); // swap the temperature pointers temp_tmp = temp1_ref; temp1_ref = temp2_ref; temp2_ref= temp_tmp; } // Execute the modified version using same data for (istep=0; istep if (abs(temp1[i]-temp1_ref[i]) > maxError) { maxError = abs(temp1[i]-temp1_ref[i]); } } // Check and see if our maxError is greater than an error bound if (maxError > 0.0005f) printf("Problem! The Max Error of %.5f is NOT within acceptable bounds.\n", maxError); else printf("The Max Error of %.5f is within acceptable bounds.\n", maxError); free( temp1_ref ); free( temp2_ref ); free( temp1 ); free( temp2 ); return 0; }

NVIDIA 教程给的标准答案：

#include #include // Simple define to index into a 1D array from 2D space #define I2D(num, c, r) ((r)*(num)+(c)) __global__ void step_kernel_mod(int ni, int nj, float fact, float* temp_in, float* temp_out) { int i00, im10, ip10, i0m1, i0p1; float d2tdx2, d2tdy2; int j = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.y * blockDim.y + threadIdx.y; // loop over all points in domain (except boundary) if (j > 0 && i > 0 && j int i00, im10, ip10, i0m1, i0p1; float d2tdx2, d2tdy2; // loop over all points in domain (except boundary) for ( int j=1; j // find indices into linear memory // for central point and neighbours i00 = I2D(ni, i, j); im10 = I2D(ni, i-1, j); ip10 = I2D(ni, i+1, j); i0m1 = I2D(ni, i, j-1); i0p1 = I2D(ni, i, j+1); // evaluate derivatives d2tdx2 = temp_in[im10]-2*temp_in[i00]+temp_in[ip10]; d2tdy2 = temp_in[i0m1]-2*temp_in[i00]+temp_in[i0p1]; // update temperatures temp_out[i00] = temp_in[i00]+fact*(d2tdx2 + d2tdy2); } } } int main() { int istep; int nstep = 200; // number of time steps // Specify our 2D dimensions const int ni = 200; const int nj = 100; float tfac = 8.418e-5; // thermal diffusivity of silver float *temp1_ref, *temp2_ref, *temp1, *temp2, *temp_tmp; const int size = ni * nj * sizeof(float); temp1_ref = (float*)malloc(size); temp2_ref = (float*)malloc(size); cudaMallocManaged(&temp1, size); cudaMallocManaged(&temp2, size); // Initialize with random data for( int i = 0; i step_kernel_ref(ni, nj, tfac, temp1_ref, temp2_ref); // swap the temperature pointers temp_tmp = temp1_ref; temp1_ref = temp2_ref; temp2_ref= temp_tmp; } dim3 tblocks(32, 16, 1); dim3 grid((nj/tblocks.x)+1, (ni/tblocks.y)+1, 1); cudaError_t ierrSync, ierrAsync; // Execute the modified version using same data for (istep=0; istep printf("Sync error: %s\n", cudaGetErrorString(ierrSync)); } if (ierrAsync != cudaSuccess) { printf("Async error: %s\n", cudaGetErrorString(ierrAsync)); } // swap the temperature pointers temp_tmp = temp1; temp1 = temp2; temp2= temp_tmp; } float maxError = 0; // Output should always be stored in the temp1 and temp1_ref at this point for( int i = 0; i maxError = abs(temp1[i]-temp1_ref[i]); } } // Check and see if our maxError is greater than an error bound if (maxError > 0.0005f) printf("Problem! The Max Error of %.5f is NOT within acceptable bounds.\n", maxError); else printf("The Max Error of %.5f is within acceptable bounds.\n", maxError); free( temp1_ref ); free( temp2_ref ); cudaFree( temp1 ); cudaFree( temp2 ); return 0; }

【本文地址】

公司简介

联系我们

CUDA C程序没报错 结果不对 找不到问题 的一种可能的问题hhhh....

CUDA C程序没报错结果不对找不到问题的一种可能的问题hhhh....