我用GPU计算X^2-X+1=0在[1,2]内的解,把[a,b]等分成64份,交给64个线程处理,找出根的范围
源代码如下://////////////////////////////////////////////////////////////////////////////
// Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
//
// Please refer to the NVIDIA end user license agreement (EULA) associated
// with this source code for terms and conditions that govern your use of
// this software. Any use, reproduction, disclosure, or distribution of
// this software and related documentation outside the terms of the EULA
// is strictly prohibited.
//
////////////////////////////////////////////////////////////////////////////
/* Template project which demonstrates the basics on how to setup a project
- example application.
- Host code.
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include
#define F(x) (x*x-x+1)
// includes CUDA
#include <cuda_runtime.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper functions for SDK examples
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
using std::endl;
using std::cout;
global void
FindAnswer(float *g_A,float *g_B, float *g_C)
{
int tid=threadIdx.x;
int bid=blockIdx.x;
float a=*g_A;
float b=*g_B;
float New_a,New_b;
do
{
New_a=*g_A+(g_B-g_A)/64(bidblockDim.x+tid);
New_b=g_A+(g_B-g_A)/64(bidblockDim.x+tid+1);
if((F(New_a))(F(New_b))<0)
{
*g_A=New_a;
*g_B=New_b;
}
__syncthreads();
}
while(fabs(*g_B-*g_A)>0.001);
if(!(a==*g_A&&b==g_B)&&(bidblockDim.x+tid)==0)
*g_C=(*g_A+*g_B)/2;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void
runTest(int argc, char **argv)
{
bool bTestResult = true;
printf(“%s Starting…\n\n”, argv[0]);
cudaError_t err=cudaSuccess;
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
StopWatchInterface *timer = 0;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
unsigned int num_threads = 32;
size_t mem_size = sizeof(float) *1;
// allocate host memory
float *h_A = (float *) malloc(mem_size);
float *h_B = (float *) malloc(mem_size);
// initalize the memory
h_A[0]=1;
h_B[0]=2;
// allocate device memory
float *d_A=NULL;
float *d_B=NULL;
err=cudaMalloc((void **) &d_A, mem_size);
err=cudaMalloc((void **) &d_B, mem_size);
// copy host memory to device
err=cudaMemcpy(d_A, h_A, mem_size,
cudaMemcpyHostToDevice);
err=cudaMemcpy(d_B, h_B, mem_size,
cudaMemcpyHostToDevice);
// allocate device memory for result
float *d_C;
checkCudaErrors(cudaMalloc((void **) &d_C, mem_size));
// setup execution parameters
//dim3 grid(2, 1, 1);
//dim3 threads(num_threads, 1, 1);
// execute the kernel
FindAnswer<<<2,32>>>(d_A, d_B,d_C);
// check if kernel execution generated and error
getLastCudaError(“Kernel execution failed”);
// allocate mem for the result on host side
float *h_C = (float *) malloc(mem_size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_C, d_C, mem_size,
cudaMemcpyDeviceToHost));
sdkStopTimer(&timer);
printf(“Processing time: %f (ms)\n”, sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
std::cout<<“解为”<<*h_C;
// compute reference solution
//float *reference = (float *) malloc(mem_size);
//computeGold(reference, h_idata, num_threads);
// check result
/* if (checkCmdLineFlag(argc, (const char **) argv, “regression”))
{
// write file for regression test
sdkWriteFile(“./data/regression.dat”, h_odata, num_threads, 0.0f, false);
}
else
{
// custom output handling when no regression test running
// in this case check if the result is equivalent to the expected soluion
bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
}*/
// cleanup memory
free(h_A);
free(h_B);
free(h_C);
//free(reference);
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
cudaDeviceReset();
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
这个程序执行时,在执行上面红色标注的行时,出错了,我单步调试到这里时,显示器就黑屏一会,下面在执行checkCudaErrors(cudaFree(d_C))时也出错了,究竟是怎么回事啊?
我不知道在VC里如何单步调试内核函数,并且看到GPU里的变量