当前位置: 动力学知识库 > 问答 > 编程问答 >

gpu - CUDA programming assignment

问题描述:

I am trying to set only the first element of an array to 5.0 (say). ie, just one of the thread shall set a values and rest of the other threads don't do anything.

Here is my complete code

#include <stdio.h>

#include <cuda.h>

#define GPUERRCHK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)

{

if (code != cudaSuccess)

{

fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);

if (abort) exit(code);

}

}

void writeBuf( char * fName, float * out_frame, int dim )

{

FILE * fp = fopen( fName, "w+" );

int baseIndx = 0;

for( int i=0 ; i<dim ; i++ )

{

for( int j=0 ; j<dim ; j++ )

{

fprintf( fp, "%f ", out_frame[ dim + j ] );

}

baseIndx += dim;

fprintf( fp, "\n" );

}

fclose( fp );

}

__global__ void kernel( float * s1, float * s2, int dim, int * hx, int *hy, float *hT, int nHeaters )

{

int x = threadIdx.x + blockIdx.x*blockDim.x;

int y = threadIdx.y + blockIdx.y*blockDim.y;

int offset = x + y*blockDim.x*gridDim.x;

if( offset < 1 )

{

s2[0] = 1.0;

}

__syncthreads();

}

int main()

{

srand48( time(NULL) );

int dim = 1024;

float *dev_s1, *dev_s2;

GPUERRCHK( cudaMalloc( (void**)&dev_s1, dim*dim * sizeof(float) ));

GPUERRCHK( cudaMalloc( (void**)&dev_s2, dim*dim * sizeof(float) ));

GPUERRCHK( cudaMemset( dev_s1, 0x00, dim*dim * sizeof(float) ));

GPUERRCHK( cudaMemset( dev_s2, 0x00, dim*dim * sizeof(float) ));

//heaters

int *dev_hx, *dev_hy;

float *dev_hT;

int nHeaters = 20;

GPUERRCHK( cudaMalloc( (void**)&dev_hx, nHeaters * sizeof(int) ));

GPUERRCHK( cudaMalloc( (void**)&dev_hy, nHeaters * sizeof(int) ));

GPUERRCHK( cudaMalloc( (void**)&dev_hT, nHeaters * sizeof(float) ));

//init heaters on cpu

int * hx, *hy;

float * hT;

hx = (int*) malloc( nHeaters * sizeof(int) );

hy = (int*) malloc( nHeaters * sizeof(int) );

hT = (float*) malloc( nHeaters * sizeof(float) );

for( int i=0 ; i<nHeaters ; i++ )

{

hx[i] = (int) ((float)drand48() * (float)dim) + 5;

hy[i] = (int) (drand48() * dim) + 5;

hT[i] = (float) (drand48() * 100) + 50;

}

//transfer hx, hy, hT to GPU

GPUERRCHK( cudaMemcpy( dev_hx, hx, nHeaters * sizeof(int), cudaMemcpyHostToDevice ));

GPUERRCHK( cudaMemcpy( dev_hy, hy, nHeaters * sizeof(int), cudaMemcpyHostToDevice ));

GPUERRCHK( cudaMemcpy( dev_hT, hT, nHeaters * sizeof(float), cudaMemcpyHostToDevice ));

float *out_frame = (float *) malloc( dim*dim*sizeof(float) );

// run kernel

int nThreadsPerBlock = 16;

int nBlockX = (dim+nThreadsPerBlock-1)/nThreadsPerBlock;

int nBlockY = (dim+nThreadsPerBlock-1)/nThreadsPerBlock;

kernel<<< dim3(nBlockX, nBlockY), dim3(nThreadsPerBlock, nThreadsPerBlock) >>>( dev_s1, dev_s2, dim, dev_hx, dev_hy, dev_hT, nHeaters );

GPUERRCHK( cudaPeekAtLastError() );

GPUERRCHK( cudaDeviceSynchronize() );

// collect result

GPUERRCHK( cudaMemcpy( out_frame, dev_s2, dim*dim * sizeof(float), cudaMemcpyDeviceToHost ) );

int f=1;

char fName[100];

snprintf( fName, 100, "out/file_%04d.data", f );

writeBuf( fName, out_frame, dim );

cudaFree( dev_s1 );

cudaFree( dev_s2 );

free( out_frame );

}

When I run this, the file contains all zeros. How do I achieve what I plan to achieve?

What could be the problem?

网友答案:

Your problem is in writeBuf(..) @line:

fprintf( fp, "%f ", out_frame[ dim + j ] );

dim is 1024 and you are accessing elements from 1024 to 1024+dim-1 and that is why you never see the first element.
The correct line should be:

fprintf( fp, "%f ", out_frame[ baseIndx + j ] );
分享给朋友:
您可能感兴趣的文章:
随机阅读: