当前位置: 动力学知识库 > 问答 > 编程问答 >

c - OpenCL clCreateBuffer() crashes the program

问题描述:

I have taken up OpenCL programming as part of a university project and I have a bit of a problem when I try to input data to a buffer object during the clCreateBuffer() routine.

The program is a simple two-dimensional matrix addition. The code is as follows:

#define _CRT_SECURE_NO_WARNINGS

#define PROGRAM_FILE "add_kernel.cl"

#define ADD_FUNC "add_matrix"

#define MATRIX_DIM 256

#include <math.h>

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <time.h>

#ifdef MAC

#include <OpenCL/cl.h>

#else

#include <CL/cl.h>

#endif

/* Find a GPU associated with the first available platform */

cl_device_id create_device() {

cl_platform_id platform;

cl_device_id dev;

int err;

/* Identify a platform */

err = clGetPlatformIDs(1, &platform, NULL);

if(err < 0) {

perror("Couldn't identify a platform");

exit(1);

}

/* Access a GPU */

err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);

if(err < 0) {

perror("Couldn't access any GPU type");

exit(1);

}

return dev;

}

cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {

cl_program program;

FILE *program_handle;

char *program_buffer, *program_log;

size_t program_size, log_size;

int err;

/* Read program file and place content into buffer */

program_handle = fopen(filename, "r");

if(program_handle == NULL) {

perror("Couldn't find the program file");

exit(1);

}

fseek(program_handle, 0, SEEK_END);

program_size = ftell(program_handle);

rewind(program_handle);

program_buffer = (char*)malloc(program_size + 1);

program_buffer[program_size] = '\0';

fread(program_buffer, sizeof(char), program_size, program_handle);

fclose(program_handle);

/* Create program from file */

program = clCreateProgramWithSource(ctx, 1,

(const char**)&program_buffer, &program_size, &err);

if(err < 0) {

perror("Couldn't create the program");

exit(1);

}

free(program_buffer);

/* Build program */

err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

if(err < 0) {

/* Find size of log and print to std output */

clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,

0, NULL, &log_size);

program_log = (char*) malloc(log_size + 1);

program_log[log_size] = '\0';

clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,

log_size + 1, program_log, NULL);

printf("%s\n", program_log);

free(program_log);

exit(1);

}

return program;

}

int main(){

/* Host/device data structures */

cl_device_id device;

cl_context context;

cl_command_queue queue;

cl_program program;

cl_kernel add_kernel;

size_t global_size;

cl_ulong mem_size;

cl_int i, j, err, check;

/* Data and buffers */

cl_uint matrix_dim;

float a_mat[MATRIX_DIM][MATRIX_DIM], b_mat[MATRIX_DIM][MATRIX_DIM],

c_mat[MATRIX_DIM][MATRIX_DIM], check_mat[MATRIX_DIM][MATRIX_DIM];

cl_mem a_buffer, b_buffer, c_buffer;

/* Initialize A, B, and check matrices */

srand((unsigned int)time(0));

for(i=0; i<MATRIX_DIM; i++) {

for(j=0; j<MATRIX_DIM; j++) {

a_mat[i][j] = (float)rand()/RAND_MAX;

}

}

srand((unsigned int)time(0));

for(i=0; i<MATRIX_DIM; i++) {

for(j=0; j<MATRIX_DIM; j++) {

b_mat[i][j] = (float)rand()/RAND_MAX;

check_mat[i][j] = 0.0f;

}

}

for(i=0; i<MATRIX_DIM; i++) {

for(j=0; j<MATRIX_DIM; j++) {

check_mat[i][j] += a_mat[i][j] + b_mat[i][j];

}

}

/* Create a device and context */

device = create_device();

context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);

if(err < 0) {

perror("Couldn't create a context");

exit(1);

}

/* Build the program */

program = build_program(context, device, PROGRAM_FILE);

add_kernel = clCreateKernel(program, ADD_FUNC, &err);

if(err < 0) {

perror("Couldn't create a kernel");

exit(1);

};

/* Create buffers */

a_buffer = clCreateBuffer(context,

CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,

sizeof(a_mat), a_mat, &err);

if(err < 0) {

perror("Couldn't create buffer A");

exit(1);

};

b_buffer = clCreateBuffer(context,

CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,

sizeof(b_mat), b_mat, &err);

if(err < 0) {

perror("Couldn't create buffer B");

exit(1);

};

c_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,

sizeof(c_mat), NULL, &err);

if(err < 0) {

perror("Couldn't create buffer C");

exit(1);

};

/* Create a command queue */

queue = clCreateCommandQueue(context, device, 0, &err);

if(err < 0) {

perror("Couldn't create a command queue");

exit(1);

};

/* Create arguments for multiplication kernel */

err = clSetKernelArg(add_kernel, 0, sizeof(a_buffer), &a_buffer);

err |= clSetKernelArg(add_kernel, 1, sizeof(b_buffer), &b_buffer);

err |= clSetKernelArg(add_kernel, 2, sizeof(c_buffer), &c_buffer);

global_size = MATRIX_DIM * MATRIX_DIM;

//printf("%lu\n", global_size);

err = clEnqueueNDRangeKernel(queue, add_kernel, 1, NULL, &global_size,

NULL, 0, NULL, NULL);

if(err < 0) {

perror("Couldn't enqueue the addition kernel");

exit(1);

}

/* Read output buffer */

err = clEnqueueReadBuffer(queue, c_buffer, CL_TRUE, 0,

sizeof(c_mat), c_mat, 0, NULL, NULL);

if(err < 0) {

perror("Couldn't read the buffer");

exit(1);

}

/* Check result */

check = 1;

for(i=0; i<MATRIX_DIM; i++) {

for(j=0; j<MATRIX_DIM; j++) {

if(c_mat[i][j] != check_mat[i][j]){

check = 0;

break;

}

}

}

if(check)

printf("Addition check succeeded.\n");

else

printf("Addition check failed.\n");

/* Deallocate resources */

clReleaseMemObject(a_buffer);

clReleaseMemObject(b_buffer);

clReleaseMemObject(c_buffer);

clReleaseKernel(add_kernel);

clReleaseCommandQueue(queue);

clReleaseProgram(program);

clReleaseContext(context);

return 0;

}

The kernel code is the following:

__kernel void add_matrix(__global float* matrix_a,

__global float* matrix_b,

__global float* result) {

int i = get_global_id(0);

result[i] = matrix_a[i] + matrix_b[i];

}

Now, it works great for dimensions up to 358x358, but as soon as I put 359 in the MATRIX_DIM it crashes. It shows the usual "foo.exe has stopped working". I know it has to do something with the clCreateBuffer() command because if I remove the code from the first clCreateBuffer() and below, it runs and terminates fine, but as soon as I add even one it crashes.

The CL_DEVICE_MAX_MEM_ALLOC_SIZE option shows a number of 512MB of available memory and the data I am trying to pass is much less than that.

Is there anything I can do to increase the ammount of data I can process?

My GPU is a Radeon Sapphire HD5770

EDIT: After a suggestion in the comments I ran the debugger which yielded the following message:

Program received signal SIGSEGV, Segmentation fault.

In amdocl!_aclHsaLoader () (C:\WINDOWS\SysWOW64\amdocl.dll)

#15 0x00401355 in create_device () at C:\test\testcl.c:26

C:\test\testcl.c:26:503:beg:0x401355

I am really not sure what this means though. Any ideas?

网友答案:

The main problem is, that you allocate to much memory on the stack at these code lines, so that, you got a stack overflow:

float a_mat[MATRIX_DIM][MATRIX_DIM], b_mat[MATRIX_DIM][MATRIX_DIM],
      c_mat[MATRIX_DIM][MATRIX_DIM], check_mat[MATRIX_DIM][MATRIX_DIM];

In my test here, the execution didn't even entered the main method. You have to allocate these matrices on the heap with:

float *a_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*a_mat));
float *b_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*b_mat));
float *c_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*c_mat));
float *check_mat = calloc(MATRIX_DIM*MATRIX_DIM, sizeof(*check_mat));

But now, you have only a 1-dimensional (1D) data-buffer for each matrix, so that, you have to change every 2D index [i][j] into the corresponding 1D index [i*MATRIX_DIM][j], e.g.:

a_mat[i*MATRIX_DIM+j] = (float)rand()/RAND_MAX;

EDIT: You have to also update the calls to clCreateBuffer und clEnqueueReadBuffer. The matrix size cannot be determined with sizeof(matrix_name) anymore (where matrix_name is one of a_mat, b_mat, ...). You have to replace every such sizeof (there are 4 of some) with MATRIX_DIM*MATRIX_DIM*sizeof(*matrix_name). Don't forget the derefence before the matrix_name, e.g.:

a_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
      MATRIX_DIM*MATRIX_DIM*sizeof(*a_mat), a_mat, &err);

(End of Edit).

Don't forget to release the data-buffers at the end:

free(a_mat);
free(b_mat);
free(c_mat);
free(check_mat);

To get even the kernel to run, I had even to fix the reading of the kernel program. The return value of ftell was always a little bit too large. The actual number of bytes is instead returned by fread. Thus, change these lines

program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);

to

program_size  = fread(program_buffer, sizeof(char), program_size, program_handle); // changed
program_buffer[program_size] = '\0'; // moved here
分享给朋友:
您可能感兴趣的文章:
随机阅读: