system goes into some indefinite loop - adding two vectors

Hi, i am being able to compile this code but it doesn’t give me any output and goes in some indefinite loop (or something else)… i dont understand whats happening. I’m a newbie to OpenCL and this is my first program… All i’m trying to do is add two vectors. I get no errors during the compilation. If i am not wrong, i should see the ‘End’ that i’ve printed after the gpu portion completes right?

This is the machine detail that i’m running the code on:
Linux gpu02.cluster 2.6.18-92.1.22.el5 #1 SMP Tue Dec 16 11:57:43 EST 2008 x86_64 x86_64 x86_64 GNU/Linux


#include <stdio.h>
#include <CL/cl.h>

#define SIZE 10

int va[SIZE];
int vb[SIZE];
int vc[SIZE];

char* load_program_source(const char*);

int Init(){
        int i;
        srand(20);
        for(i=0;i<SIZE;i++){
                va[i]=rand()%10;
                vb[i]=rand()%10;
        }
}

char* load_program_source(const char *filename)
{
        int fileSize = 0;
        FILE *pFile = fopen(filename, "r");
        rewind(pFile);
        fseek(pFile, 0, SEEK_END);
        fileSize = ftell(pFile);
        rewind(pFile);

        char *data = (char*) calloc(sizeof(char), fileSize+1);
        fread(data, 1, fileSize, pFile);
        data[fileSize]='\0';
        fclose(pFile);
        return data;
}

int main(){
        if(!Init()){
                printf("Unable to initialize data");
                return 1;
        }

        cl_context GPUContext = clCreateContextFromType(0,CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
        if(!GPUContext){
                printf("Error: Failed to create context");
                return 1;
        }

        //Get the list of GPU devices associated with this context
        size_t ParmDataBytes;
        clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
        cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
        clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes,GPUDevices,NULL);

        //Create a command queue on first gpu device
        cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0],0,NULL);
        if(!GPUCommandQueue){
                printf("Error: Failed to create a command queue");
                return 1;
        }

        //Allocate memory
        cl_mem GPUva = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*SIZE, va, NULL);
        cl_mem GPUvb = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*SIZE, vb, NULL);
        cl_mem GPUvc = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(int)*SIZE, NULL, NULL);

        //Create OCL program reading the source code from the file
        char * OclSource = load_program_source("vectoradd.cl");
        cl_program  OpenCLProgram = clCreateProgramWithSource(GPUContext,1,(const char**)&OclSource,NULL,NULL);

        //Build the program
        clBuildProgram(OpenCLProgram,0,NULL,NULL,NULL,NULL);

        //obtain the handle for the kernel
        cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram,"VectorAdd",NULL);

       //associate GPU memory with the kernel
        clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUvc);
        clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUvb);
        clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUva);

        //Launch the kernel in the GPU
        size_t WorkSize[1] = {SIZE};
        clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd,1,NULL,WorkSize,NULL,0,NULL,NULL);

        //copy the result back to the main memory
        clEnqueueReadBuffer(GPUCommandQueue, GPUvc, CL_TRUE,0,sizeof(int) * SIZE, vc, 0, NULL, NULL);

        //cleanup
        free(GPUDevices);
        clReleaseKernel(OpenCLVectorAdd);
        clReleaseProgram(OpenCLProgram);
        clReleaseCommandQueue(GPUCommandQueue);
        clReleaseContext(GPUContext);
        clReleaseMemObject(GPUva);
        clReleaseMemObject(GPUvb);
        clReleaseMemObject(GPUvc);

        printf("End");
        return 0;
}


this is my kernel code:


__kernel void VectorAdd(__global int* vc, __global int* vb, __global int* va){
        int i;
        for(i=0;i<100;i++){
                vc[i]=vb[i]+va[i];
        }
}

could anyone please help me with it?

Thanks a lot

Size in the c code is 10. Loop in the kernel loops 100 times. Result is that you overflow your memory objects.

With that change, and the addition of a print loop:

0 + 6 = 6
5 + 7 = 12
0 + 2 = 2
0 + 9 = 9
9 + 4 = 13
1 + 0 = 1
8 + 0 = 8
3 + 0 = 3
9 + 0 = 9
1 + 3 = 4

Runs fine on CPU and GPU on my MacBook Pro (10.6.2)

well how do you run the code? Its just like running a simple .c code isn’t it? or is there something else to do, because it’s not changing the output when i change that to 10.
I compile the code like:

gcc -o vectoradd vectoradd.c -lOpenCL
vectoradd.c: In function ‘load_program_source’:
vectoradd.c:30: warning: incompatible implicit declaration of built-in function ‘calloc’
vectoradd.c: In function ‘main’:
vectoradd.c:52: warning: incompatible implicit declaration of built-in function ‘malloc’

and i run the code like:

cl]$ ./vectoradd

do i need to specify anything else?

and one more thing. I’m connecting to a remote server and working on it using ssh, should that cause any problems in producing the output on the screen?

Sorry for all the trouble folks - it was the stupid PC that causing problems, not the code in itself. Thanks a lot.