Hello I’m trying to do some calculation using OpenCL but I’m not getting the result I was expecting. This code was run in one test on two Tesla T10 Processors and another test in one single core Intel 2GHz.
These were the results:
CPU: Overall: 13 files, 20563 KB in 0.4 s Throughput: 51407KB/s 49MB/s
2 GPUs: Overall: 13 files, 20563 KB in 25.57 s Throughput: 804KB/s 0MB/s
What am I doing wrong? How is it possible to get the best of the two GPUs? What am I missing?
This is the API:
#ifndef __OCLSTACK_H__
#define __OCLSTACK_H__
#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>
#define MAX_ENTRY_DEVICES 8
enum PFS_DEVICE_T {CPU = CL_DEVICE_TYPE_CPU, GPU = CL_DEVICE_TYPE_GPU, ALL = CL_DEVICE_TYPE_ALL};
typedef struct _pfs pfs_t;
struct _pfs{
cl_platform_id platform;
cl_device_id device[8];
cl_device_type deviceType[8];
cl_uint num_devices;
cl_uint num_entry_devices;
cl_ulong maxAllocSizes[8];
cl_context contexts[8]; //Combinations amongst number of devices
cl_context_properties properties[3];
cl_command_queue commands[8];
cl_program programs[8];
cl_kernel kernels[8];
void* kargs[8];
const char* kernelName[8];
const char* kernelSource;
};
int
pfs_initiate( pfs_t* handler, const char* kernelFile, enum PFS_DEVICE_T dt );
int
pfs_createKernels( pfs_t* h, const char* kName );
int
pfs_terminate( pfs_t* handler );
int
pfs_run( pfs_t* h, unsigned int *state, unsigned char* input, unsigned int size );
#endif
#include "pfs.h"
#include "pfserr.h"
#include <CL/cl.h>
int
pfs_run( pfs_t* h, unsigned int *state, unsigned char* input, unsigned int size ){
int err = CL_SUCCESS;
static unsigned int last_size=0;
if(size != 0)
last_size = size;
// create buffers for the input and ouput
cl_mem _result, _size,_block, _last;
// printf("Size: %d, Input[0..4]:%c%c%c%c
", size, input[0], input[1], input[2], input[3]);
_block = clCreateBuffer(h->contexts[0], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned char) * last_size, input, &err);
if( err != CL_SUCCESS ){
printf("[-] Unable to create a buffer object. BLOCK
");
return pfs__trace_clCreateBuffer(err);
}
_result = clCreateBuffer(h->contexts[0], CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(unsigned int) * 4 , state, &err);
if( err != CL_SUCCESS ){
printf("[-] Unable to create a buffer object. RESULT
");
return pfs__trace_clCreateBuffer(err);
}
_size = clCreateBuffer(h->contexts[0], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned int), &last_size, &err);
if( err != CL_SUCCESS ){
printf("[-] Unable to create a buffer object. SIZE
");
return pfs__trace_clCreateBuffer(err);
}
_last = clCreateBuffer(h->contexts[0], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned int), &size, &err);
if( err != CL_SUCCESS ){
printf("[-] Unable to create a buffer object. SIZE
");
return pfs__trace_clCreateBuffer(err);
}
// load data into the input buffer
err = clEnqueueWriteBuffer(h->commands[0], _size, CL_TRUE, 0, sizeof(unsigned int), &last_size, 0, NULL, NULL);
if( err != CL_SUCCESS ){
printf("[-] Unable to enqueue h->commands[0] to write to a buffer object from host memory SIZE.
");
return pfs__trace_clEnqueueWriteBuffer(err);
}
err = clEnqueueWriteBuffer(h->commands[0], _block, CL_TRUE, 0, sizeof(unsigned char) * last_size, input, 0, NULL, NULL);
if( err != CL_SUCCESS ){
printf("[-] Unable to enqueue h->commands[0] to write to a buffer object from host memory BLOCK.
");
return pfs__trace_clEnqueueWriteBuffer(err);
}
err = clEnqueueWriteBuffer(h->commands[0], _result, CL_TRUE, 0, sizeof(unsigned int) * 4, state, 0, NULL, NULL);
if( err != CL_SUCCESS ){
printf("[-] Unable to enqueue h->commands[0] to write to a buffer object from host memory RESULT.
");
return pfs__trace_clEnqueueWriteBuffer(err);
}
err = clEnqueueWriteBuffer(h->commands[0], _last, CL_TRUE, 0, sizeof(unsigned int), &size, 0, NULL, NULL);
if( err != CL_SUCCESS ){
printf("[-] Unable to enqueue h->commands[0] to write to a buffer object from host memory LAST.
");
return pfs__trace_clEnqueueWriteBuffer(err);
}
err = clFlush(h->commands[0]);
if( err != CL_SUCCESS ){
printf("[-] Unable to block until all previously queued OpenCL h->commands[0] in a command-queue are issued to the associated device and have completed.
");
return pfs__trace_clFinish(err);
}
//printf("WARGS data loaded into input buffers
");
// set the argument list for the kernel command
err = clSetKernelArg(h->kernels[0], 0, sizeof(cl_mem), &_block);
if( err != CL_SUCCESS ){
printf("[-] Unable to set the argument value for a specific argument of a kernel. BLOCK
");
return pfs__trace_clSetKernelArg(err);
}
err = clSetKernelArg(h->kernels[0], 1, sizeof(cl_mem), &_result);
if( err != CL_SUCCESS ){
printf("[-] Unable to set the argument value for a specific argument of a kernel. RESULT
");
return pfs__trace_clSetKernelArg(err);
}
err = clSetKernelArg(h->kernels[0], 2, sizeof(cl_mem), &_size);
if( err != CL_SUCCESS ){
printf("[-] Unable to set the argument value for a specific argument of a kernel. SIZE
");
return pfs__trace_clSetKernelArg(err);
}
err = clSetKernelArg(h->kernels[0], 3, sizeof(cl_mem), &_last);
if( err != CL_SUCCESS ){
printf("[-] Unable to set the argument value for a specific argument of a kernel. LAST
");
return pfs__trace_clSetKernelArg(err);
}
//printf("WARGS set the argument list for the kernels command
");
//cl_event event;
size_t global=1;
// enqueue the kernel command for execution
//clEnqueueNDRangeKernel(h->commands[0], h->kernels[0], 1, NULL, &global, NULL,
// event_list_size, event_list, &event);
err = clEnqueueNDRangeKernel(h->commands[0], h->kernels[0], 1, NULL, &global, NULL, 0, NULL, NULL);
if( err != CL_SUCCESS ){
printf("[-] Unable to enqueue a command to execute a kernel on a device.
");
return pfs__trace_clEnqueueNDRangeKernel(err);
}
//printf("WARGS ND Enqueued
");
//event_list[event_list_size] = event;
//event_list_size++;
if(size == 0){
// printf("Finishing..
");
err = clFinish(h->commands[0]);
}else{
// printf("Flushing..
");
err = clFlush(h->commands[0]);
}
if( err != CL_SUCCESS ){
printf("[-] Unable to block until all previously queued OpenCL h->commands[0] in a command-queue are issued to the associated device and have completed.
");
return pfs__trace_clFinish(err);
}
//printf("WARGS Command finished
");
// copy the results from out of the output buffer
err = clEnqueueReadBuffer(h->commands[0], _result, CL_TRUE, 0, sizeof(unsigned int) * 4, state, 0, NULL, NULL);
if( err != CL_SUCCESS ){
printf("[-] Unable to enqueue h->commands[0] to read from a buffer object to host memory.
");
return pfs__trace_clEnqueueReadBuffer(err);
}
clReleaseMemObject(_block);
clReleaseMemObject(_size);
clReleaseMemObject(_last);
clReleaseMemObject(_result);
return err;
}
This is the main application:
(...)
for(FileNum = argc; FileNum > 2; FileNum--){
fp = fopen(argv[FileNum-1],"rb");
if(fp == NULL)
return 1;
state[0] = 0x67452301u;
state[1] = 0xEFCDAB89u;
state[2] = 0x98BADCFEu;
state[3] = 0x10325476u;
start = clock();
do{
bytes = fread(input, 1, input_size, fp);
if( pfs_run(&h, state, input, bytes) )
return -1;
bytesTotal += bytes;
}while(bytes != 0);
end = clock();
for(i=0; i<4; i++)
tmp[i] = (state[i] >> 24)
| ((state[i] >> 8 ) & 0x0000FF00)
| ((state[i] << 8 ) & 0x00FF0000)
| (state[i] << 24);
printf("%08x%08x%08x%08x %20s %g s
",tmp[0],tmp[1],tmp[2],tmp[3], argv[FileNum-1],
((double)( end - start )) / CLOCKS_PER_SEC);
fclose(fp);
(...)
}