Hi, I really need some help with this thing. I would really like some pointers on what to do to make this thing faster. Since the CPU (intel i5 2500K) runs it faster than my radeon 6780 it feels like I haven’t taken advantage if Opencl’s parallelism or something.
I am thinking it might have something to do that my local work size is 1 or that I need to read back my statesbuffer to my original vector with states (position and velocity) and then resend it again into the kernel which might take a long time on the GPU (?)
I’ve taken the code from the book opencl programming guide and modified it heavily
Kernel:
//
// Convolution.cl
//
// This is a simple kernel performing convolution.
__kernel void convolve2(
__global float * states,
int antalAtomer,
float dt,
int l,
float lx,
float ly,
int TMAX)
{
const int x = get_global_id(0);
if(states[x]<0 || states[x]>lx)
{
states[x+antalAtomer*2]=-states[x+antalAtomer*2];
}
if(states[x+antalAtomer]<0 || states[x+antalAtomer]>ly)
{
states[x+antalAtomer*3]=-states[x+antalAtomer*3];
}
if(l==floor(0.1*TMAX) || l==floor(0.2*TMAX) || l==floor(0.25*TMAX) || l==floor(0.35*TMAX) || l==floor(0.50*TMAX)) // kyl ner skiten så den inte buggar loss
{
states[x+antalAtomer*2]=0;
states[x+antalAtomer*3]=0;
}
states[x] += states[x+antalAtomer*2]*dt; // flytta atomerna varannan gång
states[x+antalAtomer] +=states[x+antalAtomer*3]*dt;
// states[x] =0; // flytta atomerna varannan gång
//states[x+antalAtomer] =l;
}//end convolve2
__kernel void convolve(
__global float * states,
int antalAtomer,
float dt,
int l,
float lx,
float ly,
int TMAX)
{
float a=antalAtomer/2;
const int x2 = get_global_id(0);
const int x= x2%antalAtomer;
int k=(x+1+(int)x2/antalAtomer)%antalAtomer; // 0 om man är i "första"
float dx=states[x]-states[k];
float dy=states[antalAtomer+x]-states[antalAtomer+k];
float r=sqrt(pow(dx,2)+pow(dy,2));
float F=12*pow(r,-14)-6*pow(r,-8);
states[x+antalAtomer*2] += F*dx*dt;
states[x+antalAtomer*3] += F*dy*dt;
states[k+antalAtomer*2] -= F*dx*dt;
states[k+antalAtomer*3] -= F*dy*dt;
} // end convolve
AND the code:
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <istream>
using namespace std;
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#if !defined(CL_CALLBACK)
#define CL_CALLBACK
#endif
///
// Function to check and handle OpenCL errors
inline void
checkErr(cl_int err, const char * name)
{
if (err != CL_SUCCESS) {
std::cerr << "ERROR: " << name << " (" << err << ")" << std::endl;
exit(EXIT_FAILURE);
}
}
void CL_CALLBACK contextCallback(
const char * errInfo,
const void * private_info,
size_t cb,
void * user_data)
{
std::cout << "Error occured during context use: " << errInfo << std::endl;
// should really perform any clearup and so on at this point
// but for simplicitly just exit.
exit(1);
}
///
// main() for Convoloution example
//
int main(int argc, char** argv)
{
const cl_uint antalAtomer=10000;
float lx=50;
float ly=50;
//string mystr ("1204");
//cout<<"ange antal atomer, (troligtvis helst jämnt antal) och kanske en multipel av något i GPU/CPU):"<<endl;
//cin>> antalAtomer;
//cout<<"ange lx: "<< endl;
//cin >>lx;
//cout<<"ange ly: "<< endl;
//cin >> ly;
//cout<<" ange filnamnet för datafilen: "<< endl;
//cin>> mystr;
ofstream statesText("states.txt");
float states[antalAtomer*4]; //hela stateslistan
cl_int errNum;
cl_uint numPlatforms;
cl_uint numDevices;
cl_platform_id * platformIDs;
cl_device_id * deviceIDs;
cl_context context = NULL;
cl_command_queue queue;
cl_command_queue queue2;
cl_program program;
cl_kernel kernel;
cl_kernel kernel2;
cl_mem statesBuffer; // buffer för tillstånden..?
cl_uint TMAX=50;
int avskal=1;
float dt=0.001;
float a=antalAtomer/2;
cout<< "dt= : "<<dt<<endl;
statesText<<antalAtomer<<" "<<lx<<" "<<ly<<" "<<TMAX<<" "<<dt<<" ";
for ( int l=0;l<2*antalAtomer-5;l++)
{
statesText<<1<<" ";
}
statesText<<"
";
for(cl_uint i=0; i<antalAtomer; i++) //gör en initialposition
{
int atomRot=(int)ceil(sqrt(antalAtomer));
//states[i]=0; //x-position
//states[i+antalAtomer]=0; //y-position
states[i]=(float)lx/(atomRot-1)*(i-atomRot*((int)i/(atomRot))); //x-position
states[i+antalAtomer]=(float)ly/(atomRot-1)*((int)(i/(atomRot))); //y-position
states[i+antalAtomer*2]=0; // 0 i Vx
states[i+antalAtomer*3]=0; // 0 i Vy
}
for(int i=0; i<antalAtomer; i++)
{
statesText<<states[i]<< " " << states[i+antalAtomer]<< " ";
}
statesText<<endl;
// First, select an OpenCL platform to run on.
errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
checkErr(
(errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
"clGetPlatformIDs");
platformIDs = (cl_platform_id *)alloca(
sizeof(cl_platform_id) * numPlatforms);
errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
checkErr(
(errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS),
"clGetPlatformIDs");
// Iterate through the list of platforms until we find one that supports
// a CPU device, otherwise fail with an error.
deviceIDs = NULL;
cl_uint i;
for (i = 0; i < numPlatforms; i++)
{
errNum = clGetDeviceIDs(
platformIDs[i],
CL_DEVICE_TYPE_GPU,
0,
NULL,
&numDevices);
if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
{
checkErr(errNum, "clGetDeviceIDs");
}
else if (numDevices > 0)
{
deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
errNum = clGetDeviceIDs(
platformIDs[i],
CL_DEVICE_TYPE_GPU,
numDevices,
&deviceIDs[0],
NULL);
checkErr(errNum, "clGetDeviceIDs");
break;
}
}
// Check to see if we found at least one CPU device, otherwise return
if (deviceIDs == NULL) {
std::cout << "No CPU device found" << std::endl;
exit(-1);
}
// Next, create an OpenCL context on the selected platform.
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)platformIDs[i],
0
};
context = clCreateContext(
contextProperties,
numDevices,
deviceIDs,
&contextCallback,
NULL,
&errNum);
checkErr(errNum, "clCreateContext");
std::ifstream srcFile("Convolution.cl");
checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl");
std::string srcProg(
std::istreambuf_iterator<char>(srcFile),
(std::istreambuf_iterator<char>()));
const char * src = srcProg.c_str();
size_t length = srcProg.length();
// Create program from source
program = clCreateProgramWithSource(
context,
1,
&src,
&length,
&errNum);
checkErr(errNum, "clCreateProgramWithSource");
// Build program
errNum = clBuildProgram(
program,
numDevices,
deviceIDs,
NULL,
NULL,
NULL);
if (errNum != CL_SUCCESS)
{
// Determine the reason for the error
char buildLog[16384];
clGetProgramBuildInfo(
program,
deviceIDs[0],
CL_PROGRAM_BUILD_LOG,
sizeof(buildLog),
buildLog,
NULL);
std::cerr << "Error in kernel: " << std::endl;
std::cerr << buildLog;
checkErr(errNum, "clBuildProgram");
}
// Create kernel object
kernel = clCreateKernel(
program,
"convolve",
&errNum);
checkErr(errNum, "clCreateKernel");
kernel2 = clCreateKernel(
program,
"convolve2",
&errNum);
checkErr(errNum, "clCreateKernel");
//BUFFER FÖR TILLSTÅNDEN **************** :D :D :D :PPPPPPPPPPPPPPppPpPPp
statesBuffer = clCreateBuffer(
context,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
sizeof(float) * antalAtomer * 4,
static_cast<void *>(states),
&errNum);
checkErr(errNum, "clCreateBuffer(statesBuffer)");
// Pick the first device and create command queue.
queue = clCreateCommandQueue(
context,
deviceIDs[0],
0,
&errNum);
checkErr(errNum, "clCreateCommandQueue");
for(int l=0; l<TMAX; l++) // MAiN LO0000000000000000000000000000000000000000OP
{
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &statesBuffer); //se
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_uint), &antalAtomer); // skicka in antalatomer
errNum |= clSetKernelArg(kernel, 2, sizeof(float), &dt);
errNum |= clSetKernelArg(kernel, 3, sizeof(int), &l);
errNum |= clSetKernelArg(kernel, 4, sizeof(float), &lx);
errNum |= clSetKernelArg(kernel, 5, sizeof(float), &ly);
errNum |= clSetKernelArg(kernel, 6, sizeof(cl_uint), &TMAX);
checkErr(errNum, "clSetKernelArg");
const size_t globalWorkSize[1] = { antalAtomer*(antalAtomer/2)};
const size_t localWorkSize[1] = { 1 };
cl_event event;
// Queue the kernel up for execution across the array
errNum = clEnqueueNDRangeKernel(
queue,
kernel,
1,
NULL,
globalWorkSize,
localWorkSize,
0,
NULL,
&event);
//clWaitForEvents(1, &event); //väntar så man inte börjar med annan skit innan detta är klart
checkErr(errNum, "clEnqueueNDRangeKernel");
errNum = clEnqueueReadBuffer(
queue,
statesBuffer,
CL_TRUE,
0,
sizeof(float) * 4 * antalAtomer,
states,
0,
NULL,
NULL);
checkErr(errNum, "clEnqueueReadBuffer");
//FÖRSTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
// FÖRSTA GREJEN TYP RÄKNA KRAFT ******************************************DDDDDDDDDDDDSSSS
errNum = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &statesBuffer); //se
errNum |= clSetKernelArg(kernel2, 1, sizeof(cl_uint), &antalAtomer); // skicka in antalatomer
errNum |= clSetKernelArg(kernel2, 2, sizeof(float), &dt);
errNum |= clSetKernelArg(kernel2, 3, sizeof(int), &l);
errNum |= clSetKernelArg(kernel2, 4, sizeof(float), &lx);
errNum |= clSetKernelArg(kernel2, 5, sizeof(float), &ly);
errNum |= clSetKernelArg(kernel2, 6, sizeof(cl_uint), &TMAX);
checkErr(errNum, "clSetKernelArg");
const size_t globalWorkSize2[1] = { antalAtomer};
const size_t localWorkSize2[1] = { 1 };
// Queue the kernel up for execution across the array
errNum = clEnqueueNDRangeKernel(
queue,
kernel2,
1,
NULL,
globalWorkSize2,
localWorkSize2,
0,
NULL,
NULL);
checkErr(errNum, "clEnqueueNDRangeKernel");
errNum = clEnqueueReadBuffer(
queue,
statesBuffer,
CL_TRUE,
0,
sizeof(float) * 4 * antalAtomer,
states,
0,
NULL,
NULL);
checkErr(errNum, "clEnqueueReadBuffer");
if(l%avskal==0)
{
for(int k=0; k<antalAtomer; k++)
{
statesText<<states[k]<< " " << states[k+antalAtomer]<<" ";
}
statesText<<"
";
cout<<" klarhet "<< (double) 100*l/TMAX<<"
";
}
}
std::cout << std::endl << "Executed program succesfully." << std::endl;
return 0;
}