Slow MD-program. My cpu runs it faster than my GPU

Hi, I really need some help with this thing. I would really like some pointers on what to do to make this thing faster. Since the CPU (intel i5 2500K) runs it faster than my radeon 6780 it feels like I haven’t taken advantage if Opencl’s parallelism or something.

I am thinking it might have something to do that my local work size is 1 or that I need to read back my statesbuffer to my original vector with states (position and velocity) and then resend it again into the kernel which might take a long time on the GPU (?)

I’ve taken the code from the book opencl programming guide and modified it heavily :slight_smile:
Kernel:

//

// Convolution.cl
//
//    This is a simple kernel performing convolution.

__kernel void convolve2(
	__global float * states,
	 int antalAtomer,
	 float dt,
	 int l,
	 float lx,
	 float ly,
	int TMAX)
{
    const int x = get_global_id(0);
   
	
	
	 
		
		
		

			
		if(states[x]<0 || states[x]>lx)
			{
				states[x+antalAtomer*2]=-states[x+antalAtomer*2];
		
			}
		if(states[x+antalAtomer]<0 || states[x+antalAtomer]>ly)
			
			{

				states[x+antalAtomer*3]=-states[x+antalAtomer*3];

			}

		if(l==floor(0.1*TMAX) || l==floor(0.2*TMAX) || l==floor(0.25*TMAX) || l==floor(0.35*TMAX) || l==floor(0.50*TMAX))  // kyl ner skiten så den inte buggar loss
			{

			states[x+antalAtomer*2]=0;
			states[x+antalAtomer*3]=0;
			}

	states[x] += states[x+antalAtomer*2]*dt;                 // flytta atomerna varannan gång
	states[x+antalAtomer] +=states[x+antalAtomer*3]*dt;

	//	states[x] =0;                 // flytta atomerna varannan gång
	//states[x+antalAtomer] =l;
	
			

		


}//end convolve2



__kernel void convolve(
	__global float * states,
	 int antalAtomer,
	 float dt,
	 int l,
	 float lx,
	float ly,
	int TMAX)
{

	float a=antalAtomer/2;

		 const int x2 = get_global_id(0);
    const int x= x2%antalAtomer;
	
		
		
		int k=(x+1+(int)x2/antalAtomer)%antalAtomer; // 0 om man är i "första"

		
                float dx=states[x]-states[k];
   		 float dy=states[antalAtomer+x]-states[antalAtomer+k];
	
  		 float r=sqrt(pow(dx,2)+pow(dy,2));
 		 
   		float F=12*pow(r,-14)-6*pow(r,-8);
  	 

    			states[x+antalAtomer*2] += F*dx*dt;
			states[x+antalAtomer*3] += F*dy*dt;

			states[k+antalAtomer*2] -= F*dx*dt;
			states[k+antalAtomer*3] -= F*dy*dt;
			
		




	
} // end convolve

AND the code:



#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <istream>
using namespace std;

#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

#if !defined(CL_CALLBACK)
#define CL_CALLBACK
#endif


///
// Function to check and handle OpenCL errors
inline void 
checkErr(cl_int err, const char * name)
{
    if (err != CL_SUCCESS) {
        std::cerr << "ERROR: " <<  name << " (" << err << ")" << std::endl;
        exit(EXIT_FAILURE);
    }
}

void CL_CALLBACK contextCallback(
	const char * errInfo,
	const void * private_info,
	size_t cb,
	void * user_data)
{
	std::cout << "Error occured during context use: " << errInfo << std::endl;
	// should really perform any clearup and so on at this point
	// but for simplicitly just exit.
	exit(1);
}

///
//	main() for Convoloution example
//
int main(int argc, char** argv)
{
	

const cl_uint antalAtomer=10000;
 float lx=50;
 float ly=50;
//string mystr ("1204");
//cout<<"ange antal atomer, (troligtvis helst jämnt antal) och kanske en multipel av något i GPU/CPU):"<<endl;
//cin>> antalAtomer;
//cout<<"ange lx: "<< endl;
//cin >>lx;
//cout<<"ange ly: "<< endl;
//cin >> ly;
//cout<<" ange filnamnet för datafilen: "<< endl;
//cin>> mystr;

	ofstream statesText("states.txt");
float states[antalAtomer*4];  //hela stateslistan



    cl_int errNum;
    cl_uint numPlatforms;
	cl_uint numDevices;
    cl_platform_id * platformIDs;
	cl_device_id * deviceIDs;
    cl_context context = NULL;
	cl_command_queue queue;
	cl_command_queue queue2;
	cl_program program;
	cl_kernel kernel;
	cl_kernel kernel2;

	cl_mem statesBuffer;  // buffer för tillstånden..? 

	cl_uint TMAX=50;
	int avskal=1;
	float dt=0.001;

	float a=antalAtomer/2;
	cout<< "dt= : "<<dt<<endl;


		statesText<<antalAtomer<<" "<<lx<<" "<<ly<<" "<<TMAX<<" "<<dt<<" ";
 for ( int l=0;l<2*antalAtomer-5;l++)
    {
      statesText<<1<<" ";
    }
 statesText<<"
";
	
  for(cl_uint i=0; i<antalAtomer; i++)  //gör en initialposition
    {

      int atomRot=(int)ceil(sqrt(antalAtomer));
      //states[i]=0;               //x-position
      //states[i+antalAtomer]=0;   //y-position

	  states[i]=(float)lx/(atomRot-1)*(i-atomRot*((int)i/(atomRot)));               //x-position
      states[i+antalAtomer]=(float)ly/(atomRot-1)*((int)(i/(atomRot)));   //y-position



	  states[i+antalAtomer*2]=0; // 0 i Vx
	  states[i+antalAtomer*3]=0; // 0 i Vy

    }
  

	   for(int i=0; i<antalAtomer; i++)
    {
      statesText<<states[i]<< " " << states[i+antalAtomer]<< " ";
    }

	statesText<<endl;



      // First, select an OpenCL platform to run on.  
	errNum = clGetPlatformIDs(0, NULL, &numPlatforms);
	checkErr( 
		(errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
		"clGetPlatformIDs"); 
 
	platformIDs = (cl_platform_id *)alloca(
       		sizeof(cl_platform_id) * numPlatforms);

    errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
    checkErr( 
	   (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), 
	   "clGetPlatformIDs");

	// Iterate through the list of platforms until we find one that supports
	// a CPU device, otherwise fail with an error.
	deviceIDs = NULL;
	cl_uint i;
	for (i = 0; i < numPlatforms; i++)
	{
		errNum = clGetDeviceIDs(
            platformIDs[i], 
            CL_DEVICE_TYPE_GPU, 
            0,
            NULL,
            &numDevices);
		if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND)
	    {
			checkErr(errNum, "clGetDeviceIDs");
        }
	    else if (numDevices > 0) 
		{
		   	deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices);
			errNum = clGetDeviceIDs(
				platformIDs[i],
				CL_DEVICE_TYPE_GPU,
				numDevices, 
				&deviceIDs[0], 
				NULL);
			checkErr(errNum, "clGetDeviceIDs");
			break;
	   }
	}

	// Check to see if we found at least one CPU device, otherwise return
	if (deviceIDs == NULL) {
		std::cout << "No CPU device found" << std::endl;
		exit(-1);
	}

    // Next, create an OpenCL context on the selected platform.  
    cl_context_properties contextProperties[] =
    {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platformIDs[i],
        0
    };
    context = clCreateContext(
		contextProperties, 
		numDevices,
        deviceIDs, 
		&contextCallback,
		NULL, 
		&errNum);
	checkErr(errNum, "clCreateContext");

	std::ifstream srcFile("Convolution.cl");
    checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl");

	std::string srcProg(
        std::istreambuf_iterator<char>(srcFile),
        (std::istreambuf_iterator<char>()));

	const char * src = srcProg.c_str();
	size_t length = srcProg.length();

	// Create program from source
	program = clCreateProgramWithSource(
		context, 
		1, 
		&src, 
		&length, 
		&errNum);
	checkErr(errNum, "clCreateProgramWithSource");

	// Build program
	errNum = clBuildProgram(
		program,
		numDevices,
		deviceIDs,
		NULL,
		NULL,
		NULL);
    if (errNum != CL_SUCCESS)
    {
        // Determine the reason for the error
        char buildLog[16384];
        clGetProgramBuildInfo(
			program, 
			deviceIDs[0], 
			CL_PROGRAM_BUILD_LOG,
            sizeof(buildLog), 
			buildLog, 
			NULL);

        std::cerr << "Error in kernel: " << std::endl;
        std::cerr << buildLog;
		checkErr(errNum, "clBuildProgram");
    }
	// Create kernel object
	kernel = clCreateKernel(
		program,
		"convolve",
		&errNum);
	checkErr(errNum, "clCreateKernel");


		kernel2 = clCreateKernel(
		program,
		"convolve2",
		&errNum);
	checkErr(errNum, "clCreateKernel");



	//BUFFER FÖR TILLSTÅNDEN **************** :D :D :D :PPPPPPPPPPPPPPppPpPPp
		statesBuffer = clCreateBuffer(
		context,
		CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
		sizeof(float) * antalAtomer * 4,
		static_cast<void *>(states),
		&errNum);
	checkErr(errNum, "clCreateBuffer(statesBuffer)");



	// Pick the first device and create command queue.
	queue = clCreateCommandQueue(
		context,
		deviceIDs[0],
		0,
		&errNum);
	checkErr(errNum, "clCreateCommandQueue");



				for(int l=0; l<TMAX; l++) // MAiN LO0000000000000000000000000000000000000000OP
	{

    errNum  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &statesBuffer);  //se
	errNum |= clSetKernelArg(kernel, 1, sizeof(cl_uint), &antalAtomer); // skicka in antalatomer
	errNum |= clSetKernelArg(kernel, 2, sizeof(float), &dt);
	errNum |= clSetKernelArg(kernel, 3, sizeof(int), &l);
	errNum |= clSetKernelArg(kernel, 4, sizeof(float), &lx);
	errNum |= clSetKernelArg(kernel, 5, sizeof(float), &ly);
	errNum |= clSetKernelArg(kernel, 6, sizeof(cl_uint), &TMAX);

	checkErr(errNum, "clSetKernelArg");

	const size_t globalWorkSize[1] = { antalAtomer*(antalAtomer/2)};
    const size_t localWorkSize[1]  = { 1 };
		
	cl_event event;

    // Queue the kernel up for execution across the array
    errNum = clEnqueueNDRangeKernel(
		queue, 
		kernel, 
		1, 
		NULL,
        globalWorkSize, 
		localWorkSize,
        0, 
		NULL, 
		&event);
	
	//clWaitForEvents(1, &event);  //väntar så man inte börjar med annan skit innan detta är klart

		
	checkErr(errNum, "clEnqueueNDRangeKernel");
    

	errNum = clEnqueueReadBuffer(
		queue, 
		statesBuffer, 
		CL_TRUE,
        0, 
		sizeof(float) * 4 * antalAtomer, 
		states,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueReadBuffer");




	//FÖRSTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
	// FÖRSTA GREJEN TYP RÄKNA KRAFT ******************************************DDDDDDDDDDDDSSSS




	    errNum  = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &statesBuffer);  //se
	errNum |= clSetKernelArg(kernel2, 1, sizeof(cl_uint), &antalAtomer); // skicka in antalatomer
	errNum |= clSetKernelArg(kernel2, 2, sizeof(float), &dt);
	errNum |= clSetKernelArg(kernel2, 3, sizeof(int), &l);
	errNum |= clSetKernelArg(kernel2, 4, sizeof(float), &lx);
	errNum |= clSetKernelArg(kernel2, 5, sizeof(float), &ly);
	errNum |= clSetKernelArg(kernel2, 6, sizeof(cl_uint), &TMAX);


	checkErr(errNum, "clSetKernelArg");

	const size_t globalWorkSize2[1] = { antalAtomer};
    const size_t localWorkSize2[1]  = { 1 };
		

    // Queue the kernel up for execution across the array
    errNum = clEnqueueNDRangeKernel(
		queue, 
		kernel2, 
		1, 
		NULL,
        globalWorkSize2, 
		localWorkSize2,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueNDRangeKernel");
    
	errNum = clEnqueueReadBuffer(
		queue, 
		statesBuffer, 
		CL_TRUE,
        0, 
		sizeof(float) * 4 * antalAtomer, 
		states,
        0, 
		NULL, 
		NULL);
	checkErr(errNum, "clEnqueueReadBuffer");





	 if(l%avskal==0)
	{
     
      for(int k=0; k<antalAtomer; k++)
	{
	  statesText<<states[k]<< " " << states[k+antalAtomer]<<" ";
	 
	}
      statesText<<"
";
      cout<<" klarhet "<< (double) 100*l/TMAX<<"
";
	}


	}
    std::cout << std::endl << "Executed program succesfully." << std::endl;


	 

	return 0;
}

 

When I change local work size from 1 to higher numbers e.g. 64 my program finishes about 20 times faster but it doesn’t calculate it correctly (it diverges and it seems that you don’t calculate forces over all the atoms).

This confuses me greatly since the number of work items is the same (right?) and get_global_id(0) should have the same range (right?) so the result should not change from having local work size 1 to 64.

Also, when you calculate it with the CPU only, you get the right result regardless of what your local work size is…

(btw, how do you edit your posts?)

Since the program gives errenous results with the GPU only it should be something wrong with the concurrency. I thought that maybe openCL started moving atoms before I had finished calculated their change in velocity in the first kernel.
(I have two kernels, one that calculates change in velocity and one that moves the atom and then I loop through that many times).

I figured I had to add events between enqueNDRangeKernel(kernel1,…) ->EnqueReadBuffer, EnqueueNDRangeKernel(kernel2) ->EnqueueReadBuffer (and then reapeat)
However this doesn’t work either. What COULD be wrong with this?

I was wondering since I was looping all this, will the events be listed as CL_successful from the first loop? so you only get events in the first loop… This is a part of my code:

	for(int l=0; l<TMAX; l++) // Main loop
	{

    errNum  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &statesBuffer);  //se
      .
      .
      .
	errNum |= clSetKernelArg(kernel, 7, sizeof(float), &a);

	checkErr(errNum, "clSetKernelArg");

	const size_t globalWorkSize[1] = { antalAtomer*(antalAtomer/2)};
    const size_t localWorkSize[1]  = { ls };
		
	cl_event event;
	
    // Queue the kernel up for execution across the array
    errNum = clEnqueueNDRangeKernel(
		queue, 
		kernel, 
		1, 
		NULL,
        globalWorkSize, 
		localWorkSize,
        0, 
		NULL, 
		&event);
	
	clWaitForEvents(1, &event);  //Waits so you finish up this part first

		
	checkErr(errNum, "clEnqueueNDRangeKernel");
    
	cl_event eventRead; //waits so you finish reading buffer to states so you can send it back to kernel2

	errNum = clEnqueueReadBuffer(
		queue, 
		statesBuffer, 
		CL_TRUE,
        0, 
		sizeof(float) * 4 * antalAtomer, 
		states,
        1, 
		&event, 
		&eventRead);
	checkErr(errNum, "clEnqueueReadBuffer");




	//now begin with kernel2, moving the atoms




	    errNum  = clSetKernelArg(kernel2, 0, sizeof(cl_mem), &statesBuffer);  //se
	.
        .
         .
	errNum |= clSetKernelArg(kernel2, 6, sizeof(cl_uint), &TMAX);


	checkErr(errNum, "clSetKernelArg");

	const size_t globalWorkSize2[1] = { antalAtomer};
    const size_t localWorkSize2[1]  = { ls };
		
	cl_event event2;

    // Queue the kernel up for execution across the array
    errNum = clEnqueueNDRangeKernel(
		queue, 
		kernel2, 
		1, 
		NULL,
        globalWorkSize2, 
		localWorkSize2,
        1, 
		&eventRead,   //eventread from buffer read (maybe a bit excessive)
		&event2);
	clWaitForEvents(1, &event2);  //Waits so you finish up
	checkErr(errNum, "clEnqueueNDRangeKernel");
   
	cl_event eventRead2; 
	errNum = clEnqueueReadBuffer(
		queue, 
		statesBuffer, 
		CL_TRUE,
        0, 
		sizeof(float) * 4 * antalAtomer, 
		states,
        1, 
		&event2,  // doesnt start reading buffer before kernel2 is finished
		&eventRead2); 

	clWaitForEvents(1, &eventRead2);
	checkErr(errNum, "clEnqueueReadBuffer"); 





	 if(l%avskal==0)
	{
     
      for(int k=0; k<antalAtomer; k++)
	{
	  statesText<<states[k]<< " " << states[k+antalAtomer]<<" "; // prints the coordinates to a file
	 
	}
      statesText<<"
";
      cout<<" klarhet "<< (double) 100*l/TMAX<<"
";  // gives the perentage finished
	}


	}

I actully think I might have figured out the problem, but I am not sure on how to solve it though. The thing is that in one of my kernels you got the following when adding the velocity:


    		states[x+antalAtomer*2] += F*dx*dt;
			states[x+antalAtomer*3] += F*dy*dt;

			states[k+antalAtomer*2] -= F*dx*dt;
			states[k+antalAtomer*3] -= F*dy*dt;

since the positions in the global vector “states” could be read/write by another thread in another workitem you clearly have problem. I am not sure how to work this out though. Atomic functions seem to have been the solution but it seems they only work for integers

this did not work:

	
			barrier(CLK_GLOBAL_MEM_FENCE);
    			states[x+antalAtomer*2] += F*dx*dt;
barrier(CLK_GLOBAL_MEM_FENCE);	
			states[x+antalAtomer*3] += F*dy*dt;
barrier(CLK_GLOBAL_MEM_FENCE);
			states[k+antalAtomer*2] -= F*dx*dt;
barrier(CLK_GLOBAL_MEM_FENCE);
			states[k+antalAtomer*3] -= F*dy*dt;
barrier(CLK_GLOBAL_MEM_FENCE);

On AMD hardware, the local worksize should be a multiple of 64, otherwise you’re throwing away at least N/64*100 % of your total ALU capacity.

Atomics should only be used for limited cases - like enquing results - they’re too slow for per-work-item per-internal-loop calculations, and by definition they always will be (they remove the parallelism you’re trying to add).

TBH I can’t really follow your code, but if you have an algorithm whose items are interdependent (i.e. the output of a previous result feeds into a later one), then parallelising it can range from simple to very complex.

For any CPU implementation i’ve seen, the local work group executes as a single thread, and multiple local work items are implemented using loops (and/or simd channels), so any within-work-item order problems will be hidden. Whereas a gpu will execute many work items concurrently within the work group.