CL_OUT_OF_RESOURCES when enqueuing same kernel

HI, I’m getting a CL_OUT_OF_RESOURCES. I’ll post two implementations. They are pratically identical.
The first one runs ok. I’m calling a function on a loop that runs my kernel with different arguments.


float Parallel::step(){
	size_t i=0,max=0;
	float diff=0;
	
        //PAY ATTENTION HERE AND TO IMPLEMANTION OF stepGRoup()
	for(i=0;i<groups.size();++i){
		stepGroup(i);
	}

	try{
	queue.enqueueBarrier();
	//Read x buffer back
	queue.enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(float)*rows, x);
	
	} catch ( cl::Error& err ) {
		std::cerr << "Caught exception: " << err.what() << '(' << err.err() << ')' << std::endl;
	}
}

void Parallel::stepGroup(size_t grp_idx){
	size_t group_rows=groups[grp_idx].size();
	size_t rows_offset=0;

	for(size_t i=0;i<grp_idx;++i)
		rows_offset+=groups[i].size();


	try{
 
	size_t local_size=512;
	size_t global_size;
	if(group_rows%local_size)
		global_size=(group_rows/local_size+1)*local_size;
	else
		global_size=group_rows/local_size;
	
	//Kernel setargs
	kernel.setArg(0,d_values);	
	kernel.setArg(1,d_col_idx);	
	kernel.setArg(2,d_row_index);	
	kernel.setArg(3,d_b);	
	kernel.setArg(4,d_x);	
	kernel.setArg(5,rows_offset);	
	kernel.setArg(6,group_rows);	
	kernel.setArg(7,max_nz);	
	kernel.setArg(8,rows);	
	
	
	//Enque the kernel now
	//std::cerr<<"global_size: "<<global_size<<" local_size: "<<local_size<<std::endl;

	queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(global_size),cl::NDRange(local_size));
	
	} catch ( cl::Error& err ) {
		std::cerr << "Caught exception: " << err.what() << '(' << err.err() << ')' << std::endl;
	}	

}

The second one, instead of looping on each group and calling stepGroup(), I’m calling a function called stepAllGroups() that has the loop for the groups in it:


float Parallel::step(){
	size_t i=0,max=0;
	float diff=0;
	
        //PAY ATTENTION HERE AND TO IMPLEMANTION OF stepAllGroups()
	stepAllGroups();

	try{
	queue.enqueueBarrier();
	//Read x buffer back
	queue.enqueueReadBuffer(d_x, CL_TRUE, 0, sizeof(float)*rows, x);
	
	} catch ( cl::Error& err ) {
		std::cerr << "Caught exception: " << err.what() << '(' << err.err() << ')' << std::endl;
	}
}

void Parallel::stepAllGroups(){
	size_t group_rows;
	size_t rows_offset=0;

	size_t local_size=512;
	size_t global_size;
	
	queue.enqueueBarrier();
	for(size_t i=0;i<groups.size();++i){
		group_rows=groups[i].size();
		rows_offset+=group_rows;

		std::cerr<<"Calling kernel on group "<<i<<" with group_rows "<<group_rows<<std::endl;
	
		try{
	
			if(groups[i].size()%local_size)
				global_size=(group_rows/local_size+1)*local_size;
			else
				global_size=group_rows/local_size;

			kernel.setArg(0,d_values);	
			kernel.setArg(1,d_col_idx);	
			kernel.setArg(2,d_row_index);	
			kernel.setArg(3,d_b);	
			kernel.setArg(4,d_x);	
			kernel.setArg(5,rows_offset);	
			kernel.setArg(6,group_rows);	
			kernel.setArg(7,max_nz);	
			kernel.setArg(8,rows);	
			
			queue.enqueueBarrier();
			queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(global_size),cl::NDRange(local_size));
		} catch ( cl::Error& err ) {
			std::cerr << "Caught exception: " << err.what() << '(' << err.err() << ')' << std::endl;
		}	

	}
	
	queue.enqueueBarrier();

}

In the second implemention, I’m getting CL_OUT_OF_RESOURCES when calling ::queueReadBuffer();

Btw, I’m using enqueueBarrier() in an effort to solve this with no success.

I dont understand where the difference is since it’s seems to be a matter of how I chose to structure my code and nothing to do with OpenCL, but something is definetly happening.

Any pointers will be appreciated.

Best regards
Mat