Sorry for not including my enqueue code. I think the problem is that I was using cl::NDRange(1) for the workgroup size. That might fix the problem. It was I guess running only running 1 at a time?
/* parsData
* out - multimap connecting pairs of labels (of which there will be 1 repeat for each pair of
voxels matching */
int parseData(StatsT& ostats, TimeseriesT& timeseries, bool mutual, string srcfile = "")
{
//zero out the output statistics
for(int ii = 0 ; ii < ostats.regions*ostats.regions ; ii++) {
ostats.stats[ii].A = 0;
ostats.stats[ii].B = 0;
ostats.stats[ii].sem = 0;
ostats.stats[ii].count = 0;
ostats.stats[ii].peak_corr = 0;
ostats.stats[ii].delay_of_peak_corr = 0;
ostats.stats[ii].avg_peak_corr = 0;
ostats.stats[ii].avg_delay_of_peak_corr = 0;
}
//calculate conversion from index in correlation table to label
vector<int> index_to_label(ostats.regions);
{
int labelnum = 0;
int curr_label = -1;
for(int ii = 0 ; ii < timeseries.points ; ii++) {
if(curr_label != timeseries.data[ii*timeseries.timepoints]) {
curr_label = timeseries.data[ii*timeseries.timepoints];
index_to_label[labelnum] = curr_label;
labelnum++;
}
}
}
int DEVICE = 1;
//for tracking time
time_t start,end;
// char szInput [256];
double dif;
time (&start);
//there are several indexes here:
//ii,jj = location in list of voxels
//indexA, indexB = location in list of correlations
//labelA, labelB = voxel labels
if(srcfile.size()) {
std::fstream fin(srcfile.c_str(), fstream::in);
std::string src((std::istreambuf_iterator<char>(fin)), std::istreambuf_iterator<char>());
fin.close();
cout << "Source Size:" << src.size() << endl;
size_t ostatSize = sizeof(StatsT)+sizeof(CorrelationT)*
timeseries.regions*timeseries.regions;
cout << setw(10) << sizeof(TimeseriesT) << setw(10) << sizeof(float) << setw(10) << timeseries.points
<< setw(10) << timeseries.timepoints+1 << endl;
size_t tsSize = sizeof(TimeseriesT)+sizeof(float)*timeseries.points*
(timeseries.timepoints+1);
cl_int error;
// Get list of platforms (things that can execute OpenCL on this host), get a "context" on the first executor.
std::vector<cl::Platform> platformList;
cl::Platform::get(&platformList);
cl_context_properties cprops[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0};
cl::Context context( CL_DEVICE_TYPE_GPU, cprops, NULL, NULL, &error);
if(error) {
cerr << "Error getting context: " << endl;
printError(error);
return -1;
}
// Give the OpenCL program embedded in the string above to OpenCL.
cl::Program::Sources source(1, std::make_pair(src.c_str(), src.length()+1));
// Get devices used in this "context"
std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Compile program against device
cout << "Available Devices: " << endl;
string name, vendor, profile, version, driver;
for(unsigned int i = 0 ; i < devices.size() ; i++) {
devices[i].getInfo(CL_DEVICE_NAME, &name);
devices[i].getInfo(CL_DEVICE_VENDOR, &vendor);
devices[i].getInfo(CL_DEVICE_PROFILE, &profile);
devices[i].getInfo(CL_DEVICE_VERSION, &version);
devices[i].getInfo(CL_DRIVER_VERSION, &driver);
cout << name << ", " << vendor << ", " << profile << ", "
<< version << ", " << driver << endl;
}
cl::Program program(context, source);
error = program.build(devices,"-D OPENCL_KERNEL");
string buildlog;
program.getBuildInfo(devices[DEVICE], CL_PROGRAM_BUILD_LOG, &buildlog);
cerr << buildlog<< endl;
if(error) {
cerr << "Error building program" << endl;
printError(error);
return -2;
}
// create a kernel object, tell it we are using the kernel
//called "hello", give it an argument which is the memory we alloc'd above.
string kernelname;
if(mutual)
kernelname = "parseDataMIHelp";
else
kernelname = "parseDataCorrHelp";
cl::Kernel kernel(program, kernelname.c_str(), &error);
if(error) {
cerr << "Error creating kernel" << endl;
printError(error);
return -3;
}
cout << tsSize << endl;
// Allocate Input Buffer
cl::Buffer timeseriesCL(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
tsSize, ×eries, &error);
if(error) {
cerr << "Error creating ts buffer: " << endl;
printError(error);
return -4;
}
cout << endl;
//Allocate output Buffer
cl::Buffer index_to_labelCL( context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
index_to_label.size()*sizeof(int), index_to_label.data(), &error);
if(error) {
cerr << "Error creating label buffer: " << endl;
printError(error);
return -5;
}
cout << ostatSize << endl;
//Allocate output Buffer
cl::Buffer ostatsCL(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
ostatSize, &ostats, &error);
if(error) {
cerr << "Error creating output buffer: " << endl;
printError(error);
return -6;
}
if((error = kernel.setArg(0, ostatsCL))) {
cerr << "Error Setting Arg 0: " << endl;
printError(error);
return -7;
}
if((error = kernel.setArg(1, index_to_labelCL))) {
cerr << "Error Setting Arg 1: " << endl;
printError(error);
return -8;
}
if((error = kernel.setArg(2, timeseriesCL))) {
cerr << "Error Setting Arg 2: " << endl;
printError(error);
return -9;
}
// Queue the kernel up to run
cl::CommandQueue queue(context, devices[DEVICE], 0, &error);
if(error) {
cerr << "Error queuing kernel: " << endl;
printError(error);
return -0;
}
cl::Event event;
///HERE TODO
cout << (timeseries.points*timeseries.points+timeseries.points)/2 << endl;
cout << timeseries.regions*timeseries.regions << endl;
devices[DEVICE].getInfo(CL_DEVICE_NAME, &name);
devices[DEVICE].getInfo(CL_DEVICE_VENDOR, &vendor);
devices[DEVICE].getInfo(CL_DEVICE_PROFILE, &profile);
devices[DEVICE].getInfo(CL_DEVICE_VERSION, &version);
devices[DEVICE].getInfo(CL_DRIVER_VERSION, &driver);
cout << "Enqueue Kernel: " << (timeseries.points*timeseries.points+timeseries.points)/2
<< " On Device" << name << ", " << vendor << ", " << version << endl;
error = queue.enqueueNDRangeKernel(kernel, cl::NullRange,
cl::NDRange((timeseries.points*timeseries.points+timeseries.points)/2),
cl::NullRange, NULL, &event);
if(error) {
cerr << "Error Queuing NDRange kernel: " << endl;
printError(error);
return -9;
}
// Use the event object above to block until processing has completed
event.wait();
// Read the results out of the shared memory area.
error = queue.enqueueReadBuffer(ostatsCL, CL_TRUE, 0, ostatSize, &ostats);
if(error) {
cerr << "Error Queuing Read Buffer: " << endl;
printError(error);
return -8;
}
ostats.regions = timeseries.regions;
} else {
for(int ii = 0 ; ii < timeseries.points ; ii++) {
time (&end);
dif = difftime (end,start);
//time_elapse = distance_traveled
//----------- -----------------
//total_time = total_distance
//total_time = total_distance
//----------- -----------------
//time_elapse = distance_traveled
std::cout << "Remaining: " << dif*timeseries.points/ii << std::endl;
for(int jj = ii ; jj < timeseries.points ; jj++) {
if(mutual) {
parseDataMIHelp(&ostats, index_to_label.data(), ×eries, ii, jj);
} else {
parseDataCorrHelp(&ostats, index_to_label.data(), ×eries, ii, jj);
}
}
}
}
return 0;
}