CPU Usage 200%

I am using an AMD 3 core system, with 2x GTX260 cards (but I’m only processing on the second one right now). I am using Archlinux with the nvidia driver 280.13-1 (http://www.archlinux.org/packages/extra/x86_64/nvidia/).

The process I am working on is pretty complex, but I’m just wondering if it is reasonable for two full CPUs to be occupied while event.wait() is occurring? I’m also a little concerned because on a single thread with the CPU the process took about 44 hours. I had hoped that having 128 threads (because I have 128 stream processors) running simultaneously would give me an 100x speed up, but that doesn’t seem to be the case.

I’ll throw my kernel up in case anyone is interested. I’m using a #define OPENCL_KERNEL in a header file to use the exact same code for CPU and GPU processing. I’ve tested the kernel in small cases and it definitely gives the correct (same as CPU) output in small tests.
http://paste.pocoo.org/show/455434/

Sorry for not including my enqueue code. I think the problem is that I was using cl::NDRange(1) for the workgroup size. That might fix the problem. It was I guess running only running 1 at a time?

/* parsData
 * out   - multimap connecting pairs of labels (of which there will be 1 repeat for each pair of
            voxels matching */
int parseData(StatsT& ostats, TimeseriesT& timeseries, bool mutual, string srcfile = "")
{
    //zero out the output statistics
    for(int ii = 0 ; ii < ostats.regions*ostats.regions ; ii++) {
        ostats.stats[ii].A                       = 0;
        ostats.stats[ii].B                       = 0;
        ostats.stats[ii].sem                     = 0;
        ostats.stats[ii].count                   = 0;
        ostats.stats[ii].peak_corr               = 0;
        ostats.stats[ii].delay_of_peak_corr      = 0;
        ostats.stats[ii].avg_peak_corr           = 0;
        ostats.stats[ii].avg_delay_of_peak_corr  = 0;
    }

    //calculate conversion from index in correlation table to label
    vector<int> index_to_label(ostats.regions);
    {
    int labelnum = 0;
    int curr_label = -1;
    for(int ii = 0 ; ii < timeseries.points ; ii++) {
        if(curr_label != timeseries.data[ii*timeseries.timepoints]) {
            curr_label = timeseries.data[ii*timeseries.timepoints];
            index_to_label[labelnum] = curr_label;
            labelnum++;
        }
    }
    }
    
    int DEVICE = 1;
    //for tracking time
    time_t start,end;
//    char szInput [256];
    double dif;
    time (&start);

    //there are several indexes here:
    //ii,jj = location in list of voxels
    //indexA, indexB = location in list of correlations
    //labelA, labelB = voxel labels
    if(srcfile.size()) {
        std::fstream fin(srcfile.c_str(), fstream::in);
        std::string src((std::istreambuf_iterator<char>(fin)), std::istreambuf_iterator<char>());
        fin.close();
        cout << "Source Size:" << src.size() << endl;

        size_t ostatSize = sizeof(StatsT)+sizeof(CorrelationT)*
                    timeseries.regions*timeseries.regions;
        cout << setw(10) << sizeof(TimeseriesT) << setw(10) << sizeof(float) << setw(10) << timeseries.points
                    << setw(10) << timeseries.timepoints+1 << endl;
        size_t tsSize = sizeof(TimeseriesT)+sizeof(float)*timeseries.points*
                    (timeseries.timepoints+1);

        cl_int error;
        // Get list of platforms (things that can execute OpenCL on this host), get a "context" on the first executor.
        std::vector<cl::Platform> platformList;
        cl::Platform::get(&platformList);
        cl_context_properties cprops[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0};
        cl::Context context( CL_DEVICE_TYPE_GPU, cprops, NULL, NULL, &error);
        if(error) {
            cerr << "Error getting context: " << endl;
            printError(error);
            return -1;
        }

        // Give the OpenCL program embedded in the string above to OpenCL.
        cl::Program::Sources source(1, std::make_pair(src.c_str(), src.length()+1));

        // Get devices used in this "context"
        std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();

        // Compile program against device
        cout << "Available Devices: " << endl;
        string name, vendor, profile, version, driver;
        for(unsigned int i = 0 ; i < devices.size() ; i++) {
            devices[i].getInfo(CL_DEVICE_NAME, &name);
            devices[i].getInfo(CL_DEVICE_VENDOR, &vendor);
            devices[i].getInfo(CL_DEVICE_PROFILE, &profile);
            devices[i].getInfo(CL_DEVICE_VERSION, &version);
            devices[i].getInfo(CL_DRIVER_VERSION, &driver);
            cout << name << ", " << vendor << ", " << profile << ", " 
                        << version << ", " << driver << endl;
        }
        cl::Program program(context, source);
        error = program.build(devices,"-D OPENCL_KERNEL");
        string buildlog;
        program.getBuildInfo(devices[DEVICE], CL_PROGRAM_BUILD_LOG, &buildlog);
        cerr << buildlog<< endl;
        if(error) {
            cerr << "Error building program" << endl;
            printError(error);
            return -2;
        }

        // create a kernel object, tell it we are using the kernel 
        //called "hello", give it an argument which is the memory we alloc'd above.
        string kernelname;
        if(mutual)
            kernelname = "parseDataMIHelp";
        else
            kernelname = "parseDataCorrHelp";
            
        cl::Kernel kernel(program, kernelname.c_str(), &error);
        if(error) {
            cerr << "Error creating kernel" << endl;
            printError(error);
            return -3;
        }

        cout << tsSize << endl;
        // Allocate Input Buffer
        cl::Buffer timeseriesCL(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
                    tsSize, &timeseries, &error);
        if(error) {
            cerr << "Error creating ts buffer: " << endl;
            printError(error);
            return -4;
        }
        
        cout << endl;
        //Allocate output Buffer
        cl::Buffer index_to_labelCL( context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, 
                    index_to_label.size()*sizeof(int), index_to_label.data(), &error);
        if(error) {
            cerr << "Error creating label buffer: " << endl;
            printError(error);
            return -5;
        }
        
        cout << ostatSize << endl;
        //Allocate output Buffer
        cl::Buffer ostatsCL(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, 
                    ostatSize, &ostats, &error);
        if(error) {
            cerr << "Error creating output buffer: " << endl;
            printError(error);
            return -6;
        }
        
        if((error = kernel.setArg(0, ostatsCL))) {
            cerr << "Error Setting Arg 0: " << endl;
            printError(error);
            return -7;
        }
        
        if((error = kernel.setArg(1, index_to_labelCL))) {
            cerr << "Error Setting Arg 1: " << endl;
            printError(error);
            return -8;
        }
        
        if((error = kernel.setArg(2, timeseriesCL))) {
            cerr << "Error Setting Arg 2: " << endl;
            printError(error);
            return -9;
        }

        // Queue the kernel up to run
        cl::CommandQueue queue(context, devices[DEVICE], 0, &error);
        if(error) {
            cerr << "Error queuing kernel: " << endl;
            printError(error);
            return -0;
        }
        
        cl::Event event;
        ///HERE TODO
        cout << (timeseries.points*timeseries.points+timeseries.points)/2 << endl;
        cout << timeseries.regions*timeseries.regions << endl;

        devices[DEVICE].getInfo(CL_DEVICE_NAME, &name);
        devices[DEVICE].getInfo(CL_DEVICE_VENDOR, &vendor);
        devices[DEVICE].getInfo(CL_DEVICE_PROFILE, &profile);
        devices[DEVICE].getInfo(CL_DEVICE_VERSION, &version);
        devices[DEVICE].getInfo(CL_DRIVER_VERSION, &driver);
        
        cout << "Enqueue Kernel: " << (timeseries.points*timeseries.points+timeseries.points)/2 
                    << " On Device" << name << ", " << vendor << ", " << version << endl;
        error = queue.enqueueNDRangeKernel(kernel, cl::NullRange, 
                    cl::NDRange((timeseries.points*timeseries.points+timeseries.points)/2),
                    cl::NullRange, NULL, &event);
        if(error) {
            cerr << "Error Queuing NDRange kernel: " << endl;
            printError(error);
            return -9;
        }

        // Use the event object above to block until processing has completed
        event.wait();

        // Read the results out of the shared memory area.
        error = queue.enqueueReadBuffer(ostatsCL, CL_TRUE, 0, ostatSize, &ostats);
        if(error) {
            cerr << "Error Queuing Read Buffer: " << endl;
            printError(error);
            return -8;
        }
        ostats.regions = timeseries.regions;
    } else {
        for(int ii = 0 ; ii < timeseries.points ; ii++) {
            time (&end);
            dif = difftime (end,start);
            //time_elapse = distance_traveled
            //-----------   -----------------
            //total_time  = total_distance
            
            //total_time  = total_distance
            //-----------   -----------------
            //time_elapse = distance_traveled
            std::cout << "Remaining: " << dif*timeseries.points/ii << std::endl;

            for(int jj = ii ; jj < timeseries.points ; jj++) {
                if(mutual) {
                    parseDataMIHelp(&ostats, index_to_label.data(), &timeseries, ii, jj);
                } else {
                    parseDataCorrHelp(&ostats, index_to_label.data(), &timeseries, ii, jj);
                }
            }
        }
    }
    return 0;
}