How parallel is this code?

Hello I’m trying to do some calculation using OpenCL but I’m not getting the result I was expecting. This code was run in one test on two Tesla T10 Processors and another test in one single core Intel 2GHz.

These were the results:
CPU: Overall: 13 files, 20563 KB in 0.4 s Throughput: 51407KB/s 49MB/s
2 GPUs: Overall: 13 files, 20563 KB in 25.57 s Throughput: 804KB/s 0MB/s

What am I doing wrong? How is it possible to get the best of the two GPUs? What am I missing?

This is the API:

#ifndef __OCLSTACK_H__
#define __OCLSTACK_H__

#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>

#define MAX_ENTRY_DEVICES 8

enum PFS_DEVICE_T {CPU = CL_DEVICE_TYPE_CPU, GPU = CL_DEVICE_TYPE_GPU, ALL = CL_DEVICE_TYPE_ALL};

typedef struct _pfs pfs_t;

struct _pfs{
    cl_platform_id  platform;
    
    cl_device_id    device[8];
    cl_device_type  deviceType[8];
    cl_uint         num_devices;
    cl_uint         num_entry_devices;
    cl_ulong        maxAllocSizes[8];

    cl_context      contexts[8]; //Combinations amongst number of devices
    cl_context_properties properties[3];
    
    cl_command_queue commands[8];
    
    cl_program      programs[8];
    
    cl_kernel       kernels[8];
    void*           kargs[8];

    const char* kernelName[8];
    const char* kernelSource;
};

int
pfs_initiate( pfs_t* handler, const char* kernelFile, enum PFS_DEVICE_T dt );

int
pfs_createKernels( pfs_t* h, const char* kName );

int
pfs_terminate( pfs_t* handler );

int
pfs_run( pfs_t* h, unsigned int *state, unsigned char* input, unsigned int size );

#endif
#include "pfs.h"
#include "pfserr.h"

#include <CL/cl.h>


int
pfs_run( pfs_t* h, unsigned int *state, unsigned char* input, unsigned int size ){

        int err = CL_SUCCESS;
        static unsigned int last_size=0;
        if(size != 0)
            last_size = size;
        // create buffers for the input and ouput
        cl_mem _result, _size,_block, _last;
//	printf("Size: %d, Input[0..4]:%c%c%c%c
", size, input[0], input[1], input[2], input[3]);
        _block  = clCreateBuffer(h->contexts[0], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned char) * last_size, input, &err);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to create a buffer object. BLOCK
");
            return pfs__trace_clCreateBuffer(err);
        }

        _result = clCreateBuffer(h->contexts[0], CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(unsigned int) * 4 , state, &err);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to create a buffer object. RESULT
");
            return pfs__trace_clCreateBuffer(err);
        }

    	_size   = clCreateBuffer(h->contexts[0], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned int), &last_size, &err);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to create a buffer object. SIZE
");
            return pfs__trace_clCreateBuffer(err);
        }

    	_last   = clCreateBuffer(h->contexts[0], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned int), &size, &err);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to create a buffer object. SIZE
");
            return pfs__trace_clCreateBuffer(err);
        }
        // load data into the input buffer
	    err = clEnqueueWriteBuffer(h->commands[0], _size, CL_TRUE, 0, sizeof(unsigned int), &last_size, 0, NULL, NULL);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to enqueue h->commands[0] to write to a buffer object from host memory SIZE.
");
            return pfs__trace_clEnqueueWriteBuffer(err);
        }
	        err = clEnqueueWriteBuffer(h->commands[0], _block, CL_TRUE, 0, sizeof(unsigned char) * last_size, input, 0, NULL, NULL);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to enqueue h->commands[0] to write to a buffer object from host memory BLOCK.
");
            return pfs__trace_clEnqueueWriteBuffer(err);
        }

	    err = clEnqueueWriteBuffer(h->commands[0], _result, CL_TRUE, 0, sizeof(unsigned int) * 4, state, 0, NULL, NULL);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to enqueue h->commands[0] to write to a buffer object from host memory RESULT.
");
            return pfs__trace_clEnqueueWriteBuffer(err);
        }

	    err = clEnqueueWriteBuffer(h->commands[0], _last, CL_TRUE, 0, sizeof(unsigned int), &size, 0, NULL, NULL);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to enqueue h->commands[0] to write to a buffer object from host memory LAST.
");
            return pfs__trace_clEnqueueWriteBuffer(err);
        }

        err = clFlush(h->commands[0]);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to block until all previously queued OpenCL h->commands[0] in a command-queue are issued to the associated device and have completed.
");
            return pfs__trace_clFinish(err);
        }
//printf("WARGS data loaded into input buffers
");

        // set the argument list for the kernel command
    	err = clSetKernelArg(h->kernels[0], 0, sizeof(cl_mem), &_block);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to set the argument value for a specific argument of a kernel. BLOCK
");
            return pfs__trace_clSetKernelArg(err);
        }
    	err = clSetKernelArg(h->kernels[0], 1, sizeof(cl_mem), &_result);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to set the argument value for a specific argument of a kernel. RESULT
");
            return pfs__trace_clSetKernelArg(err);
        }
    	err = clSetKernelArg(h->kernels[0], 2, sizeof(cl_mem), &_size);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to set the argument value for a specific argument of a kernel. SIZE
");
            return pfs__trace_clSetKernelArg(err);
        }
    	err = clSetKernelArg(h->kernels[0], 3, sizeof(cl_mem), &_last);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to set the argument value for a specific argument of a kernel. LAST
");
            return pfs__trace_clSetKernelArg(err);
        }

//printf("WARGS set the argument list for the kernels command
");
        //cl_event event;
        size_t global=1;
        // enqueue the kernel command for execution
        //clEnqueueNDRangeKernel(h->commands[0], h->kernels[0], 1, NULL, &global, NULL,
        //                       event_list_size, event_list, &event);
        err = clEnqueueNDRangeKernel(h->commands[0], h->kernels[0], 1, NULL, &global, NULL, 0, NULL, NULL);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to enqueue a command to execute a kernel on a device.
");
            return pfs__trace_clEnqueueNDRangeKernel(err);
        }
//printf("WARGS ND Enqueued
");
        
        //event_list[event_list_size] = event;
        //event_list_size++;

	if(size == 0){
//		printf("Finishing..
");
        	err = clFinish(h->commands[0]);
	}else{
//		printf("Flushing..
");
		err = clFlush(h->commands[0]);
	}
        if( err != CL_SUCCESS ){
            printf("[-] Unable to block until all previously queued OpenCL h->commands[0] in a command-queue are issued to the associated device and have completed.
");
            return pfs__trace_clFinish(err);
        }
    
//printf("WARGS Command finished
");
        // copy the results from out of the output buffer
        err = clEnqueueReadBuffer(h->commands[0], _result, CL_TRUE, 0, sizeof(unsigned int) * 4, state, 0, NULL, NULL);
        if( err != CL_SUCCESS ){
            printf("[-] Unable to enqueue h->commands[0] to read from a buffer object to host memory.
");
            return pfs__trace_clEnqueueReadBuffer(err);
        }
    
        clReleaseMemObject(_block);
        clReleaseMemObject(_size);
        clReleaseMemObject(_last);
        clReleaseMemObject(_result);
        return err;
}

This is the main application:

(...)
for(FileNum = argc; FileNum > 2; FileNum--){

            fp = fopen(argv[FileNum-1],"rb");
            if(fp == NULL)
               return 1;

            state[0] = 0x67452301u;
            state[1] = 0xEFCDAB89u;
            state[2] = 0x98BADCFEu;
            state[3] = 0x10325476u;

            start = clock();
            do{
                bytes = fread(input, 1, input_size, fp);

                if( pfs_run(&h, state, input, bytes) )
                    return -1;
		bytesTotal += bytes;
            }while(bytes != 0);
            
            end = clock();

            for(i=0; i<4; i++)
                tmp[i] =  (state[i] >> 24)
                       | ((state[i] >> 8 ) & 0x0000FF00)
                       | ((state[i] << 8 ) & 0x00FF0000)
                       |  (state[i] << 24);

            printf("%08x%08x%08x%08x  %20s	%g s
",tmp[0],tmp[1],tmp[2],tmp[3], argv[FileNum-1],
                                                    ((double)( end - start )) / CLOCKS_PER_SEC);
            fclose(fp);
(...)
    }

any answer is a good answer… you don’t need to read the code, we may talk in concepts… I’m doing some calculation on the kernel. It receives a payload and the size of it, as well as a state variable. It doesn’t need to be run in 2 nor 3 dimentions but, would make it faster? can someone help me on this? please.

It looks like you are calling clEnqueueNDRangeKernel with the total amount of work to do i.e. global_work_size set to 1. There is basically one work-item that is executing the kernel. There is nothing being run in parallel here.

To really achieve the speedup you are looking for, you need to execute multiple work-items in parallel. The global_work_size should ideally reflect the amount of work that can be executed in parallel. For example, if I’m adding two arrays, then I can execute a + b for each entry in parallel. If the arrays have a million entries each, the global_work_size = 1 M.

So let’s say that I have a file of 4096 bytes and I divide it in blocks of 1024 bytes each but for each block I’m doing calculation I need the result of the previous block calculated. This is something that won’t be done in parallel, but what about the block of 1024, that can be run in parallel?

This is like dividing my 1M array in slices of 1M/4 (because that’s the maximum memory hardware can handle) and in the meanwhile making the sum of all slices in sequence.
In this case what should I have to have attention while writing the kernel? Should global_work_size be the size of 1M/4?

In that case what changes shoul one do in order to properly calculate the sum of that array?

This is the Kernel I was using:


__kernel void md5( __global unsigned int* data, __global unsigned int *state, __global unsigned int *block_size)
{
#if 1

unsigned int a,b,c,d,x[16],i;
# define rotate(x,n)  ((x << n) | (x >> (32 - n)))
# define X(Y,a,b,c,d,x,s,ac)  a += Y(b,c,d) + x + ac; if(s != 0) a = rotate(a,s); a += b
  for(i = 0;i < *block_size;i += 64)
  {
  
      x[ 0] = data[ 0];
      x[ 1] = data[ 1];
      x[ 2] = data[ 2];
      x[ 3] = data[ 3];
      x[ 4] = data[ 4];
      x[ 5] = data[ 5];
      x[ 6] = data[ 6];
      x[ 7] = data[ 7];
      x[ 8] = data[ 8];
      x[ 9] = data[ 9];
      x[10] = data[10];
      x[11] = data[11];
      x[12] = data[12];
      x[13] = data[13];
      x[14] = data[14];
      x[15] = data[15];
    
    
    data = &data[16];
    a = state[0];
    b = state[1];
    c = state[2];
    d = state[3];
# define F(x,y,z)  ((x & y) | (~x & z))
# define S11  7
# define S12 12
# define S13 17
# define S14 22
    X(F,a,b,c,d,x[ 0],S11,0xD76AA478u);
    X(F,d,a,b,c,x[ 1],S12,0xE8C7B756u);
    X(F,c,d,a,b,x[ 2],S13,0x242070DBu);
    X(F,b,c,d,a,x[ 3],S14,0xC1BDCEEEu);
    X(F,a,b,c,d,x[ 4],S11,0xF57C0FAFu);
    X(F,d,a,b,c,x[ 5],S12,0x4787C62Au);
    X(F,c,d,a,b,x[ 6],S13,0xA8304613u);
    X(F,b,c,d,a,x[ 7],S14,0xFD469501u);
    X(F,a,b,c,d,x[ 8],S11,0x698098D8u);
    X(F,d,a,b,c,x[ 9],S12,0x8B44F7AFu);
    X(F,c,d,a,b,x[10],S13,0xFFFF5BB1u);
    X(F,b,c,d,a,x[11],S14,0x895CD7BEu);
    X(F,a,b,c,d,x[12],S11,0x6B901122u);
    X(F,d,a,b,c,x[13],S12,0xFD987193u);
    X(F,c,d,a,b,x[14],S13,0xA679438Eu);
    X(F,b,c,d,a,x[15],S14,0x49B40821u);
# undef F
# undef S11
# undef S12
# undef S13
# undef S14
# define G(x,y,z)  ((x & z) | (y & ~z))
# define S21  5
# define S22  9
# define S23 14
# define S24 20
    X(G,a,b,c,d,x[ 1],S21,0xF61E2562u);
    X(G,d,a,b,c,x[ 6],S22,0xC040B340u);
    X(G,c,d,a,b,x[11],S23,0x265E5A51u);
    X(G,b,c,d,a,x[ 0],S24,0xE9B6C7AAu);
    X(G,a,b,c,d,x[ 5],S21,0xD62F105Du);
    X(G,d,a,b,c,x[10],S22,0x02441453u);
    X(G,c,d,a,b,x[15],S23,0xD8A1E681u);
    X(G,b,c,d,a,x[ 4],S24,0xE7D3FBC8u);
    X(G,a,b,c,d,x[ 9],S21,0x21E1CDE6u);
    X(G,d,a,b,c,x[14],S22,0xC33707D6u);
    X(G,c,d,a,b,x[ 3],S23,0xF4D50D87u);
    X(G,b,c,d,a,x[ 8],S24,0x455A14EDu);
    X(G,a,b,c,d,x[13],S21,0xA9E3E905u);
    X(G,d,a,b,c,x[ 2],S22,0xFCEFA3F8u);
    X(G,c,d,a,b,x[ 7],S23,0x676F02D9u);
    X(G,b,c,d,a,x[12],S24,0x8D2A4C8Au);
# undef G
# undef S21
# undef S22
# undef S23
# undef S24
# define H(x,y,z)  (x ^ y ^ z)
# define S31  4
# define S32 11
# define S33 16
# define S34 23
    X(H,a,b,c,d,x[ 5],S31,0xFFFA3942u);
    X(H,d,a,b,c,x[ 8],S32,0x8771F681u);
    X(H,c,d,a,b,x[11],S33,0x6D9D6122u);
    X(H,b,c,d,a,x[14],S34,0xFDE5380Cu);
    X(H,a,b,c,d,x[ 1],S31,0xA4BEEA44u);
    X(H,d,a,b,c,x[ 4],S32,0x4BDECFA9u);
    X(H,c,d,a,b,x[ 7],S33,0xF6BB4B60u);
    X(H,b,c,d,a,x[10],S34,0xBEBFBC70u);
    X(H,a,b,c,d,x[13],S31,0x289B7EC6u);
    X(H,d,a,b,c,x[ 0],S32,0xEAA127FAu);
    X(H,c,d,a,b,x[ 3],S33,0xD4EF3085u);
    X(H,b,c,d,a,x[ 6],S34,0x04881D05u);
    X(H,a,b,c,d,x[ 9],S31,0xD9D4D039u);
    X(H,d,a,b,c,x[12],S32,0xE6DB99E5u);
    X(H,c,d,a,b,x[15],S33,0x1FA27CF8u);
    X(H,b,c,d,a,x[ 2],S34,0xC4AC5665u);
# undef H
# undef S31
# undef S32
# undef S33
# undef S34
# define I(x,y,z)  (y ^ (x | ~z))
# define S41  6
# define S42 10
# define S43 15
# define S44 21
    X(I,a,b,c,d,x[ 0],S41,0xF4292244u);
    X(I,d,a,b,c,x[ 7],S42,0x432AFF97u);
    X(I,c,d,a,b,x[14],S43,0xAB9423A7u);
    X(I,b,c,d,a,x[ 5],S44,0xFC93A039u);
    X(I,a,b,c,d,x[12],S41,0x655B59C3u);
    X(I,d,a,b,c,x[ 3],S42,0x8F0CCC92u);
    X(I,c,d,a,b,x[10],S43,0xFFEFF47Du);
    X(I,b,c,d,a,x[ 1],S44,0x85845DD1u);
    X(I,a,b,c,d,x[ 8],S41,0x6FA87E4Fu);
    X(I,d,a,b,c,x[15],S42,0xFE2CE6E0u);
    X(I,c,d,a,b,x[ 6],S43,0xA3014314u);
    X(I,b,c,d,a,x[13],S44,0x4E0811A1u);
    X(I,a,b,c,d,x[ 4],S41,0xF7537E82u);
    X(I,d,a,b,c,x[11],S42,0xBD3AF235u);
    X(I,c,d,a,b,x[ 2],S43,0x2AD7D2BBu);
    X(I,b,c,d,a,x[ 9],S44,0xEB86D391u);
# undef I
# undef S41
# undef S42
# undef S43
# undef S44


    state[0] += a;
    state[1] += b;
    state[2] += c;
    state[3] += d;

#undef rotate
#undef X
    }
#endif
}