Incorrect output for JPEG decoding with OpenCL on Eclipse (RHEL 6.6)

Hello.
I am new to OpenCL programming and am unable to figure out where I am going wrong with my code.

I am trying to translate a portion of the code in this link:

https://github.com/TimSC/mjpeg

to OpenCL but am not getting the expected output. The expected output is a .ppm image which I have attached to my query (test.ppm). I cannot attach a ppm file to this question at the moment.

I want to get identical output with my OpenCL code. I picked one function in the

utils.c

file and put it AS IT IS in a .cl file which I called

jpeg.cl

:

jpeg.cl
__kernel inline static void idct(__global int *in, __global int *out, __global int *quant, __global long off, int max)
{

unsigned int gid= get_global_id(0);
long t0, t1, t2, t3, t4, t5, t6, t7;	// t ;
long tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
long tmp[64], *tmpp;
int i, j, te;
unsigned char *zig2p;

t0 = off;
if (max == 1) {
    t0 += in[0] * quant[0];
    for (i = 0; i < 64; i++)
        out[i] = ITOINT(t0);
    return;
}
zig2p = zig2;
tmpp = tmp;
for (i = 0; i < 8; i++) {
    j = *zig2p++;
    t0 += in[j] * (long) quant[j];
    j = *zig2p++;
    t5 = in[j] * (long) quant[j];
    j = *zig2p++;
    t2 = in[j] * (long) quant[j];
    j = *zig2p++;
    t7 = in[j] * (long) quant[j];
    j = *zig2p++;
    t1 = in[j] * (long) quant[j];
    j = *zig2p++;
    t4 = in[j] * (long) quant[j];
    j = *zig2p++;
    t3 = in[j] * (long) quant[j];
    j = *zig2p++;
    t6 = in[j] * (long) quant[j];


    if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {

        tmpp[0 * 8] = t0;
        tmpp[1 * 8] = t0;
        tmpp[2 * 8] = t0;
        tmpp[3 * 8] = t0;
        tmpp[4 * 8] = t0;
        tmpp[5 * 8] = t0;
        tmpp[6 * 8] = t0;
        tmpp[7 * 8] = t0;

        tmpp++;
        t0 = 0;
        continue;
    }
    //IDCT;
    tmp0 = t0 + t1;
    t1 = t0 - t1;
    tmp2 = t2 - t3;
    t3 = t2 + t3;
    tmp2 = IMULT(tmp2, IC4) - t3;
    tmp3 = tmp0 + t3;
    t3 = tmp0 - t3;
    tmp1 = t1 + tmp2;
    tmp2 = t1 - tmp2;
    tmp4 = t4 - t7;
    t7 = t4 + t7;
    tmp5 = t5 + t6;
    t6 = t5 - t6;
    tmp6 = tmp5 - t7;
    t7 = tmp5 + t7;
    tmp5 = IMULT(tmp6, IC4);
    tmp6 = IMULT((tmp4 + t6), S22);
    tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
    t6 = IMULT(t6, (C22 + S22)) - tmp6;
    t6 = t6 - t7;
    t5 = tmp5 - t6;
    t4 = tmp4 - t5;

    tmpp[0 * 8] = tmp3 + t7;	//t0;
    tmpp[1 * 8] = tmp1 + t6;	//t1;
    tmpp[2 * 8] = tmp2 + t5;	//t2;
    tmpp[3 * 8] = t3 + t4;	//t3;
    tmpp[4 * 8] = t3 - t4;	//t4;
    tmpp[5 * 8] = tmp2 - t5;	//t5;
    tmpp[6 * 8] = tmp1 - t6;	//t6;
    tmpp[7 * 8] = tmp3 - t7;	//t7;
    tmpp++;
    t0 = 0;
}
for (i = 0, j = 0; i < 8; i++) {
    t0 = tmp[j + 0];
    t1 = tmp[j + 1];
    t2 = tmp[j + 2];
    t3 = tmp[j + 3];
    t4 = tmp[j + 4];
    t5 = tmp[j + 5];
    t6 = tmp[j + 6];
    t7 = tmp[j + 7];
    if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
        te = ITOINT(t0);
        out[j + 0] = te;
        out[j + 1] = te;
        out[j + 2] = te;
        out[j + 3] = te;
        out[j + 4] = te;
        out[j + 5] = te;
        out[j + 6] = te;
        out[j + 7] = te;
        j += 8;
        continue;
    }
    //IDCT;
    tmp0 = t0 + t1;
    t1 = t0 - t1;
    tmp2 = t2 - t3;
    t3 = t2 + t3;
    tmp2 = IMULT(tmp2, IC4) - t3;
    tmp3 = tmp0 + t3;
    t3 = tmp0 - t3;
    tmp1 = t1 + tmp2;
    tmp2 = t1 - tmp2;
    tmp4 = t4 - t7;
    t7 = t4 + t7;
    tmp5 = t5 + t6;
    t6 = t5 - t6;
    tmp6 = tmp5 - t7;
    t7 = tmp5 + t7;
    tmp5 = IMULT(tmp6, IC4);
    tmp6 = IMULT((tmp4 + t6), S22);
    tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
    t6 = IMULT(t6, (C22 + S22)) - tmp6;
    t6 = t6 - t7;
    t5 = tmp5 - t6;
    t4 = tmp4 - t5;

    out[j + 0] = ITOINT(tmp3 + t7);
    out[j + 1] = ITOINT(tmp1 + t6);
    out[j + 2] = ITOINT(tmp2 + t5);
    out[j + 3] = ITOINT(t3 + t4);
    out[j + 4] = ITOINT(t3 - t4);
    out[j + 5] = ITOINT(tmp2 - t5);
    out[j + 6] = ITOINT(tmp1 - t6);
    out[j + 7] = ITOINT(tmp3 - t7);
    j += 8;
}


}

Please note that my goal FOR NOW is not to run the code efficiently on the GPU. I simply want to run it on my GPU (Tesla K20c) and get the same output which I am getting with the normal C code.

I am calling this kernel in a function called

jpeg_decode

which is located in the

utils.c file

. I have seen many OpenCL codes but I was uncertain about calling a kernel in a function. I got support from this link

https://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/fft-fast-fourier-transform/

and modified my

 jpeg_decode

function in the

utils.c

file in the following way:

jpeg_decode()

in

utils.c

:


int jpeg_decode(unsigned char **pic, unsigned char *buf, int *width, int *height)
{

 [b]  cl_int ret;
   cl_event kernelDone;[/b]


    struct jpeg_decdata *decdata;
    int i, j, m, tac, tdc;
    int intwidth, intheight;
    int mcusx, mcusy, mx, my;
    int ypitch ,xpitch,bpp,pitch,x,y;
    int mb;
    int max[6];
    ftopict convert;
    int err = 0;
    int isInitHuffman = 0;

    decdata = (struct jpeg_decdata *) malloc(sizeof(struct jpeg_decdata));

    //fixing= IFIX(a);
  [b] cl_kernel kernel;

   kernel = clCreateKernel(program, "idct", &ret);[/b]


        [b]size_t GWS[2];
            GWS[0]= 8;
            GWS[1] = 8;[/b]


    if (!decdata) {
	err = -1;
	goto error;
    }
    if (buf == NULL) {
	err = -1;
	goto error;
    }
    datap = buf;
    if (getbyte() != 0xff) {
	err = ERR_NO_SOI;
	goto error;
    }
    if (getbyte() != M_SOI) {
	err = ERR_NO_SOI;
	goto error;
    }
    if (readtables(M_SOF0, &isInitHuffman)) {
	err = ERR_BAD_TABLES;
	goto error;
    }
    getword();
    i = getbyte();
    if (i != 8) {
	err = ERR_NOT_8BIT;
	goto error;
    }
    intheight = getword();
    intwidth = getword();
    
    if ((intheight & 7) || (intwidth & 7)) {
	err = ERR_BAD_WIDTH_OR_HEIGHT;
	goto error;
    }
    info.nc = getbyte();
    if (info.nc > MAXCOMP) {
	err = ERR_TOO_MANY_COMPPS;
	goto error;
    }
    for (i = 0; i < info.nc; i++) {
	int h, v;
	comps[i].cid = getbyte();
	comps[i].hv = getbyte();
	v = comps[i].hv & 15;
	h = comps[i].hv >> 4;
	comps[i].tq = getbyte();
	if (h > 3 || v > 3) {
	   err = ERR_ILLEGAL_HV;
	   goto error;
	}
	if (comps[i].tq > 3) {
	   err = ERR_QUANT_TABLE_SELECTOR;
	   goto error;
	}
    }
    if (readtables(M_SOS,&isInitHuffman)) {
	err = ERR_BAD_TABLES;
	goto error;
    }
    getword();
    info.ns = getbyte();
    if (!info.ns){
    printf("info ns %d/n",info.ns);
	err = ERR_NOT_YCBCR_221111;
	goto error;
    }
    for (i = 0; i < info.ns; i++) {
	dscans[i].cid = getbyte();
	tdc = getbyte();
	tac = tdc & 15;
	tdc >>= 4;
	if (tdc > 1 || tac > 1) {
	   err = ERR_QUANT_TABLE_SELECTOR;
	   goto error;
	}
	for (j = 0; j < info.nc; j++)
	   if (comps[j].cid == dscans[i].cid)
		break;
	if (j == info.nc) {
	   err = ERR_UNKNOWN_CID_IN_SCAN;
	   goto error;
	}
	dscans[i].hv = comps[j].hv;
	dscans[i].tq = comps[j].tq;
	dscans[i].hudc.dhuff = dec_huffdc + tdc;
	dscans[i].huac.dhuff = dec_huffac + tac;
    }

    i = getbyte();
    j = getbyte();
    m = getbyte();

    if (i != 0 || j != 63 || m != 0) {
    	printf("hmm FW error,not seq DCT ??
");
    }
   // printf("ext huffman table %d 
",isInitHuffman);
    if(!isInitHuffman) {
    	if(huffman_init() < 0)
		return -ERR_BAD_TABLES;
	}
/*
    if (dscans[0].cid != 1 || dscans[1].cid != 2 || dscans[2].cid != 3) {
	err = ERR_NOT_YCBCR_221111;
	goto error;
    }

    if (dscans[1].hv != 0x11 || dscans[2].hv != 0x11) {
	err = ERR_NOT_YCBCR_221111;
	goto error;
    }
*/    
    /* if internal width and external are not the same or heigth too 
       and pic not allocated realloc the good size and mark the change 
       need 1 macroblock line more ?? */
    if (intwidth != *width || intheight != *height || *pic == NULL) {
	*width = intwidth;
	*height = intheight;
	// BytesperPixel 2 yuyv , 3 rgb24 
	*pic =
	   (unsigned char *) realloc((unsigned char *) *pic,
				     (size_t) intwidth * (intheight +
							  8) * 2);
    }


    switch (dscans[0].hv) {
    case 0x22: // 411
    	mb=6;
	mcusx = *width >> 4;
	mcusy = *height >> 4;
	bpp=2;
	xpitch = 16 * bpp;
	pitch = *width * bpp; // YUYV out
	ypitch = 16 * pitch;
	convert = yuv420pto422;	
	break;
    case 0x21: //422

    	mb=4;
	mcusx = *width >> 4;
	mcusy = *height >> 3;
	bpp=2;	
	xpitch = 16 * bpp;
	pitch = *width * bpp; // YUYV out
	ypitch = 8 * pitch;
    convert = yuv422pto422;
	break;
     printf("MY VALUES: %dx%d
",*width,*height);
    case 0x11: //444
	mcusx = *width >> 3;
	mcusy = *height >> 3;
	bpp=2;
	xpitch = 8 * bpp;
	pitch = *width * bpp; // YUYV out
	ypitch = 8 * pitch;
	if (info.ns==1) {
    		mb = 1;
		convert = yuv400pto422;
	} else {
		mb=3;
        convert = yuv422pto422;
	}
        break;
    default:
	err = ERR_NOT_YCBCR_221111;
	goto error;
	break;
    }

    idctqtab(quant[dscans[0].tq], decdata->dquant[0]);
    idctqtab(quant[dscans[1].tq], decdata->dquant[1]);
    idctqtab(quant[dscans[2].tq], decdata->dquant[2]);
    setinput(&in, datap);
    dec_initscans();

    dscans[0].next = 2;
    dscans[1].next = 1;
    dscans[2].next = 0;	/* 4xx encoding */
    for (my = 0,y=0; my < mcusy; my++,y+=ypitch) {
	for (mx = 0,x=0; mx < mcusx; mx++,x+=xpitch) {
	   if (info.dri && !--info.nm)
		if (dec_checkmarker()) {
		   err = ERR_WRONG_MARKER;
		   goto error;
		}
	switch (mb){
	   case 6: {
		decode_mcus(&in, decdata->dcts, mb, dscans, max);

        idct(decdata->dcts, decdata->out, decdata->dquant[0],
             IFIX(128.5), max[0]);
        idct(decdata->dcts + 64, decdata->out + 64,
             decdata->dquant[0], IFIX(128.5), max[1]);
        idct(decdata->dcts + 128, decdata->out + 128,
             decdata->dquant[0], IFIX(128.5), max[2]);
        idct(decdata->dcts + 192, decdata->out + 192,
             decdata->dquant[0], IFIX(128.5), max[3]);
        idct(decdata->dcts + 256, decdata->out + 256,
             decdata->dquant[1], IFIX(0.5), max[4]);
        idct(decdata->dcts + 320, decdata->out + 320,
             decdata->dquant[2], IFIX(0.5), max[5]);
    }
    break;

	   case 4:
	   {
        decode_mcus(&in, decdata->dcts, mb, dscans, max);

[b]
        ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts);
      ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out);
      ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
      ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
       ret = clSetKernelArg(kernel, 4, sizeof(int), &max[0]);
          ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
//ret= clWaitForEvents(1, &kernelDone);
//
       ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&decdata->dcts + 64);
        ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 64);
        ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
  ret = clSetKernelArg(kernel, 4, sizeof(int), &max[1]);
          ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
//ret= clWaitForEvents(2, &kernelDone);
//
        ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 128);
        ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 256);
        ret = clSetKernelArg(kernel, 2,  sizeof(cl_int), (void*)&decdata->dquant[1]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
        ret = clSetKernelArg(kernel, 4, sizeof(int), &max[4]);
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
//ret= clWaitForEvents(3, &kernelDone);
//
        ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 192);
        ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&decdata->out + 320);
        ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[2]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
        ret = clSetKernelArg(kernel, 4, sizeof(int), &max[5]);
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
       //ret= clWaitForEvents(4, &kernelDone);
//
//        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, GWS, NULL, 0, NULL, NULL);
//
//                ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 8 * 8 * sizeof(int), decdata->out, 0, NULL, NULL);
//                clFinish(command_queue);
[/b]
//
//        idct(decdata->dcts, decdata->out, decdata->dquant[0],
//             IFIX(128.5), max[0]);
//        idct(decdata->dcts + 64, decdata->out + 64,
//             decdata->dquant[0], IFIX(128.5), max[1]);
//        idct(decdata->dcts + 128, decdata->out + 256,
//             decdata->dquant[1], IFIX(0.5), max[4]);
//        idct(decdata->dcts + 192, decdata->out + 320,
//             decdata->dquant[2], IFIX(0.5), max[5]);

	   }
	   break;
	   case 3:
	   	decode_mcus(&in, decdata->dcts, mb, dscans, max);

//             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts);
//             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out);
//             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[0]);
//             ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(128.5));
//             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[0]);
//
//
//             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts + 64);
//             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out + 256);
//             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[1]);
//            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(0.5));
//             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[4]);
//
//             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts + 128);
//             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out + 320);
//             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[2]);
//             ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(0.5));
//             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[5]);

//        idct(decdata->dcts, decdata->out, decdata->dquant[0],
//             IFIX(128.5), max[0]);
//        idct(decdata->dcts + 64, decdata->out + 256,
//             decdata->dquant[1], IFIX(0.5), max[4]);
//        idct(decdata->dcts + 128, decdata->out + 320,
//             decdata->dquant[2], IFIX(0.5), max[5]);


	   break;
	   case 1:
	   	decode_mcus(&in, decdata->dcts, mb, dscans, max);

//             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts);
//             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out);
//             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[0]);
//            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(128.5));
//             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[0]);


//        idct(decdata->dcts, decdata->out, decdata->dquant[0],
//             IFIX(128.5), max[0]);

	   break;

	} // switch enc411

[b]ret= clReleaseKernel(kernel);[/b]

	convert(decdata->out,*pic+y+x,pitch); 
	}
    }

    m = dec_readmarker(&in);
    if (m != M_EOI) {
	err = ERR_NO_EOI;
	goto error;
    }
    if (decdata)
	free(decdata);
    return 0;
  error:
    if (decdata)
	free(decdata);
    return err;



}

I wrote the host code in the main file called

test.c 

. It is as follows:


int main()
{
FILE *fr = fopen ("/root/Documents/lucmotion/mjpeg-master/test.mjpeg", "rb");
    unsigned char *pic=NULL;

    unsigned char *buf=malloc(63247);
    fread(buf, 1, 63247, fr);
    int width=0;
    int height=0;

    int out;
    cl_platform_id platform_id;
    cl_uint ret_num_platforms;
    cl_device_id device_id;
    cl_uint ret_num_devices;
    cl_context context;
    cl_command_queue command_queue;
    cl_program program;
    size_t kernel_code_size;
    char *kernel_src_str;
    int *result;
    cl_int ret;
    int storeResult;
   FILE *fp;

    /* Get Platform */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

    /* Get Device */
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
        &ret_num_devices);

    /* Create Context */
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

    /* Create Command Queue */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    /* Read Kernel Code */
    fp = fopen("/root/Documents/lucmotion/mjpeg-master/jpegDecode.cl", "r");
    kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);
//
    /* Create Program Object */
    program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
        (const size_t *)&kernel_code_size, &ret);
    /* Compile kernel */
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

//working with idct


    storeResult = jpeg_decode(&pic, buf, &width, &height);
      //printf("ret %d
", ret);
      printf("ptr %ld
", (long) pic);
      printf("height %d
", height);
      printf("width %d
", width);


      ret = clFlush(command_queue);
      ret = clFinish(command_queue);
    ret = clReleaseProgram(program);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
	
	//Write greyscale (Y) information as PPM image
	if (pic != NULL)
	{
        WritePPMImage("HalfImage.ppm", pic, height, width);
        printf("Written nWimage.ppm
");
	}

	return 0;
}

I would greatly appreciate it if someone can tell me what I am doing wrong with my code.

Waiting eagerly for your response.

The forum is not allowing me to edit my original post. Hence, I have posted additional details in this reply.

The images the codes have generated are in the following links:

(This link contains the ppm image I get by running the original github code)

https://www.flickr.com/photos/50175764@N06/34702391825/in/photostream/

(This link contains the image I get by running the OpenCL code)
https://www.flickr.com/photos/50175764@N06/34540668762/

Both of the images are originally

.ppm
images. Since I cannot directly attach ppm images to my post (and I somehow can’t attach jpg images to my posts either), I have only attached links to my output images.

I am also attaching a concise version of my jpeg_decode function:

int jpeg_decode(unsigned char **pic, unsigned char *buf, int *width, int *height)
{
 
 cl_int ret;
   cl_event kernelDone;

   cl_kernel kernel;
 
   kernel = clCreateKernel(program, "idct", &ret);
 
 
        size_t GWS[2];
            GWS[0]= 8;
            GWS[1] = 8;
 
 

	switch (mb){
	  
	   case 4:
	   {
        decode_mcus(&in, decdata->dcts, mb, dscans, max);
 
        ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts);
      ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out);
      ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
      ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
       ret = clSetKernelArg(kernel, 4, sizeof(int), &max[0]);
          ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
//ret= clWaitForEvents(1, &kernelDone);
//
       ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&decdata->dcts + 64);
        ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 64);
        ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
  ret = clSetKernelArg(kernel, 4, sizeof(int), &max[1]);
          ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
//ret= clWaitForEvents(2, &kernelDone);
//
        ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 128);
        ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 256);
        ret = clSetKernelArg(kernel, 2,  sizeof(cl_int), (void*)&decdata->dquant[1]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
        ret = clSetKernelArg(kernel, 4, sizeof(int), &max[4]);
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
//ret= clWaitForEvents(3, &kernelDone);
//
        ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 192);
        ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&decdata->out + 320);
        ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[2]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
        ret = clSetKernelArg(kernel, 4, sizeof(int), &max[5]);
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
       //ret= clWaitForEvents(4, &kernelDone);
//
//        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, GWS, NULL, 0, NULL, NULL);
//
//                ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 8 * 8 * sizeof(int), decdata->out, 0, NULL, NULL);
//                clFinish(command_queue);


ret= clReleaseKernel(kernel);
 

}

I would greatly appreciate it if someone can AT LEAST tell me WHAT IS WRONG WITH MY CODE.

You don’t use workitem’s global ID, so there is no way for a thread to figure out which tile it is supposed to decode.

Thank you for your response. I am going to work on it.

What do you think is the cause of the dirty image?

This is some garbage uninitialized memory your GPU never touched and therefore it is undefined. Unless I misunderstood you program’s flow.

Yes. I realized that when I ran the code after commenting out the “clSetKernelArg” statements.

Could you tell me if a function that is called in normal C in the following way:

  idct(decdata->dcts, decdata->out, decdata->dquant[0],
             IFIX(128.5), max[0]);

Can be called in OpenCL in this way:

 ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts);
      ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out);
      ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
      ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
       ret = clSetKernelArg(kernel, 4, sizeof(int), &max[0]);
          ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);

In other words, is my OpenCL declaration equivalent to the C declaration? How can I replace the standard C function (like the one above) with an OpenCL equivalent of it? Is it even possible?

Waiting eagerly for your response.

What is IFIX? Also, you’re technically supposed to use cl_int when working with the 4th. And “__global long off” in the kernel itself doesn’t make sense, remove __global. Other than than, it seems about right.

IFIX is defined in the code in the following way:

#define IFIX(a) ((int)((a) * (1 << ISHIFT) + .5))

where ISHIFT is a constant.

Whenever I build the code using IFIX with SetKernelArg, I get the warning

"assignment from incompatible pointer type"

.

This is SO incorrect. clSetKernelArg only accepts valid pointers and you’re trying to give it a value. Do this instead.

cl_long t = IFIX(a);
      ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &t);

I have literally no clue how come this code even compiles.

[QUOTE=Salabar;42378]This is SO incorrect. clSetKernelArg only accepts valid pointers and you’re trying to give it a value. Do this instead.

cl_long t = IFIX(a);
      ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &t);

I have literally no clue how come this code even compiles.[/QUOTE]

Thank you for your suggestion. I followed your advice and made changes to my code. However, I am still getting the dirty image. It seems like the kernel I have written is not even being accessed by the program (I checked this by writing printf statements in the kernel. The code still built and went on to produce the same dirty image).

I changed my method of setting the arguments for my kernel. I did this in the following way in my “utils.c” file:

 cl_int ret;
   cl_int error;
   cl_event kernelDone;

   cl_long t= IFIX(128.5);

   cl_long u= IFIX(0.5);


cl_kernel kernel;

 size_t GWS[2];
            GWS[0]= 1;
            GWS[1] = 1;

 case 4:
	   {
        decode_mcus(&in, decdata->dcts, mb, dscans, max);

        ret = clSetKernelArg(kernel, 0,  400 * sizeof(cl_int), (void*)&clInput);
      ret = clSetKernelArg(kernel, 1,  400 * sizeof(cl_int), (void*)&clOutput);
      ret = clSetKernelArg(kernel, 2, 64 * sizeof(cl_int), (void*)&clquant);
      ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &t);
       ret = clSetKernelArg(kernel, 4, sizeof(cl_int), &max[0]);
          ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
          ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 400 * sizeof(cl_mem), decdata->out, 0, NULL, NULL);
                         clFinish(command_queue);
ret= clWaitForEvents(1, &kernelDone);

       ret = clSetKernelArg(kernel, 0, 400 * sizeof(cl_int), (void*)&decdata->dcts + 64);
        ret = clSetKernelArg(kernel, 1,  400 * sizeof(cl_int), (void*)&decdata->out + 64);
        ret = clSetKernelArg(kernel, 2, 64 * sizeof(cl_int), (void*)&decdata->dquant[0]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &t);
  ret = clSetKernelArg(kernel, 4, sizeof(cl_int), &max[1]);
          ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
          ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 400 * sizeof(cl_int), decdata->out + 64, 0, NULL, NULL);
                         clFinish(command_queue);
ret= clWaitForEvents(2, &kernelDone);
//
        ret = clSetKernelArg(kernel, 0,  400 * sizeof(cl_int), (void*)&decdata->dcts + 128);
        ret = clSetKernelArg(kernel, 1,  400 * sizeof(cl_int), (void*)&decdata->out + 256);
        ret = clSetKernelArg(kernel, 2,  64 * sizeof(cl_int), (void*)&decdata->dquant[1]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &u);
        ret = clSetKernelArg(kernel, 4, sizeof(cl_int), &max[4]);
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
        ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 400 * sizeof(cl_int), decdata->out + 256, 0, NULL, NULL);
                       clFinish(command_queue);
ret= clWaitForEvents(3, &kernelDone);
////
        ret = clSetKernelArg(kernel, 0,  400 * sizeof(cl_int), (void*)&decdata->dcts + 192);
        ret = clSetKernelArg(kernel, 1, 400 * sizeof(cl_int), (void*)&decdata->out + 320);
        ret = clSetKernelArg(kernel, 2, 64 * sizeof(cl_int), (void*)&decdata->dquant[2]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &u);
        ret = clSetKernelArg(kernel, 4, sizeof(cl_int), &max[5]);
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
        ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 400 * sizeof(cl_int), decdata->out+320, 0, NULL, NULL);
                       clFinish(command_queue);
       ret= clWaitForEvents(4, &kernelDone);

       ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, GWS, NULL, 0, NULL, NULL);
               ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 400 * sizeof(cl_int), decdata->out, 0, NULL, NULL);
              clFinish(command_queue);

	   }
	   break;

I essentially replaced every function declaration with “clSetKernelArg” statements. I then called “clEnqueueReadBuffer” to read the data back to the host. I called clEnqueueNDRangeKernel after every set of “clsetKernelArg” statements to execute each kernel on the device.
I set my global work size to be a 2-dimensional array with both arrays containing 1 value each. I am not comfortable with doing this but I wanted to see how this would work out.

My kernel file now looks like this:

__kernel inline static void idct(__global int *in, __global int *out, __global int *quant, long off, int max)
{

unsigned int gid= get_global_id(0);
unsigned int nid= get_global_id(1);
long t0, t1, t2, t3, t4, t5, t6, t7;	// t ;
long tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
long tmp[64], *tmpp;
unsigned char *zig2p;

t0 = off;
if (max == 1) {
    t0 += in[0] * quant[0];
    for (i = 0; i < 64; i++)
        out[i] = ITOINT(t0);
    return;
}
zig2p = zig2;
tmpp = tmp;
for (gid = 0; gid < 8; gid++) {
    nid = *zig2p++;
    t0 += in[nid] * (long) quant[nid];
    nid = *zig2p++;
    t5 = in[nid] * (long) quant[nid];
    nid = *zig2p++;
    t2 = in[nid] * (long) quant[nid];
    nid = *zig2p++;
    t7 = in[nid] * (long) quant[nid];
    nid = *zig2p++;
    t1 = in[nid] * (long) quant[nid];
    nid = *zig2p++;
    t4 = in[nid] * (long) quant[nid];
    nid = *zig2p++;
    t3 = in[nid] * (long) quant[nid];
    nid = *zig2p++;
    t6 = in[nid] * (long) quant[nid];


    if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {

        tmpp[0 * 8] = t0;
        tmpp[1 * 8] = t0;
        tmpp[2 * 8] = t0;
        tmpp[3 * 8] = t0;
        tmpp[4 * 8] = t0;
        tmpp[5 * 8] = t0;
        tmpp[6 * 8] = t0;
        tmpp[7 * 8] = t0;

        tmpp++;
        t0 = 0;
        continue;
    }
    //IDCT;
    tmp0 = t0 + t1;
    t1 = t0 - t1;
    tmp2 = t2 - t3;
    t3 = t2 + t3;
    tmp2 = IMULT(tmp2, IC4) - t3;
    tmp3 = tmp0 + t3;
    t3 = tmp0 - t3;
    tmp1 = t1 + tmp2;
    tmp2 = t1 - tmp2;
    tmp4 = t4 - t7;
    t7 = t4 + t7;
    tmp5 = t5 + t6;
    t6 = t5 - t6;
    tmp6 = tmp5 - t7;
    t7 = tmp5 + t7;
    tmp5 = IMULT(tmp6, IC4);
    tmp6 = IMULT((tmp4 + t6), S22);
    tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
    t6 = IMULT(t6, (C22 + S22)) - tmp6;
    t6 = t6 - t7;
    t5 = tmp5 - t6;
    t4 = tmp4 - t5;

    tmpp[0 * 8] = tmp3 + t7;	//t0;
    tmpp[1 * 8] = tmp1 + t6;	//t1;
    tmpp[2 * 8] = tmp2 + t5;	//t2;
    tmpp[3 * 8] = t3 + t4;	//t3;
    tmpp[4 * 8] = t3 - t4;	//t4;
    tmpp[5 * 8] = tmp2 - t5;	//t5;
    tmpp[6 * 8] = tmp1 - t6;	//t6;
    tmpp[7 * 8] = tmp3 - t7;	//t7;
    tmpp++;
    t0 = 0;
}
for (gid = 0, nid = 0; gid < 8; gid++) {
    t0 = tmp[nid + 0];
    t1 = tmp[nid + 1];
    t2 = tmp[nid + 2];
    t3 = tmp[nid + 3];
    t4 = tmp[nid + 4];
    t5 = tmp[nid + 5];
    t6 = tmp[nid + 6];
    t7 = tmp[nid + 7];
    if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
        te = ITOINT(t0);
        out[nid + 0] = te;
        out[nid + 1] = te;
        out[nid + 2] = te;
        out[nid + 3] = te;
        out[nid + 4] = te;
        out[nid + 5] = te;
        out[nid + 6] = te;
        out[nid + 7] = te;
        nid += 8;
        continue;
    }
    //IDCT;
    tmp0 = t0 + t1;
    t1 = t0 - t1;
    tmp2 = t2 - t3;
    t3 = t2 + t3;
    tmp2 = IMULT(tmp2, IC4) - t3;
    tmp3 = tmp0 + t3;
    t3 = tmp0 - t3;
    tmp1 = t1 + tmp2;
    tmp2 = t1 - tmp2;
    tmp4 = t4 - t7;
    t7 = t4 + t7;
    tmp5 = t5 + t6;
    t6 = t5 - t6;
    tmp6 = tmp5 - t7;
    t7 = tmp5 + t7;
    tmp5 = IMULT(tmp6, IC4);
    tmp6 = IMULT((tmp4 + t6), S22);
    tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
    t6 = IMULT(t6, (C22 + S22)) - tmp6;
    t6 = t6 - t7;
    t5 = tmp5 - t6;
    t4 = tmp4 - t5;

    out[nid + 0] = ITOINT(tmp3 + t7);
    out[nid + 1] = ITOINT(tmp1 + t6);
    out[nid + 2] = ITOINT(tmp2 + t5);
    out[nid + 3] = ITOINT(t3 + t4);
    out[nid + 4] = ITOINT(t3 - t4);
    out[nid + 5] = ITOINT(tmp2 - t5);
    out[nid + 6] = ITOINT(tmp1 - t6);
    out[nid + 7] = ITOINT(tmp3 - t7);
    nid += 8;
}


}

You said in your previous answer that that global_id has been declared but was not being used in my previous kernel code. In order to get around that problem, I replaced i and j with gid and nid respectively. The global work size is only 1 in both dimensions so I figured I could let the loops stay.

Finally, I set up my context and called the function from the c file containing main. My main is set up in the following way:

int main()
{
    //Read input data
    FILE *fr = fopen ("/root/Documents/lucmotion/mjpeg-master/test.mjpeg", "rb");
    unsigned char *pic=NULL;

    unsigned char *buf=malloc(63247);
    fread(buf, 1, 63247, fr);
    int width=0;
    int height=0;

    int out;
    cl_platform_id platform_id;
    cl_uint ret_num_platforms;
    cl_device_id device_id;
    cl_uint ret_num_devices;
    cl_context context;
    cl_command_queue command_queue;
    cl_program program;
    size_t kernel_code_size;
    char *kernel_src_str;
    int *result;
    cl_int ret;
    int storeResult;
   FILE *fp;



    /* Get Platform */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

    /* Get Device */
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
        &ret_num_devices);

    /* Create Context */
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

    /* Create Command Queue */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    /* Read Kernel Code */
    fp = fopen("/root/Documents/lucmotion/mjpeg-master/jpegDecode.cl", "r");
    kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    /* Create Program Object */
    program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
        (const size_t *)&kernel_code_size, &ret);
    /* Compile kernel */
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

//working with idct

    storeResult = jpeg_decode(&pic, buf, &width, &height);
      //printf("ret %d
", ret);
      printf("ptr %ld
", (long) pic);
      printf("height %d
", height);
      printf("width %d
", width);


    /* OpenCL Object Finalization */
 
      ret = clFlush(command_queue);
      ret = clFinish(command_queue);
    ret = clReleaseProgram(program);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);


	//Write greyscale (Y) information as PPM image

	if (pic != NULL)
	{
        WritePPMImage("HalfImage.ppm", pic, height, width);
        printf("Written nWimage.ppm
");
	}

	return 0;
}

I think now my main problem is that my kernel is not even being accessed by the program. Could this be why code was compiling despite obvious errors in declaring the arguments of the kernel?

How can I get around this problem? Any suggestions will be appreciated.

Simply try to fill to fill your image with some consistent pattern, like

 
for (int i = 0; i < 64; ++i)
out[(gid * get_global_size(1) + nid) * 64  + i] = gid %256;

and make sure everything else works.

[QUOTE=ASiddiq;42310]Hello.
I am new to OpenCL programming and am unable to figure out where I am going wrong with my code.

I am trying to translate a portion of the code in this link:

https://github.com/TimSC/mjpeg

to OpenCL but am not getting the expected output. The expected output is a .ppm image which I have attached to my query (test.ppm). I cannot attach a ppm file to this question at the moment.

I want to get identical output with my OpenCL code. I picked one function in the

utils.c

file and put it AS IT IS in a .cl file which I called

jpeg.cl

:

jpeg.cl
__kernel inline static void idct(__global int *in, __global int *out, __global int *quant, __global long off, int max)
{

unsigned int gid= get_global_id(0);
long t0, t1, t2, t3, t4, t5, t6, t7;	// t ;
long tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
long tmp[64], *tmpp;
int i, j, te;
unsigned char *zig2p;

t0 = off;
if (max == 1) {
    t0 += in[0] * quant[0];
    for (i = 0; i < 64; i++)
        out[i] = ITOINT(t0);
    return;
}
zig2p = zig2;
tmpp = tmp;
for (i = 0; i < 8; i++) {
    j = *zig2p++;
    t0 += in[j] * (long) quant[j];
    j = *zig2p++;
    t5 = in[j] * (long) quant[j];
    j = *zig2p++;
    t2 = in[j] * (long) quant[j];
    j = *zig2p++;
    t7 = in[j] * (long) quant[j];
    j = *zig2p++;
    t1 = in[j] * (long) quant[j];
    j = *zig2p++;
    t4 = in[j] * (long) quant[j];
    j = *zig2p++;
    t3 = in[j] * (long) quant[j];
    j = *zig2p++;
    t6 = in[j] * (long) quant[j];


    if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {

        tmpp[0 * 8] = t0;
        tmpp[1 * 8] = t0;
        tmpp[2 * 8] = t0;
        tmpp[3 * 8] = t0;
        tmpp[4 * 8] = t0;
        tmpp[5 * 8] = t0;
        tmpp[6 * 8] = t0;
        tmpp[7 * 8] = t0;

        tmpp++;
        t0 = 0;
        continue;
    }
    //IDCT;
    tmp0 = t0 + t1;
    t1 = t0 - t1;
    tmp2 = t2 - t3;
    t3 = t2 + t3;
    tmp2 = IMULT(tmp2, IC4) - t3;
    tmp3 = tmp0 + t3;
    t3 = tmp0 - t3;
    tmp1 = t1 + tmp2;
    tmp2 = t1 - tmp2;
    tmp4 = t4 - t7;
    t7 = t4 + t7;
    tmp5 = t5 + t6;
    t6 = t5 - t6;
    tmp6 = tmp5 - t7;
    t7 = tmp5 + t7;
    tmp5 = IMULT(tmp6, IC4);
    tmp6 = IMULT((tmp4 + t6), S22);
    tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
    t6 = IMULT(t6, (C22 + S22)) - tmp6;
    t6 = t6 - t7;
    t5 = tmp5 - t6;
    t4 = tmp4 - t5;

    tmpp[0 * 8] = tmp3 + t7;	//t0;
    tmpp[1 * 8] = tmp1 + t6;	//t1;
    tmpp[2 * 8] = tmp2 + t5;	//t2;
    tmpp[3 * 8] = t3 + t4;	//t3;
    tmpp[4 * 8] = t3 - t4;	//t4;
    tmpp[5 * 8] = tmp2 - t5;	//t5;
    tmpp[6 * 8] = tmp1 - t6;	//t6;
    tmpp[7 * 8] = tmp3 - t7;	//t7;
    tmpp++;
    t0 = 0;
}
for (i = 0, j = 0; i < 8; i++) {
    t0 = tmp[j + 0];
    t1 = tmp[j + 1];
    t2 = tmp[j + 2];
    t3 = tmp[j + 3];
    t4 = tmp[j + 4];
    t5 = tmp[j + 5];
    t6 = tmp[j + 6];
    t7 = tmp[j + 7];
    if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
        te = ITOINT(t0);
        out[j + 0] = te;
        out[j + 1] = te;
        out[j + 2] = te;
        out[j + 3] = te;
        out[j + 4] = te;
        out[j + 5] = te;
        out[j + 6] = te;
        out[j + 7] = te;
        j += 8;
        continue;
    }
    //IDCT;
    tmp0 = t0 + t1;
    t1 = t0 - t1;
    tmp2 = t2 - t3;
    t3 = t2 + t3;
    tmp2 = IMULT(tmp2, IC4) - t3;
    tmp3 = tmp0 + t3;
    t3 = tmp0 - t3;
    tmp1 = t1 + tmp2;
    tmp2 = t1 - tmp2;
    tmp4 = t4 - t7;
    t7 = t4 + t7;
    tmp5 = t5 + t6;
    t6 = t5 - t6;
    tmp6 = tmp5 - t7;
    t7 = tmp5 + t7;
    tmp5 = IMULT(tmp6, IC4);
    tmp6 = IMULT((tmp4 + t6), S22);
    tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
    t6 = IMULT(t6, (C22 + S22)) - tmp6;
    t6 = t6 - t7;
    t5 = tmp5 - t6;
    t4 = tmp4 - t5;

    out[j + 0] = ITOINT(tmp3 + t7);
    out[j + 1] = ITOINT(tmp1 + t6);
    out[j + 2] = ITOINT(tmp2 + t5);
    out[j + 3] = ITOINT(t3 + t4);
    out[j + 4] = ITOINT(t3 - t4);
    out[j + 5] = ITOINT(tmp2 - t5);
    out[j + 6] = ITOINT(tmp1 - t6);
    out[j + 7] = ITOINT(tmp3 - t7);
    j += 8;
}


}

Please note that my goal FOR NOW is not to run the code efficiently on the GPU. I simply want to run it on my GPU (Tesla K20c) and get the same output which I am getting with the normal C code.

I am calling this kernel in a function called

jpeg_decode

which is located in the

utils.c file

. I have seen many OpenCL codes but I was uncertain about calling a kernel in a function. I got support from this link

https://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/fft-fast-fourier-transform/

and modified my

 jpeg_decode

function in the

utils.c

file in the following way:

jpeg_decode()

in

utils.c

:


int jpeg_decode(unsigned char **pic, unsigned char *buf, int *width, int *height)
{

 [b]  cl_int ret;
   cl_event kernelDone;[/b]


    struct jpeg_decdata *decdata;
    int i, j, m, tac, tdc;
    int intwidth, intheight;
    int mcusx, mcusy, mx, my;
    int ypitch ,xpitch,bpp,pitch,x,y;
    int mb;
    int max[6];
    ftopict convert;
    int err = 0;
    int isInitHuffman = 0;

    decdata = (struct jpeg_decdata *) malloc(sizeof(struct jpeg_decdata));

    //fixing= IFIX(a);
  [b] cl_kernel kernel;

   kernel = clCreateKernel(program, "idct", &ret);[/b]


        [b]size_t GWS[2];
            GWS[0]= 8;
            GWS[1] = 8;[/b]


    if (!decdata) {
	err = -1;
	goto error;
    }
    if (buf == NULL) {
	err = -1;
	goto error;
    }
    datap = buf;
    if (getbyte() != 0xff) {
	err = ERR_NO_SOI;
	goto error;
    }
    if (getbyte() != M_SOI) {
	err = ERR_NO_SOI;
	goto error;
    }
    if (readtables(M_SOF0, &isInitHuffman)) {
	err = ERR_BAD_TABLES;
	goto error;
    }
    getword();
    i = getbyte();
    if (i != 8) {
	err = ERR_NOT_8BIT;
	goto error;
    }
    intheight = getword();
    intwidth = getword();
    
    if ((intheight & 7) || (intwidth & 7)) {
	err = ERR_BAD_WIDTH_OR_HEIGHT;
	goto error;
    }
    info.nc = getbyte();
    if (info.nc > MAXCOMP) {
	err = ERR_TOO_MANY_COMPPS;
	goto error;
    }
    for (i = 0; i < info.nc; i++) {
	int h, v;
	comps[i].cid = getbyte();
	comps[i].hv = getbyte();
	v = comps[i].hv & 15;
	h = comps[i].hv >> 4;
	comps[i].tq = getbyte();
	if (h > 3 || v > 3) {
	   err = ERR_ILLEGAL_HV;
	   goto error;
	}
	if (comps[i].tq > 3) {
	   err = ERR_QUANT_TABLE_SELECTOR;
	   goto error;
	}
    }
    if (readtables(M_SOS,&isInitHuffman)) {
	err = ERR_BAD_TABLES;
	goto error;
    }
    getword();
    info.ns = getbyte();
    if (!info.ns){
    printf("info ns %d/n",info.ns);
	err = ERR_NOT_YCBCR_221111;
	goto error;
    }
    for (i = 0; i < info.ns; i++) {
	dscans[i].cid = getbyte();
	tdc = getbyte();
	tac = tdc & 15;
	tdc >>= 4;
	if (tdc > 1 || tac > 1) {
	   err = ERR_QUANT_TABLE_SELECTOR;
	   goto error;
	}
	for (j = 0; j < info.nc; j++)
	   if (comps[j].cid == dscans[i].cid)
		break;
	if (j == info.nc) {
	   err = ERR_UNKNOWN_CID_IN_SCAN;
	   goto error;
	}
	dscans[i].hv = comps[j].hv;
	dscans[i].tq = comps[j].tq;
	dscans[i].hudc.dhuff = dec_huffdc + tdc;
	dscans[i].huac.dhuff = dec_huffac + tac;
    }

    i = getbyte();
    j = getbyte();
    m = getbyte();

    if (i != 0 || j != 63 || m != 0) {
    	printf("hmm FW error,not seq DCT ??
");
    }
   // printf("ext huffman table %d 
",isInitHuffman);
    if(!isInitHuffman) {
    	if(huffman_init() < 0)
		return -ERR_BAD_TABLES;
	}
/*
    if (dscans[0].cid != 1 || dscans[1].cid != 2 || dscans[2].cid != 3) {
	err = ERR_NOT_YCBCR_221111;
	goto error;
    }

    if (dscans[1].hv != 0x11 || dscans[2].hv != 0x11) {
	err = ERR_NOT_YCBCR_221111;
	goto error;
    }
*/    
    /* if internal width and external are not the same or heigth too 
       and pic not allocated realloc the good size and mark the change 
       need 1 macroblock line more ?? */
    if (intwidth != *width || intheight != *height || *pic == NULL) {
	*width = intwidth;
	*height = intheight;
	// BytesperPixel 2 yuyv , 3 rgb24 
	*pic =
	   (unsigned char *) realloc((unsigned char *) *pic,
				     (size_t) intwidth * (intheight +
							  8) * 2);
    }


    switch (dscans[0].hv) {
    case 0x22: // 411
    	mb=6;
	mcusx = *width >> 4;
	mcusy = *height >> 4;
	bpp=2;
	xpitch = 16 * bpp;
	pitch = *width * bpp; // YUYV out
	ypitch = 16 * pitch;
	convert = yuv420pto422;	
	break;
    case 0x21: //422

    	mb=4;
	mcusx = *width >> 4;
	mcusy = *height >> 3;
	bpp=2;	
	xpitch = 16 * bpp;
	pitch = *width * bpp; // YUYV out
	ypitch = 8 * pitch;
    convert = yuv422pto422;
	break;
     printf("MY VALUES: %dx%d
",*width,*height);
    case 0x11: //444
	mcusx = *width >> 3;
	mcusy = *height >> 3;
	bpp=2;
	xpitch = 8 * bpp;
	pitch = *width * bpp; // YUYV out
	ypitch = 8 * pitch;
	if (info.ns==1) {
    		mb = 1;
		convert = yuv400pto422;
	} else {
		mb=3;
        convert = yuv422pto422;
	}
        break;
    default:
	err = ERR_NOT_YCBCR_221111;
	goto error;
	break;
    }

    idctqtab(quant[dscans[0].tq], decdata->dquant[0]);
    idctqtab(quant[dscans[1].tq], decdata->dquant[1]);
    idctqtab(quant[dscans[2].tq], decdata->dquant[2]);
    setinput(&in, datap);
    dec_initscans();

    dscans[0].next = 2;
    dscans[1].next = 1;
    dscans[2].next = 0;	/* 4xx encoding */
    for (my = 0,y=0; my < mcusy; my++,y+=ypitch) {
	for (mx = 0,x=0; mx < mcusx; mx++,x+=xpitch) {
	   if (info.dri && !--info.nm)
		if (dec_checkmarker()) {
		   err = ERR_WRONG_MARKER;
		   goto error;
		}
	switch (mb){
	   case 6: {
		decode_mcus(&in, decdata->dcts, mb, dscans, max);

        idct(decdata->dcts, decdata->out, decdata->dquant[0],
             IFIX(128.5), max[0]);
        idct(decdata->dcts + 64, decdata->out + 64,
             decdata->dquant[0], IFIX(128.5), max[1]);
        idct(decdata->dcts + 128, decdata->out + 128,
             decdata->dquant[0], IFIX(128.5), max[2]);
        idct(decdata->dcts + 192, decdata->out + 192,
             decdata->dquant[0], IFIX(128.5), max[3]);
        idct(decdata->dcts + 256, decdata->out + 256,
             decdata->dquant[1], IFIX(0.5), max[4]);
        idct(decdata->dcts + 320, decdata->out + 320,
             decdata->dquant[2], IFIX(0.5), max[5]);
    }
    break;

	   case 4:
	   {
        decode_mcus(&in, decdata->dcts, mb, dscans, max);

[b]
        ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts);
      ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out);
      ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
      ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
       ret = clSetKernelArg(kernel, 4, sizeof(int), &max[0]);
          ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
//ret= clWaitForEvents(1, &kernelDone);
//
       ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&decdata->dcts + 64);
        ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 64);
        ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
  ret = clSetKernelArg(kernel, 4, sizeof(int), &max[1]);
          ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
//ret= clWaitForEvents(2, &kernelDone);
//
        ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 128);
        ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 256);
        ret = clSetKernelArg(kernel, 2,  sizeof(cl_int), (void*)&decdata->dquant[1]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
        ret = clSetKernelArg(kernel, 4, sizeof(int), &max[4]);
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
//ret= clWaitForEvents(3, &kernelDone);
//
        ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 192);
        ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&decdata->out + 320);
        ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[2]);
        ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
        ret = clSetKernelArg(kernel, 4, sizeof(int), &max[5]);
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
       //ret= clWaitForEvents(4, &kernelDone);
//
//        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, GWS, NULL, 0, NULL, NULL);
//
//                ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 8 * 8 * sizeof(int), decdata->out, 0, NULL, NULL);
//                clFinish(command_queue);
[/b]
//
//        idct(decdata->dcts, decdata->out, decdata->dquant[0],
//             IFIX(128.5), max[0]);
//        idct(decdata->dcts + 64, decdata->out + 64,
//             decdata->dquant[0], IFIX(128.5), max[1]);
//        idct(decdata->dcts + 128, decdata->out + 256,
//             decdata->dquant[1], IFIX(0.5), max[4]);
//        idct(decdata->dcts + 192, decdata->out + 320,
//             decdata->dquant[2], IFIX(0.5), max[5]);

	   }
	   break;
	   case 3:
	   	decode_mcus(&in, decdata->dcts, mb, dscans, max);

//             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts);
//             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out);
//             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[0]);
//             ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(128.5));
//             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[0]);
//
//
//             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts + 64);
//             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out + 256);
//             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[1]);
//            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(0.5));
//             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[4]);
//
//             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts + 128);
//             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out + 320);
//             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[2]);
//             ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(0.5));
//             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[5]);

//        idct(decdata->dcts, decdata->out, decdata->dquant[0],
//             IFIX(128.5), max[0]);
//        idct(decdata->dcts + 64, decdata->out + 256,
//             decdata->dquant[1], IFIX(0.5), max[4]);
//        idct(decdata->dcts + 128, decdata->out + 320,
//             decdata->dquant[2], IFIX(0.5), max[5]);


	   break;
	   case 1:
	   	decode_mcus(&in, decdata->dcts, mb, dscans, max);

//             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts);
//             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out);
//             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[0]);
//            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(128.5));
//             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[0]);


//        idct(decdata->dcts, decdata->out, decdata->dquant[0],
//             IFIX(128.5), max[0]);

	   break;

	} // switch enc411

[b]ret= clReleaseKernel(kernel);[/b]

	convert(decdata->out,*pic+y+x,pitch); 
	}
    }

    m = dec_readmarker(&in);
    if (m != M_EOI) {
	err = ERR_NO_EOI;
	goto error;
    }
    if (decdata)
	free(decdata);
    return 0;
  error:
    if (decdata)
	free(decdata);
    return err;



}

I wrote the host code in the main file called

test.c 

. It is as follows:


int main()
{
FILE *fr = fopen ("/root/Documents/lucmotion/mjpeg-master/test.mjpeg", "rb");
    unsigned char *pic=NULL;

    unsigned char *buf=malloc(63247);
    fread(buf, 1, 63247, fr);
    int width=0;
    int height=0;

    int out;
    cl_platform_id platform_id;
    cl_uint ret_num_platforms;
    cl_device_id device_id;
    cl_uint ret_num_devices;
    cl_context context;
    cl_command_queue command_queue;
    cl_program program;
    size_t kernel_code_size;
    char *kernel_src_str;
    int *result;
    cl_int ret;
    int storeResult;
   FILE *fp;

    /* Get Platform */
    ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);

    /* Get Device */
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
        &ret_num_devices);

    /* Create Context */
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

    /* Create Command Queue */
    command_queue = clCreateCommandQueue(context, device_id, 0, &ret);

    /* Read Kernel Code */
    fp = fopen("/root/Documents/lucmotion/mjpeg-master/jpegDecode.cl", "r");
    kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);
//
    /* Create Program Object */
    program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
        (const size_t *)&kernel_code_size, &ret);
    /* Compile kernel */
    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);

//working with idct


    storeResult = jpeg_decode(&pic, buf, &width, &height);
      //printf("ret %d
", ret);
      printf("ptr %ld
", (long) pic);
      printf("height %d
", height);
      printf("width %d
", width);


      ret = clFlush(command_queue);
      ret = clFinish(command_queue);
    ret = clReleaseProgram(program);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
	
	//Write greyscale (Y) information as PPM image
	if (pic != NULL)
	{
        WritePPMImage("HalfImage.ppm", pic, height, width);
        printf("Written nWimage.ppm
");
	}

	return 0;
}

I would greatly [COLOR=“#000000”]blogchiem.netappreciate it if someone can tell me what I am doing wrong with my code. [/COLOR]

Waiting eagerly for your response.[/QUOTE]

Let’s just show me how to compress JPEG images in the best way possible?