Page 1 of 2 12 LastLast
Results 1 to 10 of 13

Thread: Incorrect output for JPEG decoding with OpenCL on Eclipse (RHEL 6.6)

  1. #1
    Junior Member
    Join Date
    May 2017
    Posts
    7

    Incorrect output for JPEG decoding with OpenCL on Eclipse (RHEL 6.6)

    Hello.
    I am new to OpenCL programming and am unable to figure out where I am going wrong with my code.

    I am trying to translate a portion of the code in this link:
    HTML Code:
    https://github.com/TimSC/mjpeg
    to OpenCL but am not getting the expected output. The expected output is a .ppm image which I have attached to my query (test.ppm). I cannot attach a ppm file to this question at the moment.

    I want to get identical output with my OpenCL code. I picked one function in the
    Code :
    utils.c
    file and put it AS IT IS in a .cl file which I called
    Code :
    jpeg.cl
    :

    Code :
    jpeg.cl

    Code :
    __kernel inline static void idct(__global int *in, __global int *out, __global int *quant, __global long off, int max)
    {
     
    unsigned int gid= get_global_id(0);
    long t0, t1, t2, t3, t4, t5, t6, t7;	// t ;
    long tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
    long tmp[64], *tmpp;
    int i, j, te;
    unsigned char *zig2p;
     
    t0 = off;
    if (max == 1) {
        t0 += in[0] * quant[0];
        for (i = 0; i < 64; i++)
            out[i] = ITOINT(t0);
        return;
    }
    zig2p = zig2;
    tmpp = tmp;
    for (i = 0; i < 8; i++) {
        j = *zig2p++;
        t0 += in[j] * (long) quant[j];
        j = *zig2p++;
        t5 = in[j] * (long) quant[j];
        j = *zig2p++;
        t2 = in[j] * (long) quant[j];
        j = *zig2p++;
        t7 = in[j] * (long) quant[j];
        j = *zig2p++;
        t1 = in[j] * (long) quant[j];
        j = *zig2p++;
        t4 = in[j] * (long) quant[j];
        j = *zig2p++;
        t3 = in[j] * (long) quant[j];
        j = *zig2p++;
        t6 = in[j] * (long) quant[j];
     
     
        if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
     
            tmpp[0 * 8] = t0;
            tmpp[1 * 8] = t0;
            tmpp[2 * 8] = t0;
            tmpp[3 * 8] = t0;
            tmpp[4 * 8] = t0;
            tmpp[5 * 8] = t0;
            tmpp[6 * 8] = t0;
            tmpp[7 * 8] = t0;
     
            tmpp++;
            t0 = 0;
            continue;
        }
        //IDCT;
        tmp0 = t0 + t1;
        t1 = t0 - t1;
        tmp2 = t2 - t3;
        t3 = t2 + t3;
        tmp2 = IMULT(tmp2, IC4) - t3;
        tmp3 = tmp0 + t3;
        t3 = tmp0 - t3;
        tmp1 = t1 + tmp2;
        tmp2 = t1 - tmp2;
        tmp4 = t4 - t7;
        t7 = t4 + t7;
        tmp5 = t5 + t6;
        t6 = t5 - t6;
        tmp6 = tmp5 - t7;
        t7 = tmp5 + t7;
        tmp5 = IMULT(tmp6, IC4);
        tmp6 = IMULT((tmp4 + t6), S22);
        tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
        t6 = IMULT(t6, (C22 + S22)) - tmp6;
        t6 = t6 - t7;
        t5 = tmp5 - t6;
        t4 = tmp4 - t5;
     
        tmpp[0 * 8] = tmp3 + t7;	//t0;
        tmpp[1 * 8] = tmp1 + t6;	//t1;
        tmpp[2 * 8] = tmp2 + t5;	//t2;
        tmpp[3 * 8] = t3 + t4;	//t3;
        tmpp[4 * 8] = t3 - t4;	//t4;
        tmpp[5 * 8] = tmp2 - t5;	//t5;
        tmpp[6 * 8] = tmp1 - t6;	//t6;
        tmpp[7 * 8] = tmp3 - t7;	//t7;
        tmpp++;
        t0 = 0;
    }
    for (i = 0, j = 0; i < 8; i++) {
        t0 = tmp[j + 0];
        t1 = tmp[j + 1];
        t2 = tmp[j + 2];
        t3 = tmp[j + 3];
        t4 = tmp[j + 4];
        t5 = tmp[j + 5];
        t6 = tmp[j + 6];
        t7 = tmp[j + 7];
        if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
            te = ITOINT(t0);
            out[j + 0] = te;
            out[j + 1] = te;
            out[j + 2] = te;
            out[j + 3] = te;
            out[j + 4] = te;
            out[j + 5] = te;
            out[j + 6] = te;
            out[j + 7] = te;
            j += 8;
            continue;
        }
        //IDCT;
        tmp0 = t0 + t1;
        t1 = t0 - t1;
        tmp2 = t2 - t3;
        t3 = t2 + t3;
        tmp2 = IMULT(tmp2, IC4) - t3;
        tmp3 = tmp0 + t3;
        t3 = tmp0 - t3;
        tmp1 = t1 + tmp2;
        tmp2 = t1 - tmp2;
        tmp4 = t4 - t7;
        t7 = t4 + t7;
        tmp5 = t5 + t6;
        t6 = t5 - t6;
        tmp6 = tmp5 - t7;
        t7 = tmp5 + t7;
        tmp5 = IMULT(tmp6, IC4);
        tmp6 = IMULT((tmp4 + t6), S22);
        tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
        t6 = IMULT(t6, (C22 + S22)) - tmp6;
        t6 = t6 - t7;
        t5 = tmp5 - t6;
        t4 = tmp4 - t5;
     
        out[j + 0] = ITOINT(tmp3 + t7);
        out[j + 1] = ITOINT(tmp1 + t6);
        out[j + 2] = ITOINT(tmp2 + t5);
        out[j + 3] = ITOINT(t3 + t4);
        out[j + 4] = ITOINT(t3 - t4);
        out[j + 5] = ITOINT(tmp2 - t5);
        out[j + 6] = ITOINT(tmp1 - t6);
        out[j + 7] = ITOINT(tmp3 - t7);
        j += 8;
    }
     
     
    }

    Please note that my goal FOR NOW is not to run the code efficiently on the GPU. I simply want to run it on my GPU (Tesla K20c) and get the same output which I am getting with the normal C code.

    I am calling this kernel in a function called
    Code :
    jpeg_decode
    which is located in the
    Code :
    utils.c file
    . I have seen many OpenCL codes but I was uncertain about calling a kernel in a function. I got support from this link
    Code :
    https://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/fft-fast-fourier-transform/
    and modified my
    Code :
     jpeg_decode
    function in the
    Code :
    utils.c
    file in the following way:

    Code :
    jpeg_decode()
    in
    Code :
    utils.c
    :

    Code :
    int jpeg_decode(unsigned char **pic, unsigned char *buf, int *width, int *height)
    {
     
     [B]  cl_int ret;
       cl_event kernelDone;[/B]
     
     
        struct jpeg_decdata *decdata;
        int i, j, m, tac, tdc;
        int intwidth, intheight;
        int mcusx, mcusy, mx, my;
        int ypitch ,xpitch,bpp,pitch,x,y;
        int mb;
        int max[6];
        ftopict convert;
        int err = 0;
        int isInitHuffman = 0;
     
        decdata = (struct jpeg_decdata *) malloc(sizeof(struct jpeg_decdata));
     
        //fixing= IFIX(a);
      [B] cl_kernel kernel;
     
       kernel = clCreateKernel(program, "idct", &ret);[/B]
     
     
            [B]size_t GWS[2];
                GWS[0]= 8;
                GWS[1] = 8;[/B]
     
     
        if (!decdata) {
    	err = -1;
    	goto error;
        }
        if (buf == NULL) {
    	err = -1;
    	goto error;
        }
        datap = buf;
        if (getbyte() != 0xff) {
    	err = ERR_NO_SOI;
    	goto error;
        }
        if (getbyte() != M_SOI) {
    	err = ERR_NO_SOI;
    	goto error;
        }
        if (readtables(M_SOF0, &isInitHuffman)) {
    	err = ERR_BAD_TABLES;
    	goto error;
        }
        getword();
        i = getbyte();
        if (i != 8) {
    	err = ERR_NOT_8BIT;
    	goto error;
        }
        intheight = getword();
        intwidth = getword();
     
        if ((intheight & 7) || (intwidth & 7)) {
    	err = ERR_BAD_WIDTH_OR_HEIGHT;
    	goto error;
        }
        info.nc = getbyte();
        if (info.nc > MAXCOMP) {
    	err = ERR_TOO_MANY_COMPPS;
    	goto error;
        }
        for (i = 0; i < info.nc; i++) {
    	int h, v;
    	comps[i].cid = getbyte();
    	comps[i].hv = getbyte();
    	v = comps[i].hv & 15;
    	h = comps[i].hv >> 4;
    	comps[i].tq = getbyte();
    	if (h > 3 || v > 3) {
    	   err = ERR_ILLEGAL_HV;
    	   goto error;
    	}
    	if (comps[i].tq > 3) {
    	   err = ERR_QUANT_TABLE_SELECTOR;
    	   goto error;
    	}
        }
        if (readtables(M_SOS,&isInitHuffman)) {
    	err = ERR_BAD_TABLES;
    	goto error;
        }
        getword();
        info.ns = getbyte();
        if (!info.ns){
        printf("info ns %d/n",info.ns);
    	err = ERR_NOT_YCBCR_221111;
    	goto error;
        }
        for (i = 0; i < info.ns; i++) {
    	dscans[i].cid = getbyte();
    	tdc = getbyte();
    	tac = tdc & 15;
    	tdc >>= 4;
    	if (tdc > 1 || tac > 1) {
    	   err = ERR_QUANT_TABLE_SELECTOR;
    	   goto error;
    	}
    	for (j = 0; j < info.nc; j++)
    	   if (comps[j].cid == dscans[i].cid)
    		break;
    	if (j == info.nc) {
    	   err = ERR_UNKNOWN_CID_IN_SCAN;
    	   goto error;
    	}
    	dscans[i].hv = comps[j].hv;
    	dscans[i].tq = comps[j].tq;
    	dscans[i].hudc.dhuff = dec_huffdc + tdc;
    	dscans[i].huac.dhuff = dec_huffac + tac;
        }
     
        i = getbyte();
        j = getbyte();
        m = getbyte();
     
        if (i != 0 || j != 63 || m != 0) {
        	printf("hmm FW error,not seq DCT ??\n");
        }
       // printf("ext huffman table %d \n",isInitHuffman);
        if(!isInitHuffman) {
        	if(huffman_init() < 0)
    		return -ERR_BAD_TABLES;
    	}
    /*
        if (dscans[0].cid != 1 || dscans[1].cid != 2 || dscans[2].cid != 3) {
    	err = ERR_NOT_YCBCR_221111;
    	goto error;
        }
     
        if (dscans[1].hv != 0x11 || dscans[2].hv != 0x11) {
    	err = ERR_NOT_YCBCR_221111;
    	goto error;
        }
    */    
        /* if internal width and external are not the same or heigth too 
           and pic not allocated realloc the good size and mark the change 
           need 1 macroblock line more ?? */
        if (intwidth != *width || intheight != *height || *pic == NULL) {
    	*width = intwidth;
    	*height = intheight;
    	// BytesperPixel 2 yuyv , 3 rgb24 
    	*pic =
    	   (unsigned char *) realloc((unsigned char *) *pic,
    				     (size_t) intwidth * (intheight +
    							  8) * 2);
        }
     
     
        switch (dscans[0].hv) {
        case 0x22: // 411
        	mb=6;
    	mcusx = *width >> 4;
    	mcusy = *height >> 4;
    	bpp=2;
    	xpitch = 16 * bpp;
    	pitch = *width * bpp; // YUYV out
    	ypitch = 16 * pitch;
    	convert = yuv420pto422;	
    	break;
        case 0x21: //422
     
        	mb=4;
    	mcusx = *width >> 4;
    	mcusy = *height >> 3;
    	bpp=2;	
    	xpitch = 16 * bpp;
    	pitch = *width * bpp; // YUYV out
    	ypitch = 8 * pitch;
        convert = yuv422pto422;
    	break;
         printf("MY VALUES: %dx%d\n",*width,*height);
        case 0x11: //444
    	mcusx = *width >> 3;
    	mcusy = *height >> 3;
    	bpp=2;
    	xpitch = 8 * bpp;
    	pitch = *width * bpp; // YUYV out
    	ypitch = 8 * pitch;
    	if (info.ns==1) {
        		mb = 1;
    		convert = yuv400pto422;
    	} else {
    		mb=3;
            convert = yuv422pto422;
    	}
            break;
        default:
    	err = ERR_NOT_YCBCR_221111;
    	goto error;
    	break;
        }
     
        idctqtab(quant[dscans[0].tq], decdata->dquant[0]);
        idctqtab(quant[dscans[1].tq], decdata->dquant[1]);
        idctqtab(quant[dscans[2].tq], decdata->dquant[2]);
        setinput(&in, datap);
        dec_initscans();
     
        dscans[0].next = 2;
        dscans[1].next = 1;
        dscans[2].next = 0;	/* 4xx encoding */
        for (my = 0,y=0; my < mcusy; my++,y+=ypitch) {
    	for (mx = 0,x=0; mx < mcusx; mx++,x+=xpitch) {
    	   if (info.dri && !--info.nm)
    		if (dec_checkmarker()) {
    		   err = ERR_WRONG_MARKER;
    		   goto error;
    		}
    	switch (mb){
    	   case 6: {
    		decode_mcus(&in, decdata->dcts, mb, dscans, max);
     
            idct(decdata->dcts, decdata->out, decdata->dquant[0],
                 IFIX(128.5), max[0]);
            idct(decdata->dcts + 64, decdata->out + 64,
                 decdata->dquant[0], IFIX(128.5), max[1]);
            idct(decdata->dcts + 128, decdata->out + 128,
                 decdata->dquant[0], IFIX(128.5), max[2]);
            idct(decdata->dcts + 192, decdata->out + 192,
                 decdata->dquant[0], IFIX(128.5), max[3]);
            idct(decdata->dcts + 256, decdata->out + 256,
                 decdata->dquant[1], IFIX(0.5), max[4]);
            idct(decdata->dcts + 320, decdata->out + 320,
                 decdata->dquant[2], IFIX(0.5), max[5]);
        }
        break;
     
    	   case 4:
    	   {
            decode_mcus(&in, decdata->dcts, mb, dscans, max);
     
    [B]
            ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts);
          ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out);
          ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
          ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
           ret = clSetKernelArg(kernel, 4, sizeof(int), &max[0]);
              ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
    //ret= clWaitForEvents(1, &kernelDone);
    //
           ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&decdata->dcts + 64);
            ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 64);
            ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
      ret = clSetKernelArg(kernel, 4, sizeof(int), &max[1]);
              ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
    //ret= clWaitForEvents(2, &kernelDone);
    //
            ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 128);
            ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 256);
            ret = clSetKernelArg(kernel, 2,  sizeof(cl_int), (void*)&decdata->dquant[1]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
            ret = clSetKernelArg(kernel, 4, sizeof(int), &max[4]);
            ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
    //ret= clWaitForEvents(3, &kernelDone);
    //
            ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 192);
            ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&decdata->out + 320);
            ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[2]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
            ret = clSetKernelArg(kernel, 4, sizeof(int), &max[5]);
            ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
           //ret= clWaitForEvents(4, &kernelDone);
    //
    //        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, GWS, NULL, 0, NULL, NULL);
    //
    //                ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 8 * 8 * sizeof(int), decdata->out, 0, NULL, NULL);
    //                clFinish(command_queue);
    [/B]
    //
    //        idct(decdata->dcts, decdata->out, decdata->dquant[0],
    //             IFIX(128.5), max[0]);
    //        idct(decdata->dcts + 64, decdata->out + 64,
    //             decdata->dquant[0], IFIX(128.5), max[1]);
    //        idct(decdata->dcts + 128, decdata->out + 256,
    //             decdata->dquant[1], IFIX(0.5), max[4]);
    //        idct(decdata->dcts + 192, decdata->out + 320,
    //             decdata->dquant[2], IFIX(0.5), max[5]);
     
    	   }
    	   break;
    	   case 3:
    	   	decode_mcus(&in, decdata->dcts, mb, dscans, max);
     
    //             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts);
    //             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out);
    //             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[0]);
    //             ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(128.5));
    //             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[0]);
    //
    //
    //             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts + 64);
    //             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out + 256);
    //             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[1]);
    //            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(0.5));
    //             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[4]);
    //
    //             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts + 128);
    //             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out + 320);
    //             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[2]);
    //             ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(0.5));
    //             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[5]);
     
    //        idct(decdata->dcts, decdata->out, decdata->dquant[0],
    //             IFIX(128.5), max[0]);
    //        idct(decdata->dcts + 64, decdata->out + 256,
    //             decdata->dquant[1], IFIX(0.5), max[4]);
    //        idct(decdata->dcts + 128, decdata->out + 320,
    //             decdata->dquant[2], IFIX(0.5), max[5]);
     
     
    	   break;
    	   case 1:
    	   	decode_mcus(&in, decdata->dcts, mb, dscans, max);
     
    //             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts);
    //             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out);
    //             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[0]);
    //            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(128.5));
    //             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[0]);
     
     
    //        idct(decdata->dcts, decdata->out, decdata->dquant[0],
    //             IFIX(128.5), max[0]);
     
    	   break;
     
    	} // switch enc411
     
    [B]ret= clReleaseKernel(kernel);[/B]
     
    	convert(decdata->out,*pic+y+x,pitch); 
    	}
        }
     
        m = dec_readmarker(&in);
        if (m != M_EOI) {
    	err = ERR_NO_EOI;
    	goto error;
        }
        if (decdata)
    	free(decdata);
        return 0;
      error:
        if (decdata)
    	free(decdata);
        return err;
     
     
     
    }

    I wrote the host code in the main file called
    Code :
    test.c
    . It is as follows:

    Code :
    int main()
    {
    FILE *fr = fopen ("/root/Documents/lucmotion/mjpeg-master/test.mjpeg", "rb");
        unsigned char *pic=NULL;
     
        unsigned char *buf=malloc(63247);
        fread(buf, 1, 63247, fr);
        int width=0;
        int height=0;
     
        int out;
        cl_platform_id platform_id;
        cl_uint ret_num_platforms;
        cl_device_id device_id;
        cl_uint ret_num_devices;
        cl_context context;
        cl_command_queue command_queue;
        cl_program program;
        size_t kernel_code_size;
        char *kernel_src_str;
        int *result;
        cl_int ret;
        int storeResult;
       FILE *fp;
     
        /* Get Platform */
        ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
     
        /* Get Device */
        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
            &ret_num_devices);
     
        /* Create Context */
        context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
     
        /* Create Command Queue */
        command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
     
        /* Read Kernel Code */
        fp = fopen("/root/Documents/lucmotion/mjpeg-master/jpegDecode.cl", "r");
        kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
        fclose(fp);
    //
        /* Create Program Object */
        program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
            (const size_t *)&kernel_code_size, &ret);
        /* Compile kernel */
        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
     
    //working with idct
     
     
        storeResult = jpeg_decode(&pic, buf, &width, &height);
          //printf("ret %d\n", ret);
          printf("ptr %ld\n", (long) pic);
          printf("height %d\n", height);
          printf("width %d\n", width);
     
     
          ret = clFlush(command_queue);
          ret = clFinish(command_queue);
        ret = clReleaseProgram(program);
        ret = clReleaseCommandQueue(command_queue);
        ret = clReleaseContext(context);
     
    	//Write greyscale (Y) information as PPM image
    	if (pic != NULL)
    	{
            WritePPMImage("HalfImage.ppm", pic, height, width);
            printf("Written nWimage.ppm\n");
    	}
     
    	return 0;
    }

    I would greatly appreciate it if someone can tell me what I am doing wrong with my code.

    Waiting eagerly for your response.

  2. #2
    Junior Member
    Join Date
    May 2017
    Posts
    7
    The forum is not allowing me to edit my original post. Hence, I have posted additional details in this reply.

    The images the codes have generated are in the following links:

    (This link contains the ppm image I get by running the original github code)

    https://www.flickr.com/photos/501757...n/photostream/

    (This link contains the image I get by running the OpenCL code)
    https://www.flickr.com/photos/50175764@N06/34540668762/

    Both of the images are originally
    .ppm
    images. Since I cannot directly attach ppm images to my post (and I somehow can't attach jpg images to my posts either), I have only attached links to my output images.

    I am also attaching a concise version of my jpeg_decode function:

    Code :
    int jpeg_decode(unsigned char **pic, unsigned char *buf, int *width, int *height)
    {
     
     cl_int ret;
       cl_event kernelDone;
     
       cl_kernel kernel;
     
       kernel = clCreateKernel(program, "idct", &ret);
     
     
            size_t GWS[2];
                GWS[0]= 8;
                GWS[1] = 8;
     
     
     
    	switch (mb){
     
    	   case 4:
    	   {
            decode_mcus(&in, decdata->dcts, mb, dscans, max);
     
            ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts);
          ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out);
          ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
          ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
           ret = clSetKernelArg(kernel, 4, sizeof(int), &max[0]);
              ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
    //ret= clWaitForEvents(1, &kernelDone);
    //
           ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&decdata->dcts + 64);
            ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 64);
            ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
      ret = clSetKernelArg(kernel, 4, sizeof(int), &max[1]);
              ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
    //ret= clWaitForEvents(2, &kernelDone);
    //
            ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 128);
            ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 256);
            ret = clSetKernelArg(kernel, 2,  sizeof(cl_int), (void*)&decdata->dquant[1]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
            ret = clSetKernelArg(kernel, 4, sizeof(int), &max[4]);
            ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
    //ret= clWaitForEvents(3, &kernelDone);
    //
            ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 192);
            ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&decdata->out + 320);
            ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[2]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
            ret = clSetKernelArg(kernel, 4, sizeof(int), &max[5]);
            ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
           //ret= clWaitForEvents(4, &kernelDone);
    //
    //        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, GWS, NULL, 0, NULL, NULL);
    //
    //                ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 8 * 8 * sizeof(int), decdata->out, 0, NULL, NULL);
    //                clFinish(command_queue);
     
     
    ret= clReleaseKernel(kernel);
     
     
    }
    Last edited by ASiddiq; 05-16-2017 at 03:38 PM.

  3. #3
    Junior Member
    Join Date
    May 2017
    Posts
    7
    I would greatly appreciate it if someone can AT LEAST tell me WHAT IS WRONG WITH MY CODE.

  4. #4
    Senior Member
    Join Date
    Apr 2015
    Posts
    275
    You don't use workitem's global ID, so there is no way for a thread to figure out which tile it is supposed to decode.

  5. #5
    Junior Member
    Join Date
    May 2017
    Posts
    7
    Quote Originally Posted by Salabar View Post
    You don't use workitem's global ID, so there is no way for a thread to figure out which tile it is supposed to decode.

    Thank you for your response. I am going to work on it.

    What do you think is the cause of the dirty image?

  6. #6
    Senior Member
    Join Date
    Apr 2015
    Posts
    275
    This is some garbage uninitialized memory your GPU never touched and therefore it is undefined. Unless I misunderstood you program's flow.

  7. #7
    Junior Member
    Join Date
    May 2017
    Posts
    7
    Quote Originally Posted by Salabar View Post
    This is some garbage uninitialized memory your GPU never touched and therefore it is undefined. Unless I misunderstood you program's flow.
    Yes. I realized that when I ran the code after commenting out the "clSetKernelArg" statements.

    Could you tell me if a function that is called in normal C in the following way:

    Code :
      idct(decdata->dcts, decdata->out, decdata->dquant[0],
                 IFIX(128.5), max[0]);

    Can be called in OpenCL in this way:

    Code :
     ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts);
          ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out);
          ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
          ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
           ret = clSetKernelArg(kernel, 4, sizeof(int), &max[0]);
              ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);

    In other words, is my OpenCL declaration equivalent to the C declaration? How can I replace the standard C function (like the one above) with an OpenCL equivalent of it? Is it even possible?

    Waiting eagerly for your response.

  8. #8
    Senior Member
    Join Date
    Apr 2015
    Posts
    275
    What is IFIX? Also, you're technically supposed to use cl_int when working with the 4th. And "__global long off" in the kernel itself doesn't make sense, remove __global. Other than than, it seems about right.

  9. #9
    Junior Member
    Join Date
    May 2017
    Posts
    7
    Quote Originally Posted by Salabar View Post
    What is IFIX? Also, you're technically supposed to use cl_int when working with the 4th. And "__global long off" in the kernel itself doesn't make sense, remove __global. Other than than, it seems about right.
    IFIX is defined in the code in the following way:

    Code :
    #define IFIX(a) ((int)((a) * (1 << ISHIFT) + .5))

    where ISHIFT is a constant.

    Whenever I build the code using IFIX with SetKernelArg, I get the warning
    Code :
    "assignment from incompatible pointer type"
    .

  10. #10
    Senior Member
    Join Date
    Apr 2015
    Posts
    275
    This is SO incorrect. clSetKernelArg only accepts valid pointers and you're trying to give it a value. Do this instead.
    Code :
    cl_long t = IFIX(a);
          ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &t);
    I have literally no clue how come this code even compiles.

Page 1 of 2 12 LastLast

Tags for this Thread

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  
Proudly hosted by Digital Ocean