Page 2 of 2 FirstFirst 12
Results 11 to 13 of 13

Thread: Incorrect output for JPEG decoding with OpenCL on Eclipse (RHEL 6.6)

  1. #11
    Junior Member
    Join Date
    May 2017
    Posts
    7
    Quote Originally Posted by Salabar View Post
    This is SO incorrect. clSetKernelArg only accepts valid pointers and you're trying to give it a value. Do this instead.
    Code :
    cl_long t = IFIX(a);
          ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &t);
    I have literally no clue how come this code even compiles.
    Thank you for your suggestion. I followed your advice and made changes to my code. However, I am still getting the dirty image. It seems like the kernel I have written is not even being accessed by the program (I checked this by writing printf statements in the kernel. The code still built and went on to produce the same dirty image).

    I changed my method of setting the arguments for my kernel. I did this in the following way in my "utils.c" file:

    Code :
     cl_int ret;
       cl_int error;
       cl_event kernelDone;
     
       cl_long t= IFIX(128.5);
     
       cl_long u= IFIX(0.5);
     
     
    cl_kernel kernel;
     
     size_t GWS[2];
                GWS[0]= 1;
                GWS[1] = 1;
     
     case 4:
    	   {
            decode_mcus(&in, decdata->dcts, mb, dscans, max);
     
            ret = clSetKernelArg(kernel, 0,  400 * sizeof(cl_int), (void*)&clInput);
          ret = clSetKernelArg(kernel, 1,  400 * sizeof(cl_int), (void*)&clOutput);
          ret = clSetKernelArg(kernel, 2, 64 * sizeof(cl_int), (void*)&clquant);
          ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &t);
           ret = clSetKernelArg(kernel, 4, sizeof(cl_int), &max[0]);
              ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
              ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 400 * sizeof(cl_mem), decdata->out, 0, NULL, NULL);
                             clFinish(command_queue);
    ret= clWaitForEvents(1, &kernelDone);
     
           ret = clSetKernelArg(kernel, 0, 400 * sizeof(cl_int), (void*)&decdata->dcts + 64);
            ret = clSetKernelArg(kernel, 1,  400 * sizeof(cl_int), (void*)&decdata->out + 64);
            ret = clSetKernelArg(kernel, 2, 64 * sizeof(cl_int), (void*)&decdata->dquant[0]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &t);
      ret = clSetKernelArg(kernel, 4, sizeof(cl_int), &max[1]);
              ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
              ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 400 * sizeof(cl_int), decdata->out + 64, 0, NULL, NULL);
                             clFinish(command_queue);
    ret= clWaitForEvents(2, &kernelDone);
    //
            ret = clSetKernelArg(kernel, 0,  400 * sizeof(cl_int), (void*)&decdata->dcts + 128);
            ret = clSetKernelArg(kernel, 1,  400 * sizeof(cl_int), (void*)&decdata->out + 256);
            ret = clSetKernelArg(kernel, 2,  64 * sizeof(cl_int), (void*)&decdata->dquant[1]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &u);
            ret = clSetKernelArg(kernel, 4, sizeof(cl_int), &max[4]);
            ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
            ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 400 * sizeof(cl_int), decdata->out + 256, 0, NULL, NULL);
                           clFinish(command_queue);
    ret= clWaitForEvents(3, &kernelDone);
    ////
            ret = clSetKernelArg(kernel, 0,  400 * sizeof(cl_int), (void*)&decdata->dcts + 192);
            ret = clSetKernelArg(kernel, 1, 400 * sizeof(cl_int), (void*)&decdata->out + 320);
            ret = clSetKernelArg(kernel, 2, 64 * sizeof(cl_int), (void*)&decdata->dquant[2]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), &u);
            ret = clSetKernelArg(kernel, 4, sizeof(cl_int), &max[5]);
            ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
            ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 400 * sizeof(cl_int), decdata->out+320, 0, NULL, NULL);
                           clFinish(command_queue);
           ret= clWaitForEvents(4, &kernelDone);
     
           ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, GWS, NULL, 0, NULL, NULL);
                   ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 400 * sizeof(cl_int), decdata->out, 0, NULL, NULL);
                  clFinish(command_queue);
     
    	   }
    	   break;

    I essentially replaced every function declaration with "clSetKernelArg" statements. I then called "clEnqueueReadBuffer" to read the data back to the host. I called clEnqueueNDRangeKernel after every set of "clsetKernelArg" statements to execute each kernel on the device.
    I set my global work size to be a 2-dimensional array with both arrays containing 1 value each. I am not comfortable with doing this but I wanted to see how this would work out.

    My kernel file now looks like this:

    Code :
    __kernel inline static void idct(__global int *in, __global int *out, __global int *quant, long off, int max)
    {
     
    unsigned int gid= get_global_id(0);
    unsigned int nid= get_global_id(1);
    long t0, t1, t2, t3, t4, t5, t6, t7;	// t ;
    long tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
    long tmp[64], *tmpp;
    unsigned char *zig2p;
     
    t0 = off;
    if (max == 1) {
        t0 += in[0] * quant[0];
        for (i = 0; i < 64; i++)
            out[i] = ITOINT(t0);
        return;
    }
    zig2p = zig2;
    tmpp = tmp;
    for (gid = 0; gid < 8; gid++) {
        nid = *zig2p++;
        t0 += in[nid] * (long) quant[nid];
        nid = *zig2p++;
        t5 = in[nid] * (long) quant[nid];
        nid = *zig2p++;
        t2 = in[nid] * (long) quant[nid];
        nid = *zig2p++;
        t7 = in[nid] * (long) quant[nid];
        nid = *zig2p++;
        t1 = in[nid] * (long) quant[nid];
        nid = *zig2p++;
        t4 = in[nid] * (long) quant[nid];
        nid = *zig2p++;
        t3 = in[nid] * (long) quant[nid];
        nid = *zig2p++;
        t6 = in[nid] * (long) quant[nid];
     
     
        if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
     
            tmpp[0 * 8] = t0;
            tmpp[1 * 8] = t0;
            tmpp[2 * 8] = t0;
            tmpp[3 * 8] = t0;
            tmpp[4 * 8] = t0;
            tmpp[5 * 8] = t0;
            tmpp[6 * 8] = t0;
            tmpp[7 * 8] = t0;
     
            tmpp++;
            t0 = 0;
            continue;
        }
        //IDCT;
        tmp0 = t0 + t1;
        t1 = t0 - t1;
        tmp2 = t2 - t3;
        t3 = t2 + t3;
        tmp2 = IMULT(tmp2, IC4) - t3;
        tmp3 = tmp0 + t3;
        t3 = tmp0 - t3;
        tmp1 = t1 + tmp2;
        tmp2 = t1 - tmp2;
        tmp4 = t4 - t7;
        t7 = t4 + t7;
        tmp5 = t5 + t6;
        t6 = t5 - t6;
        tmp6 = tmp5 - t7;
        t7 = tmp5 + t7;
        tmp5 = IMULT(tmp6, IC4);
        tmp6 = IMULT((tmp4 + t6), S22);
        tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
        t6 = IMULT(t6, (C22 + S22)) - tmp6;
        t6 = t6 - t7;
        t5 = tmp5 - t6;
        t4 = tmp4 - t5;
     
        tmpp[0 * 8] = tmp3 + t7;	//t0;
        tmpp[1 * 8] = tmp1 + t6;	//t1;
        tmpp[2 * 8] = tmp2 + t5;	//t2;
        tmpp[3 * 8] = t3 + t4;	//t3;
        tmpp[4 * 8] = t3 - t4;	//t4;
        tmpp[5 * 8] = tmp2 - t5;	//t5;
        tmpp[6 * 8] = tmp1 - t6;	//t6;
        tmpp[7 * 8] = tmp3 - t7;	//t7;
        tmpp++;
        t0 = 0;
    }
    for (gid = 0, nid = 0; gid < 8; gid++) {
        t0 = tmp[nid + 0];
        t1 = tmp[nid + 1];
        t2 = tmp[nid + 2];
        t3 = tmp[nid + 3];
        t4 = tmp[nid + 4];
        t5 = tmp[nid + 5];
        t6 = tmp[nid + 6];
        t7 = tmp[nid + 7];
        if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
            te = ITOINT(t0);
            out[nid + 0] = te;
            out[nid + 1] = te;
            out[nid + 2] = te;
            out[nid + 3] = te;
            out[nid + 4] = te;
            out[nid + 5] = te;
            out[nid + 6] = te;
            out[nid + 7] = te;
            nid += 8;
            continue;
        }
        //IDCT;
        tmp0 = t0 + t1;
        t1 = t0 - t1;
        tmp2 = t2 - t3;
        t3 = t2 + t3;
        tmp2 = IMULT(tmp2, IC4) - t3;
        tmp3 = tmp0 + t3;
        t3 = tmp0 - t3;
        tmp1 = t1 + tmp2;
        tmp2 = t1 - tmp2;
        tmp4 = t4 - t7;
        t7 = t4 + t7;
        tmp5 = t5 + t6;
        t6 = t5 - t6;
        tmp6 = tmp5 - t7;
        t7 = tmp5 + t7;
        tmp5 = IMULT(tmp6, IC4);
        tmp6 = IMULT((tmp4 + t6), S22);
        tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
        t6 = IMULT(t6, (C22 + S22)) - tmp6;
        t6 = t6 - t7;
        t5 = tmp5 - t6;
        t4 = tmp4 - t5;
     
        out[nid + 0] = ITOINT(tmp3 + t7);
        out[nid + 1] = ITOINT(tmp1 + t6);
        out[nid + 2] = ITOINT(tmp2 + t5);
        out[nid + 3] = ITOINT(t3 + t4);
        out[nid + 4] = ITOINT(t3 - t4);
        out[nid + 5] = ITOINT(tmp2 - t5);
        out[nid + 6] = ITOINT(tmp1 - t6);
        out[nid + 7] = ITOINT(tmp3 - t7);
        nid += 8;
    }
     
     
    }

    You said in your previous answer that that global_id has been declared but was not being used in my previous kernel code. In order to get around that problem, I replaced i and j with gid and nid respectively. The global work size is only 1 in both dimensions so I figured I could let the loops stay.

    Finally, I set up my context and called the function from the c file containing main. My main is set up in the following way:

    Code :
    int main()
    {
        //Read input data
        FILE *fr = fopen ("/root/Documents/lucmotion/mjpeg-master/test.mjpeg", "rb");
        unsigned char *pic=NULL;
     
        unsigned char *buf=malloc(63247);
        fread(buf, 1, 63247, fr);
        int width=0;
        int height=0;
     
        int out;
        cl_platform_id platform_id;
        cl_uint ret_num_platforms;
        cl_device_id device_id;
        cl_uint ret_num_devices;
        cl_context context;
        cl_command_queue command_queue;
        cl_program program;
        size_t kernel_code_size;
        char *kernel_src_str;
        int *result;
        cl_int ret;
        int storeResult;
       FILE *fp;
     
     
     
        /* Get Platform */
        ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
     
        /* Get Device */
        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
            &ret_num_devices);
     
        /* Create Context */
        context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
     
        /* Create Command Queue */
        command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
     
        /* Read Kernel Code */
        fp = fopen("/root/Documents/lucmotion/mjpeg-master/jpegDecode.cl", "r");
        kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
        fclose(fp);
     
        /* Create Program Object */
        program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
            (const size_t *)&kernel_code_size, &ret);
        /* Compile kernel */
        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
     
    //working with idct
     
        storeResult = jpeg_decode(&pic, buf, &width, &height);
          //printf("ret %d\n", ret);
          printf("ptr %ld\n", (long) pic);
          printf("height %d\n", height);
          printf("width %d\n", width);
     
     
        /* OpenCL Object Finalization */
     
          ret = clFlush(command_queue);
          ret = clFinish(command_queue);
        ret = clReleaseProgram(program);
        ret = clReleaseCommandQueue(command_queue);
        ret = clReleaseContext(context);
     
     
    	//Write greyscale (Y) information as PPM image
     
    	if (pic != NULL)
    	{
            WritePPMImage("HalfImage.ppm", pic, height, width);
            printf("Written nWimage.ppm\n");
    	}
     
    	return 0;
    }

    I think now my main problem is that my kernel is not even being accessed by the program. Could this be why code was compiling despite obvious errors in declaring the arguments of the kernel?

    How can I get around this problem? Any suggestions will be appreciated.

  2. #12
    Senior Member
    Join Date
    Apr 2015
    Posts
    316
    Simply try to fill to fill your image with some consistent pattern, like

    Code :
    for (int i = 0; i < 64; ++i)
    out[(gid * get_global_size(1) + nid) * 64  + i] = gid %256;
    and make sure everything else works.
    Last edited by Salabar; 05-29-2017 at 03:31 PM.

  3. #13
    Newbie
    Join Date
    Jul 2017
    Location
    Vietnam
    Posts
    1
    Quote Originally Posted by ASiddiq View Post
    Hello.
    I am new to OpenCL programming and am unable to figure out where I am going wrong with my code.

    I am trying to translate a portion of the code in this link:
    HTML Code:
    https://github.com/TimSC/mjpeg
    to OpenCL but am not getting the expected output. The expected output is a .ppm image which I have attached to my query (test.ppm). I cannot attach a ppm file to this question at the moment.

    I want to get identical output with my OpenCL code. I picked one function in the
    Code :
    utils.c
    file and put it AS IT IS in a .cl file which I called
    Code :
    jpeg.cl
    :

    Code :
    jpeg.cl

    Code :
    __kernel inline static void idct(__global int *in, __global int *out, __global int *quant, __global long off, int max)
    {
     
    unsigned int gid= get_global_id(0);
    long t0, t1, t2, t3, t4, t5, t6, t7;	// t ;
    long tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
    long tmp[64], *tmpp;
    int i, j, te;
    unsigned char *zig2p;
     
    t0 = off;
    if (max == 1) {
        t0 += in[0] * quant[0];
        for (i = 0; i < 64; i++)
            out[i] = ITOINT(t0);
        return;
    }
    zig2p = zig2;
    tmpp = tmp;
    for (i = 0; i < 8; i++) {
        j = *zig2p++;
        t0 += in[j] * (long) quant[j];
        j = *zig2p++;
        t5 = in[j] * (long) quant[j];
        j = *zig2p++;
        t2 = in[j] * (long) quant[j];
        j = *zig2p++;
        t7 = in[j] * (long) quant[j];
        j = *zig2p++;
        t1 = in[j] * (long) quant[j];
        j = *zig2p++;
        t4 = in[j] * (long) quant[j];
        j = *zig2p++;
        t3 = in[j] * (long) quant[j];
        j = *zig2p++;
        t6 = in[j] * (long) quant[j];
     
     
        if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
     
            tmpp[0 * 8] = t0;
            tmpp[1 * 8] = t0;
            tmpp[2 * 8] = t0;
            tmpp[3 * 8] = t0;
            tmpp[4 * 8] = t0;
            tmpp[5 * 8] = t0;
            tmpp[6 * 8] = t0;
            tmpp[7 * 8] = t0;
     
            tmpp++;
            t0 = 0;
            continue;
        }
        //IDCT;
        tmp0 = t0 + t1;
        t1 = t0 - t1;
        tmp2 = t2 - t3;
        t3 = t2 + t3;
        tmp2 = IMULT(tmp2, IC4) - t3;
        tmp3 = tmp0 + t3;
        t3 = tmp0 - t3;
        tmp1 = t1 + tmp2;
        tmp2 = t1 - tmp2;
        tmp4 = t4 - t7;
        t7 = t4 + t7;
        tmp5 = t5 + t6;
        t6 = t5 - t6;
        tmp6 = tmp5 - t7;
        t7 = tmp5 + t7;
        tmp5 = IMULT(tmp6, IC4);
        tmp6 = IMULT((tmp4 + t6), S22);
        tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
        t6 = IMULT(t6, (C22 + S22)) - tmp6;
        t6 = t6 - t7;
        t5 = tmp5 - t6;
        t4 = tmp4 - t5;
     
        tmpp[0 * 8] = tmp3 + t7;	//t0;
        tmpp[1 * 8] = tmp1 + t6;	//t1;
        tmpp[2 * 8] = tmp2 + t5;	//t2;
        tmpp[3 * 8] = t3 + t4;	//t3;
        tmpp[4 * 8] = t3 - t4;	//t4;
        tmpp[5 * 8] = tmp2 - t5;	//t5;
        tmpp[6 * 8] = tmp1 - t6;	//t6;
        tmpp[7 * 8] = tmp3 - t7;	//t7;
        tmpp++;
        t0 = 0;
    }
    for (i = 0, j = 0; i < 8; i++) {
        t0 = tmp[j + 0];
        t1 = tmp[j + 1];
        t2 = tmp[j + 2];
        t3 = tmp[j + 3];
        t4 = tmp[j + 4];
        t5 = tmp[j + 5];
        t6 = tmp[j + 6];
        t7 = tmp[j + 7];
        if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
            te = ITOINT(t0);
            out[j + 0] = te;
            out[j + 1] = te;
            out[j + 2] = te;
            out[j + 3] = te;
            out[j + 4] = te;
            out[j + 5] = te;
            out[j + 6] = te;
            out[j + 7] = te;
            j += 8;
            continue;
        }
        //IDCT;
        tmp0 = t0 + t1;
        t1 = t0 - t1;
        tmp2 = t2 - t3;
        t3 = t2 + t3;
        tmp2 = IMULT(tmp2, IC4) - t3;
        tmp3 = tmp0 + t3;
        t3 = tmp0 - t3;
        tmp1 = t1 + tmp2;
        tmp2 = t1 - tmp2;
        tmp4 = t4 - t7;
        t7 = t4 + t7;
        tmp5 = t5 + t6;
        t6 = t5 - t6;
        tmp6 = tmp5 - t7;
        t7 = tmp5 + t7;
        tmp5 = IMULT(tmp6, IC4);
        tmp6 = IMULT((tmp4 + t6), S22);
        tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
        t6 = IMULT(t6, (C22 + S22)) - tmp6;
        t6 = t6 - t7;
        t5 = tmp5 - t6;
        t4 = tmp4 - t5;
     
        out[j + 0] = ITOINT(tmp3 + t7);
        out[j + 1] = ITOINT(tmp1 + t6);
        out[j + 2] = ITOINT(tmp2 + t5);
        out[j + 3] = ITOINT(t3 + t4);
        out[j + 4] = ITOINT(t3 - t4);
        out[j + 5] = ITOINT(tmp2 - t5);
        out[j + 6] = ITOINT(tmp1 - t6);
        out[j + 7] = ITOINT(tmp3 - t7);
        j += 8;
    }
     
     
    }

    Please note that my goal FOR NOW is not to run the code efficiently on the GPU. I simply want to run it on my GPU (Tesla K20c) and get the same output which I am getting with the normal C code.

    I am calling this kernel in a function called
    Code :
    jpeg_decode
    which is located in the
    Code :
    utils.c file
    . I have seen many OpenCL codes but I was uncertain about calling a kernel in a function. I got support from this link
    Code :
    https://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/fft-fast-fourier-transform/
    and modified my
    Code :
     jpeg_decode
    function in the
    Code :
    utils.c
    file in the following way:

    Code :
    jpeg_decode()
    in
    Code :
    utils.c
    :

    Code :
    int jpeg_decode(unsigned char **pic, unsigned char *buf, int *width, int *height)
    {
     
     [B]  cl_int ret;
       cl_event kernelDone;[/B]
     
     
        struct jpeg_decdata *decdata;
        int i, j, m, tac, tdc;
        int intwidth, intheight;
        int mcusx, mcusy, mx, my;
        int ypitch ,xpitch,bpp,pitch,x,y;
        int mb;
        int max[6];
        ftopict convert;
        int err = 0;
        int isInitHuffman = 0;
     
        decdata = (struct jpeg_decdata *) malloc(sizeof(struct jpeg_decdata));
     
        //fixing= IFIX(a);
      [B] cl_kernel kernel;
     
       kernel = clCreateKernel(program, "idct", &ret);[/B]
     
     
            [B]size_t GWS[2];
                GWS[0]= 8;
                GWS[1] = 8;[/B]
     
     
        if (!decdata) {
    	err = -1;
    	goto error;
        }
        if (buf == NULL) {
    	err = -1;
    	goto error;
        }
        datap = buf;
        if (getbyte() != 0xff) {
    	err = ERR_NO_SOI;
    	goto error;
        }
        if (getbyte() != M_SOI) {
    	err = ERR_NO_SOI;
    	goto error;
        }
        if (readtables(M_SOF0, &isInitHuffman)) {
    	err = ERR_BAD_TABLES;
    	goto error;
        }
        getword();
        i = getbyte();
        if (i != 8) {
    	err = ERR_NOT_8BIT;
    	goto error;
        }
        intheight = getword();
        intwidth = getword();
     
        if ((intheight & 7) || (intwidth & 7)) {
    	err = ERR_BAD_WIDTH_OR_HEIGHT;
    	goto error;
        }
        info.nc = getbyte();
        if (info.nc > MAXCOMP) {
    	err = ERR_TOO_MANY_COMPPS;
    	goto error;
        }
        for (i = 0; i < info.nc; i++) {
    	int h, v;
    	comps[i].cid = getbyte();
    	comps[i].hv = getbyte();
    	v = comps[i].hv & 15;
    	h = comps[i].hv >> 4;
    	comps[i].tq = getbyte();
    	if (h > 3 || v > 3) {
    	   err = ERR_ILLEGAL_HV;
    	   goto error;
    	}
    	if (comps[i].tq > 3) {
    	   err = ERR_QUANT_TABLE_SELECTOR;
    	   goto error;
    	}
        }
        if (readtables(M_SOS,&isInitHuffman)) {
    	err = ERR_BAD_TABLES;
    	goto error;
        }
        getword();
        info.ns = getbyte();
        if (!info.ns){
        printf("info ns %d/n",info.ns);
    	err = ERR_NOT_YCBCR_221111;
    	goto error;
        }
        for (i = 0; i < info.ns; i++) {
    	dscans[i].cid = getbyte();
    	tdc = getbyte();
    	tac = tdc & 15;
    	tdc >>= 4;
    	if (tdc > 1 || tac > 1) {
    	   err = ERR_QUANT_TABLE_SELECTOR;
    	   goto error;
    	}
    	for (j = 0; j < info.nc; j++)
    	   if (comps[j].cid == dscans[i].cid)
    		break;
    	if (j == info.nc) {
    	   err = ERR_UNKNOWN_CID_IN_SCAN;
    	   goto error;
    	}
    	dscans[i].hv = comps[j].hv;
    	dscans[i].tq = comps[j].tq;
    	dscans[i].hudc.dhuff = dec_huffdc + tdc;
    	dscans[i].huac.dhuff = dec_huffac + tac;
        }
     
        i = getbyte();
        j = getbyte();
        m = getbyte();
     
        if (i != 0 || j != 63 || m != 0) {
        	printf("hmm FW error,not seq DCT ??\n");
        }
       // printf("ext huffman table %d \n",isInitHuffman);
        if(!isInitHuffman) {
        	if(huffman_init() < 0)
    		return -ERR_BAD_TABLES;
    	}
    /*
        if (dscans[0].cid != 1 || dscans[1].cid != 2 || dscans[2].cid != 3) {
    	err = ERR_NOT_YCBCR_221111;
    	goto error;
        }
     
        if (dscans[1].hv != 0x11 || dscans[2].hv != 0x11) {
    	err = ERR_NOT_YCBCR_221111;
    	goto error;
        }
    */    
        /* if internal width and external are not the same or heigth too 
           and pic not allocated realloc the good size and mark the change 
           need 1 macroblock line more ?? */
        if (intwidth != *width || intheight != *height || *pic == NULL) {
    	*width = intwidth;
    	*height = intheight;
    	// BytesperPixel 2 yuyv , 3 rgb24 
    	*pic =
    	   (unsigned char *) realloc((unsigned char *) *pic,
    				     (size_t) intwidth * (intheight +
    							  8) * 2);
        }
     
     
        switch (dscans[0].hv) {
        case 0x22: // 411
        	mb=6;
    	mcusx = *width >> 4;
    	mcusy = *height >> 4;
    	bpp=2;
    	xpitch = 16 * bpp;
    	pitch = *width * bpp; // YUYV out
    	ypitch = 16 * pitch;
    	convert = yuv420pto422;	
    	break;
        case 0x21: //422
     
        	mb=4;
    	mcusx = *width >> 4;
    	mcusy = *height >> 3;
    	bpp=2;	
    	xpitch = 16 * bpp;
    	pitch = *width * bpp; // YUYV out
    	ypitch = 8 * pitch;
        convert = yuv422pto422;
    	break;
         printf("MY VALUES: %dx%d\n",*width,*height);
        case 0x11: //444
    	mcusx = *width >> 3;
    	mcusy = *height >> 3;
    	bpp=2;
    	xpitch = 8 * bpp;
    	pitch = *width * bpp; // YUYV out
    	ypitch = 8 * pitch;
    	if (info.ns==1) {
        		mb = 1;
    		convert = yuv400pto422;
    	} else {
    		mb=3;
            convert = yuv422pto422;
    	}
            break;
        default:
    	err = ERR_NOT_YCBCR_221111;
    	goto error;
    	break;
        }
     
        idctqtab(quant[dscans[0].tq], decdata->dquant[0]);
        idctqtab(quant[dscans[1].tq], decdata->dquant[1]);
        idctqtab(quant[dscans[2].tq], decdata->dquant[2]);
        setinput(&in, datap);
        dec_initscans();
     
        dscans[0].next = 2;
        dscans[1].next = 1;
        dscans[2].next = 0;	/* 4xx encoding */
        for (my = 0,y=0; my < mcusy; my++,y+=ypitch) {
    	for (mx = 0,x=0; mx < mcusx; mx++,x+=xpitch) {
    	   if (info.dri && !--info.nm)
    		if (dec_checkmarker()) {
    		   err = ERR_WRONG_MARKER;
    		   goto error;
    		}
    	switch (mb){
    	   case 6: {
    		decode_mcus(&in, decdata->dcts, mb, dscans, max);
     
            idct(decdata->dcts, decdata->out, decdata->dquant[0],
                 IFIX(128.5), max[0]);
            idct(decdata->dcts + 64, decdata->out + 64,
                 decdata->dquant[0], IFIX(128.5), max[1]);
            idct(decdata->dcts + 128, decdata->out + 128,
                 decdata->dquant[0], IFIX(128.5), max[2]);
            idct(decdata->dcts + 192, decdata->out + 192,
                 decdata->dquant[0], IFIX(128.5), max[3]);
            idct(decdata->dcts + 256, decdata->out + 256,
                 decdata->dquant[1], IFIX(0.5), max[4]);
            idct(decdata->dcts + 320, decdata->out + 320,
                 decdata->dquant[2], IFIX(0.5), max[5]);
        }
        break;
     
    	   case 4:
    	   {
            decode_mcus(&in, decdata->dcts, mb, dscans, max);
     
    [B]
            ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts);
          ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out);
          ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
          ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
           ret = clSetKernelArg(kernel, 4, sizeof(int), &max[0]);
              ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
    //ret= clWaitForEvents(1, &kernelDone);
    //
           ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&decdata->dcts + 64);
            ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 64);
            ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
      ret = clSetKernelArg(kernel, 4, sizeof(int), &max[1]);
              ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
    //ret= clWaitForEvents(2, &kernelDone);
    //
            ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 128);
            ret = clSetKernelArg(kernel, 1,  sizeof(cl_int), (void*)&decdata->out + 256);
            ret = clSetKernelArg(kernel, 2,  sizeof(cl_int), (void*)&decdata->dquant[1]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
            ret = clSetKernelArg(kernel, 4, sizeof(int), &max[4]);
            ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
    //ret= clWaitForEvents(3, &kernelDone);
    //
            ret = clSetKernelArg(kernel, 0,  sizeof(cl_int), (void*)&decdata->dcts + 192);
            ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&decdata->out + 320);
            ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[2]);
            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
            ret = clSetKernelArg(kernel, 4, sizeof(int), &max[5]);
            ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL,  NULL, 0, NULL, NULL);
           //ret= clWaitForEvents(4, &kernelDone);
    //
    //        ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, GWS, NULL, 0, NULL, NULL);
    //
    //                ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 8 * 8 * sizeof(int), decdata->out, 0, NULL, NULL);
    //                clFinish(command_queue);
    [/B]
    //
    //        idct(decdata->dcts, decdata->out, decdata->dquant[0],
    //             IFIX(128.5), max[0]);
    //        idct(decdata->dcts + 64, decdata->out + 64,
    //             decdata->dquant[0], IFIX(128.5), max[1]);
    //        idct(decdata->dcts + 128, decdata->out + 256,
    //             decdata->dquant[1], IFIX(0.5), max[4]);
    //        idct(decdata->dcts + 192, decdata->out + 320,
    //             decdata->dquant[2], IFIX(0.5), max[5]);
     
    	   }
    	   break;
    	   case 3:
    	   	decode_mcus(&in, decdata->dcts, mb, dscans, max);
     
    //             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts);
    //             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out);
    //             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[0]);
    //             ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(128.5));
    //             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[0]);
    //
    //
    //             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts + 64);
    //             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out + 256);
    //             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[1]);
    //            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(0.5));
    //             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[4]);
    //
    //             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts + 128);
    //             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out + 320);
    //             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[2]);
    //             ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(0.5));
    //             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[5]);
     
    //        idct(decdata->dcts, decdata->out, decdata->dquant[0],
    //             IFIX(128.5), max[0]);
    //        idct(decdata->dcts + 64, decdata->out + 256,
    //             decdata->dquant[1], IFIX(0.5), max[4]);
    //        idct(decdata->dcts + 128, decdata->out + 320,
    //             decdata->dquant[2], IFIX(0.5), max[5]);
     
     
    	   break;
    	   case 1:
    	   	decode_mcus(&in, decdata->dcts, mb, dscans, max);
     
    //             ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts);
    //             ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out);
    //             ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[0]);
    //            ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(128.5));
    //             ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[0]);
     
     
    //        idct(decdata->dcts, decdata->out, decdata->dquant[0],
    //             IFIX(128.5), max[0]);
     
    	   break;
     
    	} // switch enc411
     
    [B]ret= clReleaseKernel(kernel);[/B]
     
    	convert(decdata->out,*pic+y+x,pitch); 
    	}
        }
     
        m = dec_readmarker(&in);
        if (m != M_EOI) {
    	err = ERR_NO_EOI;
    	goto error;
        }
        if (decdata)
    	free(decdata);
        return 0;
      error:
        if (decdata)
    	free(decdata);
        return err;
     
     
     
    }

    I wrote the host code in the main file called
    Code :
    test.c
    . It is as follows:

    Code :
    int main()
    {
    FILE *fr = fopen ("/root/Documents/lucmotion/mjpeg-master/test.mjpeg", "rb");
        unsigned char *pic=NULL;
     
        unsigned char *buf=malloc(63247);
        fread(buf, 1, 63247, fr);
        int width=0;
        int height=0;
     
        int out;
        cl_platform_id platform_id;
        cl_uint ret_num_platforms;
        cl_device_id device_id;
        cl_uint ret_num_devices;
        cl_context context;
        cl_command_queue command_queue;
        cl_program program;
        size_t kernel_code_size;
        char *kernel_src_str;
        int *result;
        cl_int ret;
        int storeResult;
       FILE *fp;
     
        /* Get Platform */
        ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
     
        /* Get Device */
        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
            &ret_num_devices);
     
        /* Create Context */
        context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
     
        /* Create Command Queue */
        command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
     
        /* Read Kernel Code */
        fp = fopen("/root/Documents/lucmotion/mjpeg-master/jpegDecode.cl", "r");
        kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
        fclose(fp);
    //
        /* Create Program Object */
        program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
            (const size_t *)&kernel_code_size, &ret);
        /* Compile kernel */
        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
     
    //working with idct
     
     
        storeResult = jpeg_decode(&pic, buf, &width, &height);
          //printf("ret %d\n", ret);
          printf("ptr %ld\n", (long) pic);
          printf("height %d\n", height);
          printf("width %d\n", width);
     
     
          ret = clFlush(command_queue);
          ret = clFinish(command_queue);
        ret = clReleaseProgram(program);
        ret = clReleaseCommandQueue(command_queue);
        ret = clReleaseContext(context);
     
    	//Write greyscale (Y) information as PPM image
    	if (pic != NULL)
    	{
            WritePPMImage("HalfImage.ppm", pic, height, width);
            printf("Written nWimage.ppm\n");
    	}
     
    	return 0;
    }

    I would greatly blogchiem.netappreciate it if someone can tell me what I am doing wrong with my code.

    Waiting eagerly for your response.
    Let's just show me how to compress JPEG images in the best way possible?

Page 2 of 2 FirstFirst 12

Tags for this Thread

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  
Proudly hosted by Digital Ocean