Hello.
I am new to OpenCL programming and am unable to figure out where I am going wrong with my code.
I am trying to translate a portion of the code in this link:
https://github.com/TimSC/mjpeg
to OpenCL but am not getting the expected output. The expected output is a .ppm image which I have attached to my query (test.ppm). I cannot attach a ppm file to this question at the moment.
I want to get identical output with my OpenCL code. I picked one function in the
utils.c
file and put it AS IT IS in a .cl file which I called
jpeg.cl
:
jpeg.cl
__kernel inline static void idct(__global int *in, __global int *out, __global int *quant, __global long off, int max)
{
unsigned int gid= get_global_id(0);
long t0, t1, t2, t3, t4, t5, t6, t7; // t ;
long tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
long tmp[64], *tmpp;
int i, j, te;
unsigned char *zig2p;
t0 = off;
if (max == 1) {
t0 += in[0] * quant[0];
for (i = 0; i < 64; i++)
out[i] = ITOINT(t0);
return;
}
zig2p = zig2;
tmpp = tmp;
for (i = 0; i < 8; i++) {
j = *zig2p++;
t0 += in[j] * (long) quant[j];
j = *zig2p++;
t5 = in[j] * (long) quant[j];
j = *zig2p++;
t2 = in[j] * (long) quant[j];
j = *zig2p++;
t7 = in[j] * (long) quant[j];
j = *zig2p++;
t1 = in[j] * (long) quant[j];
j = *zig2p++;
t4 = in[j] * (long) quant[j];
j = *zig2p++;
t3 = in[j] * (long) quant[j];
j = *zig2p++;
t6 = in[j] * (long) quant[j];
if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
tmpp[0 * 8] = t0;
tmpp[1 * 8] = t0;
tmpp[2 * 8] = t0;
tmpp[3 * 8] = t0;
tmpp[4 * 8] = t0;
tmpp[5 * 8] = t0;
tmpp[6 * 8] = t0;
tmpp[7 * 8] = t0;
tmpp++;
t0 = 0;
continue;
}
//IDCT;
tmp0 = t0 + t1;
t1 = t0 - t1;
tmp2 = t2 - t3;
t3 = t2 + t3;
tmp2 = IMULT(tmp2, IC4) - t3;
tmp3 = tmp0 + t3;
t3 = tmp0 - t3;
tmp1 = t1 + tmp2;
tmp2 = t1 - tmp2;
tmp4 = t4 - t7;
t7 = t4 + t7;
tmp5 = t5 + t6;
t6 = t5 - t6;
tmp6 = tmp5 - t7;
t7 = tmp5 + t7;
tmp5 = IMULT(tmp6, IC4);
tmp6 = IMULT((tmp4 + t6), S22);
tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
t6 = IMULT(t6, (C22 + S22)) - tmp6;
t6 = t6 - t7;
t5 = tmp5 - t6;
t4 = tmp4 - t5;
tmpp[0 * 8] = tmp3 + t7; //t0;
tmpp[1 * 8] = tmp1 + t6; //t1;
tmpp[2 * 8] = tmp2 + t5; //t2;
tmpp[3 * 8] = t3 + t4; //t3;
tmpp[4 * 8] = t3 - t4; //t4;
tmpp[5 * 8] = tmp2 - t5; //t5;
tmpp[6 * 8] = tmp1 - t6; //t6;
tmpp[7 * 8] = tmp3 - t7; //t7;
tmpp++;
t0 = 0;
}
for (i = 0, j = 0; i < 8; i++) {
t0 = tmp[j + 0];
t1 = tmp[j + 1];
t2 = tmp[j + 2];
t3 = tmp[j + 3];
t4 = tmp[j + 4];
t5 = tmp[j + 5];
t6 = tmp[j + 6];
t7 = tmp[j + 7];
if ((t1 | t2 | t3 | t4 | t5 | t6 | t7) == 0) {
te = ITOINT(t0);
out[j + 0] = te;
out[j + 1] = te;
out[j + 2] = te;
out[j + 3] = te;
out[j + 4] = te;
out[j + 5] = te;
out[j + 6] = te;
out[j + 7] = te;
j += 8;
continue;
}
//IDCT;
tmp0 = t0 + t1;
t1 = t0 - t1;
tmp2 = t2 - t3;
t3 = t2 + t3;
tmp2 = IMULT(tmp2, IC4) - t3;
tmp3 = tmp0 + t3;
t3 = tmp0 - t3;
tmp1 = t1 + tmp2;
tmp2 = t1 - tmp2;
tmp4 = t4 - t7;
t7 = t4 + t7;
tmp5 = t5 + t6;
t6 = t5 - t6;
tmp6 = tmp5 - t7;
t7 = tmp5 + t7;
tmp5 = IMULT(tmp6, IC4);
tmp6 = IMULT((tmp4 + t6), S22);
tmp4 = IMULT(tmp4, (C22 - S22)) + tmp6;
t6 = IMULT(t6, (C22 + S22)) - tmp6;
t6 = t6 - t7;
t5 = tmp5 - t6;
t4 = tmp4 - t5;
out[j + 0] = ITOINT(tmp3 + t7);
out[j + 1] = ITOINT(tmp1 + t6);
out[j + 2] = ITOINT(tmp2 + t5);
out[j + 3] = ITOINT(t3 + t4);
out[j + 4] = ITOINT(t3 - t4);
out[j + 5] = ITOINT(tmp2 - t5);
out[j + 6] = ITOINT(tmp1 - t6);
out[j + 7] = ITOINT(tmp3 - t7);
j += 8;
}
}
Please note that my goal FOR NOW is not to run the code efficiently on the GPU. I simply want to run it on my GPU (Tesla K20c) and get the same output which I am getting with the normal C code.
I am calling this kernel in a function called
jpeg_decode
which is located in the
utils.c file
. I have seen many OpenCL codes but I was uncertain about calling a kernel in a function. I got support from this link
https://www.fixstars.com/en/opencl/book/OpenCLProgrammingBook/fft-fast-fourier-transform/
and modified my
jpeg_decode
function in the
utils.c
file in the following way:
jpeg_decode()
in
utils.c
:
int jpeg_decode(unsigned char **pic, unsigned char *buf, int *width, int *height)
{
[b] cl_int ret;
cl_event kernelDone;[/b]
struct jpeg_decdata *decdata;
int i, j, m, tac, tdc;
int intwidth, intheight;
int mcusx, mcusy, mx, my;
int ypitch ,xpitch,bpp,pitch,x,y;
int mb;
int max[6];
ftopict convert;
int err = 0;
int isInitHuffman = 0;
decdata = (struct jpeg_decdata *) malloc(sizeof(struct jpeg_decdata));
//fixing= IFIX(a);
[b] cl_kernel kernel;
kernel = clCreateKernel(program, "idct", &ret);[/b]
[b]size_t GWS[2];
GWS[0]= 8;
GWS[1] = 8;[/b]
if (!decdata) {
err = -1;
goto error;
}
if (buf == NULL) {
err = -1;
goto error;
}
datap = buf;
if (getbyte() != 0xff) {
err = ERR_NO_SOI;
goto error;
}
if (getbyte() != M_SOI) {
err = ERR_NO_SOI;
goto error;
}
if (readtables(M_SOF0, &isInitHuffman)) {
err = ERR_BAD_TABLES;
goto error;
}
getword();
i = getbyte();
if (i != 8) {
err = ERR_NOT_8BIT;
goto error;
}
intheight = getword();
intwidth = getword();
if ((intheight & 7) || (intwidth & 7)) {
err = ERR_BAD_WIDTH_OR_HEIGHT;
goto error;
}
info.nc = getbyte();
if (info.nc > MAXCOMP) {
err = ERR_TOO_MANY_COMPPS;
goto error;
}
for (i = 0; i < info.nc; i++) {
int h, v;
comps[i].cid = getbyte();
comps[i].hv = getbyte();
v = comps[i].hv & 15;
h = comps[i].hv >> 4;
comps[i].tq = getbyte();
if (h > 3 || v > 3) {
err = ERR_ILLEGAL_HV;
goto error;
}
if (comps[i].tq > 3) {
err = ERR_QUANT_TABLE_SELECTOR;
goto error;
}
}
if (readtables(M_SOS,&isInitHuffman)) {
err = ERR_BAD_TABLES;
goto error;
}
getword();
info.ns = getbyte();
if (!info.ns){
printf("info ns %d/n",info.ns);
err = ERR_NOT_YCBCR_221111;
goto error;
}
for (i = 0; i < info.ns; i++) {
dscans[i].cid = getbyte();
tdc = getbyte();
tac = tdc & 15;
tdc >>= 4;
if (tdc > 1 || tac > 1) {
err = ERR_QUANT_TABLE_SELECTOR;
goto error;
}
for (j = 0; j < info.nc; j++)
if (comps[j].cid == dscans[i].cid)
break;
if (j == info.nc) {
err = ERR_UNKNOWN_CID_IN_SCAN;
goto error;
}
dscans[i].hv = comps[j].hv;
dscans[i].tq = comps[j].tq;
dscans[i].hudc.dhuff = dec_huffdc + tdc;
dscans[i].huac.dhuff = dec_huffac + tac;
}
i = getbyte();
j = getbyte();
m = getbyte();
if (i != 0 || j != 63 || m != 0) {
printf("hmm FW error,not seq DCT ??
");
}
// printf("ext huffman table %d
",isInitHuffman);
if(!isInitHuffman) {
if(huffman_init() < 0)
return -ERR_BAD_TABLES;
}
/*
if (dscans[0].cid != 1 || dscans[1].cid != 2 || dscans[2].cid != 3) {
err = ERR_NOT_YCBCR_221111;
goto error;
}
if (dscans[1].hv != 0x11 || dscans[2].hv != 0x11) {
err = ERR_NOT_YCBCR_221111;
goto error;
}
*/
/* if internal width and external are not the same or heigth too
and pic not allocated realloc the good size and mark the change
need 1 macroblock line more ?? */
if (intwidth != *width || intheight != *height || *pic == NULL) {
*width = intwidth;
*height = intheight;
// BytesperPixel 2 yuyv , 3 rgb24
*pic =
(unsigned char *) realloc((unsigned char *) *pic,
(size_t) intwidth * (intheight +
8) * 2);
}
switch (dscans[0].hv) {
case 0x22: // 411
mb=6;
mcusx = *width >> 4;
mcusy = *height >> 4;
bpp=2;
xpitch = 16 * bpp;
pitch = *width * bpp; // YUYV out
ypitch = 16 * pitch;
convert = yuv420pto422;
break;
case 0x21: //422
mb=4;
mcusx = *width >> 4;
mcusy = *height >> 3;
bpp=2;
xpitch = 16 * bpp;
pitch = *width * bpp; // YUYV out
ypitch = 8 * pitch;
convert = yuv422pto422;
break;
printf("MY VALUES: %dx%d
",*width,*height);
case 0x11: //444
mcusx = *width >> 3;
mcusy = *height >> 3;
bpp=2;
xpitch = 8 * bpp;
pitch = *width * bpp; // YUYV out
ypitch = 8 * pitch;
if (info.ns==1) {
mb = 1;
convert = yuv400pto422;
} else {
mb=3;
convert = yuv422pto422;
}
break;
default:
err = ERR_NOT_YCBCR_221111;
goto error;
break;
}
idctqtab(quant[dscans[0].tq], decdata->dquant[0]);
idctqtab(quant[dscans[1].tq], decdata->dquant[1]);
idctqtab(quant[dscans[2].tq], decdata->dquant[2]);
setinput(&in, datap);
dec_initscans();
dscans[0].next = 2;
dscans[1].next = 1;
dscans[2].next = 0; /* 4xx encoding */
for (my = 0,y=0; my < mcusy; my++,y+=ypitch) {
for (mx = 0,x=0; mx < mcusx; mx++,x+=xpitch) {
if (info.dri && !--info.nm)
if (dec_checkmarker()) {
err = ERR_WRONG_MARKER;
goto error;
}
switch (mb){
case 6: {
decode_mcus(&in, decdata->dcts, mb, dscans, max);
idct(decdata->dcts, decdata->out, decdata->dquant[0],
IFIX(128.5), max[0]);
idct(decdata->dcts + 64, decdata->out + 64,
decdata->dquant[0], IFIX(128.5), max[1]);
idct(decdata->dcts + 128, decdata->out + 128,
decdata->dquant[0], IFIX(128.5), max[2]);
idct(decdata->dcts + 192, decdata->out + 192,
decdata->dquant[0], IFIX(128.5), max[3]);
idct(decdata->dcts + 256, decdata->out + 256,
decdata->dquant[1], IFIX(0.5), max[4]);
idct(decdata->dcts + 320, decdata->out + 320,
decdata->dquant[2], IFIX(0.5), max[5]);
}
break;
case 4:
{
decode_mcus(&in, decdata->dcts, mb, dscans, max);
[b]
ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&decdata->dcts);
ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&decdata->out);
ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
ret = clSetKernelArg(kernel, 4, sizeof(int), &max[0]);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
//ret= clWaitForEvents(1, &kernelDone);
//
ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&decdata->dcts + 64);
ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&decdata->out + 64);
ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[0]);
ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(128.5));
ret = clSetKernelArg(kernel, 4, sizeof(int), &max[1]);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
//ret= clWaitForEvents(2, &kernelDone);
//
ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&decdata->dcts + 128);
ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&decdata->out + 256);
ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[1]);
ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
ret = clSetKernelArg(kernel, 4, sizeof(int), &max[4]);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
//ret= clWaitForEvents(3, &kernelDone);
//
ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void*)&decdata->dcts + 192);
ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&decdata->out + 320);
ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&decdata->dquant[2]);
ret = clSetKernelArg(kernel, 3, sizeof(cl_long), IFIX(0.5));
ret = clSetKernelArg(kernel, 4, sizeof(int), &max[5]);
ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, GWS, NULL, NULL, 0, NULL, NULL);
//ret= clWaitForEvents(4, &kernelDone);
//
// ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, GWS, NULL, 0, NULL, NULL);
//
// ret = clEnqueueReadBuffer(command_queue, clOutput, CL_TRUE, 0, 8 * 8 * sizeof(int), decdata->out, 0, NULL, NULL);
// clFinish(command_queue);
[/b]
//
// idct(decdata->dcts, decdata->out, decdata->dquant[0],
// IFIX(128.5), max[0]);
// idct(decdata->dcts + 64, decdata->out + 64,
// decdata->dquant[0], IFIX(128.5), max[1]);
// idct(decdata->dcts + 128, decdata->out + 256,
// decdata->dquant[1], IFIX(0.5), max[4]);
// idct(decdata->dcts + 192, decdata->out + 320,
// decdata->dquant[2], IFIX(0.5), max[5]);
}
break;
case 3:
decode_mcus(&in, decdata->dcts, mb, dscans, max);
// ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts);
// ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out);
// ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[0]);
// ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(128.5));
// ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[0]);
//
//
// ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts + 64);
// ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out + 256);
// ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[1]);
// ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(0.5));
// ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[4]);
//
// ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts + 128);
// ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out + 320);
// ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[2]);
// ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(0.5));
// ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[5]);
// idct(decdata->dcts, decdata->out, decdata->dquant[0],
// IFIX(128.5), max[0]);
// idct(decdata->dcts + 64, decdata->out + 256,
// decdata->dquant[1], IFIX(0.5), max[4]);
// idct(decdata->dcts + 128, decdata->out + 320,
// decdata->dquant[2], IFIX(0.5), max[5]);
break;
case 1:
decode_mcus(&in, decdata->dcts, mb, dscans, max);
// ret = clSetKernelArg(kernel, 0, sizeof(cl_int), (void *)&decdata->dcts);
// ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void *)&decdata->out);
// ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&decdata->dquant[0]);
// ret = clSetKernelArg(kernel, 3, sizeof(cl_long), (void*)IFIX(128.5));
// ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&max[0]);
// idct(decdata->dcts, decdata->out, decdata->dquant[0],
// IFIX(128.5), max[0]);
break;
} // switch enc411
[b]ret= clReleaseKernel(kernel);[/b]
convert(decdata->out,*pic+y+x,pitch);
}
}
m = dec_readmarker(&in);
if (m != M_EOI) {
err = ERR_NO_EOI;
goto error;
}
if (decdata)
free(decdata);
return 0;
error:
if (decdata)
free(decdata);
return err;
}
I wrote the host code in the main file called
test.c
. It is as follows:
int main()
{
FILE *fr = fopen ("/root/Documents/lucmotion/mjpeg-master/test.mjpeg", "rb");
unsigned char *pic=NULL;
unsigned char *buf=malloc(63247);
fread(buf, 1, 63247, fr);
int width=0;
int height=0;
int out;
cl_platform_id platform_id;
cl_uint ret_num_platforms;
cl_device_id device_id;
cl_uint ret_num_devices;
cl_context context;
cl_command_queue command_queue;
cl_program program;
size_t kernel_code_size;
char *kernel_src_str;
int *result;
cl_int ret;
int storeResult;
FILE *fp;
/* Get Platform */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
/* Get Device */
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&ret_num_devices);
/* Create Context */
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Read Kernel Code */
fp = fopen("/root/Documents/lucmotion/mjpeg-master/jpegDecode.cl", "r");
kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
//
/* Create Program Object */
program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str,
(const size_t *)&kernel_code_size, &ret);
/* Compile kernel */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
//working with idct
storeResult = jpeg_decode(&pic, buf, &width, &height);
//printf("ret %d
", ret);
printf("ptr %ld
", (long) pic);
printf("height %d
", height);
printf("width %d
", width);
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseProgram(program);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
//Write greyscale (Y) information as PPM image
if (pic != NULL)
{
WritePPMImage("HalfImage.ppm", pic, height, width);
printf("Written nWimage.ppm
");
}
return 0;
}
I would greatly appreciate it if someone can tell me what I am doing wrong with my code.
Waiting eagerly for your response.