Nop, it did not work out.
Actually inside the kernel i am using ‘pos.x’ as height and ‘pos.y’ as width. That is why i used size_t workSize[2] = {h, w}
I think i might have done something wrong somewhere else, i am attaching the whole host code below. All the functions for allocation of the buffer are below the main function in the code
#include "yuv_reader.h"
#include "hypothesis_opencl.h"
#include "pugixml.hpp"
#include "config.h"
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
cl_int retVal; //Return value
cl_int errNum;
// Check the command line params
if(argc != 2)
{
std::cout << "USAGE: OpenCLMatch <config_filename.xml>" << std::endl;
return 0;
}
// load config
pugi::xml_document confXml;
if(confXml.load_file(argv[1]) != pugi::status_ok)
{
std::cout << "ERROR: cannot open XML config file " << argv[1] << std::endl;
//return -2;
}
config conf(confXml);
if(!conf.initConfig())
{
std::cout << "ERROR: XML config file is not complete" << argv[1] << std::endl;
return -2;
}
//**************** Yuv file read *******************
int frameno = conf.start_frame;
const int h = conf.height,
w = conf.width,
format = 420,
radius = (conf.radius > RADIUS) ? MIN(conf.radius, MAXRADIUS) : RADIUS;
const float sigmaColor = (conf.sigma_color > 0) ? conf.sigma_color : SIGMA_COLOR,
sigmaDistance = (conf.sigma_distance > 0) ? conf.sigma_distance : SIGMA_DISTANCE,
searchLimit = (conf.search_limit > 0) ? conf.search_limit : SEARCH_LIMIT;
string filenameCL = conf.openCL_filename;
CIYuv CIYuv_col(h, w, format);
CIYuv CIYuv_col444(h, w, 444);
CIYuv CIYuv_depth(h, w, 400);
CIYuv CIYuv_filteredDepth(h, w, 400);
FILE *pf_read_col, *pf_read_depth, *pf_write, *errLog;
if((errLog = fopen("Debug/errLog.txt", "w")) == NULL)
{
std::cout << "Error::Failed to open errLog";
};
string fileName = conf.color_filename;
if((pf_read_col = fopen(fileName.c_str(), "rb")) == NULL)
{
std::cout << "main::fopen:: opening YUV color file failed
";
}
fileName = conf.depth_filename;
if((pf_read_depth = fopen(fileName.c_str(), "rb")) == NULL)
{
std::cout << "main::fopen:: opening YUV depth file failed
";
}
fileName = conf.filtered_filename;
if((pf_write = fopen(fileName.c_str(), "wb")) == NULL)
{
std::cout << "main::fopen:: opening YUV output depth file failed
";
}
// Get platforms
cl_uint nPlatform = 0;
retVal = clGetPlatformIDs(0, NULL, &nPlatform);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clGetPlatformIDs" << std::endl;
}
cl_platform_id* plInfos = (cl_platform_id*)malloc(nPlatform * sizeof(cl_platform_id));
retVal = clGetPlatformIDs(nPlatform, plInfos, NULL);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clGetPlatformIDs" << std::endl;
}
// Get device
cl_context GPUContext;
for (cl_uint i = 0; i < nPlatform; i++)
{
cl_uint nDev = 0;
retVal = clGetDeviceIDs(plInfos[i], CL_DEVICE_TYPE_GPU, 0, 0, &nDev);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clGetDeviceIDs" << std::endl;
}
cl_device_id* GPUDevices = (cl_device_id*)malloc(nDev * sizeof(cl_device_id));
retVal = clGetDeviceIDs(plInfos[i], CL_DEVICE_TYPE_GPU, nDev, GPUDevices, 0);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clGetDeviceIDs" << std::endl;
}
// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
GPUContext = clCreateContext(NULL, nDev, GPUDevices, 0, 0, &errNum);
if(errNum != CL_SUCCESS)
{
std::cout << "ERROR: OpenCL::clCreateContext" << std::endl;
return -1;
}
}
// Get the list of GPU devices associated with this context
size_t ParmDataBytes;
retVal = clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clGetContextInfo" << std::endl;
}
cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
retVal = clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clGetContextInfo" << std::endl;
}
// Create a command-queue on the first GPU device
cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, &errNum);
if(errNum != CL_SUCCESS)
{
std::cout << "ERROR: OpenCL::clCreateCommandQueue" << std::endl;
return -1;
}
// Read OpenCL code from file
size_t sourceLength = 0;
char* sourceCL = readTextFile((char*)filenameCL.c_str(), &sourceLength);
if(sourceCL == NULL || sourceLength<10)
{
std::cout << "ERROR::main::readTextFile: Cannot open " << filenameCL << " file" << std::endl;
return -2;
}
// Create OpenCL program with source code
cl_program OpenCLProgram = clCreateProgramWithSource( GPUContext,
1,
(const char **)&sourceCL,
&sourceLength,
&errNum);
delete[] sourceCL;
if(errNum != CL_SUCCESS)
{
std::cout << "ERROR::main::clCreateProgramWithSource: Unable to create opencl program" << std::endl;
return -1;
}
// Build the program (OpenCL JIT compilation)
retVal = clBuildProgram(OpenCLProgram, 1, GPUDevices, NULL, NULL, NULL);
if(retVal != CL_SUCCESS)
{
std::cout << "ERROR::main::clBuildProgram: " << retVal << std::endl;
return -1;
}
size_t paramValueSize = 1024 * 1024,
param_value_size_ret;
char *paramValue;
paramValue = (char*)calloc(paramValueSize, sizeof(char));
retVal = clGetProgramBuildInfo( OpenCLProgram,
GPUDevices[0],
CL_PROGRAM_BUILD_LOG,
paramValueSize,
paramValue,
¶m_value_size_ret);
fprintf(errLog, paramValue);
fclose(errLog);
// Create a handle to the compiled OpenCL function (Kernel)
cl_kernel openCLKernel = clCreateKernel( OpenCLProgram,
"hypothesis_opencl",
&errNum);
if(errNum != CL_SUCCESS)
{
std::cout << "ERROR::main::clCreateKernel: " << errNum << std::endl;
return -1;
}
// Set kernel arguments
LocalBuffer buffLoc;
GPUBuffer buffGPU;
initLocalBuffer(&buffLoc, w, h);
retVal = initGPUBuffer(&buffGPU, GPUContext, w, h, searchLimit, radius);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::initGPUBuffer" <<std::endl;
}
// Fill local buffer with appropriate values
// pre-calculation of color & distance weights
init_color_SAD(buffLoc.weightTable, sigmaColor, MAXDIFF);
init_distance(buffLoc.distTable, sigmaDistance, radius);
// Loop till the end of video file
while((CIYuv_depth.readOneFrame(pf_read_depth, frameno)) == true)
{
CIYuv_col.readOneFrame(pf_read_col, frameno);
std::cout << "Processing frmne no. " << frameno << "
";
// Set the YUV image format to 444
CIYuv_col444.setData444_inIYUV(&CIYuv_col);
int i = 0,
j = h * w,
k = h * w * 2;
// Initialize with some interesting data
// Copy Y U V data to the local buffer
for(int r = 0; r < h; r++)
{
for(int c = 0; c < w; c++)
{
buffLoc.frameCol[i] = CIYuv_col444.Y[r][c];
buffLoc.frameCol[j++] = CIYuv_col444.U[r][c];
buffLoc.frameCol[k++] = CIYuv_col444.V[r][c];
buffLoc.frameDepth[i++] = CIYuv_depth.Y[r][c];
}
}
// Copy the output in CPU memory TO GPU memory
retVal = fillGPUBuffer( GPUCommandQueue,
&CIYuv_col,
&CIYuv_depth,
&buffGPU,
&buffLoc );
if(retVal != CL_SUCCESS)
{
std::cout << "Error::fillGPUBuffer" <<std::endl;
}
retVal = initKernel( openCLKernel,
&CIYuv_col,
&CIYuv_depth,
&buffGPU);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::initKernel" <<std::endl;
}
// Launch the Kernel on the GPU
size_t workSize[2] = {h, w}; // two dimensional Range
retVal = clEnqueueNDRangeKernel( GPUCommandQueue,
openCLKernel,
2,
NULL,
workSize,
NULL,
0,
NULL,
NULL);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clEnqueueNDRangeKernel" <<std::endl;
}
// Copy the output in GPU memory back to CPU memory
retVal = clEnqueueReadBuffer( GPUCommandQueue,
buffGPU.frameFilteredDepth,
CL_FALSE,
0,
(sizeof(float) * (w * h)),
buffLoc.frameFilteredDepth,
0,
NULL,
NULL);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clEnqueueReadBuffer" <<std::endl;
}
// Putting the output vector back to the image format
// Copy output vector back to Y
i = 0;
for(int r = 0; r < CIYuv_col.getHeight(); r++)
{
for(int c = 0; c < CIYuv_col.getWidth(); c++)
{
CIYuv_filteredDepth.Y[r][c] = (uchar)buffLoc.frameFilteredDepth[i++];
}
}
// Write to a YUV file
if((CIYuv_filteredDepth.writeOneFrame(pf_write)) != true)
{
std::cout << "ERROR:CIYuv_filteredDepth.writeOneFrame Write YUV file failed
";
}
// Increment the frame index
frameno ++;
}
// Cleanup
free(GPUDevices);
clReleaseKernel(openCLKernel);
clReleaseProgram(OpenCLProgram);
clReleaseCommandQueue(GPUCommandQueue);
clReleaseContext(GPUContext);
releaseGPUBuffer(&buffGPU);
releaseLocalBuffer(&buffLoc);
// Close all opened files
fclose(pf_read_col);
fclose(pf_read_depth);
fclose(pf_write);
_getch();
return 0;
}
int initGPUBuffer(GPUBuffer* buff, cl_context GPUContext, int width, int height, float searchLimit, int radius)
{
const int HW = height*width;
const int HW3 = HW*3;
cl_int errVal;
// Allocate GPU memory for source vectors AND initialize from CPU memory
buff->frameCol = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY , sizeof(uchar)*HW3, NULL, &errVal);
if(errVal != NULL)
{
std::cout << "Error::clCreateBuffer" << std::endl;
return -1;
}
buff->frameDepth = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY, sizeof(uchar)*HW, NULL, &errVal);
if(errVal != NULL)
{
std::cout << "Error::clCreateBuffer" << std::endl;
return -1;
}
buff->distTable = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY, sizeof(float)*MAXWND, NULL, &errVal);
if(errVal != NULL)
{
std::cout << "Error::clCreateBuffer" << std::endl;
return -1;
}
buff->weightTable = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY, sizeof(float)*MAXDIFF, NULL, &errVal);
if(errVal != NULL)
{
std::cout << "Error::clCreateBuffer" << std::endl;
return -1;
}
// Allocate output memory on GPU
buff->frameFilteredDepth = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(float)*HW, NULL, &errVal);
if(errVal != NULL)
{
std::cout << "Error::clCreateBuffer" << std::endl;
return -1;
}
//Set constant scalar paremeters
buff->width = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(int), &width, &errVal);
if(errVal != NULL)
{
std::cout << "Error::clCreateBuffer" << std::endl;
return -1;
}
buff->height = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(int), &height, &errVal);
if(errVal != NULL)
{
std::cout << "Error::clCreateBuffer" << std::endl;
return -1;
}
buff->searchLimit = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(float), &searchLimit, &errVal);
if(errVal != NULL)
{
std::cout << "Error::clCreateBuffer" << std::endl;
return -1;
}
buff->filterRadius = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(int), &radius, &errVal);
if(errVal != NULL)
{
std::cout << "Error::clCreateBuffer" << std::endl;
return -1;
}
return CL_SUCCESS;
}
void releaseGPUBuffer(GPUBuffer* buff)
{
clReleaseMemObject(buff->frameCol);
clReleaseMemObject(buff->frameDepth);
clReleaseMemObject(buff->distTable);
clReleaseMemObject(buff->weightTable);
clReleaseMemObject(buff->frameFilteredDepth);
clReleaseMemObject(buff->width);
clReleaseMemObject(buff->height);
clReleaseMemObject(buff->searchLimit);
clReleaseMemObject(buff->filterRadius);
}
void initLocalBuffer(LocalBuffer* buff, int width, int height)
{
const int HW = height*width;
const int HW3 = HW*3;
buff->frameCol = new uchar[HW3];
buff->frameDepth = new uchar[HW];
buff->frameFilteredDepth = new float[HW];
buff->distTable = new float[MAXWND];
buff->weightTable = new float[MAXDIFF];
}
void releaseLocalBuffer(LocalBuffer* buff)
{
delete[] buff->frameCol;
delete[] buff->frameDepth;
delete[] buff->frameFilteredDepth;
delete[] buff->distTable;
delete[] buff->weightTable;
}
int fillGPUBuffer(cl_command_queue GPUCommandQueue, CIYuv* CIYuv_col, CIYuv* CIYuv_depth, GPUBuffer* buffGPU, LocalBuffer* buffLoc)
{
cl_int retVal;
retVal = clEnqueueWriteBuffer(GPUCommandQueue, buffGPU->frameCol, CL_TRUE, 0,
sizeof(uchar) * CIYuv_col->getSizeInByte(), buffLoc->frameCol, 0, NULL, NULL);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clEnqueueWriteBuffer" << std::endl;
return -1;
}
retVal = clEnqueueWriteBuffer(GPUCommandQueue, buffGPU->frameDepth, CL_TRUE, 0,
sizeof(uchar) * (CIYuv_depth->getHeight() * CIYuv_depth->getWidth()), buffLoc->frameDepth, 0, NULL, NULL);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clEnqueueWriteBuffer" << std::endl;
return -1;
}
retVal = clEnqueueWriteBuffer(GPUCommandQueue, buffGPU->frameFilteredDepth, CL_TRUE, 0,
sizeof(float) * (CIYuv_depth->getHeight() * CIYuv_depth->getWidth()), buffLoc->frameFilteredDepth, 0, NULL, NULL);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clEnqueueWriteBuffer" << std::endl;
return -1;
}
retVal = clEnqueueWriteBuffer(GPUCommandQueue, buffGPU->weightTable, CL_TRUE, 0,
sizeof(float) * MAXDIFF, buffLoc->weightTable, 0, NULL, NULL);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clEnqueueWriteBuffer" << std::endl;
return -1;
}
retVal = clEnqueueWriteBuffer(GPUCommandQueue, buffGPU->distTable, CL_TRUE, 0,
sizeof(float) * MAXWND, buffLoc->distTable, 0, NULL, NULL);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clEnqueueWriteBuffer" << std::endl;
return -1;
}
return CL_SUCCESS;
}
int initKernel(cl_kernel kernel, CIYuv* CIYuv_col, CIYuv* CIYuv_depth, GPUBuffer* buffGPU)
{
cl_int retVal;
retVal = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffGPU->frameCol);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clSetKernelArg" << std::endl;
return -1;
}
retVal = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffGPU->frameDepth);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clSetKernelArg" << std::endl;
return -1;
}
retVal = clSetKernelArg(kernel, 2, sizeof(cl_mem), &buffGPU->frameFilteredDepth);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clSetKernelArg" << std::endl;
return -1;
}
retVal = clSetKernelArg(kernel, 3, sizeof(cl_mem), &buffGPU->weightTable);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clSetKernelArg" << std::endl;
return -1;
}
retVal = clSetKernelArg(kernel, 4, sizeof(cl_mem), &buffGPU->distTable);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clSetKernelArg" << std::endl;
return -1;
}
retVal = clSetKernelArg(kernel, 5, sizeof(cl_mem), &buffGPU->height);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clSetKernelArg" << std::endl;
return -1;
}
retVal = clSetKernelArg(kernel, 6, sizeof(cl_mem), &buffGPU->width);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clSetKernelArg" << std::endl;
return -1;
}
retVal = clSetKernelArg(kernel, 7, sizeof(cl_mem), &buffGPU->filterRadius);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clSetKernelArg" << std::endl;
return -1;
}
retVal = clSetKernelArg(kernel, 8, sizeof(cl_mem), &buffGPU->searchLimit);
if(retVal != CL_SUCCESS)
{
std::cout << "Error::clSetKernelArg" << std::endl;
return -1;
}
return CL_SUCCESS;
}
char* readTextFile(char* filename, size_t* fileLength)
{
char* cSourceCL = NULL;
ifstream progin(filename);
if(progin.is_open())
{
progin.seekg(0, ios::end);
int srcLength = progin.tellg();
progin.seekg(0, ios::beg);
cSourceCL = new char[srcLength+1];
progin.read(cSourceCL, srcLength);
progin.close();
for(int idx = srcLength; idx > 0; idx--)
{
std::cout << cSourceCL[idx] <<" ";
if(cSourceCL[idx] == '}')
{
*fileLength = idx + 1;
cSourceCL[*fileLength] = '\0';
break;
}
}
}
return cSourceCL;
}
inline int int_SAD(int a, int b)
{
int sad = a - b;
return sad > 0 ? sad : -sad;
}
void init_color_SAD(float *weights_table, float sigmaColor, int max_cost)
{
for(int i=0; i<max_cost; i++)
{
weights_table[i] = exp(-((float)i)/sigmaColor);
}
}
void init_distance(float *distance, float sigmaDistance, int radius)
{
// pre-calculation of distance weights
for(int i=-radius, index = 0; i<=radius; i++)
{
for(int j=-radius; j<=radius; j++, index++)
{
distance[index] = exp(-sqrt((float)i*i+j*j)/sigmaDistance);
}
}
}