#include <stdio.h>
#include <conio.h>
#include <windows.h>
#include <assert.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <cv.h>
#include <cxcore.h>
#include <highgui.h>
#include <oclUtils.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <sys/timeb.h>
#define MAX_SOURCE_SIZE (0x100000)
float *srcA, *srcB; // Host buffers for OpenCL test
float *CPUoutput, *GPUoutput; // Host buffer for GPU and CPU processing
// OpenCL Vars
cl_platform_id cpPlatform; // OpenCL platform
cl_device_id cdDevice = NULL,CPU = NULL; // OpenCL device
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQueue; // OpenCL command que
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmDevSrcA; // OpenCL device source buffer A
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevDst; // OpenCL device destination buffer
cl_uint ret_num_platforms;
size_t szGlobalWorkSize[2]; // Total # of work items in the 2D range
size_t szLocalWorkSize[2]; // # of work items in the 2D work group
cl_event GPUevent,transferevent; // OpenCL event
cl_ulong start; // To store start clock time
cl_ulong end; // To store end clock time
//cl_event ev;
//size_t szParmDataBytes; // Byte size of context information
//size_t szKernelLength; // Byte size of kernel code
cl_int ciErrNum = 0; // Error check variable
char* cPathAndName = NULL; // var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
size_t source_size;
char *source_str;
size_t buffer_size;
float *img,*img1, *mask;
// demo config variables
int iNumElements = 0; // Length of entire array to be created (Image size)
int iFilterWidth = 3; // Specifies the width of the Filter for convolution
int iWidth = 0; // Image Width Holding variable
int iHeight = 0; // Image Height Holding variable
shrBOOL bNoPrompt = shrFALSE;
IplImage *srcimg, *tempFrame; //Image storing elements
//Elements helps in creating the buffers
size_t mem_sizeImage;
unsigned int sizeKernel;
size_t mem_sizeKernel;
// Forward Declarations
// *************************************************************************************
float* CPUConvolution(float* pfData1, float* pfData2);
void GPUDevicePreparation();
void GPUConvolution(float *inimage, float *mask,float *outimage);
void Cleanup (int iExitCode);
double getclock();
void (*pCleanup)(int) = &Cleanup;
void readsourceimage();
//Function Definition
//=====================================================================================
// Function to read an image and make it to an two dimensional array
//=====================================================================================
void readsourceimage()
{
srcimg = cvLoadImage(“road_6000_800.jpg”,0);
tempFrame = cvLoadImage(“road_6000_800.jpg”,0);
iWidth = srcimg->width;
iHeight = srcimg->height;
iNumElements = iWidth * iHeight;
IplImage *dstimg = cvCreateImageHeader(cvSize(iWidth,iHeight),IPL_DEPTH_32F,1);
cvNamedWindow ("Inputimage",1);
cvShowImage ("Inputimage",srcimg);
cvWaitKey();
BYTE *inimg = (BYTE *) srcimg->imageData;
// ----------------------- Dynamic Mem in C++ -----------------------
img = new float [iNumElements];
img1 = new float [iNumElements];
for (int i=0; i< iNumElements; i++)
{
img[i] = (float) inimg[i];
}
for (int i=0; i< iNumElements; i++)
{
img1[i] = (float) inimg[i];
}
}
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
// Reading the kernel source file to access it
//$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
void loadprogramsource(const char *filename)
{
FILE *fp;
fp = fopen(filename,“r”);
if(!fp)
{
fprintf(stderr,"Failed to load kernel
");
exit(1);
}
source_str = (char *)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str,1,MAX_SOURCE_SIZE,fp);
fclose(fp);
}
//-------------------------------------------------------------------------------------------
// GPU Device Preparation and loading all the necessary variable and building the profgram
//-------------------------------------------------------------------------------------------
void GPUDevicePreparation()
{
shrLog("GPU Device Preparation for building and execution…
");
// Get the NVIDIA platform
shrLog("Get the NVIDIA platform...
");
ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
//Find GPU CL device, which is really needed for processing
shrLog("Getting the GPU device...
");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create the context
shrLog("clCreateContext...
");
cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create a command-queue
shrLog("clCreateCommandQueue...
");
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Place here some thing
const char fileName[] = "./Convolution.cl";
loadprogramsource(fileName);
// Create the program
shrLog("clCreateProgramWithSource...
");
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char *)&source_str, (const size_t)&source_size, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Build the program with 'mad' Optimization option
//#ifdef MAC
// char* flags = "-cl-fast-relaxed-math -DMAC";
//#else
char* flags = "-cl-fast-relaxed-math";
//#endif
shrLog("clBuildProgram...
");
ciErrNum = clBuildProgram(cpProgram, 1, &cdDevice, flags, NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclConvolution.ptx");
Cleanup(EXIT_FAILURE);
}
// Create the kernel
shrLog("clCreateKernel (Convolution)...
");
ckKernel = clCreateKernel(cpProgram, “Convolution”, &ciErrNum);
}
//§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§
// GPU Convolution starts in this function
//§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§§
void GPUConvolution(float *inimage, float *mask, float *outimage)
{
long ts,te,tst,tet;
mem_sizeImage = sizeof(float) * iNumElements;
sizeKernel = iFilterWidth * iFilterWidth;
mem_sizeKernel = sizeof(float) * sizeKernel;
unsigned char * out1;
//ts = GetTickCount();
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
//shrLog("clCreateBuffer (SrcA, SrcB and GPUoutput in Device GMEM)...
");
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_sizeImage, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, mem_sizeKernel, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, mem_sizeImage, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
//te = GetTickCount();
//printf("Create Buffer: %d [Milli sec]
",(te - ts));
//ts = GetTickCount();
// Asynchronous write of data to GPU device
//shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...
");
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, mem_sizeImage,(void*) inimage, 0, NULL, &GPUevent);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, mem_sizeKernel,(void*) mask, 0, NULL, &GPUevent);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
clFinish(cqCommandQueue);
//te = GetTickCount();
//printf("Write Buffer: %d [Milli sec]
",(te - ts));
//ts = GetTickCount();
// Set the Argument values
//shrLog("clSetKernelArg 0 - 4...
");
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), &cmDevSrcA);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernel, 1, sizeof(cl_mem), &cmDevSrcB);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernel, 2, sizeof(cl_mem), &cmDevDst);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernel, 3, sizeof(cl_int), &iWidth);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ciErrNum = clSetKernelArg(ckKernel, 4, sizeof(cl_int), &iFilterWidth);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
//te = GetTickCount();
//printf("Set Kernel Arguments: %d [Milli sec]
",(te - ts));
//Setting the local and global parameters
szGlobalWorkSize[0] = iWidth;
szGlobalWorkSize[1] = iHeight;
szLocalWorkSize[0] = 1;
szLocalWorkSize[1] = 1;
//ts = GetTickCount();
//clGetEventProfilingInfo(GPUevent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),&start,NULL);
// Launch kernel
//shrLog("clEnqueueNDRangeKernel (Convolution)...
");
//ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, &GPUevent);
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum,CL_SUCCESS, pCleanup); //CL_INVALID_KERNEL,CL_SUCCESS
//tst = GetTickCount();
//clWaitForEvents(1,&GPUevent);
//tet = GetTickCount();
//clFinish(cqCommandQueue);
//clGetEventProfilingInfo(GPUevent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),&end,NULL);
//te = GetTickCount();
//printf("Launch Kernel: %d [Milli sec]
",(te - ts));
//printf("Event Wait Time for Kernel Execution: %d [Milli sec]
",(tet - tst));
//printf("Kernel Execution time: %ld [nano sec]
",(end-start));
//ts = GetTickCount();
//clGetEventProfilingInfo(transferevent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong),&start,NULL);
// Read back results and check accumulated errors
//shrLog("clEnqueueReadBuffer (GPU Output)...
");
//ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_FALSE, 0, mem_sizeImage, outimage, 0, NULL, &transferevent);
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, mem_sizeImage, outimage, 0, NULL,NULL);
//Sleep(50);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
//clWaitForEvents(1,&transferevent);
//clGetEventProfilingInfo(transferevent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong),&end,NULL);
//te = GetTickCount();
//printf("Read Buffer: %d [Milli sec]
",(te - ts));
//printf("Data copying time back from GPU: %ld [nano sec]
",(end-start));
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
//clFinish(cqCommandQueue);
}
//**********************************************************************
// “CPU” Host processing Convolution function for comparison purposes
// *********************************************************************
float* CPUConvolution(float *inimage, float *mask)
{
//printf("
Entered into the CPU Convolution Function
");
int k = 0;
float *pfResult;
// ----------------------- Dynamic Mem in C++ -----------------------
pfResult = new float [iNumElements];
int idx = 0;
int jdx = 0;
for (int i = 0; i < iHeight; i++)
{
for (int j = 0; j < iWidth; j++)
{
float sum = 0;
for (int m = 0; m < iFilterWidth; m++)
{
for (int n = 0; n < iFilterWidth; n++)
{
idx = i - m;
jdx = j - n;
if (idx >= 0 && jdx >= 0)
sum += img[(i * iWidth) + j + n] * mask[(m * iFilterWidth) + n];
}
}
if (sum > 255)
sum = 255;
if (sum < 0)
sum = 0;
pfResult[k] = sum;
k++;
}
}
return(pfResult);
}
// **********************************************************************************************
// Main function
// **********************************************************************************************
int main(int argc, char **argv)
{
printf("The program execution has started(Entered into main)…
");
//float kernelconv[] = {1,1,1,1,1,1,0,1,0,1,1,1,-4,1,1,1,0,1,0,1,1,1,1,1,1};
float kernelconv[] = {0,1,0,1,-4,1,0,1,0,};
long ts,te;
mask = kernelconv; //Creating the mask to execute the kernel
// Calling the function to load image for processing
readsourceimage();
// Creating a memory location to store the output of the GPU
GPUoutput = (float *)malloc(sizeof(cl_float) * iNumElements);
// Calling of GPU Device Preparation function
GPUDevicePreparation();
ts = GetTickCount();
// Calling of GPU convolution
//for(int i = 0; i < 10; i++)
//{
GPUConvolution(img,mask,GPUoutput);
//}
te = GetTickCount();
printf("
Execution time on GPU: %d [Milli sec]
",(te - ts));
ts = GetTickCount();
//Calling of CPU convolution
//for(int i = 0; i < 10; i++)
//{
CPUoutput = CPUConvolution(img,mask);
//}
te = GetTickCount();
printf("
Execution time on CPU: %d [Milli sec]
",(te - ts));
byte *tmp1, *tmp2;
tmp1 = new byte[iNumElements];
tmp2 = new byte[iNumElements];
//Retriving the results back from GPU
for(int i = 0;i < iNumElements; i++)
{
// printf("%f ",GPUoutput[i]);
tmp1[i] = (byte) floor(abs(GPUoutput[i]));
}
srcimg->imageData = (char *) tmp1;
srcimg->imageDataOrigin = srcimg->imageData;
//Retriving the results back from CPU
for(int i = 0;i < iNumElements; i++)
{
//printf("%f ",CPUoutput[i]);
tmp2[i] = (byte) floor(abs(CPUoutput[i]));
}
tempFrame->imageData = (char *) tmp2;
tempFrame->imageDataOrigin = tempFrame->imageData;
cvNamedWindow("GPU_output",1);
cvNamedWindow("CPU_output",1);
cvShowImage("GPU_output",srcimg);
cvShowImage("CPU_output",tempFrame);
clFinish(cqCommandQueue);
shrLog("
Press any Key to EXIT
");
cvWaitKey();
//cvReleaseImage(&srcimg);
//cvReleaseImage(&tempFrame);
//shrLog(LOGFILE, 0, "%f ", GPUoutput[i]);
} // End of MAIN
// Cleanup and exit code
// *********************************************************************
void Cleanup(int iExitCode)
{
// Cleanup allocated objects
shrLog("Starting Cleanup...
");
getchar();
if(cPathAndName)free(cPathAndName);
if(cSourceCL)free(cSourceCL);
if(ckKernel)clReleaseKernel(ckKernel);
if(cpProgram)clReleaseProgram(cpProgram);
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
if(cxGPUContext)clReleaseContext(cxGPUContext);
if (cmDevSrcA)clReleaseMemObject(cmDevSrcA);
if (cmDevSrcB)clReleaseMemObject(cmDevSrcB);
if (cmDevDst)clReleaseMemObject(cmDevDst);
// Free host memory
free(srcA);
free(srcB);
free(GPUoutput);
free(CPUoutput);
// finalize logs and leave
if (bNoPrompt)
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "oclConvolution.exe Exiting...
");
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "oclConvolution.exe Exiting…
Press <Enter> to Quit
");
getchar();
}
//cvWaitKey();
getchar();
exit (iExitCode);
}