OpenCL memory allocation problem

I’m using OpenCL on Android and am processing video’s.
I can process images and frame’s from video’s. When I use my inverse filter (no for loops in it), I can process all the frames without initializing the OpenCL objects again. But when I use a filter with a for loop, only the first frame will be processed and all the others will be black.

Example:
1)
init function
execute kernel with for loops
remove opencl function
=> all goes fine

init function
execute kernel without for loops
execute kernel without for loops on next frame
remove opencl function
=> all goes fine

init function
execute kernel with for loops
execute kernel with for loops on next frame
remove opencl function
=> first frame is processed, 2th frame is black

I’m using OpenCL1.1 and am looking for a way to solve this issue. As far as my knowledge goes, it must be something with memory allocation?

Init code:


struct OpenCLObjects
{
	cl_platform_id platform;
	cl_device_id device;
	cl_context context;
	cl_command_queue queue;
	cl_program program;
	cl_kernel kernel;
	bool isInputBufferInitialized;
	cl_mem inputBuffer;
	cl_mem outputBuffer;
};

static OpenCLObjects openCLObjects;

void initOpenCL
(
		JNIEnv* env,
		jobject thisObject,
		jstring kernelName,
		cl_device_type required_device_type,
		OpenCLObjects& openCLObjects
)
{

	using namespace std;


	openCLObjects.isInputBufferInitialized = false;

	cl_int err = CL_SUCCESS;

	/* 
	 * Step 1: Get the first platform
	 */
	cl_platform_id platform;
	err = clGetPlatformIDs(1, &platform, NULL);
	SAMPLE_CHECK_ERRORS(err);

	cl_uint i = 0;
	size_t platform_name_length = 0;
	err = clGetPlatformInfo(
			platform,
			CL_PLATFORM_NAME,
			0,
			0,
			&platform_name_length
	);
	SAMPLE_CHECK_ERRORS(err);

	openCLObjects.platform = platform;
	/* 
	 * Step 2: Create context with a device of the specified type (required_device_type).
	 */

	cl_context_properties context_props[] = {
			CL_CONTEXT_PLATFORM,
			cl_context_properties(openCLObjects.platform),
			0
	};

	openCLObjects.context =
			clCreateContextFromType
			(
					context_props,
					required_device_type,
					0,
					0,
					&err
			);
	SAMPLE_CHECK_ERRORS(err);
	/* 
	 * Step 3: Query for OpenCL device that was used for context creation.
	 */
	err = clGetContextInfo
			(
					openCLObjects.context,
					CL_CONTEXT_DEVICES,
					sizeof(openCLObjects.device),
					&openCLObjects.device,
					0
			);
	SAMPLE_CHECK_ERRORS(err);

	/*  
	 * Step 4: Create OpenCL program from its source code.
	 * The file name is passed by java.
	 * Convert the jstring to const char* and append the needed directory path.
	 */
	const char* fileName = env->GetStringUTFChars(kernelName, 0);
	std::string fileDir;
	fileDir.append("/data/data/com.denayer.ovsr/app_execdir/");
	fileDir.append(fileName);
	fileDir.append(".cl");
	std::string kernelSource = loadProgram(fileDir);
	const char* kernelSourceChar = kernelSource.c_str();

	openCLObjects.program =
			clCreateProgramWithSource
			(
					openCLObjects.context,
					1,
					&kernelSourceChar,
					0,
					&err
			);

	SAMPLE_CHECK_ERRORS(err);

	/*
	 * Build the program with defined BUILDOPT (build optimizations).
	 */
	err = clBuildProgram(openCLObjects.program, 0, 0, BUILDOPT, 0, 0);
	jstring JavaString = (*env).NewStringUTF("Code compiled succesful.");
	if(err == CL_BUILD_PROGRAM_FAILURE)
	{
		size_t log_length = 0;
		err = clGetProgramBuildInfo(
				openCLObjects.program,
				openCLObjects.device,
				CL_PROGRAM_BUILD_LOG,
				0,
				0,
				&log_length
		);
		SAMPLE_CHECK_ERRORS(err);

		vector<char> log(log_length);

		err = clGetProgramBuildInfo(
				openCLObjects.program,
				openCLObjects.device,
				CL_PROGRAM_BUILD_LOG,
				log_length,
				&log[0],
				0
		);
		SAMPLE_CHECK_ERRORS(err);

		LOGE
		(
				"Error happened during the build of OpenCL program.
Build log: %s",
				&log[0]
		);
		return;
	}

	/* 
	 * Step 6: Extract kernel from the built program.
	 */
	fileName = env->GetStringUTFChars(kernelName, 0);
	char result[100];   // array to hold the result.
	std::strcpy(result,fileName); // copy string one into the result.
	std::strcat(result,"Kernel"); // append string two to the result.
	openCLObjects.kernel = clCreateKernel(openCLObjects.program, result, &err);
	SAMPLE_CHECK_ERRORS(err);

	/* 
	 * Step 7: Create command queue.
	 */

	openCLObjects.queue =
			clCreateCommandQueue
			(
					openCLObjects.context,
					openCLObjects.device,
					0,     
					&err
			);
	SAMPLE_CHECK_ERRORS(err);

}

Execution code:

void nativeImage2DOpenCL
(
		JNIEnv* env,
		jobject thisObject,
		OpenCLObjects& openCLObjects,
		jobject inputBitmap,
		jobject outputBitmap
)
{
	using namespace std;

	timeval start;
	timeval end;

	gettimeofday(&start, NULL);

	AndroidBitmapInfo bitmapInfo;
	AndroidBitmap_getInfo(env, inputBitmap, &bitmapInfo);

	size_t bufferSize = bitmapInfo.height * bitmapInfo.stride;
	cl_uint rowPitch = bitmapInfo.stride / 4;

	cl_int err = CL_SUCCESS;

	void* inputPixels = 0;
	AndroidBitmap_lockPixels(env, inputBitmap, &inputPixels);

	cl_image_format image_format;
	image_format.image_channel_data_type=CL_UNORM_INT8;
	image_format.image_channel_order=CL_RGBA;

	openCLObjects.inputBuffer =
			clCreateImage2D(openCLObjects.context,
					CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
					&image_format,
					bitmapInfo.width,
					bitmapInfo.height,
					0,
					inputPixels,
					&err);
	SAMPLE_CHECK_ERRORS(err);

	openCLObjects.isInputBufferInitialized = true;

	AndroidBitmap_unlockPixels(env, inputBitmap);

	void* outputPixels = 0;
	AndroidBitmap_lockPixels(env, outputBitmap, &outputPixels);

	cl_mem outputBuffer =
			clCreateImage2D(openCLObjects.context,
					CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
					&image_format,
					bitmapInfo.width,
					bitmapInfo.height,
					0,
					outputPixels,
					&err);
	SAMPLE_CHECK_ERRORS(err);
	err = clSetKernelArg(openCLObjects.kernel, 0, sizeof(openCLObjects.inputBuffer), &openCLObjects.inputBuffer);
	SAMPLE_CHECK_ERRORS(err);
	err = clSetKernelArg(openCLObjects.kernel, 1, sizeof(outputBuffer), &outputBuffer);
	SAMPLE_CHECK_ERRORS(err);

	size_t globalSize[2] = { bitmapInfo.width, bitmapInfo.height };

	err = clEnqueueNDRangeKernel
			(
					openCLObjects.queue,
					openCLObjects.kernel,
					2,
					0,
					globalSize,
					0,
					0, 0, 0
			);
	SAMPLE_CHECK_ERRORS(err);

	err = clFinish(openCLObjects.queue);
	SAMPLE_CHECK_ERRORS(err);

    const size_t origin[3] = {0, 0, 0};
    const size_t region[3] = {bitmapInfo.width, bitmapInfo.height, 1};

	err = clEnqueueReadImage(
			openCLObjects.queue,
			outputBuffer,
			true,
			origin,
			region,
			0,
			0,
			outputPixels,
			0,
			0,
			0);
	SAMPLE_CHECK_ERRORS(err);


	// Call clFinish to guarantee that the output region is updated.
	err = clFinish(openCLObjects.queue);
	SAMPLE_CHECK_ERRORS(err);

	err = clReleaseMemObject(outputBuffer);
	SAMPLE_CHECK_ERRORS(err);

	// Make the output content be visible at the Java side by unlocking
	// pixels in the output bitmap object.
	AndroidBitmap_unlockPixels(env, outputBitmap);

}

Kernel code inverse:


_kernel void inverseKernel(__read_only  image2d_t  srcImage,
                          __write_only image2d_t  dstImage)
{ 
    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                               CLK_ADDRESS_REPEAT        |
                               CLK_FILTER_NEAREST;
     int x = get_global_id(0);
     int y = get_global_id(1);
     int2 coords = (int2) (x,y);

    float4 centerPixel = read_imagef(srcImage,sampler,coords);
    centerPixel.x = 1-centerPixel.x;
    centerPixel.y = 1-centerPixel.y;
    centerPixel.z = 1-centerPixel.z;
    write_imagef(dstImage,coords,centerPixel);	
}

Edge kernel (with for loop):


__kernel void edgeKernel(__read_only  image2d_t  srcImage,
                          __write_only image2d_t  dstImage)
{    
    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
                               CLK_ADDRESS_REPEAT         |
                               CLK_FILTER_NEAREST;
    int x = get_global_id(0);
    int y = get_global_id(1);
	int2 coords = (int2) (x,y);

	int i = 0;
	int j = 0;
	float4 bufferPixel;
	float4 currentPixel;
	float sum = 0;
	int counter = 0;
	const float edgeKernel[9] = {0.0f,1.0f,0.0f,1.0f,-4.0f,1.0f,0.0f,1.0f,0.0f};
	currentPixel = read_imagef(srcImage,sampler,coords);
	for(i=-1;i<=1;i++)
	{
		for(j=-1;j<=1;j++)
		{
		coords = (int2)((x+i),(y+j));
	    bufferPixel = read_imagef(srcImage,sampler,coords);
	    //sum = sum + (bufferPixel.y * edgeKernel[counter]);
	    sum = mad(bufferPixel.y,edgeKernel[counter],sum);
	    counter++;
		}
	}
	if(sum>255) sum=255;
	if(sum<0) sum=0;

	currentPixel.x=sum;
	currentPixel.y=sum;
	currentPixel.z=sum;

	write_imagef(dstImage,coords,currentPixel);	                          

}

All code can be found here:

OpenCL code is in the JNI folder, the kernels are in the assets folder

You are using unnormalized integer coordinates with read_imagef(), so your sampler should be

const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP_TO_EDGE |
CLK_FILTER_NEAREST;