Qualcomm MSM ION (Zero Memory Copy) OpenCL and Tool of profiling

I try to use MSM ION (Zero Memory Copy) on the Qcom 801 with Andero 622 maybe,
I consult the external document provided by khronons : https://www.khronos.org/registry/cl/extensions/qcom/cl_qcom_ion_host_ptr.txt
But when I use clEnqueueMapBuffer, I couldn’t map the pointer on host side, and the error came out while running kernel.
When I try to use clEnqueueReadBuffer, it occured the segmentfault error.
When I use “mmap”, everything is ok. I cannot the pointer provied by the mmap to read and write the data.
But I’m afraid that the result maybe not correct because the data was not writen back from cache.

One more question, I wanna profile my program on Qcom 801 with linaro-linux system, could you give some good suggested tool?

Code:



        cl_mem_ion_host_ptr imageI_ion = {0};
	size_t		buffer_size_in_bytes     = 0;
	size_t               buffer_size_with_padding = 0;
	size_t               ext_mem_padding_in_bytes = 0;
	size_t               device_page_size         = 0;
	clGetDeviceInfo(device_id, CL_DEVICE_PAGE_SIZE_QCOM, sizeof(device_page_size), &device_page_size, NULL);
	clGetDeviceInfo(device_id, CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM, sizeof(ext_mem_padding_in_bytes), &ext_mem_padding_in_bytes, NULL);

	buffer_size_in_bytes = sizeof(unsigned char) * WIDTH * HEIGHT *  3;
	buffer_size_with_padding = buffer_size_in_bytes + ext_mem_padding_in_bytes;
	//ION memory set up
	int rc;
	int ion_fd = open("/dev/ion", O_RDONLY );

	struct ion_allocation_data imageI_data;
	struct ion_fd_data imageI_fd;
	imageI_data.len = buffer_size_with_padding; 
	imageI_data.align = device_page_size; 
	imageI_data.heap_mask = ION_HEAP(ION_SYSTEM_HEAP_ID);
	imageI_data.flags = ION_HEAP(ION_CP_MM_HEAP_ID);
	
        rc = ioctl(ion_fd, ION_IOC_ALLOC, &imageI_data);
	if (rc < 0) { printf( "Failed to allocate uspace ion buffer!
 "); }
	imageI_fd.handle = imageI_data.handle;
	rc = ioctl(ion_fd, ION_IOC_SHARE, &imageI_fd);
	if (rc < 0) { printf( "unable to ion map buffer!
"); };

	/*Map and Load Data*/
	int map_fd;

	map_fd = imageI_fd.fd;
	unsigned char *imageI;
	imageI = (unsigned char *)mmap(NULL, imageI_data.len, PROT_READ | PROT_WRITE, MAP_SHARED , map_fd, 0);
	if (!imageI ) { printf( "mmap failed
"); }
	LoadImage( imageI, (size_t)WIDTH*HEIGHT*3, "color.bin");

	imageI_ion.ext_host_ptr.allocation_type    = CL_MEM_ION_HOST_PTR_QCOM;
	imageI_ion.ext_host_ptr.host_cache_policy  = CL_MEM_HOST_UNCACHED_QCOM; 
	imageI_ion.ion_filedesc                    = imageI_fd.fd;
	imageI_ion.ion_hostptr                     = imageI_fd.handle; 

	cl_color = clCreateBuffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_EXT_HOST_PTR_QCOM, sizeof(unsigned char)*WIDTH*HEIGHT*3, &imageI_ion, &status);
	if((CL_SUCCESS != status) || NULL == cl_color){ fprintf(stderr, "Error: create buffer. %d
", status); exit(EXIT_FAILURE); }
        ...
        ...
        ...
//	unsigned char *color =  (unsigned char *)clEnqueueMapBuffer(command_queue, cl_color, CL_TRUE, CL_MAP_WRITE, 0,
//																sizeof(unsigned char) * WIDTH * HEIGHT * 3, 0,  NULL, NULL, &status );
//	if((CL_SUCCESS != status)){ fprintf(stderr, "Error: Kernel run %d
", status); exit(EXIT_FAILURE); }
//	cout << "Pointer: " << (void *) color << endl;
//
//	clEnqueueUnmapMemObject(command_queue, cl_color, color, 0, NULL, NULL);
        ...
        ...
        ...
        status = clEnqueueNDRangeKernel( command_queue, kernel, work_dim, NULL, global_work_size, local_work_size, 0, NULL, &prof_event);


//      unsigned char * filterColor = (unsigned char *)clEnqueueMapBuffer(command_queue, cl_filterColor, CL_TRUE, CL_MAP_READ, 0,
//										sizeof(unsigned char) * WIDTH * HEIGHT * 3, 0,  NULL, NULL, &status );
//	if ((CL_SUCCESS != status) || (NULL == filterColor)){ cout << "Error map buffer: " << status << endl; }
//	status = clEnqueueReadBuffer(command_queue, cl_filterColor, CL_TRUE, 0, sizeof(unsigned char) * width * height * 3, ImgSrc, 0, NULL, NULL);
//	if((CL_SUCCESS != status)){ fprintf(stderr, "Error: read buffer result %d
", status); exit(EXIT_FAILURE); }
//



No matter before or after clEnqueueNDRangeKernel, the clEnqueueMap(Buffer, Image) both work well without error, but the result is wrong. When I try to use clEnqueueReadBuffer, it occured the segmentfault error.
I print the map pointer out, it is 0x01 which is not correct that I couldn’t write data.
Could you help me figure out the problem pls?
If you could give some profiling tool, it would be better!
Thank you very much!