Is non-blocking clEnqueueWriteBuffer really immediately return?

I try blocking and non-blocking version of clEnqueueWriteBuffer, It seem that non-blocking is not faster than blocking.

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>

long long get_time() {
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000000) + tv.tv_usec;
}

int main(int argc, char *argv[])
{
cl_int status = CL_SUCCESS; // Used to handle error codes
cl_platform_id platform;
cl_context context;
cl_command_queue queue;
cl_device_id device;

long long time0, time1;
int n = 2500000;
size_t mem_size = sizeof(float) * n;
float *host_buffer = (float*) malloc(mem_size);
cl_mem device_buffer;

status = clGetPlatformIDs(1, &platform, NULL);
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
context = clCreateContext(0, 1, &device, NULL, NULL, &status);
queue = clCreateCommandQueue(context, device, 0, &status);
device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, mem_size, host_buffer, &status);

time0 = get_time();
status = clEnqueueWriteBuffer(queue, device_buffer, CL_FALSE, 0, mem_size, host_buffer, 0, NULL, NULL);
time1 = get_time();

status = clFinish(queue);

printf("%.10f

", (float) (time1 - time0) / 1000000);

free(host_buffer);
clReleaseMemObject(device_buffer);
clReleaseCommandQueue(queue);
clReleaseContext(context);

return 0;

}

I change 3rd argument of clEnqueueWriteBuffer to try the blocking version. but both of them use about 0.0037 sec. (I tested on Tesla M2050)

Why non-blocking is use time as blocking. Is my code is incorrect?

Thank you

I have read once that the Buffercontent does not need to be available in openCL unitl it is needed by a kernel. You should try to call a Kernel after writing the data to make shure all data is available in the openCL implementation. This might get you another timing