cl_mem objects only work the first time

Hey, this has me stumped.

I create a set of cl_mem objects via calls such as…

void CClass::ConstructOpenCLBuffers()
{
cl_int ciErrNum;

// One per z-plane
if( 0 != m_CLBuffer_NumPointsArr )
    clReleaseMemObject( m_CLBuffer_NumPointsArr );
m_CLBuffer_NumPointsArr = clCreateBuffer( m_CLGPUContext, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,  m_MaxDepthCount*sizeof(int), NULL, &ciErrNum );
if( 0 != ciErrNum )
{
    HandleCreateBufferErrors( ciErrNum );    
    CleanupOpenCL();
    ASSERT( 0 );
}
...

}

I attempt to refresh the contents of the input memory buffers via something like

void CClass::RefreshData()
{
// Copy the numPoints data into the pinned memory buffer for it
int* NumPointsPinnedBufferData = (int*)clEnqueueMapBuffer( m_CLCommandQueue,
m_CLBuffer_NumPointsArr,
CL_TRUE,
CL_MAP_WRITE,
0,
m_MaxDepthCountsizeof(int),
0, NULL,
NULL,
&ciErrNum );
HandleEnqueueMapBufferErrors( ciErrNum );

memcpy( NumPointsPinnedBufferData, m_NumPointsArr, m_MaxDepthCount
sizeof(int) );
ciErrNum = clEnqueueWriteBuffer( m_CLCommandQueue, m_CLBuffer_NumPointsArr, CL_TRUE, 0, m_MaxDepthCountsizeof(int), NumPointsPinnedBufferData, 0, NULL, NULL );
HandleEnqueueWriteBufferErrors( ciErrNum );

ciErrNum = clEnqueueUnmapMemObject( m_CLCommandQueue,
m_CLBuffer_NumPointsArr,
(void
)NumPointsPinnedBufferData,
0,
NULL,
NULL );
HandleEnqueueUnmapBufferErrors( ciErrNum );

}

Then I call a series of kernel calls to produce an output into one of these cl_mem objects (defined as CL_MEM_READWRITE) via something very much like…

void CClass::GetResult()
{
DWORD* GPUBuffer = (DWORD*)clEnqueueMapBuffer( m_CLCommandQueue,
m_CLBuffer_resultBuffer,
CL_TRUE,
CL_MAP_READ,
0,
numElements*sizeof(DWORD),
0, NULL,
NULL,
&ciErrNum );
HandleEnqueueMapBufferErrors( ciErrNum );

memcpy( pCPUSideDataBuffer, GPUBuffer, numElements*sizeof(DWORD) );

ciErrNum = clEnqueueUnmapMemObject( m_CLCommandQueue, 
                                        m_CLBuffer_resultBuffer, 
                                        (void*)GPUBuffer, 
                                        0, 
                                        NULL, 
                                        NULL );
HandleEnqueueUnmapBufferErrors( ciErrNum );

}

Unfortunately this only works the first iteration. Any attempt to get a reasonable result after the first invocation of the kernels fails UNLESS I call ConstructOpenCLBuffers again. This ultimately cleans up every cl_mem object and creates new cl_mem objects.

Since this is costly, I would very much like to avoid doing this. Does anyone have any suggestions what might be going wrong? (And yes I am handling every single error I have access to, and no it never returns an error code)

[QUOTE=krazanmp;30568]Hey, this has me stumped.

I create a set of cl_mem objects via calls such as…

void CClass::ConstructOpenCLBuffers()
{
cl_int ciErrNum;

// One per z-plane
if( 0 != m_CLBuffer_NumPointsArr )
    clReleaseMemObject( m_CLBuffer_NumPointsArr );
m_CLBuffer_NumPointsArr = clCreateBuffer( m_CLGPUContext, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,  m_MaxDepthCount*sizeof(int), NULL, &ciErrNum );
if( 0 != ciErrNum )
{
    HandleCreateBufferErrors( ciErrNum );    
    CleanupOpenCL();
    ASSERT( 0 );
}
...

}

I attempt to refresh the contents of the input memory buffers via something like

void CClass::RefreshData()
{
// Copy the numPoints data into the pinned memory buffer for it
int* NumPointsPinnedBufferData = (int*)clEnqueueMapBuffer( m_CLCommandQueue,
m_CLBuffer_NumPointsArr,
CL_TRUE,
CL_MAP_WRITE,
0,
m_MaxDepthCountsizeof(int),
0, NULL,
NULL,
&ciErrNum );
HandleEnqueueMapBufferErrors( ciErrNum );

memcpy( NumPointsPinnedBufferData, m_NumPointsArr, m_MaxDepthCount
sizeof(int) );
ciErrNum = clEnqueueWriteBuffer( m_CLCommandQueue, m_CLBuffer_NumPointsArr, CL_TRUE, 0, m_MaxDepthCountsizeof(int), NumPointsPinnedBufferData, 0, NULL, NULL );
HandleEnqueueWriteBufferErrors( ciErrNum );

ciErrNum = clEnqueueUnmapMemObject( m_CLCommandQueue,
m_CLBuffer_NumPointsArr,
(void
)NumPointsPinnedBufferData,
0,
NULL,
NULL );
HandleEnqueueUnmapBufferErrors( ciErrNum );

}

Then I call a series of kernel calls to produce an output into one of these cl_mem objects (defined as CL_MEM_READWRITE) via something very much like…

void CClass::GetResult()
{
DWORD* GPUBuffer = (DWORD*)clEnqueueMapBuffer( m_CLCommandQueue,
m_CLBuffer_resultBuffer,
CL_TRUE,
CL_MAP_READ,
0,
numElements*sizeof(DWORD),
0, NULL,
NULL,
&ciErrNum );
HandleEnqueueMapBufferErrors( ciErrNum );

memcpy( pCPUSideDataBuffer, GPUBuffer, numElements*sizeof(DWORD) );

ciErrNum = clEnqueueUnmapMemObject( m_CLCommandQueue, 
                                        m_CLBuffer_resultBuffer, 
                                        (void*)GPUBuffer, 
                                        0, 
                                        NULL, 
                                        NULL );
HandleEnqueueUnmapBufferErrors( ciErrNum );

}

Unfortunately this only works the first iteration. Any attempt to get a reasonable result after the first invocation of the kernels fails UNLESS I call ConstructOpenCLBuffers again. This ultimately cleans up every cl_mem object and creates new cl_mem objects.

Since this is costly, I would very much like to avoid doing this. Does anyone have any suggestions what might be going wrong? (And yes I am handling every single error I have access to, and no it never returns an error code)[/QUOTE]

Update:
It turns out it is a kernel that fails… again only on the first invocation of it. The only information I could gatheer is:
It is 5th of out 6 kernels to be called in a chain
One of its cl_mem parameters is the largest sized buffer being used.
It works fine IF the first invocation isn’t called

i.e. The following DOES work
for( i = 0 to n )
{
clEnqueueNDRangeKernel( OK_kernel_1… )
clEnqueueNDRangeKernel( OK_kernel_2… )
clEnqueueNDRangeKernel( OK_kernel_3… )
clEnqueueNDRangeKernel( OK_kernel_4… )
clEnqueueNDRangeKernel( OK_kernel_5… )
if( 0 == i ) continue; // But doesn’t if this line is removed
clEnqueueNDRangeKernel( OK_kernel_6… ) // Uses largest buffer via a cl_mem… doesn’t produce an error anywhere
clEnqueueNDRangeKernel( OK_kernel_7… )
}

My question is…
It is possible that clCreateBuffer isn’t complete by the time I attempt to use its cl_mem result? There doesn’t seem to be any synchronization tools for use with clCreateBuffer (other than clFinish(…)). Since it behaves for all calls to it for i != 0, it seems to imply an initialization issue. Unfortunately there is no other host-side initialization occurring after this called and all invocations for i > 0 works as expected.

Is there some error handling/querying code that is not obvious from the documentation?

Try putting clFinish calls after every clEnqueue call to narrow down the specific call that is causing a problem.