EnDoerner

11-21-2012, 11:37 AM

First of all I want to present myself, I am from Chile and this is my first post on the forum. I am Ph.D. in Physics and some time ago I started this adventure on OpenCL.

The question is the following. I am studying an example of a Histogram from AMD webpage ( http://developer.amd.com/tools/heteroge ... les-demos/ (http://developer.amd.com/tools/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/samples-demos/) ) and I have an "inner conflict" with the dimensions of the data array on the kernel, the code is the following:

#define LINEAR_MEM_ACCESS

#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable

#define BIN_SIZE 256

/**

* @brief Calculates block-histogram bin whose bin size is 256

* @param data input data pointer

* @param sharedArray shared array for thread-histogram bins

* @param binResult block-histogram array

*/

__kernel

void histogram256(__global const uint* data,

__local uchar* sharedArray,

__global uint* binResult)

{

size_t localId = get_local_id(0);

size_t globalId = get_global_id(0);

size_t groupId = get_group_id(0);

size_t groupSize = get_local_size(0);

/* initialize shared array to zero */

for(int i = 0; i < BIN_SIZE; ++i)

sharedArray[localId * BIN_SIZE + i] = 0;

barrier(CLK_LOCAL_MEM_FENCE);

/* calculate thread-histograms */

for(int i = 0; i < BIN_SIZE; ++i)

{

#ifdef LINEAR_MEM_ACCESS

uint value = data[groupId * groupSize * BIN_SIZE + i * groupSize + localId];

#else

uint value = data[globalId * BIN_SIZE + i];

#endif // LINEAR_MEM_ACCESS

sharedArray[localId * BIN_SIZE + value]++;

}

barrier(CLK_LOCAL_MEM_FENCE);

/* merge all thread-histograms into block-histogram */

for(int i = 0; i < BIN_SIZE / groupSize; ++i)

{

uint binCount = 0;

for(int j = 0; j < groupSize; ++j)

binCount += sharedArray[j * BIN_SIZE + i * groupSize + localId];

binResult[groupId * BIN_SIZE + i * groupSize + localId] = binCount;

}

}

In the "/*calculate thread-histograms*/" part each element of the data array is called in the following way:

data[groupId * groupSize * BIN_SIZE + i * groupSize + localId]

Now comes my conflict. The dimensions of the data array is 1024*1024=1048576, the number of workgroups is 4096 and the local size is 128 (as defined by the host program). If one calculates the index for the "last thread" of groupId=4095 and localId=127 one would have an index of 4095*128*256+i*128+127=134185087+i*128, which clearly exceeds the dimension of the data array. So how does the program handle this issue?

Well, a last question, what does LINEAR_MEM_ACCESS mean?, I have been trying to find the answer without luck... thanks for your help!

The question is the following. I am studying an example of a Histogram from AMD webpage ( http://developer.amd.com/tools/heteroge ... les-demos/ (http://developer.amd.com/tools/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/samples-demos/) ) and I have an "inner conflict" with the dimensions of the data array on the kernel, the code is the following:

#define LINEAR_MEM_ACCESS

#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable

#define BIN_SIZE 256

/**

* @brief Calculates block-histogram bin whose bin size is 256

* @param data input data pointer

* @param sharedArray shared array for thread-histogram bins

* @param binResult block-histogram array

*/

__kernel

void histogram256(__global const uint* data,

__local uchar* sharedArray,

__global uint* binResult)

{

size_t localId = get_local_id(0);

size_t globalId = get_global_id(0);

size_t groupId = get_group_id(0);

size_t groupSize = get_local_size(0);

/* initialize shared array to zero */

for(int i = 0; i < BIN_SIZE; ++i)

sharedArray[localId * BIN_SIZE + i] = 0;

barrier(CLK_LOCAL_MEM_FENCE);

/* calculate thread-histograms */

for(int i = 0; i < BIN_SIZE; ++i)

{

#ifdef LINEAR_MEM_ACCESS

uint value = data[groupId * groupSize * BIN_SIZE + i * groupSize + localId];

#else

uint value = data[globalId * BIN_SIZE + i];

#endif // LINEAR_MEM_ACCESS

sharedArray[localId * BIN_SIZE + value]++;

}

barrier(CLK_LOCAL_MEM_FENCE);

/* merge all thread-histograms into block-histogram */

for(int i = 0; i < BIN_SIZE / groupSize; ++i)

{

uint binCount = 0;

for(int j = 0; j < groupSize; ++j)

binCount += sharedArray[j * BIN_SIZE + i * groupSize + localId];

binResult[groupId * BIN_SIZE + i * groupSize + localId] = binCount;

}

}

In the "/*calculate thread-histograms*/" part each element of the data array is called in the following way:

data[groupId * groupSize * BIN_SIZE + i * groupSize + localId]

Now comes my conflict. The dimensions of the data array is 1024*1024=1048576, the number of workgroups is 4096 and the local size is 128 (as defined by the host program). If one calculates the index for the "last thread" of groupId=4095 and localId=127 one would have an index of 4095*128*256+i*128+127=134185087+i*128, which clearly exceeds the dimension of the data array. So how does the program handle this issue?

Well, a last question, what does LINEAR_MEM_ACCESS mean?, I have been trying to find the answer without luck... thanks for your help!