Thanks for your quick reply David. I am confused about Mapping vs copying. In your earlier reply you mentioned that using clEnqueueReadBuffer() involves copying where clEnqueueMapBuffer() does not. I suppose this is only true as long as device can write on some portion of host memory.
The description of CL_MEM_USE_HOST_PTR on AMD OpenCL programming guide Table 4.3 says this:
------ Location ------ ------- Map Mode------ ------Map Location—
----Device Memory---- ----Copy---- ---- Pinned Host Memory—
What is the significance of map location here? If the data is in the device memory as mentioned by Location then I have to copy it over to the host side.
The manual also says that:
Like regular host memory, the CPU uses caching when accessing pinned host memory. Thus, GPU accesses must use the CPU cache coherency protocol when accessing. For discrete devices, the GPU access to this memory is through the PCIe bus, which also limits bandwidth.
I wrote a code that is doing a Depth First Search in a binary tree. My kernel that takes around 4s to run on CPU takes 22s on GPU so could it be because of pinned host memory? I also ran AMD APP profiler on it which shows 119 scratch register usage and 27 General Purpose register usage. How to reduce it? I have an array of floats of size 4x4 and two float vectors of size 4x1 that are read only variable and that I am using across all the threads. Could putting them in local store reduce the time drastically to less than 4s? Another thing is that there are lots of write operations. Could you please give me some suggestions on improving the run time??
I am also posting my entire kernel here if that helps.
Regards,
Richeek
/*!
* Sample kernel which multiplies every element of the input array with
* a constant and stores it at the corresponding output array
*/
#pragma OPENCL EXTENSION cl_amd_printf : enable
#define Mt 4
#define Mr 4
#define MOD_SCHEME 16
#define bitSize 4
#define STACKSIZE 100
#define initial_SC 100000.0f
#define CLIP 100000.0f
/* Complex Number Struct */
typedef struct my_comp
{
float2 data;
} comp;
inline comp mul(comp num1, comp num2)
{
comp temp;
temp.data.x = num1.data.x*num2.data.x - num1.data.y*num2.data.y;
temp.data.y = num1.data.x*num2.data.y + num1.data.y*num2.data.x;
return temp;
}
inline comp div(comp num1, comp num2)
{
comp temp;
float mag = num2.data.x*num2.data.x + num2.data.y*num2.data.y;
temp.data.x = (num1.data.x*num2.data.x + num1.data.y*num2.data.y)/mag;
temp.data.y = (num1.data.y*num2.data.x - num1.data.x*num2.data.y)/mag;
return temp;
}
inline comp add(comp num1, comp num2)
{
comp temp;
temp.data.x = num1.data.x + num2.data.x;
temp.data.y = num1.data.y + num2.data.y;
return temp;
}
inline comp sub(comp num1, comp num2)
{
comp temp;
temp.data.x = num1.data.x - num2.data.x;
temp.data.y = num1.data.y - num2.data.y;
return temp;
}
void equal(comp *num1, comp num2)
{
num1->data.x = num2.data.x;
num1->data.y = num2.data.y;
}
typedef struct _node
{
int level, index;
float ped;
}node;
typedef struct stack
{
node n[STACKSIZE];
int top;
}stack_class;
void init_stack(stack_class *s)
{
s->top = -1;
}
void push(stack_class *s, node _n)
{
s->top = s->top+1;
if(s->top == STACKSIZE)
printf("PROBLEM STACK IS FULL
");
s->n[s->top] = _n;
}
node pop(stack_class *s)
{
if(s->top>=0)
{
node nn = s->n[s->top];
s->top = s->top - 1;
return nn;
}
else
{
node temp;
temp.index = -456;
printf("STACK IS EMPTY CANT POP
");
return temp;
}
}
bool is_empty(stack_class *s)
{
if(s->top == -1)
return true;
return false;
}
/* Only for 16QAM */
void get_cord_val(int *qam_sym, comp *x)
{
int xVal=0, yVal=0;
float xCor, yCor;
for(int i=0;i<bitSize/2;i++)
yVal = 2*yVal + qam_sym[i];
for(int i=bitSize/2;i<bitSize;i++)
xVal = 2*xVal + qam_sym[i];
switch(xVal)
{
case 0:
xCor=-3;
break;
case 1:
xCor=-1;
break;
case 3:
xCor=1;
break;
case 2:
xCor=3;
break;
}
switch(yVal)
{
case 0:
yCor=-3;
break;
case 1:
yCor=-1;
break;
case 3:
yCor=1;
break;
case 2:
yCor=3;
break;
}
x->data.x = xCor;
x->data.y = yCor;
}
void get_symbol(comp y, comp *parent_sig, __global float *R_re, __global float *R_im, int level, int tx,
comp *cor)
{
int yRe, yIm;
int min_x, max_x, min_y, max_y, j=0, i=0;
comp y_scaled;
equal(&y_scaled, y);
comp temp, temp2;
for(int i=level+1; i<tx; ++i)
{
temp.data.x = 1.0f*parent_sig[i].data.x;
temp.data.y = 1.0f*parent_sig[i].data.y;
temp2.data.x = R_re[i];
temp2.data.y = R_im[i];
y_scaled = sub(y_scaled,mul(temp2,temp));
}
temp2.data.x = R_re[level];
temp2.data.y = R_im[level];
y_scaled = div(y_scaled, temp2);
float y_re = y_scaled.data.x, y_im = y_scaled.data.y;
if(MOD_SCHEME == 16) //16 QAM
{
if(y_re < -2.0f)
yRe = -3;
else if(y_re >= -2.0f && y_re < 0.0f)
yRe = -1;
else if(y_re >= 0.0f && y_re < 2.0f)
yRe = 1;
else
yRe = 3;
if(y_im < -2.0f)
yIm = -3;
else if(y_im >= -2.0f && y_im < 0.0f)
yIm = -1;
else if(y_im >= 0.0f && y_im < 2.0f)
yIm = 1;
else
yIm = 3;
cor[0].data.x = yRe;
cor[0].data.y = yIm;
j=1;
min_x = min_y = -3;
max_x = max_y = 3;
i = 1;
int k, flag = 0;
while(j<16)
{
//cout<<"get symbol 2
";
min_x = yRe-2*i;
max_x = yRe+2*i;
min_y = yIm-2*i;
max_y = yIm+2*i;
int min = min_x, max=max_x, x, y=max_y;
k=min;
flag = 0;
//while box is not complete
for(unsigned int l=0;l<4;++l)
{
while(k!=max)
{
if(!flag || flag == 2)
{
if(k >= -3 && k <= 3 && y >= -3 && y<= 3)
{
cor[j].data.x = k*1.0f;
cor[j].data.y = y*1.0f;
++j;
}
}
else if (flag == 1 || flag == 3)
{
if(k >= -3 && k <= 3 && x >= -3 && x<= 3)
{
cor[j].data.x = x;
cor[j].data.y = k;
++j;
}
}
if(!flag || flag == 3)
k+= 2;
else
k-= 2;
}
if(!flag)
{
flag = 1;
min = max_y;
max = min_y;
x = max_x;
}
else if(flag == 1)
{
flag = 2;
min = max_x;
max = min_x;
y = min_y;
}
else if(flag == 2)
{
flag = 3;
min = min_y;
max = max_y;
x = min_x;
}
k = min;
}
++i;
}
}
}
float getPed(comp y, __global float *R_re, __global float *R_im, comp *parent_sig, comp sym, int curr_level, float ped_parent)
{
comp b, e;
b.data.x = 0;
b.data.y = 0;
e.data.x = 0;
e.data.y = 0;
comp sig, temp2;
for(int i=curr_level+1; i<Mt; ++i)
{
sig.data.x = 1.0f*parent_sig[i].data.x;
sig.data.y = 1.0f*parent_sig[i].data.y;
temp2.data.x = R_re[i];
temp2.data.y = R_im[i];
b = add(b, mul(temp2,sig));
}
b = sub(y,b);
temp2.data.x = R_re[curr_level];
temp2.data.y = R_im[curr_level];
sig = mul(temp2,sym);
e = sub(b,sig);
float val = ped_parent + (e.data.x*e.data.x + e.data.y*e.data.y);
return val;
}
void get_bits(comp qam_sym, int *bits)
{
if(MOD_SCHEME == 16)
{
if(qam_sym.data.y == -3)
{
bits[0] = bits[1] = 0;
}
else if(qam_sym.data.y == -1)
{
//bits[0] = 1; bits[1] = 0;
bits[0] = 0; bits[1] = 1;
}
else if(qam_sym.data.y == 1)
{
bits[0] = bits[1] = 1;
}
else //if(qam_sym.imag() == 3)
{
//bits[0] = 0; bits[1] = 1;
bits[0] = 1; bits[1] = 0;
}
if(qam_sym.data.x == -3)
{
bits[2] = bits[3] = 0;
}
else if(qam_sym.data.x == -1)
{
//bits[2] = 1; bits[3] = 0;
bits[2] = 0; bits[3] = 1;
}
else if(qam_sym.data.x == 1)
{
bits[2] = bits[3] = 1;
}
else //if(qam_sym.real() == 3)
{
//bits[2] = 0; bits[3] = 1;
bits[2] = 1; bits[3] = 0;
}
}
}
__kernel void sphere_decoder(const int block_length,
const float noise_power,
__global float *block_data,
const int LIST_SIZE,
__global float *llr,
__global float *cand_dist,
__global float *cand_sym,
__global float *R_re,
__global float *R_im,
__global float *qr_noise_re,
__global float *qr_noise_im)
{
//printf("f hello
");
uint tid = get_global_id(0);
//printf("f hello %d
",tid);
int bitstream[Mt*bitSize], stride = Mt*bitSize;
for(int i=0; i<stride; ++i)
bitstream[i] = block_data[tid*stride + i];
comp x[Mt], y[Mr], best_sig[Mr], parents[Mr];
int level = Mt-1, index_level[Mr];
for(int i = 0; i<Mt; ++i)
{
get_cord_val(bitstream+i*Mt, &x[i]);
}
//// generate the y vector
//
comp temp_const0, temp2;
temp_const0.data.x = 0.0f;
temp_const0.data.y = 0.0f;
for(int i=0; i<Mr; ++i)
{
equal(&y[i],temp_const0);
equal(&best_sig[i],temp_const0);
equal(&parents[i],temp_const0);
index_level[i] = 0;
for(int j=0; j<Mt; ++j)
{
temp2.data.x = R_re[i*Mt+j];
temp2.data.y = R_im[i*Mt+j];
y[i] = add(y[i],mul(temp2, x[j]));
// printf("IN LOOP %f+j%f
",y[i].data.x, y[i].data.y);
}
// adding noise here
temp2.data.x = qr_noise_re[i];
temp2.data.y = qr_noise_im[i];
//y[i] = add(y[i], temp2); //FIXME disabling the noise
}
stack_class s;
init_stack(&s);
node nn;
nn.level = level+1;
nn.index = -1;
nn.ped = 0.0f;
push(&s, nn);
float ped_parent = 0.0f, SC_ML = initial_SC, max_cand_dist = 0.0f,
bit_plus_1_dist, bit_minus_1_dist;
comp cor[Mt*MOD_SCHEME];
int list_size = 0, max_cand_index = 0, i, j,
curr_level = nn.level, temp_bits[bitSize];
while(!is_empty(&s))
{
for(j=curr_level-1; j>=0; --j)
{
get_symbol(y[j], parents, R_re+j*Mt, R_im+j*Mt, j, Mt, cor+j*MOD_SCHEME);
nn.ped = getPed(y[j], R_re+j*Mt, R_im+j*Mt, parents, cor[j*MOD_SCHEME + index_level[j]], j, ped_parent);
equal(&parents[j],cor[j*MOD_SCHEME+index_level[j]]);
nn.index = index_level[j]++;
nn.level = j;
ped_parent = nn.ped;
if(j == 0 && nn.ped < SC_ML) //reached the leaf node
{
if(list_size < LIST_SIZE)
{
cand_dist[tid*LIST_SIZE + list_size] = nn.ped;
if(cand_dist[tid*LIST_SIZE + list_size] > max_cand_dist)
{
max_cand_dist = cand_dist[tid*LIST_SIZE + list_size];
max_cand_index = list_size;
}
for(int k=0; k<Mt; k++)
{
get_bits(cor[k*MOD_SCHEME+index_level[k]-1],temp_bits);
for(int l=0;l<bitSize;++l)
{
cand_sym[list_size*block_length+ tid*stride + k*bitSize+l] = (float)temp_bits[l];
if(temp_bits[l]==1)
cand_sym[list_size*block_length+ tid*stride + k*bitSize+l] = -1.0f;
else if(temp_bits[l]==0)
cand_sym[list_size*block_length+ tid*stride + k*bitSize+l] = 1.0f;
}
}
list_size++;
}
else if(nn.ped < max_cand_dist)
{
cand_dist[max_cand_index+tid*LIST_SIZE] = nn.ped;
/* Replace this candidate */
for(int k=0; k<Mt; ++k)
{
get_bits(cor[k*MOD_SCHEME+index_level[k]-1],temp_bits);
for(int l=0;l<bitSize;++l)
{
cand_sym[max_cand_index*block_length+ tid*stride + k*bitSize+l] = (float)temp_bits[l];
if(temp_bits[l]==1)
cand_sym[max_cand_index*block_length+ tid*stride + k*bitSize+l] = -1.0f;
if(temp_bits[l]==0)
cand_sym[max_cand_index*block_length+ tid*stride + k*bitSize+l] = 1.0f;
}
}
/* find the next max candidate */
max_cand_dist = -1.0f;
for(int k=0; k<LIST_SIZE; ++k)
{
if(cand_dist[tid*LIST_SIZE + k] > max_cand_dist)
{
max_cand_dist = cand_dist[tid*LIST_SIZE + k];
max_cand_index = k;
}
}
}
if(list_size == LIST_SIZE)
SC_ML = max_cand_dist;
}
else if(nn.ped >= SC_ML && list_size == LIST_SIZE) //tree pruning
{
break;
}
if(j>=1)
push(&s, nn);
}//end for curr_level
nn = pop(&s);
curr_level = nn.level;
ped_parent = nn.ped;
//
while(nn.index >= MOD_SCHEME)
{
index_level[nn.level] = 0;
nn = pop(&s);
}
//
///* going to the next child */
///* set the level for the children below curr level to zero */
for(i=curr_level-2;i>=0; --i)
index_level[i] = 0;
//
if(index_level[curr_level-1] < MOD_SCHEME-1 || (nn.index == -1 && index_level[curr_level-1] == MOD_SCHEME-1))
push(&s,nn);
} //end of while
/* calculate LLRs here */
for(int l=0; l<stride; ++l) //for each bit of the MIMO symbol
{
bit_plus_1_dist = bit_minus_1_dist = INT_MAX*1.0f;
//if(tid == 0)
//printf("INT MAX is %f
", INT_MAX*1.0f);
for(int k = 0; k<LIST_SIZE; ++k)
{
if(cand_sym[k*block_length+ tid*stride +l] == -1.0f && cand_dist[tid*LIST_SIZE+k] < bit_minus_1_dist)
{
bit_minus_1_dist = cand_dist[tid*LIST_SIZE+k];
}
else if(cand_sym[k*block_length+ tid*stride +l] == 1.0f && cand_dist[tid*LIST_SIZE+k] < bit_plus_1_dist )
{
bit_plus_1_dist = cand_dist[tid*LIST_SIZE+k];
}
}
//if(bit_minus_1_dist != 0.0f)
// printf("got a problem
");
llr[tid*stride + l] = (1/noise_power)*(bit_minus_1_dist - bit_plus_1_dist);
if(tid == 0)
{
printf("LLRs:%d %f %f %f %f
",l, bit_plus_1_dist, bit_minus_1_dist, llr[tid*stride+l], cand_sym[1920]);
}
if(llr[tid*stride+l] > CLIP)
llr[tid*stride + l] = CLIP;
else if(llr[tid*stride+l]<-CLIP)
llr[tid*stride+l] = -CLIP;
}
}