[QUOTE=OferRosenberg;29488]If your kernel code uses float data type, you’re using only 1/4 of the possible SC processing power. If the kernel uses float4, you’re on the right track… Keep in mind that there’s always the issue of register spill, where using too much registers will cause the machine to store registers in memory during wavefront switch - a performance killer. this is why using float8 and float16 is not recommanded, as they increase the register pressure and may cause a spill. [/QUOTE]Yes, i know that and therefore i investigated the register spill with KernelAnalyzer / CodeAnalyst. Both show me no register pressure if i use float8 or float16.
The loop inside the Kernel runs 2 times faster with float8 and about 3-4 times faster with float16 comparing to float4…
[QUOTE=OferRosenberg;29488]
So:
- Use float4 and not float (and not float8/float16)
- Use workgroups which are a multiply of 64 (wavefront). [/QUOTE]A part of my Program(s) is a piece of code which discovers the fastest number of Workgroups/LocalWorksize.
But this is not the point!
The point is the absolutely slow addition of 8 or 16 floats. In the Assemlercode i found only lines with mov´s and add´s with registers. Nothing unusual.
When i change the line (not in a loop ! )
sumFy = Fy.s0+Fy.s1+Fy.s2+Fy.s3+Fy.s4+Fy.s5+Fy.s6+Fy.s7+Fy.s8+Fy.s9+Fy.sa+Fy.sb+Fy.sc+Fy.sd+Fy.se+Fy.sf;
to
sumFy = Fy.s0+Fy.s1+Fy.s2+Fy.s3+Fy.s4+Fy.s5+Fy.s6+Fy.s7;
the whole kerneltime is only half!
[QUOTE=OferRosenberg;29488]BTW, the form <vectorname>.s[<index>] is illegal in the spec. only <vectorname>.s0, <vectorname>.s1, etc. Why? the compiler needs to know upfront the vector elements, to code the command. Supporting a runtime defined index is a performance hit. [/QUOTE]Supporting a runtime index (an array) is a fundamental property of every computing language.
Sorry, but having to encode a union with an embedded struct to get the function of an array is definitely a performance hit (during coding ^^). But you could take a look at my Kernel and give me some hints to avoid the array respectively the index.
[QUOTE=OferRosenberg;29488]Good Luck ![/QUOTE]Thank you very much for your helpful answer!
I would like to mention that I’m programming C/C++ only in OpenCL-kernels. C was not my favourite language since the early 80´s.
Here is my Kernel, i tried to index (and add) the components in a for/to loop (after the barrier), but that does not work, i got wrong results!
If i put the printf-command in the line after the addition, i get correct results but a cough slow runtime…
#pragma OPENCL EXTENSION cl_amd_printf : enable
#define __FAST_RELAXED_MATH__
__kernel void nBody_vector ( __global float16* x,
__global float16* y,
__global float16* z,
__global float* vx,
__global float* vy,
__global float* vz,
const float ausgleich,
const float dt
)
{
union // weil x.s[0] bei nvidia und AMD nicht geht, aber bei Apple schon :(
{
float s[16];
float16 v;
} x_union,y_union,z_union;
int tid = get_global_id(0);
int particles = get_global_size(0) / 16;
float16 aus = (float16)ausgleich;
float16 x_tid;
float16 y_tid;
float16 z_tid;
uint tid_mod = tid % 16;
uint tid_index = tid / 16;
x_union.v = x[tid_index]; // alle 16 x-Koordinaten nach x_union.v
y_union.v = y[tid_index];
z_union.v = z[tid_index];
x_tid = (float16) x_union.s[tid_mod]; // 16 mal die koordinate an Position tid_mod in x_tid
y_tid = (float16) y_union.s[tid_mod];
z_tid = (float16) z_union.s[tid_mod];
float16 Fx=0.0f;
float16 Fy=0.0f;
float16 Fz=0.0f;
for (int j=0 ;j<particles; j++) //16 kräfte gleichzeitig berechnen SIMD
{
float16 dx = x[j] - x_tid;
float16 dy = y[j] - y_tid;
float16 dz = z[j] - z_tid;
float16 drSquared = dx*dx + dy*dy + dz*dz + aus;
float16 drPowerN32 = native_rsqrt(drSquared); //1.0f / (drSquared * native_sqrt (drSquared));
drPowerN32 = drPowerN32 * drPowerN32 * drPowerN32;
Fx += dx * drPowerN32;
Fy += dy * drPowerN32;
Fz += dz * drPowerN32;
}
x_union.v = Fx; // alle 16 x-Koordinaten nach x_union.v
y_union.v = Fy;
z_union.v = Fz;
float sumFx=0.0f;
float sumFy=0.0f;
float sumFz=0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
/*
//#pragma unroll 16
for (int r=0 ;r<16; r++)
{
sumFx += x_union.s[r];
// printf("sumFx = %f tid = %i r = %i x_union.s[r] = %f
",sumFx,tid,r,x_union.s[r] );
sumFy += y_union.s[r];
sumFz += z_union.s[r];
}
// printf( "sumFx = %f tid = %i
",sumFx,tid);
*/
float sumFx1 = Fx.s0+Fx.s1+Fx.s2+Fx.s3; //
float sumFx2 = Fx.s4+Fx.s5+Fx.s6+Fx.s7;
float sumFx3 = Fx.s8+Fx.s9+Fx.sa+Fx.sb;
float sumFx4 = Fx.sc+Fx.sd+Fx.se+Fx.sf;
sumFx = sumFx1 + sumFx2 + sumFx3 + sumFx4; //sloooooow
sumFy = Fy.s0+Fy.s1+Fy.s2+Fy.s3+Fy.s4+Fy.s5+Fy.s6+Fy.s7+Fy.s8+Fy.s9+Fy.sa+Fy.sb+Fy.sc+Fy.sd+Fy.se+Fy.sf;
sumFz = Fz.s0+Fz.s1+Fz.s2+Fz.s3+Fz.s4+Fz.s5+Fz.s6+Fz.s7+Fz.s8+Fz.s9+Fz.sa+Fz.sb+Fz.sc+Fz.sd+Fz.se+Fz.sf;
vx[tid] += dt * sumFx; //(Fx.s0+Fx.s1+Fx.s2+Fx.s3+Fx.s4+Fx.s5+Fx.s6+Fx.s7+Fx.s8+Fx.s9+Fx.sa+Fx.sb+Fx.sc+Fx.sd+Fx.se+Fx.sf);
vy[tid] += dt * sumFy; //(Fy.s0+Fy.s1+Fy.s2+Fy.s3+Fy.s4+Fy.s5+Fy.s6+Fy.s7+Fy.s8+Fy.s9+Fy.sa+Fy.sb+Fy.sc+Fy.sd+Fy.se+Fy.sf);
vz[tid] += dt * sumFz; //(Fz.s0+Fz.s1+Fz.s2+Fz.s3+Fz.s4+Fz.s5+Fz.s6+Fz.s7+Fz.s8+Fz.s9+Fz.sa+Fz.sb+Fz.sc+Fz.sd+Fz.se+Fz.sf);
/*
vx[tid] += dt * (Fx.s0+Fx.s1+Fx.s2+Fx.s3+Fx.s4+Fx.s5+Fx.s6+Fx.s7+Fx.s8+Fx.s9+Fx.sa+Fx.sb+Fx.sc+Fx.sd+Fx.se+Fx.sf);
vy[tid] += dt * (Fy.s0+Fy.s1+Fy.s2+Fy.s3+Fy.s4+Fy.s5+Fy.s6+Fy.s7+Fy.s8+Fy.s9+Fy.sa+Fy.sb+Fy.sc+Fy.sd+Fy.se+Fy.sf);
vz[tid] += dt * (Fz.s0+Fz.s1+Fz.s2+Fz.s3+Fz.s4+Fz.s5+Fz.s6+Fz.s7+Fz.s8+Fz.s9+Fz.sa+Fz.sb+Fz.sc+Fz.sd+Fz.se+Fz.sf);
*/
barrier(CLK_GLOBAL_MEM_FENCE); //auf Berechnungen ALLER Threads warten
switch (tid_mod) { //ich habe keine Ahnung, wie man das verkürzen könnte....
case 0:
x[tid_index].s0 += vx[tid] * dt;
y[tid_index].s0 += vy[tid] * dt;
z[tid_index].s0 += vz[tid] * dt;
break;
case 1:
x[tid_index].s1 += vx[tid] * dt;
y[tid_index].s1 += vy[tid] * dt;
z[tid_index].s1 += vz[tid] * dt;
break;
case 2:
x[tid_index].s2 += vx[tid] * dt;
y[tid_index].s2 += vy[tid] * dt;
z[tid_index].s2 += vz[tid] * dt;
break;
case 3:
x[tid_index].s3 += vx[tid] * dt;
y[tid_index].s3 += vy[tid] * dt;
z[tid_index].s3 += vz[tid] * dt;
break;
case 4:
x[tid_index].s4 += vx[tid] * dt;
y[tid_index].s4 += vy[tid] * dt;
z[tid_index].s4 += vz[tid] * dt;
break;
case 5:
x[tid_index].s5 += vx[tid] * dt;
y[tid_index].s5 += vy[tid] * dt;
z[tid_index].s5 += vz[tid] * dt;
break;
case 6:
x[tid_index].s6 += vx[tid] * dt;
y[tid_index].s6 += vy[tid] * dt;
z[tid_index].s6 += vz[tid] * dt;
break;
case 7:
x[tid_index].s7 += vx[tid] * dt;
y[tid_index].s7 += vy[tid] * dt;
z[tid_index].s7 += vz[tid] * dt;
break;
case 8:
x[tid_index].s8 += vx[tid] * dt;
y[tid_index].s8 += vy[tid] * dt;
z[tid_index].s8 += vz[tid] * dt;
break;
case 9:
x[tid_index].s9 += vx[tid] * dt;
y[tid_index].s9 += vy[tid] * dt;
z[tid_index].s9 += vz[tid] * dt;
break;
case 0xa:
x[tid_index].sa += vx[tid] * dt;
y[tid_index].sa += vy[tid] * dt;
z[tid_index].sa += vz[tid] * dt;
break;
case 0xb:
x[tid_index].sb += vx[tid] * dt;
y[tid_index].sb += vy[tid] * dt;
z[tid_index].sb += vz[tid] * dt;
break;
case 0xc:
x[tid_index].sc += vx[tid] * dt;
y[tid_index].sc += vy[tid] * dt;
z[tid_index].sc += vz[tid] * dt;
break;
case 0xd:
x[tid_index].sd += vx[tid] * dt;
y[tid_index].sd += vy[tid] * dt;
z[tid_index].sd += vz[tid] * dt;
break;
case 0xe:
x[tid_index].se += vx[tid] * dt;
y[tid_index].se += vy[tid] * dt;
z[tid_index].se += vz[tid] * dt;
break;
case 0xf:
x[tid_index].sf += vx[tid] * dt;
y[tid_index].sf += vy[tid] * dt;
z[tid_index].sf += vz[tid] * dt;
break;
}
}