Hi all,
My kernel works well on Nvidea GTX 560 GPU device but works error on AMD A7970 device. The source code list bellow.
A7970’s outputs is 0xd7 eb 6a d7 d7 eb 6a d7, but GTX560’s is 0xd7 eb 6a d7 05 b5 30 ad.
Where the problem appear.
// kernel source code -------------------------------------------
typedef union USHA1_type
{
unsigned int sha1uint[5];
unsigned char sha1uchar[20];
}USHA1_t;
inline uint SWAP32(uint x)
{
x = rotate(x, 16U);
return ((x & 0x00FF00FF) << 8) + ((x >> 8) & 0x00FF00FF);
}
//sha1 ==================
#define K1 0x5A827999
#define K2 0x6ED9EBA1
#define K3 0x8F1BBCDC
#define K4 0xCA62C1D6
#define H1 0x67452301
#define H2 0xEFCDAB89
#define H3 0x98BADCFE
#define H4 0x10325476
#define H5 0xC3D2E1F0
#define F1(x,y,z) (z ^ (x & (y ^ z)))
#define F2(x,y,z) (x ^ y ^ z)
#define F3(x,y,z) ((x & y) | (z & (x | y)))
#define F4(x,y,z) (x ^ y ^ z)
#define R(t) (temp = W[(t - 3) & 0x0F] ^ W[(t - 8) & 0x0F] ^ W[(t - 14) & 0x0F] ^ W[t & 0x0F], ( W[t & 0x0F] = rotate((int)temp,1) ) )
#define P1(a,b,c,d,e,x)
{
e += rotate((int)a,5) + F1(b,c,d) + K1 + x; b = rotate((int)b,30);
}
#define P2(a,b,c,d,e,x)
{
e += rotate((int)a,5) + F2(b,c,d) + K2 + x; b = rotate((int)b,30);
}
#define P3(a,b,c,d,e,x)
{
e += rotate((int)a,5) + F3(b,c,d) + K3 + x; b = rotate((int)b,30);
}
#define P4(a,b,c,d,e,x)
{
e += rotate((int)a,5) + F4(b,c,d) + K4 + x; b = rotate((int)b,30);
}
//1-63 BYTES sha1
inline void sha1_crypt(__private unsigned char *plain, unsigned int plainlen, __private unsigned int *digest)
{
int t;
int stop, mmod;
unsigned int i, ulen;
unsigned int W[16] = {0};
unsigned int temp, A,B,C,D,E;
A = H1;
B = H2;
C = H3;
D = H4;
E = H5;
for (t = 1; t < 15; t++)
{
W[t] = 0x00000000;
}
i = plainlen;
stop = i / 4 ;
for (t = 0 ; t < stop ; t++){
W[t] = ((unsigned char) plain[t * 4]) << 24;
W[t] |= ((unsigned char) plain[t * 4 + 1]) << 16;
W[t] |= ((unsigned char) plain[t * 4 + 2]) << 8;
W[t] |= (unsigned char) plain[t * 4 + 3];
}
mmod = i % 4;
if ( mmod == 3){
W[t] = ((unsigned char) plain[t * 4]) << 24;
W[t] |= ((unsigned char) plain[t * 4 + 1]) << 16;
W[t] |= ((unsigned char) plain[t * 4 + 2]) << 8;
W[t] |= ((unsigned char) 0x80) ;
} else if (mmod == 2) {
W[t] = ((unsigned char) plain[t * 4]) << 24;
W[t] |= ((unsigned char) plain[t * 4 + 1]) << 16;
W[t] |= 0x8000 ;
} else if (mmod == 1) {
W[t] = ((unsigned char) plain[t * 4]) << 24;
W[t] |= 0x800000 ;
} else /*if (mmod == 0)*/ {
W[t] = 0x80000000 ;
}
ulen = (i * 8) & 0xFFFFFFFF;
W[15] = ulen ;
P1( A, B, C, D, E, W[0] );
P1( E, A, B, C, D, W[1] );
P1( D, E, A, B, C, W[2] );
P1( C, D, E, A, B, W[3] );
P1( B, C, D, E, A, W[4] );
P1( A, B, C, D, E, W[5] );
P1( E, A, B, C, D, W[6] );
P1( D, E, A, B, C, W[7] );
P1( C, D, E, A, B, W[8] );
P1( B, C, D, E, A, W[9] );
P1( A, B, C, D, E, W[10] );
P1( E, A, B, C, D, W[11] );
P1( D, E, A, B, C, W[12] );
P1( C, D, E, A, B, W[13] );
P1( B, C, D, E, A, W[14] );
P1( A, B, C, D, E, W[15] );
P1( E, A, B, C, D, R(16) );
P1( D, E, A, B, C, R(17) );
P1( C, D, E, A, B, R(18) );
P1( B, C, D, E, A, R(19) );
P2( A, B, C, D, E, R(20) );
P2( E, A, B, C, D, R(21) );
P2( D, E, A, B, C, R(22) );
P2( C, D, E, A, B, R(23) );
P2( B, C, D, E, A, R(24) );
P2( A, B, C, D, E, R(25) );
P2( E, A, B, C, D, R(26) );
P2( D, E, A, B, C, R(27) );
P2( C, D, E, A, B, R(28) );
P2( B, C, D, E, A, R(29) );
P2( A, B, C, D, E, R(30) );
P2( E, A, B, C, D, R(31) );
P2( D, E, A, B, C, R(32) );
P2( C, D, E, A, B, R(33) );
P2( B, C, D, E, A, R(34) );
P2( A, B, C, D, E, R(35) );
P2( E, A, B, C, D, R(36) );
P2( D, E, A, B, C, R(37) );
P2( C, D, E, A, B, R(38) );
P2( B, C, D, E, A, R(39) );
P3( A, B, C, D, E, R(40) );
P3( E, A, B, C, D, R(41) );
P3( D, E, A, B, C, R(42) );
P3( C, D, E, A, B, R(43) );
P3( B, C, D, E, A, R(44) );
P3( A, B, C, D, E, R(45) );
P3( E, A, B, C, D, R(46) );
P3( D, E, A, B, C, R(47) );
P3( C, D, E, A, B, R(48) );
P3( B, C, D, E, A, R(49) );
P3( A, B, C, D, E, R(50) );
P3( E, A, B, C, D, R(51) );
P3( D, E, A, B, C, R(52) );
P3( C, D, E, A, B, R(53) );
P3( B, C, D, E, A, R(54) );
P3( A, B, C, D, E, R(55) );
P3( E, A, B, C, D, R(56) );
P3( D, E, A, B, C, R(57) );
P3( C, D, E, A, B, R(58) );
P3( B, C, D, E, A, R(59) );
P4( A, B, C, D, E, R(60) );
P4( E, A, B, C, D, R(61) );
P4( D, E, A, B, C, R(62) );
P4( C, D, E, A, B, R(63) );
P4( B, C, D, E, A, R(64) );
P4( A, B, C, D, E, R(65) );
P4( E, A, B, C, D, R(66) );
P4( D, E, A, B, C, R(67) );
P4( C, D, E, A, B, R(68) );
P4( B, C, D, E, A, R(69) );
P4( A, B, C, D, E, R(70) );
P4( E, A, B, C, D, R(71) );
P4( D, E, A, B, C, R(72) );
P4( C, D, E, A, B, R(73) );
P4( B, C, D, E, A, R(74) );
P4( A, B, C, D, E, R(75) );
P4( E, A, B, C, D, R(76) );
P4( D, E, A, B, C, R(77) );
P4( C, D, E, A, B, R(78) );
P4( B, C, D, E, A, R(79) );
digest[0] = SWAP32(A + H1);
digest[1] = SWAP32(B + H2);
digest[2] = SWAP32(C + H3);
digest[3] = SWAP32(D + H4);
digest[4] = SWAP32(E + H5);
}
__kernel void test_sha1_kernel(__global unsigned int* gout)
{
unsigned int id = get_global_id(0);
unsigned int i = 0;
//two input data
unsigned char InData1[16] = {0};
unsigned char InData2[16] = {0};
//two calout data
USHA1_t sha1out1;
USHA1_t sha1out2;
//init data
for(i = 0; i < 5; i ++)
{
sha1out1.sha1uint[i] = 0;
sha1out2.sha1uint[i] = 0;
}
for(i = 0; i < 16; i ++)
{
InData1[i] = 0x03;
InData2[i] = 0x38;
}
//two out temp
unsigned char out1[4] = {0};
unsigned char out2[4] = {0};
for(i = 0; i < 4; i++)
{
out1[i] = 0;
out2[i] = 0;
}
//cal 1
unsigned int *psha1out1 = (unsigned int *)(sha1out1.sha1uint);
sha1_crypt(InData1, 8, psha1out1);
sha1_crypt(InData1, 8, psha1out1);
//save output1
for(i = 0; i < 4; i++)
{
out1[i] = sha1out1.sha1uchar[i];
}
//cal 2
unsigned int *psha1out2 = (unsigned int *)(sha1out2.sha1uint);
sha1_crypt(InData2, 8, psha1out2);
//save output2
for(i = 0; i < 4; i++)
{
out2[i] = sha1out2.sha1uchar[i];
}
//out to cpu
if(id == 0)
{
gout[0] = (unsigned int)out1[0];
gout[1] = (unsigned int)out1[1];
gout[2] = (unsigned int)out1[2];
gout[3] = (unsigned int)out1[3];
gout[4] = (unsigned int)out2[0];
gout[5] = (unsigned int)out2[1];
gout[6] = (unsigned int)out2[2];
gout[7] = (unsigned int)out2[3];
}
}