Different result between AMD and Nvidea devices.

Hi all,
My kernel works well on Nvidea GTX 560 GPU device but works error on AMD A7970 device. The source code list bellow.
A7970’s outputs is 0xd7 eb 6a d7 d7 eb 6a d7, but GTX560’s is 0xd7 eb 6a d7 05 b5 30 ad.
Where the problem appear. :confused:

// kernel source code -------------------------------------------

typedef union USHA1_type
{
unsigned int sha1uint[5];
unsigned char sha1uchar[20];
}USHA1_t;

inline uint SWAP32(uint x)
{
x = rotate(x, 16U);
return ((x & 0x00FF00FF) << 8) + ((x >> 8) & 0x00FF00FF);
}

//sha1 ==================
#define K1 0x5A827999
#define K2 0x6ED9EBA1
#define K3 0x8F1BBCDC
#define K4 0xCA62C1D6

#define H1 0x67452301
#define H2 0xEFCDAB89
#define H3 0x98BADCFE
#define H4 0x10325476
#define H5 0xC3D2E1F0

#define F1(x,y,z) (z ^ (x & (y ^ z)))
#define F2(x,y,z) (x ^ y ^ z)
#define F3(x,y,z) ((x & y) | (z & (x | y)))
#define F4(x,y,z) (x ^ y ^ z)

#define R(t) (temp = W[(t - 3) & 0x0F] ^ W[(t - 8) & 0x0F] ^ W[(t - 14) & 0x0F] ^ W[t & 0x0F], ( W[t & 0x0F] = rotate((int)temp,1) ) )

#define P1(a,b,c,d,e,x)
{
e += rotate((int)a,5) + F1(b,c,d) + K1 + x; b = rotate((int)b,30);
}

#define P2(a,b,c,d,e,x)
{
e += rotate((int)a,5) + F2(b,c,d) + K2 + x; b = rotate((int)b,30);
}
#define P3(a,b,c,d,e,x)
{
e += rotate((int)a,5) + F3(b,c,d) + K3 + x; b = rotate((int)b,30);
}
#define P4(a,b,c,d,e,x)
{
e += rotate((int)a,5) + F4(b,c,d) + K4 + x; b = rotate((int)b,30);
}

//1-63 BYTES sha1
inline void sha1_crypt(__private unsigned char *plain, unsigned int plainlen, __private unsigned int *digest)
{
int t;
int stop, mmod;
unsigned int i, ulen;
unsigned int W[16] = {0};
unsigned int temp, A,B,C,D,E;

A = H1;
B = H2;
C = H3;
D = H4;
E = H5;

for (t = 1; t &lt; 15; t++)
{
	W[t] = 0x00000000;
}

i = plainlen;

stop = i / 4 ;
for (t = 0 ; t &lt; stop ; t++){
	W[t] =	((unsigned char) plain[t * 4])		&lt;&lt; 24;
	W[t] |= ((unsigned char) plain[t * 4 + 1])	&lt;&lt; 16;
	W[t] |= ((unsigned char) plain[t * 4 + 2])	&lt;&lt; 8;
	W[t] |= (unsigned char)  plain[t * 4 + 3];
}
mmod = i % 4;
if ( mmod == 3){
	W[t] = ((unsigned char)  plain[t * 4]) &lt;&lt; 24;
	W[t] |= ((unsigned char) plain[t * 4 + 1]) &lt;&lt; 16;
	W[t] |= ((unsigned char) plain[t * 4 + 2]) &lt;&lt; 8;
	W[t] |=  ((unsigned char) 0x80) ;
} else if (mmod == 2) {
	W[t] = ((unsigned char)  plain[t * 4]) &lt;&lt; 24;
	W[t] |= ((unsigned char) plain[t * 4 + 1]) &lt;&lt; 16;
	W[t] |=  0x8000 ;
} else if (mmod == 1) {
	W[t] = ((unsigned char)  plain[t * 4]) &lt;&lt; 24;
	W[t] |=  0x800000 ;
} else /*if (mmod == 0)*/ {
	W[t] =  0x80000000 ;
}
ulen = (i * 8) & 0xFFFFFFFF;
W[15] =  ulen ;


P1( A, B, C, D, E, W[0]  );
P1( E, A, B, C, D, W[1]  );
P1( D, E, A, B, C, W[2]  );
P1( C, D, E, A, B, W[3]  );
P1( B, C, D, E, A, W[4]  );
P1( A, B, C, D, E, W[5]  );
P1( E, A, B, C, D, W[6]  );
P1( D, E, A, B, C, W[7]  );
P1( C, D, E, A, B, W[8]  );
P1( B, C, D, E, A, W[9]  );
P1( A, B, C, D, E, W[10] );
P1( E, A, B, C, D, W[11] );
P1( D, E, A, B, C, W[12] );
P1( C, D, E, A, B, W[13] );
P1( B, C, D, E, A, W[14] );
P1( A, B, C, D, E, W[15] );
P1( E, A, B, C, D, R(16) );
P1( D, E, A, B, C, R(17) );
P1( C, D, E, A, B, R(18) );
P1( B, C, D, E, A, R(19) );

P2( A, B, C, D, E, R(20) );
P2( E, A, B, C, D, R(21) );
P2( D, E, A, B, C, R(22) );
P2( C, D, E, A, B, R(23) );
P2( B, C, D, E, A, R(24) );
P2( A, B, C, D, E, R(25) );
P2( E, A, B, C, D, R(26) );
P2( D, E, A, B, C, R(27) );
P2( C, D, E, A, B, R(28) );
P2( B, C, D, E, A, R(29) );
P2( A, B, C, D, E, R(30) );
P2( E, A, B, C, D, R(31) );
P2( D, E, A, B, C, R(32) );
P2( C, D, E, A, B, R(33) );
P2( B, C, D, E, A, R(34) );
P2( A, B, C, D, E, R(35) );
P2( E, A, B, C, D, R(36) );
P2( D, E, A, B, C, R(37) );
P2( C, D, E, A, B, R(38) );
P2( B, C, D, E, A, R(39) );

P3( A, B, C, D, E, R(40) );
P3( E, A, B, C, D, R(41) );
P3( D, E, A, B, C, R(42) );
P3( C, D, E, A, B, R(43) );
P3( B, C, D, E, A, R(44) );
P3( A, B, C, D, E, R(45) );
P3( E, A, B, C, D, R(46) );
P3( D, E, A, B, C, R(47) );
P3( C, D, E, A, B, R(48) );
P3( B, C, D, E, A, R(49) );
P3( A, B, C, D, E, R(50) );
P3( E, A, B, C, D, R(51) );
P3( D, E, A, B, C, R(52) );
P3( C, D, E, A, B, R(53) );
P3( B, C, D, E, A, R(54) );
P3( A, B, C, D, E, R(55) );
P3( E, A, B, C, D, R(56) );
P3( D, E, A, B, C, R(57) );
P3( C, D, E, A, B, R(58) );
P3( B, C, D, E, A, R(59) );


P4( A, B, C, D, E, R(60) );
P4( E, A, B, C, D, R(61) );
P4( D, E, A, B, C, R(62) );
P4( C, D, E, A, B, R(63) );
P4( B, C, D, E, A, R(64) );
P4( A, B, C, D, E, R(65) );
P4( E, A, B, C, D, R(66) );
P4( D, E, A, B, C, R(67) );
P4( C, D, E, A, B, R(68) );
P4( B, C, D, E, A, R(69) );
P4( A, B, C, D, E, R(70) );
P4( E, A, B, C, D, R(71) );
P4( D, E, A, B, C, R(72) );
P4( C, D, E, A, B, R(73) );
P4( B, C, D, E, A, R(74) );
P4( A, B, C, D, E, R(75) );
P4( E, A, B, C, D, R(76) );
P4( D, E, A, B, C, R(77) );
P4( C, D, E, A, B, R(78) );
P4( B, C, D, E, A, R(79) );

digest[0] = SWAP32(A + H1);
digest[1] = SWAP32(B + H2);
digest[2] = SWAP32(C + H3);
digest[3] = SWAP32(D + H4);
digest[4] = SWAP32(E + H5);	

}

__kernel void test_sha1_kernel(__global unsigned int* gout)
{
unsigned int id = get_global_id(0);
unsigned int i = 0;

//two input data
unsigned char InData1[16] = {0};
unsigned char InData2[16] = {0};

//two calout data
USHA1_t sha1out1;
USHA1_t sha1out2;

//init data
for(i = 0; i &lt; 5; i ++)
{
	sha1out1.sha1uint[i] = 0;
	sha1out2.sha1uint[i] = 0;
}
for(i = 0; i &lt; 16; i ++)
{
	InData1[i] = 0x03;
	InData2[i] = 0x38;
}

//two out temp
unsigned char out1[4] = {0};
unsigned char out2[4] = {0};
for(i = 0; i &lt; 4; i++)
{
	out1[i] = 0;
	out2[i] = 0;
}

//cal 1
unsigned int *psha1out1 = (unsigned int *)(sha1out1.sha1uint);
sha1_crypt(InData1, 8, psha1out1);
sha1_crypt(InData1, 8, psha1out1);

//save output1
for(i = 0; i &lt; 4; i++)
{
	out1[i] = sha1out1.sha1uchar[i];
}

//cal 2
unsigned int *psha1out2 = (unsigned int *)(sha1out2.sha1uint);			
sha1_crypt(InData2, 8, psha1out2);

//save output2
for(i = 0; i &lt; 4; i++)
{
	out2[i] = sha1out2.sha1uchar[i];
}

//out to cpu
if(id == 0)
{
	gout[0] = (unsigned int)out1[0];
	gout[1] = (unsigned int)out1[1];
	gout[2] = (unsigned int)out1[2];
	gout[3] = (unsigned int)out1[3];

	gout[4] = (unsigned int)out2[0];
	gout[5] = (unsigned int)out2[1];
	gout[6] = (unsigned int)out2[2];
	gout[7] = (unsigned int)out2[3];
}

}

Try to cast plain to unsigned int instead of unsigned char, such as:

W[t] = ((unsigned int) plain[t * 4]) << 24;

and so on…

[QUOTE=utnapishtim;30322]Try to cast plain to unsigned int instead of unsigned char, such as:

W[t] = ((unsigned int) plain[t * 4]) << 24;

and so on…[/QUOTE]

Thank you very much, I tried it.

for (t = 0 ; t &lt; stop ; t++){
	W[t] =	((unsigned int)plain[t * 4])		&lt;&lt; 24;
	W[t] |= ((unsigned int)plain[t * 4 + 1])	&lt;&lt; 16;
	W[t] |= ((unsigned int)plain[t * 4 + 2])	&lt;&lt; 8;
	W[t] |= (unsigned int) plain[t * 4 + 3];
}
mmod = i % 4;
if ( mmod == 3){
	W[t] = ((unsigned int) plain[t * 4]) &lt;&lt; 24;
	W[t] |= ((unsigned int)plain[t * 4 + 1]) &lt;&lt; 16;
	W[t] |= ((unsigned int)plain[t * 4 + 2]) &lt;&lt; 8;
	W[t] |=  ((unsigned int)0x80) ;
} else if (mmod == 2) {
	W[t] = ((unsigned int) plain[t * 4]) &lt;&lt; 24;
	W[t] |= ((unsigned int)plain[t * 4 + 1]) &lt;&lt; 16;
	W[t] |=  0x8000 ;
} else if (mmod == 1) {
	W[t] = ((unsigned int) plain[t * 4]) &lt;&lt; 24;
	W[t] |=  0x800000 ;
} else /*if (mmod == 0)*/ {
	W[t] =  0x80000000 ;
}

But no change happen.

I debug the kernel using codexl.

On device A7970 the screenshot shows when it runs to the breakpoint “if(id == 0)” , the value of out2 is 0x05 but actually I gets 0xd7 in the cpu memory.

[ATTACH=CONFIG]40[/ATTACH]

I’ve checked on NVIDIA GPU, AMD GPU and Intel CPU and your kernel is fine.

How do you get the result from the device buffer on the host side?

[QUOTE=utnapishtim;30326]I’ve checked on NVIDIA GPU, AMD GPU and Intel CPU and your kernel is fine.

How do you get the result from the device buffer on the host side?[/QUOTE]

Thanks, my host side code:

#include “stdafx.h”

#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cstring>
#include <string>
#include <fstream>

#define SUCCESS 0
#define FAILURE 1
#define EXPECTED_FAILURE 2

#define GlobalThreadSize 256
#define GroupSize 64

#define OPENCLBUILDOPTIONS “-cl-opt-disable”
//#define OPENCLBUILDOPTIONS NULL

/* convert the kernel file into a string */
int convertToString(const char filename, std::string& s)
{
size_t size;
char
str;
std::fstream f(filename, (std::fstream::in | std::fstream::binary));

if(f.is_open())
{
	size_t fileSize;
	f.seekg(0, std::fstream::end);
	size = fileSize = (size_t)f.tellg();
	f.seekg(0, std::fstream::beg);
	str = new char[size+1];
	if(!str)
	{
		f.close();
		return SUCCESS;
	}

	f.read(str, fileSize);
	f.close();
	str[size] = '\0';
	s = str;
	delete[] str;
	return SUCCESS;
}
std::cout&lt;&lt;"Error: failed to open file

:"<<filename<<std::endl;
return FAILURE;
}

int _tmain(int argc, char* argv[])
{

cl_int status = 0;//store the return status

cl_uint numPlatforms;//store the number of platforms query by clGetPlatformIDs()

cl_platform_id platform = NULL;//store the chosen platform

//get platform 
status = clGetPlatformIDs(0, NULL, &numPlatforms);
if (status != CL_SUCCESS)
{
	std::cout&lt;&lt;"Error: Getting platforms!"&lt;&lt;std::endl;
	return FAILURE;
}

if (numPlatforms &gt; 0)
{
	cl_platform_id* platforms = (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id));
	status=clGetPlatformIDs(numPlatforms,platforms,NULL);
	platform=platforms[0];
	free(platforms);
}

if (NULL == platform)
{
	std::cout&lt;&lt;"Error: No available platform found!"&lt;&lt;std::endl;
	return FAILURE;
}


/* Query the context and get the available devices */
cl_uint numDevice=0;
status=clGetDeviceIDs(platform,CL_DEVICE_TYPE_ALL,0,NULL,&numDevice);
cl_device_id *devices=(cl_device_id*)malloc(numDevice*sizeof(cl_device_id));
if (devices == 0) 
{
    std::cout &lt;&lt; "No device available

";
return FAILURE;
}
clGetDeviceIDs(platform,CL_DEVICE_TYPE_ALL,numDevice,devices,NULL);

/* Create Context using the platform selected above */   
cl_context context=clCreateContext(NULL,numDevice,devices,NULL,NULL,NULL);

if (status != CL_SUCCESS)
{
	std::cout&lt;&lt;"Error: Creating context failed!"&lt;&lt;std::endl;
	return FAILURE;
}

/*
*The API clCreateCommandQueue creates a command-queue on a specific device.
*/
cl_command_queue commandQueue = clCreateCommandQueue(context, devices[0], 0, &status);
if (status != CL_SUCCESS)
{
	std::cout&lt;&lt;"Error: Creating command queue failed!"&lt;&lt;std::endl;
	return FAILURE;
}

//set input data
cl_uint *output = (cl_uint *) malloc( sizeof(cl_uint) * GlobalThreadSize);

//create output buffer
cl_mem outputBuffer = clCreateBuffer(
                  context, 
                  CL_MEM_WRITE_ONLY,
                  sizeof(cl_uint) * GlobalThreadSize,
                  NULL, 
                  &status);
if (status != CL_SUCCESS)
{
	std::cout&lt;&lt;"Error: Creating output buffer failed!"&lt;&lt;std::endl;
	return FAILURE;
}

//get the printf kernel
const char* filename = "./sha1_Kernel.cl";
std::string sourceStr;
status = convertToString(filename, sourceStr);
const char *source = sourceStr.c_str();
size_t sourceSize[] ={strlen(source)};
//
//create program
cl_program program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status);
if (status != CL_SUCCESS)
{
	std::cout&lt;&lt;"Error: Creating program object failed!"&lt;&lt;std::endl;
	return FAILURE;
}

//build program with the command line option '-g' so we can debug kernel
status = clBuildProgram(program,1, devices, OPENCLBUILDOPTIONS, NULL, NULL);

char opencl_log[65536];
clGetProgramBuildInfo(program, *devices, CL_PROGRAM_BUILD_LOG, sizeof(opencl_log), (void *) opencl_log, NULL);

if (status != CL_SUCCESS)
{
	std::cout&lt;&lt;"Error: Building program failed!"&lt;&lt;std::endl;
	return FAILURE;
}

//create printf kernel	
cl_kernel kernel = clCreateKernel(program, "test_sha1_kernel", &status);
if (status != CL_SUCCESS)
{
	std::cout&lt;&lt;"Error: Creating kernel failed!"&lt;&lt;std::endl;
	return FAILURE;
}

//set args
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&outputBuffer);

size_t global_threads[1];
size_t local_threads[1];
global_threads[0] = GlobalThreadSize;
local_threads[0] = GroupSize;

//execute the kernel
status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_threads, local_threads, 0, NULL, NULL);
if (status != CL_SUCCESS)
{
	std::cout&lt;&lt;"Error: Enqueue kernel onto command queue failed!"&lt;&lt;std::endl;
	return FAILURE;
}
status = clFinish(commandQueue);

memset(output, 0, sizeof(cl_uint) * GlobalThreadSize);

status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, sizeof(cl_uint)*GlobalThreadSize, output, 0, NULL, NULL);

for(int i = 0; i &lt; 8; i++)
	printf("%02X ", output[i]);

// Clean the resources.
status = clReleaseKernel(kernel);//Release kernel.
status = clReleaseMemObject(outputBuffer);
status = clReleaseProgram(program);//Release program.
status = clReleaseCommandQueue(commandQueue);//Release command queue.
status = clReleaseContext(context);//Release context.

if (devices != NULL)
{
	free(devices);
	devices = NULL;
}

free(output);

scanf_s("%c");
return SUCCESS;

}

If do not use build option “-cl-opt-disable”, the result seems to be right.

In my mind disable-opt will be slow but more stable, it’s a strange case.

But in the case of enable-opt, there are also many problems can not be understood. Development is really very tough on the AMD 7970 device.

When using the AMD device development,whether there is a need for special attention?