Abysmal performance on HD4870 / Snow Leopard.. why?

Hi all,

I’ve written a basic SHA1 hash brute-forcer for OpenCL. Unfortunately the performance is way below what I was anticipating. Tens of millions of hashes per second should be typical for a half-decent GPU, yet this is taking 14.7 seconds just to burn through 2.6 million.

Any advice would be much appreciated…


#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>

void nextkey();

#define NUM_BLOCKS 10240
#define BLOCK_SIZE 256
#define CHARSET "abcdefghijklmnopqrstuvwxyz1234567890"

const char *KernelSource = "
" \
"#define K0	0x5A827999
" \
"#define K1	0x6ED9EBA1
" \
"#define K2	0x8F1BBCDC
" \
"#define K3	0xCA62C1D6
" \
"
" \
"#define H1 0x67452301
" \
"#define H2 0xEFCDAB89
" \
"#define H3 0x98BADCFE
" \
"#define H4 0x10325476
" \
"#define H5 0xC3D2E1F0
" \
"
" \
"#define uchar unsigned char
" \
"
" \
"uint rotateLeft(uint x, int n)
" \
"{
" \
"		return	(x << n) | (x >> (32-n));
" \
"}
" \
"
" \
"__kernel void sha1(__global char *msg, __global const unsigned int *len, __global char *digest)
" \
"{
" \
"		int t, i, j, gid, x;
" \
"		uint W[80], A[5], temp, number;
" \
"		char hexChars[16] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
" \
"		gid = get_global_id(0);
" \
"		int item_pad = gid * 64;
" \
"		uint ulen = (len[gid]*8) & 0xFFFFFFFF;
" \
"
" \
"		for (i=0;i<64-len[gid];i++) {
" \
"			msg[item_pad+len[gid]+i] = 0;
" \
"		}
" \
"
" \
"		msg[item_pad + len[gid]] = (char) 0x80;
" \
"
" \
"	 msg[item_pad + 60] = ulen >> 24;
" \
"	 msg[item_pad + 61] = ulen >> 16;
" \
"	 msg[item_pad + 62] = ulen >> 8;
" \
"	 msg[item_pad + 63] = ulen;
" \
"
" \
"		A[0] = H1;
" \
"		A[1] = H2;
" \
"		A[2] = H3;
" \
"		A[3] = H4;
" \
"		A[4] = H5;
" \
"
" \
"		for (t = 0; t < 16; t++)
" \
"		{
" \
"				W[t] = ((uchar) msg[item_pad + (t * 4)]);
" \
"				W[t] = W[t] << 24;
" \
"				temp = ((uchar) msg[item_pad + (t * 4 + 1)]);
" \
"				temp = temp << 16;
" \
"				W[t] |= temp;
" \
"				temp = ((uchar) msg[item_pad + (t * 4 + 2)]);
" \
"				temp = temp << 8;
" \
"				W[t] |= temp;
" \
"				W[t] |= (uchar) msg[item_pad + (t * 4 + 3)];
" \
"		}
" \
"
" \
"		for(i = 16; i < 80; i++)
" \
"		{
" \
"				W[i] = rotateLeft(W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1);
" \
"		}
" \
"
" \
"		for(i = 0; i < 20; i++)
" \
"		{
" \
"				temp = rotateLeft(A[0],5) + ((A[1] & A[2]) | ((~ A[1]) & A[3])) + A[4] + W[i] + K0;
" \
"				A[4] = A[3];
" \
"				A[3] = A[2];
" \
"				A[2] = rotateLeft(A[1], 30);
" \
"				A[1] = A[0];
" \
"				A[0] = temp;
" \
"		}
" \
"
" \
"		for(i = 20; i < 40; i++)
" \
"		{
" \
"				temp = rotateLeft(A[0], 5) + (A[1] ^ A[2] ^ A[3]) + A[4] + W[i] + K1;
" \
"				A[4] = A[3];
" \
"				A[3] = A[2];
" \
"				A[2] = rotateLeft(A[1], 30);
" \
"				A[1] = A[0];
" \
"				A[0] = temp;
" \
"		}
" \
"
" \
"		for(i = 40; i < 60; i++)
" \
"		{
" \
"				temp = rotateLeft(A[0], 5) + ((A[1] & A[2]) | (A[1] & A[3]) | (A[2] & A[3])) + A[4] + W[i] + K2;
" \
"				A[4] = A[3];
" \
"				A[3] = A[2];
" \
"				A[2] = rotateLeft(A[1], 30);
" \
"				A[1] = A[0];
" \
"				A[0] = temp;
" \
"		}
" \
"
" \
"		for(i = 60; i < 80; i++)
" \
"		{
" \
"				temp = rotateLeft(A[0], 5) + (A[1] ^ A[2] ^ A[3])  + A[4] + W[i] + K3;
" \
"				A[4] = A[3];
" \
"				A[3] = A[2];
" \
"				A[2] = rotateLeft(A[1], 30);
" \
"				A[1] = A[0];
" \
"				A[0] = temp;
" \
"		}
" \
"		A[0] += H1;
" \
"		A[1] += H2;
" \
"		A[2] += H3;
" \
"		A[3] += H4;
" \
"		A[4] += H5;
" \
"
" \
"		for(j = 0; j < 5; j++)
" \
"		{
" \
"				number = A[j];
" \
"				for(i = 0; i < 8; i++)
" \
"				{
" \
"						digest[item_pad + (j*8 + 7-i)] = hexChars[number%16];
" \
"						number /= 16;
" \
"				}
" \
"		}
" \
"
" \
"		digest[item_pad + 40] = '\\0';
" \
"}
" \
"
";

char keybuf[64+1];

int main(int argc, char **argv) {
	char c;
	unsigned int i=0, j, x;
	char *textstring = (char *)malloc(64*BLOCK_SIZE);
	char *result = (char *)malloc(64*BLOCK_SIZE);
	time_t tt,tt2,tt3;

	memset(&keybuf,0,sizeof(keybuf));
	strncpy(keybuf,CHARSET,1);

	if (textstring == NULL || result == NULL) {
		printf("Couldn't allocate memory.
");
		return 0;
	}

	int err;
	unsigned int length[BLOCK_SIZE];

	size_t global = BLOCK_SIZE;
	size_t local = BLOCK_SIZE;

	cl_device_id device_id;
	cl_context context;
	cl_command_queue commands;
	cl_program program;
	cl_kernel kernel;

	cl_mem input;
	cl_mem devlen;
	cl_mem output;

		int gpu = 1;
		err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
		if (err != CL_SUCCESS)
		{
				printf("Error: Failed to create a device group!
");
				return EXIT_FAILURE;
		}

		context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
		if (!context)
		{
				printf("Error: Failed to create a compute context!
");
				return EXIT_FAILURE;
		}

		commands = clCreateCommandQueue(context, device_id, 0, &err);
		if (!commands)
		{
				printf("Error: Failed to create a command commands!
");
				return EXIT_FAILURE;
		}

		program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
		if (!program)
		{
				printf("Error: Failed to create compute program! (error %d)
",err);
				switch (err) {
					case CL_INVALID_CONTEXT: printf("context is not a valid context.
"); break;
					case CL_INVALID_VALUE: printf("count is zero or if strings or any entry in strings is NULL
"); break;
					case CL_OUT_OF_HOST_MEMORY: printf("there is a failure to allocate resources required by the OpenCL implementation on the host
"); break;
				}
				return EXIT_FAILURE;
		}

		err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
		if (err != CL_SUCCESS)
		{
				size_t len;
				char buffer[2048];

				printf("Error: Failed to build program executable!
");
				clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
				printf("%s
", buffer);
				exit(1);
		}

		kernel = clCreateKernel(program, "sha1", &err);
		if (!kernel || err != CL_SUCCESS)
		{
				printf("Error: Failed to create compute kernel!
");
				switch (err) {
					case CL_INVALID_PROGRAM: printf("if program is not a valid program object
"); break;
					case CL_INVALID_PROGRAM_EXECUTABLE: printf("if there is no successfully built executable for program
"); break;
					case CL_INVALID_KERNEL_NAME: printf("if kernel_name is not found in program
"); break;
					case CL_INVALID_KERNEL_DEFINITION: printf("if the function definition for __kernel function given by kernel_name such as the number of arguments, the argument types are not the same for all devices for which the program executable has been built
"); break;
					case CL_INVALID_VALUE: printf("is kernel_name is NULL
"); break;
					case CL_OUT_OF_HOST_MEMORY: printf("if there is a failure to allocate resources required by the OpenCL implementation on the host
"); break;
				}
				exit(1);
		}

		input = clCreateBuffer(context,  CL_MEM_READ_ONLY,	64*BLOCK_SIZE, NULL, NULL);
		output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 64*BLOCK_SIZE, NULL, NULL);
		devlen = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*BLOCK_SIZE, NULL, NULL);

		if (!input || !output)
		{
				printf("Error: Failed to allocate device memory!
");
				exit(1);
		}

		err = 0;
		err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
		err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &devlen);
		err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
		if (err != CL_SUCCESS)
		{
				printf("Error: Failed to set kernel arguments! %d
", err);
				switch (err) {
					case CL_INVALID_KERNEL: printf("kernel is not a valid kernel object
"); break;
					case CL_INVALID_ARG_INDEX: printf("arg_index is not a valid argument index
"); break;
					case CL_INVALID_ARG_VALUE: printf("arg_value specified is NULL for an argument that is not declared with the __local qualifier or vice-versa
"); break;
					case CL_INVALID_MEM_OBJECT: printf("an argument declared to be a memory object when the specified arg_value is not a valid memory object
"); break;
					case CL_INVALID_SAMPLER: printf("an argument declared to be of type sampler_t when the specified arg_value is not a valid sampler object
"); break;
					case CL_INVALID_ARG_SIZE: printf("arg_size does not match the size of the data type for an argument that is not a memory object or if the argument is a memory object and arg_size != sizeof(cl_mem) or if arg_size is zero and the argument is declared with the __local qualifier or if the argument is a sampler and arg_size != sizeof(cl_sampler)
"); break;
				}
				exit(1);
		}

		for (x = 0; x<NUM_BLOCKS; x++) {
			for (i=0; i<BLOCK_SIZE; i++) {
				strcpy(&textstring[i*64],keybuf);
				length[i] = strlen(&textstring[i*64]);
				nextkey();
			}

			err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, 64*BLOCK_SIZE, textstring, 0, NULL, NULL);
			if (err != CL_SUCCESS)
			{
					printf("Error: Failed to write to source array!
");
					exit(1);
			}

			err = clEnqueueWriteBuffer(commands, devlen, CL_TRUE, 0, sizeof(unsigned int)*BLOCK_SIZE, length, 0, NULL, NULL);
			if (err != CL_SUCCESS)
			{
					printf("Error: Failed to write to source array!
");
					exit(1);
			}

			err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			if (err)
			{
					printf("Error: Failed to execute kernel!
");
					return EXIT_FAILURE;
			}

			clFinish(commands);

			err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, 64*BLOCK_SIZE, result, 0, NULL, NULL );
			if (err != CL_SUCCESS)
			{
					printf("Error: Failed to read output array! %d
", err);
					exit(1);
			}
		}

		printf("Done, last result was: %s - %s
",&textstring[(i-1)*64],&result[(i-1)*64]);
		printf("Computed %d hashes.
",NUM_BLOCKS*BLOCK_SIZE);

		clReleaseMemObject(input);
		clReleaseMemObject(output);
		clReleaseProgram(program);
		clReleaseKernel(kernel);
		clReleaseCommandQueue(commands);
		clReleaseContext(context);
	
		return 0;
	}

	void nextkey() {
		int i,j,k,found;

		if (keybuf[strlen(keybuf)-1] == CHARSET[strlen(CHARSET)-1])
		{
			found=0;
			for (i=strlen(keybuf)-1;i>=0;i--)
			{
				if (keybuf[i] != CHARSET[strlen(CHARSET)-1])
				{
					for (j=0;j<strlen(CHARSET);j++)
					{
						if (keybuf[i] == CHARSET[j])
						{
							keybuf[i] = CHARSET[j+1];
							for (k=i+1;k<strlen(keybuf);k++)
							{
								keybuf[k] = CHARSET[0];
							}
							found=1;
							break;
						}
					}
					i=-1;
				}
			}
			if (!found)
			{
				for (i=0;i<strlen(keybuf);i++)
				{
					keybuf[i] = CHARSET[0];
				}
				strncat(keybuf,CHARSET,1);
			}
		}
		else
		{
			for (i=0;i<strlen(CHARSET);i++)
			{
				if (keybuf[strlen(keybuf)-1] == CHARSET[i]) break;
			}
			keybuf[strlen(keybuf)-1] = CHARSET[i+1];
		}
	}

Answering that question is difficult. :slight_smile: Here are a few comments:

  1. The 4870 is a 4-way SIMD machine, which means you’ll only get 1/4 of the performance (at max) if you don’t use float4 or int4 data types. However, it seems like you’re getting far less than that.

  2. The impression I’ve had is that OpenCL on AMD devices isn’t as fast as it is on Nvidia (yet). I suspect it will get better as the compilers improve, but that hardly helps.

  3. I’d also suggest using local memory for the tables, but I don’t think that really exists on the 4870.

  4. Make sure you don’t include the compilation and setup times in your timing loop as they can be hundreds of ms, but that’s small compared to 14.7s.

  5. Calling clFinish() inside your loop is very bad for performance. The clEnqueueRead with blocking=CL_TRUE will take care of this for you on an in-order queue with much less overhead. In general you want to queue up another kernel while you are reading back the results from this one so work can be overlapped as much as possible. I suspect you have a lot of time where the GPU is idle since you’re doing kernel/finish/read. You should queue up at least two kernels at a time if possible so

What are the global and local sizes you end up running with? The AMD parts want a local size that is an even multiple of 64 for best performance.

I’m sure you’ll get better performance if you change those things, but I don’t know if it will be near what you expect. I suspect #2 above is the real issue, and there’s not much you can do about that. You can get a good feeling for how much that matters (modulo #1) if you try running your code on an Nvidia part and see how well you do. Good luck!

I get around 8 sec with my 8800GTX, Mac OS X 10.6.2, C2D 3.0 GHz

Editing source clFinish() to //clFinish() doesnt give any changes in time !

I’ve made some progress with optimising this, still needs a lot of work though. I need to understand what uint4’s are before I can get much further (see other post I just made).