Hi all,
I’ve written a basic SHA1 hash brute-forcer for OpenCL. Unfortunately the performance is way below what I was anticipating. Tens of millions of hashes per second should be typical for a half-decent GPU, yet this is taking 14.7 seconds just to burn through 2.6 million.
Any advice would be much appreciated…
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
void nextkey();
#define NUM_BLOCKS 10240
#define BLOCK_SIZE 256
#define CHARSET "abcdefghijklmnopqrstuvwxyz1234567890"
const char *KernelSource = "
" \
"#define K0 0x5A827999
" \
"#define K1 0x6ED9EBA1
" \
"#define K2 0x8F1BBCDC
" \
"#define K3 0xCA62C1D6
" \
"
" \
"#define H1 0x67452301
" \
"#define H2 0xEFCDAB89
" \
"#define H3 0x98BADCFE
" \
"#define H4 0x10325476
" \
"#define H5 0xC3D2E1F0
" \
"
" \
"#define uchar unsigned char
" \
"
" \
"uint rotateLeft(uint x, int n)
" \
"{
" \
" return (x << n) | (x >> (32-n));
" \
"}
" \
"
" \
"__kernel void sha1(__global char *msg, __global const unsigned int *len, __global char *digest)
" \
"{
" \
" int t, i, j, gid, x;
" \
" uint W[80], A[5], temp, number;
" \
" char hexChars[16] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
" \
" gid = get_global_id(0);
" \
" int item_pad = gid * 64;
" \
" uint ulen = (len[gid]*8) & 0xFFFFFFFF;
" \
"
" \
" for (i=0;i<64-len[gid];i++) {
" \
" msg[item_pad+len[gid]+i] = 0;
" \
" }
" \
"
" \
" msg[item_pad + len[gid]] = (char) 0x80;
" \
"
" \
" msg[item_pad + 60] = ulen >> 24;
" \
" msg[item_pad + 61] = ulen >> 16;
" \
" msg[item_pad + 62] = ulen >> 8;
" \
" msg[item_pad + 63] = ulen;
" \
"
" \
" A[0] = H1;
" \
" A[1] = H2;
" \
" A[2] = H3;
" \
" A[3] = H4;
" \
" A[4] = H5;
" \
"
" \
" for (t = 0; t < 16; t++)
" \
" {
" \
" W[t] = ((uchar) msg[item_pad + (t * 4)]);
" \
" W[t] = W[t] << 24;
" \
" temp = ((uchar) msg[item_pad + (t * 4 + 1)]);
" \
" temp = temp << 16;
" \
" W[t] |= temp;
" \
" temp = ((uchar) msg[item_pad + (t * 4 + 2)]);
" \
" temp = temp << 8;
" \
" W[t] |= temp;
" \
" W[t] |= (uchar) msg[item_pad + (t * 4 + 3)];
" \
" }
" \
"
" \
" for(i = 16; i < 80; i++)
" \
" {
" \
" W[i] = rotateLeft(W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1);
" \
" }
" \
"
" \
" for(i = 0; i < 20; i++)
" \
" {
" \
" temp = rotateLeft(A[0],5) + ((A[1] & A[2]) | ((~ A[1]) & A[3])) + A[4] + W[i] + K0;
" \
" A[4] = A[3];
" \
" A[3] = A[2];
" \
" A[2] = rotateLeft(A[1], 30);
" \
" A[1] = A[0];
" \
" A[0] = temp;
" \
" }
" \
"
" \
" for(i = 20; i < 40; i++)
" \
" {
" \
" temp = rotateLeft(A[0], 5) + (A[1] ^ A[2] ^ A[3]) + A[4] + W[i] + K1;
" \
" A[4] = A[3];
" \
" A[3] = A[2];
" \
" A[2] = rotateLeft(A[1], 30);
" \
" A[1] = A[0];
" \
" A[0] = temp;
" \
" }
" \
"
" \
" for(i = 40; i < 60; i++)
" \
" {
" \
" temp = rotateLeft(A[0], 5) + ((A[1] & A[2]) | (A[1] & A[3]) | (A[2] & A[3])) + A[4] + W[i] + K2;
" \
" A[4] = A[3];
" \
" A[3] = A[2];
" \
" A[2] = rotateLeft(A[1], 30);
" \
" A[1] = A[0];
" \
" A[0] = temp;
" \
" }
" \
"
" \
" for(i = 60; i < 80; i++)
" \
" {
" \
" temp = rotateLeft(A[0], 5) + (A[1] ^ A[2] ^ A[3]) + A[4] + W[i] + K3;
" \
" A[4] = A[3];
" \
" A[3] = A[2];
" \
" A[2] = rotateLeft(A[1], 30);
" \
" A[1] = A[0];
" \
" A[0] = temp;
" \
" }
" \
" A[0] += H1;
" \
" A[1] += H2;
" \
" A[2] += H3;
" \
" A[3] += H4;
" \
" A[4] += H5;
" \
"
" \
" for(j = 0; j < 5; j++)
" \
" {
" \
" number = A[j];
" \
" for(i = 0; i < 8; i++)
" \
" {
" \
" digest[item_pad + (j*8 + 7-i)] = hexChars[number%16];
" \
" number /= 16;
" \
" }
" \
" }
" \
"
" \
" digest[item_pad + 40] = '\\0';
" \
"}
" \
"
";
char keybuf[64+1];
int main(int argc, char **argv) {
char c;
unsigned int i=0, j, x;
char *textstring = (char *)malloc(64*BLOCK_SIZE);
char *result = (char *)malloc(64*BLOCK_SIZE);
time_t tt,tt2,tt3;
memset(&keybuf,0,sizeof(keybuf));
strncpy(keybuf,CHARSET,1);
if (textstring == NULL || result == NULL) {
printf("Couldn't allocate memory.
");
return 0;
}
int err;
unsigned int length[BLOCK_SIZE];
size_t global = BLOCK_SIZE;
size_t local = BLOCK_SIZE;
cl_device_id device_id;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
cl_mem input;
cl_mem devlen;
cl_mem output;
int gpu = 1;
err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!
");
return EXIT_FAILURE;
}
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!
");
return EXIT_FAILURE;
}
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!
");
return EXIT_FAILURE;
}
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program! (error %d)
",err);
switch (err) {
case CL_INVALID_CONTEXT: printf("context is not a valid context.
"); break;
case CL_INVALID_VALUE: printf("count is zero or if strings or any entry in strings is NULL
"); break;
case CL_OUT_OF_HOST_MEMORY: printf("there is a failure to allocate resources required by the OpenCL implementation on the host
"); break;
}
return EXIT_FAILURE;
}
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!
");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s
", buffer);
exit(1);
}
kernel = clCreateKernel(program, "sha1", &err);
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!
");
switch (err) {
case CL_INVALID_PROGRAM: printf("if program is not a valid program object
"); break;
case CL_INVALID_PROGRAM_EXECUTABLE: printf("if there is no successfully built executable for program
"); break;
case CL_INVALID_KERNEL_NAME: printf("if kernel_name is not found in program
"); break;
case CL_INVALID_KERNEL_DEFINITION: printf("if the function definition for __kernel function given by kernel_name such as the number of arguments, the argument types are not the same for all devices for which the program executable has been built
"); break;
case CL_INVALID_VALUE: printf("is kernel_name is NULL
"); break;
case CL_OUT_OF_HOST_MEMORY: printf("if there is a failure to allocate resources required by the OpenCL implementation on the host
"); break;
}
exit(1);
}
input = clCreateBuffer(context, CL_MEM_READ_ONLY, 64*BLOCK_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 64*BLOCK_SIZE, NULL, NULL);
devlen = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int)*BLOCK_SIZE, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!
");
exit(1);
}
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &devlen);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d
", err);
switch (err) {
case CL_INVALID_KERNEL: printf("kernel is not a valid kernel object
"); break;
case CL_INVALID_ARG_INDEX: printf("arg_index is not a valid argument index
"); break;
case CL_INVALID_ARG_VALUE: printf("arg_value specified is NULL for an argument that is not declared with the __local qualifier or vice-versa
"); break;
case CL_INVALID_MEM_OBJECT: printf("an argument declared to be a memory object when the specified arg_value is not a valid memory object
"); break;
case CL_INVALID_SAMPLER: printf("an argument declared to be of type sampler_t when the specified arg_value is not a valid sampler object
"); break;
case CL_INVALID_ARG_SIZE: printf("arg_size does not match the size of the data type for an argument that is not a memory object or if the argument is a memory object and arg_size != sizeof(cl_mem) or if arg_size is zero and the argument is declared with the __local qualifier or if the argument is a sampler and arg_size != sizeof(cl_sampler)
"); break;
}
exit(1);
}
for (x = 0; x<NUM_BLOCKS; x++) {
for (i=0; i<BLOCK_SIZE; i++) {
strcpy(&textstring[i*64],keybuf);
length[i] = strlen(&textstring[i*64]);
nextkey();
}
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, 64*BLOCK_SIZE, textstring, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!
");
exit(1);
}
err = clEnqueueWriteBuffer(commands, devlen, CL_TRUE, 0, sizeof(unsigned int)*BLOCK_SIZE, length, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!
");
exit(1);
}
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (err)
{
printf("Error: Failed to execute kernel!
");
return EXIT_FAILURE;
}
clFinish(commands);
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, 64*BLOCK_SIZE, result, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d
", err);
exit(1);
}
}
printf("Done, last result was: %s - %s
",&textstring[(i-1)*64],&result[(i-1)*64]);
printf("Computed %d hashes.
",NUM_BLOCKS*BLOCK_SIZE);
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
void nextkey() {
int i,j,k,found;
if (keybuf[strlen(keybuf)-1] == CHARSET[strlen(CHARSET)-1])
{
found=0;
for (i=strlen(keybuf)-1;i>=0;i--)
{
if (keybuf[i] != CHARSET[strlen(CHARSET)-1])
{
for (j=0;j<strlen(CHARSET);j++)
{
if (keybuf[i] == CHARSET[j])
{
keybuf[i] = CHARSET[j+1];
for (k=i+1;k<strlen(keybuf);k++)
{
keybuf[k] = CHARSET[0];
}
found=1;
break;
}
}
i=-1;
}
}
if (!found)
{
for (i=0;i<strlen(keybuf);i++)
{
keybuf[i] = CHARSET[0];
}
strncat(keybuf,CHARSET,1);
}
}
else
{
for (i=0;i<strlen(CHARSET);i++)
{
if (keybuf[strlen(keybuf)-1] == CHARSET[i]) break;
}
keybuf[strlen(keybuf)-1] = CHARSET[i+1];
}
}