Hey all
im trying to setup an Application to make some calculation on my video card. The problem is, that my cpu is much faster then the gpu.
When i start the program, i get the following msg:
Connecting to NVIDIA GeForce 320M,
max_compute_units: 6
max_work_groub_size: 512
max_work_item_dimensions: 3
It’s working on any of your system? If so, where is my mistake?
The Kernel:
__kernel void
add(__global float a,
__global float b,
__global float answer)
{
int gid = get_global_id(0);
answer[gid] = a[gid] + b[gid];
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] = 0.460.486.544.21 * (10.56sin(a[gid]) + 3.47 * cos(b[gid])b[gid]a[gid]);
answer[gid] /= 0.460.486.544.21 * (10.56*sin(a[gid]) + 3.47 * cos(b[gid])*b[gid]*a[gid]);
}
My main.c:
#ifdef APPLE
#include <OpenGL/OpenGL.h>
#include <GLUT/glut.h>
//#include <OpenGL/glu.h>
#else
#include <GL/glut.h>
//#include <GL/glu.h>
#endif
#include <OpenCL/OpenCL.h>
#include <iostream>
#include <assert.h>
#include <sys/sysctl.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <stdio.h>
#pragma mark -
#pragma mark Utilities
char * load_program_source(const char *filename)
{
struct stat statbuf;
FILE *fh;char *source;
fh = fopen(filename, “r”);
if (fh == 0)
return 0;stat(filename, &statbuf);
source = (char *) malloc(statbuf.st_size + 1);
fread(source, statbuf.st_size, 1, fh);
source[statbuf.st_size] = ‘\0’;
return source;
}#pragma mark -
#pragma mark Main OpenCL Routine
int runCL(float * a, float * b, float * results, int n)
{
cl_program program[1];
cl_kernel kernel[1];
cl_command_queue cmd_queue;
cl_context context;
cl_device_id cpu = NULL, device = NULL;
cl_int err = 0;
size_t returned_size = 0;
size_t buffer_size;
cl_mem a_mem, b_mem, ans_mem;
#pragma mark Device Information
{
// Find the CPU CL device, as a fallback
//26:00
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
assert(err == CL_SUCCESS);// Find the GPU CL device, this is what we really want // If there is no GPU device is CL capable, fall back to CPU err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL); //if (err != CL_SUCCESS) device = cpu; assert(device); // Get some information about the returned device cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; cl_uint max_compute_units = 0; size_t max_work_groub_size = 0; cl_uint max_work_item_dimensions = 0; //27:00 err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size); err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
err |= clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(max_compute_units), &max_compute_units, &returned_size);
err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_work_groub_size), &max_work_groub_size, &returned_size);
err |= clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(max_work_item_dimensions), &max_work_item_dimensions, &returned_size);
assert(err == CL_SUCCESS); printf("Connecting to %s %s,
max_compute_units: %d
max_work_groub_size: %zu
max_work_item_dimensions: %d…
", vendor_name, device_name, max_compute_units, max_work_groub_size, max_work_item_dimensions);
}
#pragma mark Context and Command Queue
{
// Now create a context to perform our calculation with the
// specified device
context = clCreateContext(0, 1, &device, NULL, NULL, &err);
assert(err == CL_SUCCESS);// And also a command queue for the context cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
}
#pragma mark Program and Kernel Creation
{
// Load the program source from disk
// The kernel/program is the project directory and in Xcode the executable
// is set to launch from that directory hence we use a relative path
const char * filename = “example.cl”;
char program_source = load_program_source(filename);
program[0] = clCreateProgramWithSource(context, 1, (const char*)&program_source,
NULL, &err);assert(err == CL_SUCCESS); // 28:40 err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL); assert(err == CL_SUCCESS); // Now create the kernel "objects" that we want to use in the example file kernel[0] = clCreateKernel(program[0], "add", &err);
}
#pragma mark Memory Allocation
{// Allocate memory on the device to hold our data and store the results into buffer_size = sizeof(float) * n; // Input array a //30:10 a_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL); //32:20 err = clEnqueueWriteBuffer(cmd_queue, a_mem, CL_TRUE, 0, buffer_size, (void*)a, 0, NULL, NULL); // Input array b b_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, NULL); err |= clEnqueueWriteBuffer(cmd_queue, b_mem, CL_TRUE, 0, buffer_size, (void*)b, 0, NULL, NULL); assert(err == CL_SUCCESS); // Results array ans_mem= clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL); // Get all of the stuff written and allocated clFinish(cmd_queue);
}
#pragma mark Kernel Arguments
{// Now setup the arguments to our kernel //33:48 err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &a_mem); err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &b_mem); err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &ans_mem); assert(err == CL_SUCCESS);
}
#pragma mark Execution and Read
{// Run the calculation by enqueuing it and forcing the // command queue to complete the task size_t global_work_size = n; //33:59 err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL, &global_work_size, NULL, 0, NULL, NULL); assert(err == CL_SUCCESS); clFinish(cmd_queue); // Once finished read back the results from the answer // array into the results array //35:35 err = clEnqueueReadBuffer(cmd_queue, ans_mem, CL_TRUE, 0, buffer_size, results, 0, NULL, NULL); assert(err == CL_SUCCESS); clFinish(cmd_queue);
}
#pragma mark Teardown
{
clReleaseMemObject(a_mem);
clReleaseMemObject(b_mem);
clReleaseMemObject(ans_mem);clReleaseCommandQueue(cmd_queue); clReleaseContext(context);
}
return CL_SUCCESS;
}
int main(int argc, char **argv) {
// Problem size
// int n = 20481616164;
int n = 40;// Allocate some memory and a place for the results
float * a = (float )malloc(nsizeof(float));
float * b = (float )malloc(nsizeof(float));
float * results = (float )malloc(nsizeof(float));// Fill in the values
for(int i=0;i<n;i++) {
a[i] = (float)i;
b[i] = (float)n-i;
results[i] = 0.f;
}// Do the OpenCL calculation
runCL(a, b, results, n);// Print out some results.
// for(int i=0;i<n;i++)
//if (i+1 != results[i])
// printf("%f
",results[i]);
printf("%f
",results[n-1]);// Free up memory
free(a);
free(b);
free(results);return 0;
}
thank you