Double type problem

thamerhatem · August 9, 2011, 8:28am

Hi, I’m trying use double type in openCL, but doesn’t work anyway, i want use double for more precision, if have any other type make this, please, tell me.

if you don’t have time for read my code, resuming is: I use double inside of kernel source and in main code, i tried other things like double_t, float_t, double2, half… nothing work.

My code:
#pragma OPENCL EXTENSION cl_amd_fp64 : enable

#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>

////////////////////////////////////////////////////////////////////////////////

// Use a static data size for simplicity
//
#define DATA_SIZE (1000000)
////////////////////////////////////////////////////////////////////////////////
#define TIPO double
// Simple compute kernel that computes the calcpi of an input array. [1]
//
const char KernelSource = "
"
"#pragma OPENCL EXTENSION cl_amd_fp64 : enable
"
"__kernel void calcpi(
"
" __global double input,
"
" __global double* output,
"
" const unsigned int count)
"
"{
"
" int i = get_global_id(0);
"
" double z = get_global_id(0)*2+1;
"
" if(i < count)
"
" output[i] = 4.0/z;
"
"}
"
"
";

////////////////////////////////////////////////////////////////////////////////

int main(int argc, char** argv)
{
int err; // error code returned from api calls
//printf("%d",sizeof(TIPO));
//scanf("%d",&err);
TIPO data[2]; // original data set given to device
TIPO *results = malloc(sizeof(TIPO)*DATA_SIZE); // results returned from device
//unsigned int correct; // number of correct results returned
//printf(“TESTE”);

size_t global;                    // global domain size for our calculation
size_t local;                     // local domain size for our calculation

cl_device_id device_id;           // device ID
cl_context context;               // context
cl_command_queue queue;           // command queue
cl_program program;               // program
cl_kernel kernel;                 // kernel

cl_mem input;                     // device memory used for the input array
cl_mem output;                    // device memory used for the output array

// Get data on which to operate
//

//int i = 0;
//int n = 3;
unsigned int count = DATA_SIZE;
//for(i = 0; i &lt; count; i+=2) {
    //data[i] = n;
	//n += 2;
//}
//printf("TESTE");
// Get an ID for the device                                    [2]
int gpu = 1;
err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1,&device_id, NULL);
if (err != CL_SUCCESS)
	printf("ERROR CLGETDEVICEIDS!

"); // [3]

// Create a context                                            [4]
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context) {
	printf("ERROR CONTEXT

");
}

// Create a command queue                                              [5]
//
queue = clCreateCommandQueue(context, device_id, 0, &err);
if (!queue) {
	printf("ERROR QUEUE

");
}

// Create the compute program from the source buffer                   [6]
//
program = clCreateProgramWithSource(context, 1,(const char **) & KernelSource, NULL, &err);
if ( !program) {
	printf("ERROR PROGRAM

");
}

// Build the program executable                                        [7]
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
    size_t len;
    char buffer[2048];
	
    printf("Error: Failed to build program executable

“); //[8]
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,sizeof(buffer), buffer, &len);
printf(”%s
", buffer);
exit(1);
}

// Create the compute kernel in the program we wish to run            [9]
//
kernel = clCreateKernel(program, "calcpi", &err);
if (!kernel || err != CL_SUCCESS) {
	printf("ERROR KERNEL OR CL_SUCESS

");
}

// Create the input and output arrays in device memory for our calculation
//                                                                   [10]
input = clCreateBuffer(context,  CL_MEM_READ_ONLY,  sizeof(TIPO) *count,NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(TIPO) *count,NULL, NULL);
if (!input || !output) {
	printf("ERROR !INPUT OR !OUTPUT

");
}

// Write our data set into the input array in device memory          [11]
//
err = clEnqueueWriteBuffer(queue, input, CL_TRUE, 0,sizeof(TIPO) *2, data, 0, NULL, NULL);
if (err != CL_SUCCESS) {
	printf("ERROR WRITE OUR DATA

");
}

// Set the arguments to our compute kernel                           [12]
//
err = 0;
err  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
if (err != CL_SUCCESS) {
	printf("ERROR ARGUMENTS COMPUTE KERNEL - ERROR NUMBER: %d

",err);
exit(1);
}

// Get the maximum work-group size for executing the kernel on the device
//                                                                   [13]
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,sizeof(size_t), &local, NULL);
if (err != CL_SUCCESS) {
	printf("ERROR MAXIMUM WORK-GROUP - ERROR NUMBER: %d

",err);
exit(1);
}

// Execute the kernel over the entire range of the data set          [14]
//
global = count;
//printf("TESTE");
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL,0, NULL, NULL);
if (err) {
	printf("ERROR EXECUTE KERNEL - ERROR NUMBER: %d

",err);
printf("ERROS: CL_INVALID_PROGRAM_EXECUTABLE %d
", CL_INVALID_PROGRAM_EXECUTABLE);
printf("ERROS: CL_INVALID_COMMAND_QUEUE %d
", CL_INVALID_COMMAND_QUEUE);
printf("ERROS: CL_INVALID_KERNEL %d
", CL_INVALID_KERNEL);
printf("ERROS: CL_INVALID_CONTEXT %d
", CL_INVALID_CONTEXT);
printf("ERROS: CL_INVALID_KERNEL_ARGS %d
", CL_INVALID_KERNEL_ARGS);
printf("ERROS: CL_INVALID_WORK_DIMENSION %d
", CL_INVALID_WORK_DIMENSION);
printf("ERROS: CL_INVALID_WORK_GROUP_SIZE %d
", CL_INVALID_WORK_GROUP_SIZE);
printf("ERROS: CL_MEM_OBJECT_ALLOCATION_FAILURE %d
", CL_MEM_OBJECT_ALLOCATION_FAILURE);
printf("ERROS: CL_INVALID_WORK_ITEM_SIZE %d
", CL_INVALID_WORK_ITEM_SIZE);
printf("ERROS: CL_INVALID_GLOBAL_OFFSET %d
", CL_INVALID_GLOBAL_OFFSET);
printf("ERROS: CL_OUT_OF_RESOURCES %d
", CL_OUT_OF_RESOURCES);
printf("ERROS: CL_INVALID_EVENT_WAIT_LIST %d
", CL_INVALID_EVENT_WAIT_LIST);
printf("ERROS: CL_OUT_OF_HOST_MEMORY %d
", CL_OUT_OF_HOST_MEMORY);
exit(1);
}

// Wait for the command queue to get serviced before reading back results
//                                                                   [15]
clFinish(queue);

// Read the results from the device                                  [16]
//
err = clEnqueueReadBuffer(queue, output, CL_TRUE, 0,sizeof(TIPO) *count, results, 0, NULL, NULL );
if (err != CL_SUCCESS) {
	printf("ERROR READ RESULTS - ERROR NUMBER: %d

“,err);
}
//printf(“TESTE”);
TIPO pi = 0.0;
int i;
for (i=0;i<count-1;i++) {
//printf(”%f",results[i]);
pi += (pow(-1.0,i)) * (TIPO) results[i];
//pi = (TIPO) results[i];
//printf(“casa %d deu: %1.50f
“,i,pi);
//printf(”%f”,(pow(-1,i)));
//pi += (pow(-1.0,i));
}
printf(“PI: %1.50f”,pi);

// Shut down and clean up
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);

scanf("%d",&i);
return 0;

}

david.garcia · August 9, 2011, 4:07pm

if you don’t have time for read my code, resuming is: I use double inside of kernel source and in main code, i tried other things like double_t, float_t, double2, half… nothing work.

Can you be more specific? Do you get an error code when you call clBuildProgram()? A crash? Something else?

#pragma OPENCL EXTENSION cl_amd_fp64 : enable

This is an AMD-exclusive extension. Don’t be offended, but are you running this on an AMD machine? Have you confirmed whether your device supports that extension? Use clGetDeviceInfo(…, CL_DEVICE_EXTENSIONS, …) to find out.

You can also try this instead:

#pragma OPENCL EXTENSION cl_khr_fp64 : enable

thamerhatem · August 9, 2011, 7:20pm

Thank you for your answer, and…
The problem is when i try use double instead of float, with float works right.
I don’t get any error, the number returned is wrong, when i put in output[i] = 1.0 with output be float, no problems… when i put with double i have a problem
The problem is… wrong number, not errors or anything like this
Do you understand? =)

and my processor is intel, the gpu is ATI Radeon HD 6750M on Mac Book Pro.

ajs2 · August 10, 2011, 11:28am

The AMD extension cl_amd_fp64 is not supported by Apple’s OpenCL framework. On that config, the Khronos extension, cl_khr_fp64 is available for the CPU.

thamerhatem · August 10, 2011, 8:11pm

pts… do you know any alternative?