OpenCL Addition Example (Mac OS X 10.6 Snow  Leopard)

/*
 
 File: compute_sumints.c
 
 Abstract: source for compute_sum kernel and initialization and runtime 
           code for summing integers in and OpenCL kernel
 
 Version: 1.0
 
 Disclaimer: IMPORTANT:  This Apple software is supplied to you by 
 Apple Inc. ("Apple") in consideration of your agreement to the
 following terms, and your use, installation, modification or
 redistribution of this Apple software constitutes acceptance of these
 terms.  If you do not agree with these terms, please do not use,
 install, modify or redistribute this Apple software.
 
 In consideration of your agreement to abide by the following terms, and
 subject to these terms, Apple grants you a personal, non-exclusive
 license, under Apple's copyrights in this original Apple software (the
 "Apple Software"), to use, reproduce, modify and redistribute the Apple
 Software, with or without modifications, in source and/or binary forms;
 provided that if you redistribute the Apple Software in its entirety and
 without modifications, you must retain this notice and the following
 text and disclaimers in all such redistributions of the Apple Software. 
 Neither the name, trademarks, service marks or logos of Apple Inc. 
 may be used to endorse or promote products derived from the Apple
 Software without specific prior written permission from Apple.  Except
 as expressly stated in this notice, no other rights or licenses, express
 or implied, are granted by Apple herein, including but not limited to
 any patent rights that may be infringed by your derivative works or by
 other works in which the Apple Software may be incorporated.
 
 The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
 MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
 THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
 FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
 OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
 
 IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
 OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
 MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
 AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
 STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
 
 Copyright (C) 2008 Apple Inc. All Rights Reserved.
 
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <stdbool.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
#include <mach/mach_time.h>

static cl_device		device;
static cl_context		context;

const char *sum_kernel_code = 
"__kernel void compute_sum(__global int *a, int n, __local long *tmp_sum, __global long *sum)
"
"{
"
"    int  tid = get_local_thread_id(0);
"
"    int  lsize = get_local_thread_size(0);
"
"    int  i;
"
"
"
"    tmp_sum[tid] = 0;
"
"    for (i=tid; i<n; i+=lsize)
"
"        tmp_sum[tid] += a[i];
"
"
"
"    for (i=lsize/2; i>0; i/=2)
"
"    {
"
"        barrier(CL_GLOBAL_MEM_FENCE);
"
"        if (tid < i)
"
"            tmp_sum[tid] += tmp_sum[tid + i];
"
"    }
"
"
"
"    if (tid == 0)
"
"        *sum = tmp_sum[0];
"
"}
";


static int
verify_sum(int *inptr, long long *outptr, int n)
{
    long long	r = 0;
    int         i;
    
    for (i=0; i<n; i++)
    {
		r += inptr[i];
    }
	
	if (r != outptr[0])
	{
		printf("sum of ints test failed
");
		return -1;
	}
    
    printf("sum of ints test passed
");
    return 0;
}

int
compute_sumints(int num_elements, long long *compute_sum, float *compute_time)
{
	cl_mem						streams[2];
	long long					sum;
	int							*input_ptr;
	cl_program					program;
	cl_kernel					kernel;
	void						*values[4];
	size_t						sizes[4] = { sizeof(cl_mem), sizeof(int), 0, sizeof(cl_mem) };
	size_t						lengths[1];
	unsigned int				global_threads[1];
	unsigned int				local_threads[1];
	int							err;
	unsigned int				max_threadgroup_size;
	int							i;
	cl_device_id				device_id;
	uint64_t					t0, t1;
	struct mach_timebase_info	info;
	
	mach_timebase_info(&info);
	
	printf( "computing sum for %d randomly generated ints
", num_elements );
	input_ptr = malloc(sizeof(int) * num_elements);
	for (i=0; i<num_elements; i++)
		input_ptr[i] = (int)rand();
	
	err = clGetDeviceGroupInfo(device, CL_DEVICE_IDS, &device_id, sizeof(cl_device_id), NULL);
	if (err != CL_SUCCESS) {
		printf( "clGetDeviceGroupInfo failed
" );
		return -1;
	}

	clGetDeviceConfigInfo(device_id, CL_DEVICE_MAX_THREAD_GROUP_SIZE, &max_threadgroup_size, sizeof(unsigned int), NULL);

	lengths[0] = strlen(sum_kernel_code);
    program = clCreateProgramWithSource(device, 1, &sum_kernel_code, lengths);
	if (!program)
	{
		printf("clCreateProgramWithSource failed
");
		return -1;
	}
	
	err = clBuildProgramExecutable(program, false, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		printf("clBuildProgramExecutable failed
");
		return -1;
	}
	
	kernel = clCreateKernel(program, "compute_sum");
	if (!kernel)
	{
		printf("clCreateKernel failed
");
		return -1;
	}

	streams[1] = clCreateArray(device, CL_MEM_ALLOC_GLOBAL_POOL, sizeof(long long), 1, NULL);
	if (!streams[1])
	{
		printf("clCreateArray failed
");
		return -1;
	}
	
	// begin timing
	t0 = mach_absolute_time();
	streams[0] = clCreateArray(device, CL_MEM_ALLOC_GLOBAL_POOL | CL_MEM_COPY_HOST_PTR, sizeof(int), num_elements, input_ptr);
	if (!streams[0])
	{
		printf("clCreateArray failed
");
		return -1;
	}
		
	sizes[0] = sizeof(cl_mem);  values[0] = streams[0];
	sizes[1] = sizeof(int);     values[1] = (void *)&num_elements;
	sizes[2] = max_threadgroup_size*sizeof(long long); values[2] = NULL;
	sizes[3] = sizeof(cl_mem);  values[3] = streams[1];
	err = clSetKernelArgs(context, kernel, 4, NULL, values, sizes);
	if (err != CL_SUCCESS)
	{
		printf("clSetKernelArgs failed
");
		return -1;
	}

	global_threads[0] = max_threadgroup_size;
	local_threads[0] = max_threadgroup_size;
    err = clExecuteKernel(context, kernel, NULL, global_threads, local_threads, 1, NULL, 0, NULL);
    if (err != CL_SUCCESS)
    {
        printf("clExecuteKernel failed
");
        return -1;
    }
	
    err = clReadArray(context, streams[1], false, 0, sizeof(long long), (void *)&sum, NULL);
    if (err != CL_SUCCESS)
    {
        printf("clReadArray failed
");
        return -1;
    }
    // end timing
	t1 = mach_absolute_time();
	
	{
		
		if (compute_time) *compute_time = 1e-9 * (t1 - t0) * info.numer / info.denom;
	}
    err = verify_sum(input_ptr, &sum, num_elements);
	
	if (compute_sum) *compute_sum = sum;
    
	// cleanup
	clReleaseMemObject(streams[0]);
	clReleaseMemObject(streams[1]);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	free(input_ptr);
	
	return err;
}

int
init_compute()
{
	cl_device_id compute_device_id[2];
	unsigned int num_devices = 0;
	int return_value = 0;
	
    return_value = clGetComputeDevices(CL_DEVICE_TYPE_GPU, 2, compute_device_id, &num_devices);
    if(return_value || 0 == num_devices) {
		printf( "clGetComputeDevices failed (with %d devices available)
", num_devices );
        return -1;
	}
	
    device = clCreateDeviceGroup(1, &compute_device_id[0]);
	if (!device)
	{
		printf("clCreateDeviceGroup failed
");
		return -1;
	}
	
	context = clCreateContext(0, device);
	if (!context)
	{
		printf("clCreateContext failed
");
		return -1;
	}
	
	return 0;
}

void release_compute()
{
	clReleaseContext(context);
	clReleaseDeviceGroup(device);
}

#if 0
int
main(int argc, char *argv[])
{
	if (init_compute())
		return -1;
	
	int r = compute_sumints(1024*1024, NULL, NULL);
	release_compute();
	return r;
}
#endif



Because this forum is not support attachment, so if u want the whole Sample code, pls PM with ur email :slight_smile:

Thanks - nice to see some Mac OS X 10.6 devs here !
I pm éd you for some little demo sources.

Anyone other has an OpenCL Example (with Xcode project file) for me ?
Didnt get an answer from above (perhaps he is in holidays :wink: )
Thanks


#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <stdbool.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
#include <mach/mach_time.h>

const char * sProgramSource = 
"__kernel void vectorAdd(              
" \
"__global const float * a,             
" \
"__global const float * b,             
" \
"__global	float * c)                 
" \
"{                                     
" \
"	// Vector element index            
" \
"	int nIndex = get_global_id(0);     
" \
"	c[nIndex] = a[nIndex] + b[nIndex]; 
" \
"}                                     
";

int main (int argc, const char * argv[])
{
	const unsigned int cnBlockSize= 512;
	const unsigned int cnBlocks =3;
	size_t cnDimension = cnBlocks * cnBlockSize;
	int err;
	cl_device_id device_id;
	size_t local;
	size_t len;
	char buffer[2048];

	int gpu = 1;
	err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to get device ID
");
		exit(1);
	}

	err = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, &len);
	printf("CL_DEVICE_NAME: %s
", buffer);
	err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, &len);
	printf("CL_DEVICE_VENDOR: %s
", buffer);

	// create OpenCL device & context
	cl_context hContext;
	hContext = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to create context
");
		exit(1);
	}
	
	// create a command queue for our device
	cl_command_queue hCmdQueue;
	hCmdQueue = clCreateCommandQueue(hContext, device_id, 0, 0);

	// create & compile program
	cl_program hProgram;
	hProgram = clCreateProgramWithSource(hContext, 1, (const char **) &sProgramSource, NULL, &err);
	if (!hProgram || err != CL_SUCCESS)
	{
		printf("Error: Failed to Create program with source
");
		exit(1);
	}

	err = clBuildProgram(hProgram, 0, NULL, NULL, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to build program executable
");
		clGetProgramBuildInfo(hProgram, device_id, CL_PROGRAM_BUILD_LOG,
							  sizeof(buffer), buffer, &len);
		printf("%s
", buffer);
		exit(1);
	}

	// create kernel
	cl_kernel hKernel;
	hKernel = clCreateKernel(hProgram, "vectorAdd", &err);
	if (!hKernel || err != CL_SUCCESS)
	{
		printf("Error: Failed to create kernel
");
		exit(1);
	}

	// allocate host vectors
	float * pA = new float[cnDimension];
	float * pB = new float[cnDimension];
	float * pC = new float[cnDimension];
	float * pC1 = new float[cnDimension];
	
	memset(pC, 0, cnDimension * sizeof(float));
	memset(pC1, 0, cnDimension * sizeof(float));

	// initialize host memory
	int i;
	for(i=0; i < cnDimension; i++)
	{
		pA[i] = pC[i] = pC1[i] = 0;
		pB[i] = i;
//		pA[i] = rand() % 10 + 1;
//		pB[i] = rand() % 10 + 1;
	}

	// allocate device memory
	cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC;
	hDeviceMemA = clCreateBuffer(hContext,
								 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pA, 0);
	hDeviceMemB = clCreateBuffer(hContext,
								 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float), pB, 0);
	hDeviceMemC = clCreateBuffer(hContext,
								 CL_MEM_WRITE_ONLY, cnDimension * sizeof(cl_float), 0, 0);

	// setup parameter values
	err = 0;
	err  = clSetKernelArg(hKernel, 0, sizeof(cl_mem), (void *)&hDeviceMemA);
	err |= clSetKernelArg(hKernel, 1, sizeof(cl_mem), (void *)&hDeviceMemB);
	err |= clSetKernelArg(hKernel, 2, sizeof(cl_mem), (void *)&hDeviceMemC);
	if (err != CL_SUCCESS)
	{
		printf("Error: Failed to set kernel args
");
		exit(1);
	}

	// Get the maximum work-group size for executing the kernel on the device
	err = clGetKernelWorkGroupInfo(hKernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Error: clGetKernelWorkGroupInfo Failed
");
		exit(1);
	}
	
	// execute kernel
	err = clEnqueueNDRangeKernel(hCmdQueue, hKernel, 1, NULL, (size_t*)(&cnDimension), &local, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{
		printf("Error: clEnqueueNDRangeKernel Failed
");
		exit(1);
	}
	// copy results from device back to host
	clEnqueueReadBuffer(hCmdQueue, hDeviceMemC, CL_TRUE, 0, cnDimension * sizeof(cl_float),
						pC, 0, NULL, NULL);

	// wait for command queue
	clFinish(hCmdQueue);

	bool valid = true;
	for(i=0; i < cnDimension; i++)
	{
		pC1[i] = pA[i] + pB[i];
		if (pC[i] != pC1[i])
		{
			printf("Error: %0.2f != %0.2f
", pC[i], pC1[i]);
			valid = false;
		}
	}

	printf("Number of elements : %d
", cnDimension);
	printf("First Element: %0.2f
", pC[0]);
	printf("Last Element : %0.2f

", pC[cnDimension-1]);
	if (valid) {
		printf("Test passed
");
	}
	else {
		printf("Test failed
");
	}

	
	delete[] pA;
	delete[] pB; 
	delete[] pC;
	delete[] pC1;
	
	clReleaseMemObject(hDeviceMemA); 
	clReleaseMemObject(hDeviceMemB); 
	clReleaseMemObject(hDeviceMemC);
	clReleaseProgram(hProgram);
	clReleaseKernel(hKernel);
	clReleaseCommandQueue(hCmdQueue);
	clReleaseContext(hContext);
    return 0;
}

Thanks !

How about an thing like that (raytracing on GPU, with sample source code buts CUDA).

http://cg.alexandra.dk/2009/08/10/trier … -tutorial/

Sorry,
i tried to compile both OpenCL .c examples with Xode (10.6, as command line projects).
Both cant be compiled.
Different errors - some definition errors (CL… not declared), some compile errors
float *pa = new float (xyz):

Can someone upload that examples as .xcodeproject files (zipped, really small!) which then will work (right settings for Librarys/ compiler + code fixes) ?

Thanks

hey guys, I’m sorry for reply so late…

I just send out the code to you, pls check ur inbox

THANKS.
I will look on it.

I got hat example to run with that

const char *[b] sProgramSource[/b] = 
"__kernel void vectorAdd(              
" \
"__global const float * a,             
" \
"__global const float * b,             
" \
"__global   float * c)                 
" \
"{                                     
" \
"   // Vector element index            
" \
"   int loop;            
" \
"   int nIndex = get_global_id(0);     
" \
"   [b]for (loop=1; loop< 5000; loop++)[/b]   
" \      *** changed by me to run longer on GPU  ***
"{                                     
" \


"   c[nIndex] = a[nIndex] + b[nIndex]; 
" \

"}                                     
" \

"}                                     
";

But it will run only on NVIDIAs - from 9600M GT up to GTX 285 - no problems.
9600M GT = 15 sec, GTX285 = 0,8 sec
CPU from 3,8 Sec i7 920@4 GHZ down to 100 sec C2Mobile 2 GHZ.

ATI Users (OS X, 10.6) reported that OS X complete freezes when they run the Bench.

I posted the code (V020) and xcodeproject here:

Would be fine if i will get some help to fix that freeze problem with ATI.

So, OpenCL is not an “fire & forget” , i must do some extra coding for GPU differences ?
Any help would be fine !

Also, i dont know what to do with that CL.hpp - if i include it, i get > 400 compiler errors.

I changed some code of the source part (smaller loop for the vector adds).
Much error handling added.
Works now on ATI 4870 /OS X 10.6), but runs way slower.
ATI 4870 : 4 sec, Geforce 285 : 0,17 sec , Geforce 9600GT : 0,93 sec, Geforce 9600M : 5 sec, Geforce 9400M: 15 sec
Now V025.
Sourcecode same link as post before.

I thried also the OpenCL Example1 from the kind user out of china.
But i get lots of errors at compiling, even if i used you complete “pack” as Xcode project.
Normally should work.
Some cl OpenCL calls are definitly NOT found in the OpenCL Standard ( i checked that).
Also some CL_MAX… constants are not defined and cant be found also in the OpenCL documentation too.
And some cl OpenCL calls have to less values given with.
Question: Did you compiled that own, or do you only have the source and shared that.
If you get that compiled for OS X 10.6, please pm that small execute (zipped) to me.
Example2 gave much less errors like undfinded constants and undefined OpenCL calls,

Question:
I am a bit confused , because the bech works so good an all Nvidias and near not on any ATIs.

Examples of errors (OpenCL example1)
compute_sumints.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <stdbool.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <OpenCL/opencl.h>
#include <mach/mach_time.h>
...
static cl_device		device;  
> no cl_device defined, i must use cl_device[b]_id[/b]

err = clGetDeviceGroupInfo(device, CL_DEVICE_IDS, &device_id, sizeof(cl_device_id), NULL);
> clGetDevice[b]Group[/b]Info call doesnt exist, only the clGetDeviceInfo, [b]CL_DEVICE_IDS[/b] doesnt exist in OpenCL

clGetDeviceConfigInfo(device_id, CL_DEVICE_MAX_THREAD_GROUP_SIZE, &max_threadgroup_size, sizeof(unsigned int), NULL);
> clGetDevice[b]Config[/b]Info doesnt exist in OpenCL, CL_DEVICE_MAX_[b]THREAD[/b]_GROUP_SIZE doesnt exist ,only ..._MAX_[b]WORK[/b]_GROUP_....


and so on. Was easy to fix cl calls with to less values, but how fix complete unknow openCl calls ?

Thanks for any help.

mitchde,
The performance difference comes mainly from the fact that you are not fully utilizing the ATI architecture. The ATI architecture is a 5-way VLIW, where as Nvidia is a scalar architecture. If you want to see performance on ATI hardware, you need to program using the vector types that OpenCL provides.

Thanks !

I now will try the float4 insted of float:

/*const char * sProgramSource = 
"__kernel void vectorAdd(              
" \
"__global const float * a,             
" \
"__global const float * b,             
" \
"__global   float * c)                 
" \
"{
" \
"   // Vector element index            
" \
"   int loop;            
" \
"   int test1;            
" \
"   int nIndex = get_global_id(0);     
" \
"   for (loop=1; loop< 1000; loop++)
" \
"{
" \
"   c[nIndex] = a[nIndex] + b[nIndex]; 
" \
"   c[nIndex] = c[nIndex] * (a[nIndex] + b[nIndex]); 
" \
"   c[nIndex] = c[nIndex] * (a[nIndex] / 2.0 ); 
" \
"}
" \

"}
"; 
 */

const char * sProgramSource = 
 "__kernel void vectorAdd(              
" \
 "__global const float4 * a,             
" \
 "__global const float4 * b,             
" \
 "__global   float4 * c)                 
" \
 "{
" \
 "   // Vector element index            
" \
 "   int loop;            
" \
 "   int test1;            
" \
 "   int nIndex = get_global_id(0);     
" \
 "
 "   c[nIndex] = a[nIndex] + b[nIndex]; 
" \
 
 "}
"; 

I changed also the OpenCL calls from float to float4.

hDeviceMemA = clCreateBuffer(hContext,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, cnDimension * sizeof(cl_float4), pA, &err);

(for all 3 : pA, pB, pC)
and
err = clEnqueueReadBuffer(hCmdQueue, hDeviceMemC, CL_TRUE, 0, cnDimension * sizeof(cl_float4),
pC, 0, NULL, NULL);

I will report what changed, at least it compiled :wink:

Upps.
I have seen that AMD itself uses float or float2 , not float4.
i now will remove that loop and go back to very basic gpu source code for first steps in OpenCL :slight_smile:

http://ati.amd.com/technology/streamcom … pencl.html ) also uses float and not float4 in their own example.


Does it actual use the GPU already?
I’m quite surprise, nVidia and ATI OpenCL drivers does seem really so I actually wonder how Apple could have OpenCL working

Yes indeed, the Example, i modded to an “bench” is running well and valid at least on Nvidia GPUs / Mac OS X Snow Leopard 10.6.

Here some resullts i got posted:

NEW V025 test results !

ATIs (no freezes of the bench anymore , ATI 4870 works now):
Number of OpenCL devices found: 3
OpenCL Device # 0 = Radeon HD 4870
Device 0 is an: GPU with max. 750 MHz and 4 units/cores // 4 cores are wrong !!! //
Now computing - please be patient…
time used: 4.126 seconds

Number of OpenCL devices found: 2
OpenCL Device # 0 = Radeon HD 4870
Device 0 is an: GPU with max. 750 MHz and 4 units/cores
time used: 3.997 seconds
At least with actual drivers and my benchmark ATI benches are useless.
Seems to be that either OpenCL isnt sooo universal (same code run on all GPUs optimized) or bugs in ATI OpenCL part. Maybe some OpenCL PRAGMA settings must set for ATI to get better performance.

NVIDIAs:
Number of OpenCL devices found: 2
OpenCL Device # 0 = GeForce GTX 285
Device 0 is an: GPU with max. 1584 MHz and 240 units/cores
time used: 0.231 seconds
OpenCL Device # 1 = Intel® Core™ i7 CPU 920 4,3GHz
time used: 1.296 seconds

by grue:
Number of OpenCL devices found: 3
OpenCL Device # 0 = GeForce 8800 GT
Device 0 is an: GPU with max. 1500 MHz and 112 units/cores
time used: 0.683 seconds
OpenCL Device # 1 = GeForce GTX 260
Device 1 is an: GPU with max. 1400 MHz and 216 units/cores
time used: 0.365 seconds
OpenCL Device # 2 = Intel® Xeon® CPU X5365 @ 3.00GHz
time used: 3.094 seconds

by moondark
Number of OpenCL devices found: 3
OpenCL Device # 0 = GeForce 9600M GT
Device 0 is an: GPU with max. 1250 MHz and 32 units/cores
time used: 2.798 seconds
OpenCL Device # 1 = GeForce 9400M
Device 1 is an: GPU with max. 1100 MHz and 16 units/cores
time used: 9.549 seconds
OpenCL Device # 2 = Intel® Core™2 Duo CPU P8600 @ 2.40GHz
time used: 15.800 seconds

by antic
Number of OpenCL devices found: 2
OpenCL Device # 0 = GeForce 9500 GT
Device 0 is an: GPU with max. 1350 MHz and 32 units/cores
time used: 3.053 seconds
OpenCL Device # 1 = Intel® Core™2 CPU 6600 @ 3.80GHz
time used: 15.188 seconds

by ricola
Number of OpenCL devices found: 2
OpenCL Device # 0 = GeForce 9400 GT
Device 0 is an: GPU with max. 1375 MHz and 16 units/cores
time used: 3.992 seconds
OpenCL Device # 1 = Intel® Core™2 CPU E7500 @ 3,66 GHz
time used: 12.048 seconds

Niceeeeee!
Does it even support the connection with OpenGL?

I now tried an orig. Apple OpenCL Demo .

There is an OpenCL + OpenGL (GLUT) togehter, when you mean that.

Looks like :
CPU
http://www.insanelymac.com/forum/index.php?act=attach&type=post&id=54948

GPU
http://www.insanelymac.com/forum/index.php?act=attach&type=post&id=54949

static cl_device device;
> no cl_device defined, i must use cl_device_id

err = clGetDeviceGroupInfo(device, CL_DEVICE_IDS, &device_id, sizeof(cl_device_id), NULL);
> clGetDeviceGroupInfo call doesnt exist, only the clGetDeviceInfo, CL_DEVICE_IDS doesnt exist in OpenCL

clGetDeviceConfigInfo(device_id, CL_DEVICE_MAX_THREAD_GROUP_SIZE, &max_threadgroup_size, sizeof(unsigned int), NULL);
> clGetDeviceConfigInfo doesnt exist in OpenCL, CL_DEVICE_MAX_THREAD_GROUP_SIZE doesnt exist ,only …MAXWORKGROUP

These are historical interfaces. I believe they are left over from the WWDC 2008 release. They’ve been removed from the standard and Apple’s OpenCL implementation. Rough translations follow:

cl_device -> cl_device_id
clGetDeviceGroupInfo -> clGetContextInfo( CL_CONTEXT_DEVICES)
clGetDeviceConfigInfo(CL_DEVICE_MAX_THREAD_GROUP_SIZE) -> clGetDeviceInfo(CL_DEVICE_MAX_WORK_GROUP_SIZE)

The last one might also be translated as clGetKernelWorkGroupInfo, depending on what you are doing.

All and all, a benchmark that looks at array addition is pretty weak. You are mostly just benchmarking memory bandwidth. Even if you have all the data you need in some equivalent of a L1 cache, its still 3 LSU ops for each arithmetic instruction. You folks should work on some more real world examples.

“All and all, a benchmark that looks at array addition is pretty weak. You are mostly just benchmarking memory bandwidth. Even if you have all the data you need in some equivalent of a L1 cache, its still 3 LSU ops for each arithmetic instruction. You folks should work on some more real world examples.”

You are absoulte right - but for starting+understanding OpenCL coding (not the OpenCL sourcepart !) an weak OpenCL source part is OK :wink:

What do you think about that Apple OpenCL example, coding qJulia on GPU ?
I think that OpenCL source part is much more “real parallel gpu programming” than an simple vector add - also i7 CPU´s can do really fast.

I compiled that also and got around 30 FPS in the starting szenes , 10-60 fps in the animation with an fixed 800x800 window.

//
// File:       qjulia.c
//
// Abstract:   This example shows how to use OpenCL to raytrace a 4d Quaternion Julia-Set 
//             Fractal and intermix the results of a compute kernel with OpenGL for rendering.
//
// Version:    <1.0>
//
/
// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
//
////////////////////////////////////////////////////////////////////////////////////////////////////

#ifndef WIDTH
#define WIDTH                       (512)
#endif
#ifndef HEIGHT
#define HEIGHT                      (512)
#endif
#define ASPECT                      ((float)WIDTH / (float)HEIGHT)
#define SQR(x)                      ((x)*(x))
#define BOUNDING_RADIUS             (2.0f)
#define BOUNDING_RADIUS_SQR         (SQR(BOUNDING_RADIUS))
#define ESCAPE_THRESHOLD            (BOUNDING_RADIUS * 1.5f)
#define DELTA                       (1e-5f)
#define ITERATIONS                  (10)
#define EPSILON                     (0.003f)
#define SHADOWS                     (0)

////////////////////////////////////////////////////////////////////////////////////////////////////

// Note that __float3_SPI is an unsupported vector type.  It is not part of the 
// OpenCL specification, and is not officially supported by any platform or vendor
// and it should not be used.

#define FLOAT3_TYPE                 __float3_SPI
#define FLOAT3_CONSTRUCTOR(x,y,z)   ((__float3_SPI){(x),(y),(z)})

////////////////////////////////////////////////////////////////////////////////////////////////////

#ifndef FLOAT3_TYPE
#define FLOAT3_TYPE                 float4
#endif

#ifndef FLOAT3_CONSTRUCTOR(x,y,z)
#define FLOAT3_CONSTRUCTOR(x,y,z)   ((float4){(x),(y),(z),(0.0f)})
#endif

////////////////////////////////////////////////////////////////////////////////////////////////////

#define float3                      FLOAT3_TYPE
#define make_float3(x,y,z)          FLOAT3_CONSTRUCTOR(x,y,z)

////////////////////////////////////////////////////////////////////////////////////////////////////

float3 cross3(float3 b, float3 c)
{
    return make_float3(mad(b.y, c.z,  -b.z * c.y),
                       mad(b.z, c.x,  -b.x * c.z),
                       mad(b.x, c.y,  -b.y * c.x));

}

float3 normalize3(float3 v)
{
    return v * half_rsqrt(dot(v, v));
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float4 qmult( float4 q1, float4 q2 )
{
    float4 r;
    float3 t;

    float3 q1yzw = make_float3(q1.y, q1.z, q1.w);
    float3 q2yzw = make_float3(q2.y, q2.z, q2.w);
    float3 c = cross3( q1yzw, q2yzw );

    t = q2yzw * q1.x + q1yzw * q2.x + c;
    r.x = q1.x * q2.x - dot( q1yzw, q2yzw );
    r.yzw = t.xyz;

    return r;
}

float4 qsqr( float4 q )
{
    float4 r;
    float3 t;
    
    float3 qyzw = make_float3(q.y, q.z, q.w);

    t     = 2.0f * q.x * qyzw;
    r.x   = q.x * q.x - dot( qyzw, qyzw );
    r.yzw = t.xyz;

    return r;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float3 
EstimateNormalQJulia(
    float3 p,
    float4 c,
    int iterations )
{
    float4 qp = make_float4( p.x, p.y, p.z, 0.0f );
    float4 gx1 = qp - make_float4( DELTA, 0.0f, 0.0f, 0.0f );
    float4 gx2 = qp + make_float4( DELTA, 0.0f, 0.0f, 0.0f );
    float4 gy1 = qp - make_float4( 0.0f, DELTA, 0.0f, 0.0f );
    float4 gy2 = qp + make_float4( 0.0f, DELTA, 0.0f, 0.0f );
    float4 gz1 = qp - make_float4( 0.0f, 0.0f, DELTA, 0.0f );
    float4 gz2 = qp + make_float4( 0.0f, 0.0f, DELTA, 0.0f );

    for ( int i = 0; i < iterations; i++ )
    {
        gx1 = qsqr( gx1 ) + c;
        gx2 = qsqr( gx2 ) + c;
        gy1 = qsqr( gy1 ) + c;
        gy2 = qsqr( gy2 ) + c;
        gz1 = qsqr( gz1 ) + c;
        gz2 = qsqr( gz2 ) + c;
    }

    float nx = fast_length(gx2) - fast_length(gx1);
    float ny = fast_length(gy2) - fast_length(gy1);
    float nz = fast_length(gz2) - fast_length(gz1);

    float3 normal = normalize3(make_float3( nx, ny, nz ));

    return normal;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float4 
IntersectQJulia(
    float3 rO,
    float3 rD,
    float4 c,
    float epsilon,
    float escape)
{
    float rd = 0.0f;
    float dist = epsilon;
    while ( dist >= epsilon && rd < escape)
    {
        float4 z = make_float4( rO.x, rO.y, rO.z, 0.0f );
        float4 zp = make_float4( 1.0f, 0.0f, 0.0f, 0.0f );
        float zd = 0.0f;
        uint count = 0;
        while(zd < escape && count < ITERATIONS)
        {
            zp = 2.0f * qmult(z, zp);
            z = qsqr(z) + c;
            zd = dot(z, z);
            count++;
        }

        float normZ = fast_length( z );
        dist = 0.5f * normZ * half_log( normZ ) / fast_length( zp );
        rO += rD * dist;
        rd = dot(rO, rO);
    }

    return make_float4(rO.x, rO.y, rO.z, dist);
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float3
Phong(
    float3 light,
    float3 eye,
    float3 pt,
    float3 normal,
    float3 base)
{
    const float SpecularExponent = 10.0f; 
    const float Specularity = 0.45f;

    float3 light_dir = normalize3( light - pt );
    float3 eye_dir = normalize3( eye - pt );
    float NdotL = dot( normal, light_dir );
    float3 reflect_dir = light_dir - 2.0f * NdotL * normal;

    base += fabs(normal) * 0.5f;
    float3 diffuse = base * fmax(NdotL, 0.0f);
    float3 specular = Specularity * half_powr( fmax( dot(eye_dir, reflect_dir), 0.0f), SpecularExponent );
    return diffuse + specular;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float
IntersectSphere(
    float3 rO,
    float3 rD,
    float radius )
{
    float fB = 2.0f * dot( rO, rD );
    float fB2 = fB * fB;
    float fC = dot( rO, rO ) - radius;
    float fT = (fB2 - 4.0f * fC);
    if (fT <= 0.0f)
        return 0.0f;
    float fD = half_sqrt( fT );
    float fT0 = ( -fB + fD ) * 0.5f;
    float fT1 = ( -fB - fD ) * 0.5f;
    fT = fmin(fT0, fT1);
    return fT;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

float4 
RaytraceQJulia(
    float3 rO,
    float3 rD,
    float4 mu,
    float epsilon,
    float3 eye,
    float3 light,
    float3 diffuse,
    float radius,
    bool shadows,
    int iterations )
{
    const float4 background = make_float4( 0.15f, 0.15f, 0.15f, 0.0f );
    float4 color = background;

    rD = normalize3( rD );
    float t = IntersectSphere( rO, rD, radius );
    if ( t <= 0.0f )
        return color;

    rO += rD * t;
    float4 hit = IntersectQJulia( rO, rD, mu, epsilon, ESCAPE_THRESHOLD );
    float dist = hit.w;
    if (dist >= epsilon)
        return color;

    rO.xyz = hit.xyz;
    float3 normal = EstimateNormalQJulia( rO, mu, iterations );

    float3 rgb = Phong( light, rD, rO, normal, diffuse );
    color.xyz = rgb.xyz;
    color.w = 1.0f;

    if (SHADOWS)
    {
        float3 light_dir = normalize3( light - rO );
        rO += normal * epsilon * 2.0f;
        hit = IntersectQJulia( rO, light_dir, mu, epsilon, ESCAPE_THRESHOLD );
        dist = hit.w;
        color.xyz *= (dist < epsilon) ? (0.4f) : (1.0f);
    }

    return color;
}

///////////////////////////////////////////////////////////////////////////////////////////

float4 
QJulia(
    float4 coord,
    float4 mu,
    float4 diffuse,
    float epsilon,
    float iterations,
    int shadows,
    uint width,
    uint height)
{
    float zoom = BOUNDING_RADIUS_SQR;
    float radius = BOUNDING_RADIUS_SQR;

    float2 size = make_float2((float)width, (float)height);
    float scale = max(size.x, size.y);
    float2 half = make_float2(0.5f, 0.5f);
    float2 position = (coord.xy - half * size) / scale;
    float2 frame = (position) * zoom;

    float3 light = make_float3(1.5f, 0.5f, 4.0f);
    float3 eye = make_float3(0.0f, 0.0f, 4.0f);
    float3 ray = make_float3(frame.x, frame.y, 0.0f);
    float3 base = make_float3(diffuse.x, diffuse.y, diffuse.z);    

    float3 rO = eye;
    float3 rD = (ray - rO);
    
    float4 color = RaytraceQJulia( rO, rD, mu, epsilon, eye, light, base, radius, shadows, iterations);

    return color;
}

////////////////////////////////////////////////////////////////////////////////////////////////////

__kernel void
QJuliaKernel(
    __global uchar4 *result,
    const float4 mu,
    const float4 diffuse,
    const float epsilon)
{
    int tx = get_global_id(0);
    int ty = get_global_id(1);
    int sx = get_global_size(0);
    int sy = get_global_size(1);
    int index = ty * WIDTH + tx;
    bool valid = (tx < WIDTH) && (ty < HEIGHT);

    float4 coord = make_float4((float)tx, (float)ty, 0.0f, 0.0f);
    
    if(valid)
    {
        float4 color = QJulia(coord, mu, diffuse, epsilon, ITERATIONS, SHADOWS, WIDTH, HEIGHT);
        uchar4 output = convert_uchar4_sat_rte(color * 255.0f);
        result[index] = output;
    }
}

////////////////////////////////////////////////////////////////////////////////////////////////////

hey, guys, I found some examples!

http://developer.apple.com/mac/library/ … index.html

http://developer.apple.com/mac/library/ … index.html

http://developer.apple.com/mac/library/ … index.html

http://developer.apple.com/mac/library/ … index.html

http://developer.apple.com/mac/library/ … index.html

http://developer.apple.com/mac/library/ … index.html

http://developer.apple.com/mac/library/ … index.html

http://developer.apple.com/mac/library/ … index.html

http://developer.apple.com/mac/library/ … index.html

http://developer.apple.com/mac/library/ … index.html