PDA

View Full Version : vectorization

chiraga213
01-21-2013, 11:22 PM
hello,
i want to do vectorization using opencl like this
float 4 C=(float 4)(a[0],a[1],a[2],a[3])+(float 4)(b[0],b[1],b[2],b[3])

clint3112
01-22-2013, 12:28 AM
float4 c = float4(a[0]+b[0],a[1]+b[1],a[2]+b[2],a[3]+b[3])

or you can change a and b to float 4

float4 a = (float4)(a[0],...);
float4 b = (float4)(b[0],...);
float4 c = a+b;

chiraga213
01-22-2013, 01:35 AM
--------------------------------------------------------------------------
#include "stdafx.h"
#include <iostream>
#include "CL\cl.h"
#include <stdio.h>
using namespace std;

const char *source =
"__kernel void vec_add (__global int *a, \n"
" __global const int *b, \n"
" __global int *c) \n"
"{ \n"

"int16 a=(int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); \n"
"int16 b=(int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); \n"
"int16 c = a+b; \n"
"} \n";
int _tmain(int argc, _TCHAR* argv[])
{
int N = 16;
// Get the first available platform
// Example: AMD Accelerated Parallel Processing
cl_platform_id platform;
clGetPlatformIDs(1,&platform,NULL); // number of platforms available

// Get the first GPU device the platform provides
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device,NULL); // number of devices available

// Create a context and command queue on that device
cl_context context = clCreateContext(0, // optional (context properties)
1, // number of devices
&device, // pointer to device list
NULL, NULL, // optional (callback function for reporting errors)
NULL); // no error code returned

cl_command_queue queue = clCreateCommandQueue(context, // valid context
device, // device associated with context
CL_QUEUE_PROFILING_ENABLE, // optional (command queue properties)
NULL); // no error code returned

// Create program object and load source code into program object
cl_program program = clCreateProgramWithSource(context,
1, // number of strings
&source, // strings
NULL, // string length or NULL terminated
NULL); // no error code returned

// Build program executable from program source
clBuildProgram(program,
1, // number of devices
&device, // pointer to device list
NULL, // optional (build options)
NULL, NULL); // optional (callback function, argument)

// Build program executable from program source

// Create kernel object
cl_kernel kernel = clCreateKernel(program, // program object
"vec_add", // kernel name in program
NULL); // no error code returned

// Initialize arrays
cl_float *a = (cl_float *) malloc(N*sizeof(cl_float));
cl_float *b = (cl_float *) malloc(N*sizeof(cl_float));
// int i;
//for(i=0;i<N;i++){
// a[i] = i;
// b[i] = i;
//}
// A buffer object is a handle to a region of memory
cl_mem a_buffer = clCreateBuffer(context,
CL_MEM_COPY_HOST_PTR, // copy data from memory referenced
// by host pointer
N*sizeof(cl_float), // size in bytes of buffer object
a, // host pointer
NULL); // no error code returned
cl_mem b_buffer = clCreateBuffer(context,
N*sizeof(cl_float), b, NULL);

cl_mem c_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
N*sizeof(cl_float), NULL, NULL);
//size_t global_work_size = 4;

// Set the kernel arguments
clSetKernelArg(kernel, 0, sizeof(a_buffer), (void*) &a_buffer);
clSetKernelArg(kernel, 1, sizeof(b_buffer), (void*) &b_buffer);
clSetKernelArg(kernel, 2, sizeof(c_buffer), (void*) &c_buffer);
cl_event timeEvent;
//const size_t m=4;
size_t szGlobalWorkSize[1];
//size_t szLocalWorkSize[2];
szGlobalWorkSize[0]=1;
//szLocalWorkSize[0]=2;
//szLocalWorkSize[1]=2;
// Enqueue a command to execute the kernel on the GPU device
cl_int error = clEnqueueNDRangeKernel(queue, kernel,
1, NULL, // global work items dimensions and offset
szGlobalWorkSize, // number of global work items
NULL, // number of work items in a work group
0, NULL, // don't wait on any events to complete
&timeEvent); // no event object returned

// Block until all commands in command-queue have completed
clFinish(queue);
cl_float *c = (cl_float *) malloc(N*sizeof(cl_float));
queue, // command queue in which read command will be queued
c_buffer, // buffer object to read back
CL_TRUE, // blocking read - doesn't return until buffer copied
0, // offset in bytes in buffer object to read from
N * sizeof(cl_float), // size in bytes of data being read
c, // pointer to host memory where data is to be read into
0, NULL, // don't wait on any even
NULL); // no event object returned
for(int i=0;i<N;i++)
cout<<"\n"<<c[i];

cl_ulong startBuf;
// size_t a = 1000;
cl_int x= clGetEventProfilingInfo ( timeEvent,
CL_PROFILING_COMMAND_START ,
sizeof(cl_ulong),
&startBuf,
NULL);

cl_ulong endBuf;
// size_t a = 1000;
cl_int y= clGetEventProfilingInfo ( timeEvent,
CL_PROFILING_COMMAND_END ,
sizeof(cl_ulong),
&endBuf,
NULL);
long diff = endBuf - startBuf;
double values_in_second = (double)diff/(double)1000000000;

cout<<"Total GPU time:"<<values_in_second<<"\n";
free(a);
free(b);
free(c);
clReleaseMemObject(a_buffer);
clReleaseMemObject(b_buffer);
clReleaseMemObject(c_buffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseContext(context);
clReleaseCommandQueue(queue);
getchar();
return 0;
}

clint3112
01-22-2013, 05:03 AM
a, b and c are local variables hiding the global in pointers i think. i dont know if thats allowed in openCL kernels.
if you want to access the global variables it has to be something like

const char *source =
"__kernel void vec_add (__global int *a, \n"
" __global const int *b, \n"
" __global int *c) \n"
"{ \n"

"int16 la=(int16)(a[0],a[1]...); \n"
"int16 lb=(int16)(b[0],b[1]...); \n"
"int16 lc = a+b; \n"
c[0] = lc.0;
c[1] = lc.1;
...
"} \n";

But this would be the same as using:

c[i] = a[i] + b[i]; //With i beeing your workitem ID