vectorization

hello,
i want to do vectorization using opencl like this
float 4 C=(float 4)(a[0],a[1],a[2],a[3])+(float 4)(b[0],b[1],b[2],b[3])
please help me out how to do this in opencl

float4 c = float4(a[0]+b[0],a[1]+b[1],a[2]+b[2],a[3]+b[3])

or you can change a and b to float 4

float4 a = (float4)(a[0],…);
float4 b = (float4)(b[0],…);
float4 c = a+b;

hey thanks for reply
please check this code and please correct me

#include “stdafx.h”
#include <iostream>
#include “CL\cl.h”
#include <stdio.h>
using namespace std;

const char *source =
"__kernel void vec_add (__global int *a,
"
" __global const int *b,
"
" __global int *c)
"
"{
"

"int16 a=(int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
"
"int16 b=(int16)(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
"
"int16 c = a+b;
"
"}
";
int _tmain(int argc, _TCHAR* argv[])
{
int N = 16;
// Get the first available platform
// Example: AMD Accelerated Parallel Processing
cl_platform_id platform;
clGetPlatformIDs(1,&platform,NULL); // number of platforms available

// Get the first GPU device the platform provides
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device,NULL); // number of devices available

// Create a context and command queue on that device
cl_context context = clCreateContext(0, // optional (context properties)
                                      1, // number of devices
                                      &device, // pointer to device list
                                      NULL, NULL, // optional (callback function for reporting errors)
                                      NULL); // no error code returned

cl_command_queue queue = clCreateCommandQueue(context, // valid context
                                               device, // device associated with context
                                               CL_QUEUE_PROFILING_ENABLE, // optional (command queue properties)
                                               NULL); // no error code returned

// Create program object and load source code into program object
cl_program program = clCreateProgramWithSource(context,
	                                           1, // number of strings
                                               &source, // strings
                                               NULL, // string length or NULL terminated
                                               NULL); // no error code returned


// Build program executable from program source
clBuildProgram(program,
               1, // number of devices
               &device, // pointer to device list
               NULL, // optional (build options)
               NULL, NULL); // optional (callback function, argument)

// Build program executable from program source

// Create kernel object
cl_kernel kernel = clCreateKernel(program, // program object
                                  "vec_add", // kernel name in program
                                  NULL); // no error code returned

// Initialize arrays
cl_float *a = (cl_float *) malloc(N*sizeof(cl_float));
cl_float *b = (cl_float *) malloc(N*sizeof(cl_float));

// int i;
//for(i=0;i<N;i++){
// a[i] = i;
// b[i] = i;
//}
// A buffer object is a handle to a region of memory
cl_mem a_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY | // buffer object read only for kernel
CL_MEM_COPY_HOST_PTR, // copy data from memory referenced
// by host pointer
Nsizeof(cl_float), // size in bytes of buffer object
a, // host pointer
NULL); // no error code returned
cl_mem b_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY |CL_MEM_COPY_HOST_PTR,
N
sizeof(cl_float), b, NULL);

cl_mem c_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                 N*sizeof(cl_float), NULL, NULL);
//size_t global_work_size = 4;

// Set the kernel arguments
clSetKernelArg(kernel, 0, sizeof(a_buffer), (void*) &a_buffer);
clSetKernelArg(kernel, 1, sizeof(b_buffer), (void*) &b_buffer);
clSetKernelArg(kernel, 2, sizeof(c_buffer), (void*) &c_buffer);
	cl_event timeEvent;
	//const size_t m=4;
	size_t szGlobalWorkSize[1];
	//size_t szLocalWorkSize[2];
	szGlobalWorkSize[0]=1;
	 //szLocalWorkSize[0]=2;
     //szLocalWorkSize[1]=2;
// Enqueue a command to execute the kernel on the GPU device
cl_int error = clEnqueueNDRangeKernel(queue, kernel,
                       1, NULL, // global work items dimensions and offset
                       szGlobalWorkSize, // number of global work items
                       NULL, // number of work items in a work group
                       0, NULL, // don't wait on any events to complete
                       &timeEvent); // no event object returned

// Block until all commands in command-queue have completed
clFinish(queue);

// Read back the results
cl_float *c = (cl_float ) malloc(Nsizeof(cl_float));
clEnqueueReadBuffer(
queue, // command queue in which read command will be queued
c_buffer, // buffer object to read back
CL_TRUE, // blocking read - doesn’t return until buffer copied
0, // offset in bytes in buffer object to read from
N * sizeof(cl_float), // size in bytes of data being read
c, // pointer to host memory where data is to be read into
0, NULL, // don’t wait on any even
NULL); // no event object returned
for(int i=0;i<N;i++)
cout<<"
"<<c[i];

cl_ulong startBuf;

// size_t a = 1000;
cl_int x= clGetEventProfilingInfo ( timeEvent,
CL_PROFILING_COMMAND_START ,
sizeof(cl_ulong),
&startBuf,
NULL);

cl_ulong endBuf;
// size_t a = 1000;
cl_int y= clGetEventProfilingInfo ( timeEvent,
CL_PROFILING_COMMAND_END ,
sizeof(cl_ulong),
&endBuf,
NULL);
long diff = endBuf - startBuf;
double values_in_second = (double)diff/(double)1000000000;

cout&lt;&lt;"Total GPU time:"&lt;&lt;values_in_second&lt;&lt;"

";
free(a);
free(b);
free©;
clReleaseMemObject(a_buffer);
clReleaseMemObject(b_buffer);
clReleaseMemObject(c_buffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseContext(context);
clReleaseCommandQueue(queue);
getchar();
return 0;
}

a, b and c are local variables hiding the global in pointers i think. i dont know if thats allowed in openCL kernels.
if you want to access the global variables it has to be something like


const char *source =
"__kernel void vec_add (__global int *a, 
"
" __global const int *b, 
"
" __global int *c) 
"
"{ 
"
 
"int16 la=(int16)(a[0],a[1]...); 
"
"int16 lb=(int16)(b[0],b[1]...); 
"
"int16 lc = a+b; 
"
c[0] = lc.0;
c[1] = lc.1;
...
"} 
";

But this would be the same as using:


c[i] = a[i] + b[i]; //With i beeing your workitem ID

in your kernel code.

thanks for reply me.
actually i want to know that is there anyways to do vectorization which i want to do via this program…
if you have any code regarding this.can you please give me…because the code which you have given to me is not providing proper solution of vectorization. that is vector addition.

You might find it worthwhile to look up the vload* and vstore* functions. In your case, replace * with 16.