Passing data from one kernel to another

Hey I am fairly new to OpenCL and I am currently making a program using C++ bindings and “OpenCLUtilities/openCLUtilities.hpp”. I know that data stays on device unless enqueueReadBuffer method is called but how do I make a pointer to that data? I’d like to perform some computation in one kernel (grad) and use output (ang, mag) in second kernel without copying buffers back to host and again to device. If you could tell me where and what I am doing wrong in my code or provide some relevant example I will be very grateful.

Context context = createCLContextFromArguments(argc, argv);
    Program program = buildProgramFromSource(context, "/Users/Mateusz/Desktop/grad.cl");
    std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
    CommandQueue queue = CommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE);
    
    cl::Device dev0 = devices[1];
    std::string name;
    dev0.getInfo(CL_DEVICE_NAME, &name);
    std::cout << "Used device: " << name << std::endl;
    
    Image2D clImage1 = Image2D(context,
                               CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                               ImageFormat(CL_RGBA, CL_UNSIGNED_INT8),
                               image1.columns(), image1.rows(), 0, image1_pixels);
    
    
    // Create a buffer for the result
    Buffer clResultAng = Buffer(context, CL_MEM_READ_WRITE, sizeof(float)*image1.rows()*image1.columns());
    Buffer clResultMag = Buffer(context, CL_MEM_READ_WRITE, sizeof(double)*image1.rows()*image1.columns());
    Buffer clResultOut = Buffer(context, CL_MEM_READ_WRITE, sizeof(float)*image1.rows()*image1.columns());

    
    Kernel gradients = Kernel(program, "grad");
    gradients.setArg(0, clImage1);
    gradients.setArg(1, clResultAng);
    gradients.setArg(2, clResultMag);
    
    
    Event kernel_event, read_event;
    queue.enqueueNDRangeKernel(gradients, NullRange,
                               NDRange(image1.columns(), image1.rows() ),
                               NullRange, NULL, &kernel_event);
    
    
    //    Program program1 = buildProgramFromSource(context, "/Users/Mateusz/Desktop/histograms.cl");
        Kernel histograms = Kernel(program, "histograms");
        histograms.setArg(0, clResultAng);
        histograms.setArg(1, clResultMag);
        histograms.setArg(2, clResultOut);
    
    
    //Synchronize?
    kernel_event.wait();
    
    // Transfer image back to host
    //queue.enqueueReadBuffer(clResultMag, CL_TRUE, 0, sizeof(double)*image1.columns()*image1.rows(), mag);
    queue.enqueueReadBuffer(clResultOut, CL_TRUE, 0, sizeof(float)*image1.columns()*image1.rows(), test);
    queue.enqueueReadBuffer(clResultAng, CL_TRUE, 0, sizeof(float)*image1.columns()*image1.rows(), ang);

    
    //          DEBUGGING OUTPUT
    // std::cout<<std::endl;
    
     for (int i=0; i<image1_size/4; i++) {
     if(ang[i]>0)
     std::cout << "Test: " << test[i] << " vs. Angle: " << ang[i] << std::endl;
     }

//#pragma OPENCL EXTENSION cl_khr_fp64 : enable

__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

__kernel void grad(
                   __read_only image2d_t input_image,
                   __global float * ang,
                   __global double * mag
                   ) {
    
    const int2 pos = {get_global_id(0), get_global_id(1)};
    
    __private float4 dx = 0.0f;
    __private float4 dy = 0.0f;
    
    
    // calculate gradients in dx direction using [1, 0, -1] kernel
    for(int a = -1; a < 2; a++) {
        dx += (-a)*read_imagef(input_image, sampler, (int2)((pos.x+a), pos.y) );
    }
    
    // calculate gradients in dy direction using transposed [1, 0, -1] kernel
    for(int b = -1; b < 2; b++) {
        dy += (-b)*read_imagef(input_image, sampler, (int2)(pos.x, (pos.y+b)) );
    }
    
    /*  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!
     
     Dalal and Triggs suggested:
     "For colour images, we calculate separate gradients for
     each colour channel, and take the one with the largest norm
     as the pixel’s gradient vector."
     
     
     *///  !!!!!!!!!!!!!!!!!!!!!!!!!!!!
    
    float R_x=0.0f, G_x=0.0f, B_x=0.0f, x=0;
    float R_y=0.0f, G_y=0.0f, B_y=0.0f, y=0;
    R_x += dx.x; G_x += dx.y; B_x += dx.z;
    R_y += dy.x; G_y += dy.y; B_y += dy.z;
    double xx = 0, yy = 0;
    
    x = (R_x>G_x && R_x>B_x) ? dx.x : ((G_x>B_x) ? dx.y : dx.z);    // is only one channel used or are they mixed??
    y = (R_y>G_y && R_y>B_y) ? dy.x : ((G_y>B_y) ? dy.y : dy.z);
    
    xx = (double)x*x;
    yy = (double)y*y;
    
    ang[pos.x+pos.y*get_global_size(0)] = (float)atan2pi(y,x)*360;
    mag[pos.x+pos.y*get_global_size(0)] = (double)sqrt(xx+yy);
    
}

__kernel void histograms(
                         __global float * ang,
                         __global double * mag,
                         __global float * out
                         ) {
    const int2 pos = {get_global_id(0), get_global_id(1)};
    out[pos.x+pos.y*get_global_size(0)] =  ang[pos.x+pos.y*get_global_size(0)];
    
}

You can use the same buffer as argument in both kernels. There is no need to transfer the data to the host unless you need to do host side computations on it. Just make sure you wait for the first kernel to finish before you start the second.