here is my kernel. I changed some of the variables names, due to the fact that the project is under IP. but it is essentially a convolution algorithm applied using weights that vary depending on the y values;
__kernel void Convo(
global uchar* input,
global float* output,
global float* weightsIn,
private const int numDataPerRow,
private const int numDataPerCol,
global const int* numWeightsX,
global const int* numWeightsY,
global const int* indices)
{
int index = get_global_id(0);
if(input[index] < 255)
{
bool flipped = true;
int x = index%numDataPerRow;
int y = index/numDataPerRow;
int yVal = ((75-y*.625)+2.5)/5;
if (yVal <= 0)
{
yVal *= -1;
flipped = false;
}
else
flipped = true;
int halfValRowWeights = numWeightsX[yVal]/2;
int halfValColWeights = numWeightsY[yVal]/2;
int left = x- halfValRowWeights;
int right = x+ halfValRowWeights;
int top = y- halfValColWeights;
int bottom = y+ halfValColWeights;
float weight = 0;
float total = 0;
//the weight indices
int weightRow = 0;
int weightCol = 0;
//if we are in the negative, then we need to adjust
//the Y index in the weight grid, and the dataGrid
if (top < 0)
{
//adjust the weight coordinates
weightRow = abs(top);
//set top of data to 0
top = 0;
}
else if (y > numDataPerCol - 1 - halfValColWeights)
{
//dont have to change weights here because we access all of them
bottom = numDataPerCol - 1;
}
if (flipped)
{
y = numWeightsY[yVal]-1;
}
//the values from the array
float weightVal = 0;
float inputVal = 0;
for (int yIn = top; yIn <= bottom; yIn++)
{
weightCol = 0;
for (int xIn = left; xIn <= right; xIn++)
{
//used if we have a wrapped X value
int indexWaveDataX = 0;
if (xIn < 0)
{
//we need to wrap in the X
indexWaveDataX = numDataPerRow + xIn;
}
else if (xIn > numDataPerRow - 1)
{
indexWaveDataX = xIn - numDataPerRow;
}
else
{
indexWaveDataX = xIn;
}
inputVal = input[yIn*numDataPerRow + indexWaveDataX];
weightVal = weightsIn[weightCol + weightRow * numWeightsX[lat] + indices[lat]];
if (inputVal < 255)
{
weight = weight + (weightVal * inputVal);
total = total + weightVal;
}
weightCol++;
}
if (flipped)
weightRow--;
else
weightRow++;
}
float value = (float)(weight/total);
output[index] = value;
}
else
output[index] = -1;
}
The syntax is correct, I know that. Here is the code for my host.
myWatch.Start();
float[] convert;
IEnumerable<float> we;
ComputeEventList eventList = new ComputeEventList();
ComputeCommandQueue commands = new ComputeCommandQueue(context, context.Devices[0], ComputeCommandQueueFlags.None);
ComputeKernel kernel = null;
for (int i = 0; i < totalRuns; i++)
{
numWeightsXS = new ComputeBuffer<int>(context,
ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, numWeightsX[i]);
numWeightsYS = new ComputeBuffer<int>(context,
ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, numWeightsY[i]);
//indices buffer is used to determine where we are in convert
//convert is a 3d array, flattened at the second level
indicesBuffer = new ComputeBuffer<int>(context,
ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, indices[i]);
we = weights[i].SelectMany(x => x);
convert = we.ToArray();
weightsIn = new ComputeBuffer<float>(context,
ComputeMemoryFlags.ReadOnly | ComputeMemoryFlags.CopyHostPointer, convert);
Console.WriteLine(convert.Length);
kernel = program.CreateKernel("Convo");
inBuffer = new ComputeBuffer<byte>(context,
ComputeMemoryFlags.ReadOnly | //used to make sure that we use a const
ComputeMemoryFlags.CopyHostPointer, //so we cant change the original data, but we have it
dataArray[i]);
outBuffer = new ComputeBuffer<float>(context,
ComputeMemoryFlags.WriteOnly, // make sure that we arent reading and writing to this object
dataOut[i]);
kernel.SetMemoryArgument(0, inBuffer); // make sure you sync these with the kernel arguments
kernel.SetMemoryArgument(1, outBuffer);
kernel.SetMemoryArgument(2, weightsIn);
kernel.SetValueArgument<int>(3, globalData[i].XGrids);
kernel.SetValueArgument<int>(4, globalData[i].YGrids);
kernel.SetMemoryArgument(5, numWeightsXS);
kernel.SetMemoryArgument(6, numWeightsYS);
kernel.SetMemoryArgument(7, indicesBuffer);
commands.Execute(kernel, null, new long[] { dataArray[i].Length }, null, eventList);
commands.ReadFromBuffer<float>(outBuffer, ref dataOut[i], true, eventList);
eventList.Wait();
}
commands.Finish();
eventList.Clear();
if(DEBUG)
for(int a = 0; a < dataOut[i].Length; a++)
{
Console.WriteLine(dataOut[i][a]);
}
}
myWatch.Stop();
//printData("GPU: ");
myWatch.Reset();
kernel.Dispose();
inBuffer.Dispose();
outBuffer.Dispose();
commands.Dispose();
weightsIn.Dispose();
numWeightsXS.Dispose();
numWeightsYS.Dispose();
indicesBuffer.Dispose();
program.Dispose();
context.Dispose();
Sorry its alot of code. The data that is getting to the kernel is correct. I output it and it is giving the right numbers. Note that I had to cast from bytes to chars, due to memory alignment problems between the host and GPU.
So I dont know if im killing something I shouldnt be too soon, or if its an index out of bounds thing. I am assuming the index out of bounds, because if I get rid of the weights line that uses the indices, then it stops failing. But it may be something else. Any ideas?