[newbie] LU decompozition, Crout algorithm

I have a simple OpenCL project and I hit a dead end.
I have to parallelize Crout algorithm.

Here is the seqvential algorithm:

void crout(double **A, double **L, double **U, int n) {
	int i, j, k;
	double sum = 0;
 
	for (i = 0; i < n; i++) {
		U[i][i] = 1;
	}
 
	for (j = 0; j < n; j++) {
		for (i = j; i < n; i++) {
			sum = 0;
			for (k = 0; k < j; k++) {
				sum = sum + L[i][k] * U[k][j];	
			}
			L[i][j] = A[i][j] - sum;
		}
 
		for (i = j; i < n; i++) {
			sum = 0;
			for(k = 0; k < j; k++) {
				sum = sum + L[j][k] * U[k][i];
			}
			if (L[j][j] == 0) {
				printf("det(L) close to 0!
 Can't divide by 0...
");
				exit(EXIT_FAILURE);
			}
			U[j][i] = (A[j][i] - sum) / L[j][j];
		}
	}
}

Here is the host program:

#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <CL/cl.h>

#define M 4

#define CROUT_CL "crout.cl"

/* Allocates a matrix with random float entries. */
void rand_matrix(float *data, int size)
{
	int i = 0;
	for (; i < size; i++)
		data[i] = rand() / (float)RAND_MAX;
}

int
main(int argc, char** argv)
{
	/* Local matrix data */
	unsigned int i = 0, j = 0;
	unsigned int size_U = 0;
	unsigned int mem_size_U = 0;
	float* host_U = NULL;
	unsigned int size_A = 0;
	unsigned int mem_size_A = 0;
	float* host_A = NULL;
	unsigned int size_L = 0;
	unsigned int mem_size_L = 0;
	float* host_L = NULL;

	FILE *log = NULL;
	
	/* OpenCL specific variables */
	cl_program program = NULL;
	cl_kernel kernel = NULL;

	size_t dataBytes;

	cl_int result = CL_SUCCESS;
	cl_platform_id platform;
	cl_device_id device;
	cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0};
	cl_context ctx = NULL;
	cl_command_queue queue = NULL;
	cl_uint ndevices = 0;

	/* OpenCL device memory for matrices */
	cl_mem d_A;
	cl_mem d_L;
	cl_mem d_U;

	cl_device_id *clDevices = NULL;
	FILE *matrix_1_file = NULL;
	size_t srcsz = 0;
	char *matrix_1_src = NULL;

	size_t localWorkSize[2], globalWorkSize[2];
     
     /* XXX: Ugly hack for setting the CL arguments as pointer references */
	int m = M; 

	/* Host memory for matrix A*/
	size_A = M * M;
	mem_size_A = sizeof(float) * size_A;
	host_A = (float*) malloc(mem_size_A);
	if (host_A == NULL) {
		printf("Matrix A malloc() failed, bailing out!
");
		goto err;
	}
	/* Generate matrices */
	srand(2006);
	rand_matrix(host_A, size_A);

	/* Log A*/
	log = fopen("crout.log", "wb");
	if (log == NULL) {
		printf("Failed to create log file!
");
		goto err;
	}
	fprintf(log, "

Matrix A
");
	for(i = 0; i < M; i++) {
		for (j = 0; j < M ; j++) {
			fprintf(log, "%f ", host_A[j * M + i]);
		}
		fprintf(log, "
");
	}
	/* Host memory for the result matrice L and U */
	size_U = M * M;
	mem_size_U = sizeof(float) * size_U;
	host_U = (float*) malloc(mem_size_U);
	if (host_U == NULL) {
		printf("Matrix U malloc() failed, bailing out!
");
		goto err;
	}
     	size_L = M * M;
	mem_size_L = sizeof(float) * size_L;
	host_L = (float*) malloc(mem_size_L);
	if (host_L == NULL) {
		printf("Matrix L malloc() failed, bailing out!
");
		goto err;
	} 

	/* 
	 * Initialize OpenCL.
	 */
	
	result = clGetPlatformIDs(1, &platform, NULL);
	if (result != CL_SUCCESS) {
		printf ("Failed getting platform ID
");
		goto err;
	}

	result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL,
	    &ndevices);
	if (result != CL_SUCCESS) {
		printf("Failed fetching number of devices
");
		goto err;
	}
	fprintf(log, "Number of devices: %d
", ndevices);
	result = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
	if(result != CL_SUCCESS) {
		printf("Failed fetching devices
");
		goto err;
	}

	props[1] = (cl_context_properties)platform;

	ctx = clCreateContext(props, 1, &device, NULL, NULL, &result);
	if(result != CL_SUCCESS) {
		printf("Failed to create context
");
		goto err;
	}

	/* fetch the list of devices associated with context */
	result = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 0, NULL, 
		&dataBytes);
	clDevices = (cl_device_id *)malloc(dataBytes);
	if (clDevices == NULL) {
		printf("clDevices malloc() failed!
");
		goto err;
	}
	result |= clGetContextInfo(ctx, CL_CONTEXT_DEVICES, dataBytes, 
	    clDevices, NULL);
	if (result != CL_SUCCESS) {
		printf("clGetContextInfo() failed!
");
		goto err;
	}

	/* Create a command-queue */
	queue = clCreateCommandQueue(ctx, clDevices[0], 0, 
	    &result);
	if (result != CL_SUCCESS) {
		printf("clGetContextInfo() failed!
");
		goto err;
	}

	/* Setup device memory */
     d_L = clCreateBuffer(ctx, CL_MEM_READ_WRITE, mem_size_L, NULL, &result); 
	d_U = clCreateBuffer(ctx, CL_MEM_READ_WRITE, mem_size_U, NULL, &result);
	d_A = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
	    mem_size_A, host_A, &result);

	/* 
	 * Compile and link the OpenCL kernel.
	 */

	/* Read-in the source code */
	matrix_1_file = fopen(CROUT_CL, "rb");
	if (matrix_1_file == NULL) {
		printf("fopen() failed!
");
		goto err;
	}
	fseek(matrix_1_file, 0, SEEK_END);
	srcsz = ftell(matrix_1_file);
	fseek(matrix_1_file, 0, SEEK_SET);
	matrix_1_src = (char *)malloc(srcsz + 1);
	if (matrix_1_src == NULL) {
		printf("matrix_1_src malloc() failed!
");
		goto err;
	}
	fread(matrix_1_src, 1, srcsz, matrix_1_file);
	matrix_1_src[srcsz] = 0;

	fprintf(log, "
FILE DUMP BEGINS
");
	fprintf(log, "%s", matrix_1_src);
	fprintf(log, "FILE DUMP ENDS
");
	
	program = clCreateProgramWithSource(ctx, 1,
	    (const char **)&matrix_1_src, &srcsz, &result);
	if (result != CL_SUCCESS) {
		printf("clCreateProgamWithSource() failed!
");
		goto err;
	}

	/* Build the kernel */
	result = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
	if (result != CL_SUCCESS) {
		/* Print out the build log in case of failure */
		char programLog[10000] = {0};
		printf("clBuildProgram() failed!
");
		result = clGetProgramBuildInfo(program, device,
		    CL_PROGRAM_BUILD_LOG, 10000, programLog, 0);
		printf("%s
", programLog);
		goto err;
	}

	/* Fetch the kernel for the crout program */
	kernel = clCreateKernel(program, "crout", &result);
	if (result != CL_SUCCESS) {
		printf("clCreateKernel() failed!
");
		goto err;
	}
	
	/* Push arguments down the stack*/
	result |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_A);
	result = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_L);
	result = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_U);
     	result |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&m); 
	if (result != CL_SUCCESS) {
		printf("clSetKernelArg() failed!
");
		goto err;
	}

	localWorkSize[0] = 1;
     localWorkSize[1] = 1;

	globalWorkSize[0] = M;
	globalWorkSize[1] = M;
      

	result = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalWorkSize, 
	    localWorkSize, 0, NULL, NULL);
	if (result != CL_SUCCESS) {
		printf("clEnqueueNDRangeKernel() failed!
");
		goto err;
	}

	/* 
	 * Results.
	 */
     result = clEnqueueReadBuffer(queue, d_L, CL_TRUE, 0, mem_size_L, host_L,
	    0, NULL, NULL);
	if (result != CL_SUCCESS) {
		printf("clEnqueueReadBuffer() failed!
");
		goto err;
	}
	result = clEnqueueReadBuffer(queue, d_U, CL_TRUE, 0, mem_size_U, host_U,
	    0, NULL, NULL);
	if (result != CL_SUCCESS) {
		printf("clEnqueueReadBuffer() failed!
");
		goto err;
	}

	/* Log the results */
     fprintf(log, "

Matrix L (Results)
");
	for(i = 0; i < M; i++) {
		for (j = 0; j < M; j++) {
			fprintf(log, "%f ", host_L[j * M + i]);
		}
		fprintf(log, "
");
	}
	fprintf(log, "
"); 
	fprintf(log, "

Matrix U (Results)
");
	for(i = 0; i < M; i++) {
		for (j = 0; j < M; j++) {
			fprintf(log, "%f ", host_U[j * M + i]);
		}
		fprintf(log, "
");
	}
	fprintf(log, "
");

err:
	if (host_A)
		free(host_A);
	if (host_L)
		free(host_L);
	if (host_U)
		free(host_U);
	if (matrix_1_file)
		fclose(matrix_1_file);
	if (matrix_1_src)
		free(matrix_1_src);
	if (clDevices)
		free(clDevices);
	if (ctx)
		clReleaseContext(ctx);
	if (kernel)
		clReleaseKernel(kernel);
	if (program)
		clReleaseProgram(program);
	if (queue)
		clReleaseCommandQueue(queue);
	if (log)
		fclose(log);

	return 0;
}

And here is the kernel. What I have done so far. I only done it for the L matrix because U is almost similar.

__kernel void crout (__global const float *a,
    __global float *l, __global float *u, const int m)
{
	int i = get_global_id(0);
	int j = get_global_id(1);
	int k=0;
	float ll =0;
	float uu =0;/*
	for (k=0;k<m;k++){
		if (i>=k && j>k){
			l[i+k*m]=a[i+k*m]-l[i+(k-1)*m]*l[k*m+k];  // this line needs work. 
						   // k desc         k desc    
			uu-=a[m*j+k];
		}

	}
	/* This is just for the first two columns 
	if(i>=2)
	l[i+2*m]=a[i+2*m]-a[i+1*m]*a[2*m+1]-a[i+0*m]*a[2*m+0];
         */
}

I don’t now how to make the partial sums without a loop.

You can’t, but there’s nothing wrong with a loop in a kernel.

Yes it can be done. I found the CUDA solution here: https://smartech.gatech.edu/bitstream/handle/1853/33980/virk_bikram_j_201005_mast.pdf Page 34. But I don’t know know how to implement it in OpenCL.