Parallel breadth-first search

hi, i want to do a program that visit a graph with a parallel BFS in OpenCL but it is my first time i used this library. I write a code for the host and a code for the kernel but, when i run the program, the error code for the function clEnqueueReadBuffer return sometimes CL_OUT_OF_RESOURCES and sometimes CL_INVALID_COMMAND_QUEUE. this is the code for the host

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include "head.h"


int main(void) {
	printf("


");
	//calculating the size of the matrix
	const int MATR=grand_matr("matrice2.txt"); 
	
	//create an array of lists that represent my array
	int *visitati=(int*)malloc(sizeof(int)*MATR);
	int *grafo=(int*)malloc(sizeof(int)*MATR*MATR);
	int *vicini=(int*)malloc(sizeof(int)*MATR*MATR);
	int *nodi=(int*)malloc(sizeof(int)*MATR);
	int *stack;
	FILE *matrice;
	int i=0,j=0,giro=0;
	int *sem_nodi=(int*)malloc(sizeof(int));
	sem_nodi[0]=0;
	int *sem_vis=(int*)malloc(sizeof(int));
	sem_vis[0]=0;
	int *lato=(int*)malloc(sizeof(int));
	lato[0]=MATR;
	int x,start,due;         //x rappresenta il carrattere che leggo dal file
	for(i=0;i<MATR;i++){
		nodi[i]=0;
		visitati[i]=MATR+1;
	}
	for(i=0;i<MATR*MATR;i++)
		grafo[i]=0;
	i=0;
	
		matrice = fopen("matrice2.txt", "r");
	if (!matrice) {
		fprintf(stderr, "Failed to load matrice2.txt
");
		exit(1);
	}
	
	
	else{
		while (!feof (matrice))
		{
			fscanf(matrice,"%d", &x);
			grafo[i]=x;
			i++;
			/*if (i!=j){
				grafo[j][i]=x;
			}
			i=(i+1)%MATR;
			if ((i%MATR)==0)
				j++;*/
		}
	}
	fclose(matrice);
	// Load the kernel source code into the array source_str
	FILE *fp;
	char *source_str;
	size_t source_size;
	 
	fp = fopen("kernel.cl", "r");
	if (!fp) {
		fprintf(stderr, "Failed to load kernel.
");
		exit(1);
	}
	source_str = (char*)malloc(MAX_SOURCE_SIZE);
	source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
	fclose( fp );
	
	// Get platform and device information
	cl_platform_id platform_id = NULL;

	cl_device_id device_id = NULL;   

	cl_uint ret_num_devices;

	cl_uint ret_num_platforms;

	cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
	printf("clGetPlatformIDs %d
",ret);
	if (ret!=0) exit(0);

	ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, 
		&device_id, &ret_num_devices);
	printf("clGetDeviceIDs %d
",ret); 
	if (ret!=0) exit(0);

	// Create an OpenCL context
	cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
	printf("clCreateContext %d
",ret);
	if (ret!=0) exit(0);
	
        // Create a command queue
        cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
	printf("clCreateCommandQueue %d
",ret);
	if (ret!=0) exit(0);
	// Insert into stack the start node
	start=rand()%MATR;
	nodi[start]=2;
		
	// Create memory buffers on the device for each vector 

	cl_mem visitati_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE,
		MATR * sizeof(int), NULL, &ret);
	printf("clCreateBuffer visitati %d
",ret);
	if (ret!=0) exit(0);

	cl_mem nodi_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE,
		MATR * sizeof(int), NULL, &ret);
	printf("clCreateBuffer nodi%d
",ret);
	if (ret!=0) exit(0);

	cl_mem stack_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
		MATR * sizeof(int), NULL, &ret);
	printf("clCreateBuffer stack%d
",ret);
	if (ret!=0) exit(0);

	cl_mem grafo_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
		MATR * MATR * sizeof(int), NULL, &ret);
	printf("clCreateBuffer grafo%d
",ret);
	if (ret!=0) exit(0);

	cl_mem vicini_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE,
		MATR * MATR * sizeof(int), NULL, &ret);
	printf("clCreateBuffer vicini%d
",ret);
	if (ret!=0) exit(0);
	
	// Create a program from the kernel source
	cl_program program = clCreateProgramWithSource(context, 1, 
	(const char **)&source_str, (const size_t *)&source_size, &ret);
	printf("clCreateProgramWithSource %d
",ret);
	if (ret!=0) exit(0);	
	// Build the program
	if((ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL))!=CL_SUCCESS){
		printf("clBuildProgram %d
",ret);
		size_t log_size;
		clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG,0,NULL,&log_size);
		char build_log [log_size+1];
		clGetProgramBuildInfo(program,device_id,CL_PROGRAM_BUILD_LOG,log_size,build_log,NULL);
		build_log[log_size]='\0';
		printf("ale %s 
",build_log);
		exit(0);
	}
	// Create the OpenCL kernel
	cl_kernel kernel = clCreateKernel(program, "bfs", &ret);
	printf("clCreateKernel %d
",ret);
	if (ret!=0) exit(0);

	while((due=howtwo(nodi,MATR))!=0){
		giro++;
		printf("

Giro num %d
",giro);
		// Create the stack
		stack=(int*)malloc(sizeof(int)*MATR);
		j=0;
		for(i=0;i<MATR;i++){
			if(nodi[i]==2){
				stack[j]=i;
				j++;
			}
		}
		// Copy the arrays to their respective memory buffers
		ret = clEnqueueWriteBuffer(command_queue, visitati_mem_obj, CL_TRUE, 0, 
			MATR * sizeof(int), visitati, 0, NULL, NULL);
		printf("clEnqueueWriteBuffer visitati%d
",ret);
		if (ret!=0) exit(0);
		
		ret = clEnqueueWriteBuffer(command_queue, nodi_mem_obj, CL_TRUE, 0, 
			MATR * sizeof(int), nodi, 0, NULL, NULL);
		printf("clEnqueueWriteBuffer visitati%d
",ret);
		if (ret!=0) exit(0);
		
		ret = clEnqueueWriteBuffer(command_queue, stack_mem_obj, CL_TRUE, 0, 
			MATR * sizeof(int), stack, 0, NULL, NULL);
		printf("clEnqueueWriteBuffer stack %d
",ret);
		if (ret!=0) exit(0);	
		
		ret = clEnqueueWriteBuffer(command_queue, grafo_mem_obj, CL_TRUE, 0, 
			MATR * MATR * sizeof(int), grafo, 0, NULL, NULL);
		printf("clEnqueueWriteBuffer grafo %d
",ret);
		if (ret!=0) exit(0);		
	
		ret = clEnqueueWriteBuffer(command_queue, vicini_mem_obj, CL_TRUE, 0, 
			MATR * MATR * sizeof(int), vicini, 0, NULL, NULL);
		printf("clEnqueueWriteBuffer vicini %d
",ret);
		if (ret!=0) exit(0);		

		// Set the arguments of the kernel
		ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&visitati_mem_obj);
		printf("clSetKernelArg visitati%d
",ret);
		if (ret!=0) exit(0);
		ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&nodi_mem_obj);
		printf("clSetKernelArg nodi%d
",ret);
		if (ret!=0) exit(0);
		ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&stack_mem_obj);
		printf("clSetKernelArg stack%d
",ret);
		if (ret!=0) exit(0);
		ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&grafo_mem_obj);
		printf("clSetKernelArg grafo%d
",ret);
		if (ret!=0) exit(0);
		ret = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&vicini_mem_obj);
		printf("clSetKernelArg vicini_vis%d
",ret);
		if (ret!=0) exit(0);
		ret = clSetKernelArg(kernel, 5, sizeof(int),&lato);
		printf("clSetKernelArg lato%d
",ret);
		if (ret!=0) exit(0);
		
		// Execute the OpenCL kernel on the list
		size_t global_item_size = due;
		size_t local_item_size = due;
		ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
			&global_item_size, &local_item_size, 0, NULL, NULL);
		printf("clEnqueueNDRangeKernel %d
",ret);
		if (ret!=0) exit(0);

		// Read the memory buffer vicini_mem_obj on the device to the local variable vicini
		
                
		ret = clEnqueueReadBuffer(command_queue, vicini_mem_obj, CL_TRUE, 0, 
		MATR * MATR * sizeof(int), vicini, 0, NULL, NULL);
		printf("clEnqueueReadBuffer vicini%d
",ret);
		if (ret!=0) exit(0);  //HERE IS WHERE THE ERROR IS
		for(i=0;i<MATR*due;i++){
			if (vicini[i]==2){
				nodi[i%MATR]=2;
			}
		}
	}
	// Display the result to the screen
	printf("
Ordine di visita ");	
	for(i=0;i<MATR;i++){
		printf(" %d ",visitati[i]);
	}
	printf("
Verifico che ogni nodo sia stato visitato");
	for(i=0;i<MATR;i++){
		printf("i %d ",nodi[i]);
	}
	printf("
");	
	ret = clFlush(command_queue);
	ret = clFinish(command_queue);
	ret = clReleaseKernel(kernel);
	ret = clReleaseProgram(program);
	ret = clReleaseMemObject(visitati_mem_obj);
	ret = clReleaseMemObject(nodi_mem_obj);
	ret = clReleaseMemObject(stack_mem_obj);
	ret = clReleaseMemObject(grafo_mem_obj);
	ret = clReleaseCommandQueue(command_queue);
	ret = clReleaseContext(context);
	free(visitati);
	free(nodi);
	free(stack);
	free(grafo);


}

and this is the code for the kernel

__kernel void bfs( 	__global int *visitati,
				__global int *nodi,
				__global int *stack,
				__global int *grafo,
				__global int *vicini,
				int lato) {
	
	// prendo l'indice dell'elemento da processare
	int id = get_global_id(0); 

	// prendo l'id del nodo che devo visitare
	int exp=stack[id];
	
	int i,j;
	int sez=exp*lato;
	
	//Ciclo la sezione di matrice che riguarda il work-item
	for(i=sez;i<sez+lato;i++){
	
		//Controllo che ci sia una vicinanza tra i due nodi 
		if(grafo[i]==1){
			// se il nodo vicino non è stato ancora aggiunto allo stack o visitato lo aggiungo a nodi
			if (nodi[grafo[i]]==0){
				//scrivo su nodi
				vicini[(id*lato)+i]=2;
			}
		}
	}
		
}

thanks for the help! Are 2 days that i’m completely blocked!!! And sorry for my english!!!

Those errors indicate that it’s probably crashed - mostly likely from reading/writing beyond the memory bounds. Try using some printf stuff and/or using a gpu driver to debug your memory accesses.

Given the code is so simple, you might want to try writing it in a cpu language and simulating the parallelism with loops, to get the algorithm correct.