Strange problem with python + ctypes + opencl

Does anyone here have much experience with python/ctypes? I have a very simple program,which when directly ported to python via ctypes sporadically segfaults. It’s driving me quite insane. The program initializes opencl and then enters a loop where it repeatedly compiles a blank kernel & executes it. For some reason, repeated compilation & execution of a kernel in python (at least on my system) quite rapidly segfaults, and I have no idea why. Any ideas at all would be appreciated. I’m running Ubuntu 11.10 amd64 & using a nvidia 485m card, with the 285.05.09 driver. Thanks!

The simple c program:

#include <CL/cl.h>
#include <stdio.h>

cl_context ctx; // OpenCL context
cl_command_queue queue; // OpenCL command que
cl_platform_id platform; // OpenCL platform
cl_device_id device; // OpenCL device
cl_program program; // OpenCL program
cl_kernel kernel; // OpenCL kernel
cl_event event;
cl_int err1, err2; // Error code var
size_t global_size; // 1D var for Total # of work items
size_t local_size; // 1D var for # of work items in the work group

int main(){
global_size = 256;
local_size = 16;

// initialize opencl
err1 = clGetPlatformIDs(1, &platform, NULL);
err1 = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
ctx = clCreateContext(0, 1, &device, NULL, NULL, &err1);
queue = clCreateCommandQueue(ctx, device, 0, &err1);

printf("queue: %ld
", (long)queue);

char* source = “__kernel void main(){}”;

// repeatedly compile & execute a kernel
while(1){

program = clCreateProgramWithSource(ctx, 1, (const char **)&source, 0, &err1);
printf("program: %ld

", (long)program);

err1 = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

kernel = clCreateKernel(program, "main", &err1);
printf("kernel: %ld

", (long)kernel);

err1 = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);

if(err1 == CL_SUCCESS)
  printf("successfully executed

");
}
}

and the python port:

#! /usr/bin/python

from ctypes import *
openCL = CDLL(“libOpenCL.so”)

block_size = 16
kernel_dim = 256

DEVICE_TYPE_GPU = 1<<2

intalize opencl

platforms = create_string_buffer(4)
openCL.clGetPlatformIDs (1, platforms, None)
platform = cast(platforms, POINTER(c_int))[0]

device = create_string_buffer(4)
openCL.clGetDeviceIDs(platform, DEVICE_TYPE_GPU, 1, device, None);
device = cast(device, POINTER(c_int))[0]

ctx = openCL.clCreateContext(0, 1, (c_int * 1)(device), None, None, None);

queue = openCL.clCreateCommandQueue(ctx, device, 0, None);

contents = ‘’’ __kernel void main(){}’’’
contents = c_char_p(contents)
contents = (c_char_p * 1)(contents)

repeatedly build & execute kernel

while(True):

program = openCL.clCreateProgramWithSource(ctx, 1, contents, 0, None)   
print "PROGRAM:", program

openCL.clBuildProgram(program, 0, None, None, None, None)

main_kernel = openCL.clCreateKernel(program, "main", None)
print "MAIN KERNEL:", main_kernel

openCL.clEnqueueNDRangeKernel(queue, main_kernel, 1, None, (c_long * 1)(kernel_dim), (c_long * 1)(block_size), 0, None, None) 

#none of these seem to help
#openCL.clReleaseKernel(main_kernel)
#openCL.clReleaseProgram(program)

#openCL.clFinish(queue)
#openCL.clFlush(queue)