Compilation bug?

Dear all,

I am using OpenCL to run a procedure in my graphical board, which is yielding strange results. I’ve debugged the procedure the most I could, and I noticed that the error starts when I call the function C6_pow_CMP (see the attached code, marked with “// This is the line where the error starts”), which for some reason, is returning 1.#IND00e+000 when it shouldn’t.

I launched a simpler procedure in my graphical board, where I also call the function C6_pow_CMP, but in this case the obtained results are the expected.

May this be some compilation bug of the graphical board? I would appreciate any comment or suggestion.

Later, I can send the C code that launches the procedure on the GPU and sets up all the variables to whoever may be interested.

Thank you

This is the code that is yielding the wrong results (it is a bit too long):



//// GPU code
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define type float
#define N_DOF_SVP 10
#define N_DOF_SH 5
#define N_HOR_COORD 5
#define N_VBEM_HOR_COORD 0
#define N_X 5
#define N_VERT_BOUND 0
#define N_NODE_BEM 3
#define N_25D_BEM_ELEM 3
#define N_THIN_LAYER 5
#define K_ORDER_TLM 1
#define N_PSR 0
#define K_BEM_ORDER 1
#define N_DOF_1 9
#define N_DOF_2 0
#define N_DOF_F 0

void C6_add_CMP(type*, type*, type*);
void C6_sub_CMP(type*, type*, type*);
void C6_mult_CMP(type*, type*, type*);
void C6_div_CMP(type*, type*, type*);
void C6_exp_CMP(type*, type*);
void C6_pow_CMP(type*, type*, type);

__kernel void CalcTLM_25D_Integrals_GPU_SVP
(
	int nX,
	int nDofSVP,
	type fKy,
	__global type* ad1X,
	__global type* ad1Im3_SVP,
	__global type* ad1Im2_SVP,
	__global type* ad1Im1_SVP,
	__global type* ad1I0_SVP,
	__global type* ad1I1_SVP,
	__global type* ad1I2_SVP,
	__global type* ad1I3_SVP,
	__global type* ad1SVP_Poles
)
{
	int iMode = get_global_id(0) + 1;
	int jMode;
	int iX = get_global_id(1) + 1;
	int jX;
	int kSign;
	type fX;
	type fAux;
	type cmpAux1[3];
	type cmpAux2[3];
	type cmpAux3[3];
	type cmpIm[3];
	type cmpKj[3];
	type cmpKj2[3];
	type cmpKj2Ky2[3];
	type cmpKy[3];
	type cmpKy2[3];
	type cmpKy3[3];
	type cmpKy4[3];
	type cmpOne[3];
	type cmp_Exp_Ky_x[3];
	type cmp_Exp_sKj2Ky2_x[3];
	type cmp_sKj2Ky2[3];
	__global type* cmpIm3_RE;
	__global type* cmpIm3_IM;
	__global type* cmpIm2_RE;
	__global type* cmpIm2_IM;
	__global type* cmpIm1_RE;
	__global type* cmpIm1_IM;
	__global type* cmpI0_RE;
	__global type* cmpI0_IM;
	__global type* cmpI1_RE;
	__global type* cmpI1_IM;
	__global type* cmpI2_RE;
	__global type* cmpI2_IM;
	__global type* cmpI3_RE;
	__global type* cmpI3_IM;

	__global type* acmp1SVP_Poles[3];
	__global type* acmp1Im3_SVP[2*N_X*N_DOF_SVP+1];
	__global type* acmp1Im2_SVP[2*N_X*N_DOF_SVP+1];
	__global type* acmp1Im1_SVP[2*N_X*N_DOF_SVP+1];
	__global type* acmp1I0_SVP[2*N_X*N_DOF_SVP+1];
	__global type* acmp1I1_SVP[2*N_X*N_DOF_SVP+1];
	__global type* acmp1I2_SVP[2*N_X*N_DOF_SVP+1];
	__global type* acmp1I3_SVP[2*N_X*N_DOF_SVP+1];
	__global type** acmp2Im3_SVP[2*N_X+1];
	__global type** acmp2Im2_SVP[2*N_X+1];
	__global type** acmp2Im1_SVP[2*N_X+1];
	__global type** acmp2I0_SVP[2*N_X+1];
	__global type** acmp2I1_SVP[2*N_X+1];
	__global type** acmp2I2_SVP[2*N_X+1];
	__global type** acmp2I3_SVP[2*N_X+1];
	__global type*** acmp3Im3_SVP[3];
	__global type*** acmp3Im2_SVP[3];
	__global type*** acmp3Im1_SVP[3];
	__global type*** acmp3I0_SVP[3];
	__global type*** acmp3I1_SVP[3];
	__global type*** acmp3I2_SVP[3];
	__global type*** acmp3I3_SVP[3];

	/* Calculate pointers */
	acmp1SVP_Poles[1] = &ad1SVP_Poles[0];
	acmp1SVP_Poles[1]--;
	acmp1SVP_Poles[2] = &ad1SVP_Poles[nDofSVP];
	acmp1SVP_Poles[2]--;

	acmp3Im3_SVP[1] = &acmp2Im3_SVP[1];
	acmp3Im3_SVP[1]--;
	acmp3Im3_SVP[2] = &acmp2Im3_SVP[1+nX];
	acmp3Im3_SVP[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3Im3_SVP[1][jX] = &acmp1Im3_SVP[(jX-1)*nDofSVP + 1];
		acmp3Im3_SVP[1][jX]--;
		acmp3Im3_SVP[2][jX] = &acmp1Im3_SVP[(nX+jX-1)*nDofSVP + 1];
		acmp3Im3_SVP[2][jX]--;
		for (jMode = 1; jMode <= nDofSVP; jMode++)
		{
			acmp3Im3_SVP[1][jX][jMode] =
							&ad1Im3_SVP[(jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3Im3_SVP[1][jX][jMode]--;
			acmp3Im3_SVP[2][jX][jMode] =
							&ad1Im3_SVP[(nX+jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3Im3_SVP[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3Im2_SVP[1] = &acmp2Im2_SVP[1];
	acmp3Im2_SVP[1]--;
	acmp3Im2_SVP[2] = &acmp2Im2_SVP[1+nX];
	acmp3Im2_SVP[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3Im2_SVP[1][jX] = &acmp1Im2_SVP[(jX-1)*nDofSVP + 1];
		acmp3Im2_SVP[1][jX]--;
		acmp3Im2_SVP[2][jX] = &acmp1Im2_SVP[(nX+jX-1)*nDofSVP + 1];
		acmp3Im2_SVP[2][jX]--;
		for (jMode = 1; jMode <= nDofSVP; jMode++)
		{
			acmp3Im2_SVP[1][jX][jMode] =
							&ad1Im2_SVP[(jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3Im2_SVP[1][jX][jMode]--;
			acmp3Im2_SVP[2][jX][jMode] =
							&ad1Im2_SVP[(nX+jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3Im2_SVP[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3Im1_SVP[1] = &acmp2Im1_SVP[1];
	acmp3Im1_SVP[1]--;
	acmp3Im1_SVP[2] = &acmp2Im1_SVP[1+nX];
	acmp3Im1_SVP[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3Im1_SVP[1][jX] = &acmp1Im1_SVP[(jX-1)*nDofSVP + 1];
		acmp3Im1_SVP[1][jX]--;
		acmp3Im1_SVP[2][jX] = &acmp1Im1_SVP[(nX+jX-1)*nDofSVP + 1];
		acmp3Im1_SVP[2][jX]--;
		for (jMode = 1; jMode <= nDofSVP; jMode++)
		{
			acmp3Im1_SVP[1][jX][jMode] =
							&ad1Im1_SVP[(jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3Im1_SVP[1][jX][jMode]--;
			acmp3Im1_SVP[2][jX][jMode] =
							&ad1Im1_SVP[(nX+jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3Im1_SVP[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3I0_SVP[1] = &acmp2I0_SVP[1];
	acmp3I0_SVP[1]--;
	acmp3I0_SVP[2] = &acmp2I0_SVP[1+nX];
	acmp3I0_SVP[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3I0_SVP[1][jX] = &acmp1I0_SVP[(jX-1)*nDofSVP + 1];
		acmp3I0_SVP[1][jX]--;
		acmp3I0_SVP[2][jX] = &acmp1I0_SVP[(nX+jX-1)*nDofSVP + 1];
		acmp3I0_SVP[2][jX]--;
		for (jMode = 1; jMode <= nDofSVP; jMode++)
		{
			acmp3I0_SVP[1][jX][jMode] =
							&ad1I0_SVP[(jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3I0_SVP[1][jX][jMode]--;
			acmp3I0_SVP[2][jX][jMode] =
							&ad1I0_SVP[(nX+jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3I0_SVP[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3I1_SVP[1] = &acmp2I1_SVP[1];
	acmp3I1_SVP[1]--;
	acmp3I1_SVP[2] = &acmp2I1_SVP[1+nX];
	acmp3I1_SVP[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3I1_SVP[1][jX] = &acmp1I1_SVP[(jX-1)*nDofSVP + 1];
		acmp3I1_SVP[1][jX]--;
		acmp3I1_SVP[2][jX] = &acmp1I1_SVP[(nX+jX-1)*nDofSVP + 1];
		acmp3I1_SVP[2][jX]--;
		for (jMode = 1; jMode <= nDofSVP; jMode++)
		{
			acmp3I1_SVP[1][jX][jMode] =
							&ad1I1_SVP[(jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3I1_SVP[1][jX][jMode]--;
			acmp3I1_SVP[2][jX][jMode] =
							&ad1I1_SVP[(nX+jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3I1_SVP[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3I2_SVP[1] = &acmp2I2_SVP[1];
	acmp3I2_SVP[1]--;
	acmp3I2_SVP[2] = &acmp2I2_SVP[1+nX];
	acmp3I2_SVP[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3I2_SVP[1][jX] = &acmp1I2_SVP[(jX-1)*nDofSVP + 1];
		acmp3I2_SVP[1][jX]--;
		acmp3I2_SVP[2][jX] = &acmp1I2_SVP[(nX+jX-1)*nDofSVP + 1];
		acmp3I2_SVP[2][jX]--;
		for (jMode = 1; jMode <= nDofSVP; jMode++)
		{
			acmp3I2_SVP[1][jX][jMode] =
							&ad1I2_SVP[(jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3I2_SVP[1][jX][jMode]--;
			acmp3I2_SVP[2][jX][jMode] =
							&ad1I2_SVP[(nX+jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3I2_SVP[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3I3_SVP[1] = &acmp2I3_SVP[1];
	acmp3I3_SVP[1]--;
	acmp3I3_SVP[2] = &acmp2I3_SVP[1+nX];
	acmp3I3_SVP[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3I3_SVP[1][jX] = &acmp1I3_SVP[(jX-1)*nDofSVP + 1];
		acmp3I3_SVP[1][jX]--;
		acmp3I3_SVP[2][jX] = &acmp1I3_SVP[(nX+jX-1)*nDofSVP + 1];
		acmp3I3_SVP[2][jX]--;
		for (jMode = 1; jMode <= nDofSVP; jMode++)
		{
			acmp3I3_SVP[1][jX][jMode] =
							&ad1I3_SVP[(jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3I3_SVP[1][jX][jMode]--;
			acmp3I3_SVP[2][jX][jMode] =
							&ad1I3_SVP[(nX+jX-1)*nDofSVP*6+(jMode-1)*6];
			acmp3I3_SVP[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	cmpKy[1] = fKy;
	cmpKy[2] = 0.0;
	cmpKy2[1] = fKy * fKy;
	cmpKy2[2] = 0.0;
	cmpKy3[1] = fabs(fKy * fKy * fKy);
	cmpKy3[2] = 0.0;
	cmpKy4[1] = fKy * fKy * fKy * fKy;
	cmpKy4[2] = 0.0;
	cmpOne[1] = 1.0;
	cmpOne[2] = 0.0;
	cmpIm[1] = 0.0;
	cmpIm[2] = 1.0;

	cmpKj[1] = acmp1SVP_Poles[1][iMode];
	cmpKj[2] = acmp1SVP_Poles[2][iMode];
	C6_mult_CMP(cmpKj2, cmpKj, cmpKj);
	C6_sub_CMP(cmpKj2Ky2, cmpKj2, cmpKy2);
	fAux = 0.5;
	C6_pow_CMP(cmp_sKj2Ky2, cmpKj2Ky2, fAux); // This is the line where the error starts
	if (cmp_sKj2Ky2[2] > 0.0)
	{
		cmp_sKj2Ky2[1] = -cmp_sKj2Ky2[1];
		cmp_sKj2Ky2[2] = -cmp_sKj2Ky2[2];
	}

	/* Calculate SVP integrals */
	cmpIm3_RE = acmp3Im3_SVP[1][iX][iMode];
	cmpIm3_IM = acmp3Im3_SVP[2][iX][iMode];
	cmpIm2_RE = acmp3Im2_SVP[1][iX][iMode];
	cmpIm2_IM = acmp3Im2_SVP[2][iX][iMode];
	cmpIm1_RE = acmp3Im1_SVP[1][iX][iMode];
	cmpIm1_IM = acmp3Im1_SVP[2][iX][iMode];
	cmpI0_RE = acmp3I0_SVP[1][iX][iMode];
	cmpI0_IM = acmp3I0_SVP[2][iX][iMode];
	cmpI1_RE = acmp3I1_SVP[1][iX][iMode];
	cmpI1_IM = acmp3I1_SVP[2][iX][iMode];
	cmpI2_RE = acmp3I2_SVP[1][iX][iMode];
	cmpI2_IM = acmp3I2_SVP[2][iX][iMode];
	cmpI3_RE = acmp3I3_SVP[1][iX][iMode];
	cmpI3_IM = acmp3I3_SVP[2][iX][iMode];

	fX = ad1X[iX - 1];
	if (fX > 0.0) kSign = 1;
	else if (fX == 0.0) kSign = 0;
	else if (fX < 0.0) kSign = -1;
	fX = fabs(fX);

	cmpAux1[1] = fX * cmp_sKj2Ky2[2];
	cmpAux1[2] = -fX * cmp_sKj2Ky2[1];
	C6_exp_CMP(cmp_Exp_sKj2Ky2_x, cmpAux1);

	cmpAux1[1] = -fabs(fX * fKy);
	cmpAux1[2] = 0.0;
	C6_exp_CMP(cmp_Exp_Ky_x, cmpAux1);

	/* Im3,1 */
	cmpAux3[1] = fX * fX / 2.0;
	cmpAux3[2] = 0.0;
	C6_div_CMP(cmpAux1, cmpOne, cmpKj2Ky2);
	cmpAux3[1] -= cmpAux1[1];
	cmpAux3[2] -= cmpAux1[2];
	C6_div_CMP(cmpAux1, cmp_Exp_sKj2Ky2_x, cmpKj2Ky2);
	cmpAux3[1] += cmpAux1[1];
	cmpAux3[2] += cmpAux1[2];
	cmpAux1[1] = 2.0 * cmpKj2Ky2[2];
	cmpAux1[2] = -2.0 * cmpKj2Ky2[1];
	C6_div_CMP(cmpAux2, cmpAux3, cmpAux1);
	cmpAux3[1] = -kSign * cmpAux2[1];
	cmpAux3[2] = -kSign * cmpAux2[2];
	cmpIm3_RE[1] = cmpKj[1];
	cmpIm3_IM[1] = cmpKj[2];

	/* Im3,2 */
	cmpAux1[1] = fX;
	cmpAux1[2] = 0.0;
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2Ky2);
	cmpAux1[1] = fabs(fKy);
	cmpAux1[2] = 0.0;
	C6_mult_CMP(cmpAux3, cmpAux1, cmpKj2);
	C6_div_CMP(cmpAux1, cmp_Exp_Ky_x, cmpAux3);
	cmpAux2[1] += cmpAux1[1];
	cmpAux2[2] += cmpAux1[2];
	cmpAux1[1] = 0.0;
	cmpAux1[2] = -fKy * fKy;
	C6_mult_CMP(cmpAux3, cmpAux1, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux1, cmpAux3, cmpKj2);
	C6_div_CMP(cmpAux3, cmpAux1, cmpKj2Ky2);
	C6_div_CMP(cmpAux1, cmpAux3, cmp_sKj2Ky2);
	cmpAux2[1] += cmpAux1[1];
	cmpAux2[2] += cmpAux1[2];
	C6_div_CMP(cmpAux3, cmpAux2, cmpKy);
	cmpAux3[1] /= 2.0;
	cmpAux3[2] /= 2.0;
	cmpIm3_RE[2] = cmpKj2[1];
	cmpIm3_IM[2] = cmpKj2[2];
	
	/* Im3,3 */
	C6_div_CMP(cmpAux1, cmpOne, cmpKj2Ky2);
	cmpAux1[1] = -cmpAux1[1];
	cmpAux1[2] = -cmpAux1[2];
	C6_div_CMP(cmpAux2, cmp_Exp_Ky_x, cmpKj2);
	cmpAux1[1] += cmpAux2[1];
	cmpAux1[2] += cmpAux2[2];
	C6_mult_CMP(cmpAux2, cmpKy2, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux3, cmpAux2, cmpKj2);
	C6_div_CMP(cmpAux2, cmpAux3, cmpKj2Ky2);
	cmpAux1[1] += cmpAux2[1];
	cmpAux1[2] += cmpAux2[2];
	cmpAux2[1] = 0.0;
	cmpAux2[2] = 2.0 * fKy * fKy;
	C6_div_CMP(cmpAux3, cmpAux1, cmpAux2);
	cmpIm3_RE[3] = cmpKj2Ky2[1];
	cmpIm3_IM[3] = cmpKj2Ky2[2];

	/* Im3,4 */
	cmpAux1[1] = -fX * fX / 2.0;
	cmpAux1[2] = 0.0;
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2Ky2);
	cmpAux1[1] = 2.0 * cmpKy2[1] - cmpKj2[1];
	cmpAux1[2] = 2.0 * cmpKy2[2] - cmpKj2[2];
	C6_div_CMP(cmpAux3, cmpAux1, cmpKy2);
	C6_div_CMP(cmpAux1, cmpAux3, cmpKj2Ky2);
	C6_div_CMP(cmpAux3, cmpAux1, cmpKj2Ky2);
	cmpAux2[1] += cmpAux3[1];
	cmpAux2[2] += cmpAux3[2];
	C6_div_CMP(cmpAux1, cmp_Exp_Ky_x, cmpKy2);
	C6_div_CMP(cmpAux3, cmpAux1, cmpKj2);
	cmpAux2[1] += cmpAux3[1];
	cmpAux2[2] += cmpAux3[2];
	C6_mult_CMP(cmpAux1, cmpKy2, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux3, cmpAux1, cmpKj2);
	C6_div_CMP(cmpAux1, cmpAux3, cmpKj2Ky2);
	C6_div_CMP(cmpAux3, cmpAux1, cmpKj2Ky2);
	cmpAux2[1] -= cmpAux3[1];
	cmpAux2[2] -= cmpAux3[2];
	C6_div_CMP(cmpAux3, cmpAux2, cmpIm);
	cmpAux3[1] *= -kSign / 2.0;
	cmpAux3[2] *= -kSign / 2.0;
	cmpIm3_RE[4] = cmp_sKj2Ky2[1];
	cmpIm3_IM[4] = cmp_sKj2Ky2[2];

	/* Im3,5 */
	cmpAux1[1] = fX;
	cmpAux1[2] = 0.0;
	C6_div_CMP(cmpAux2, cmp_Exp_sKj2Ky2_x, cmp_sKj2Ky2);
	C6_mult_CMP(cmpAux3, cmpAux2, cmpIm);
	cmpAux1[1] -= cmpAux3[1];
	cmpAux1[2] -= cmpAux3[2];
	C6_mult_CMP(cmpAux2, cmpKj, cmpKj2Ky2);
	C6_div_CMP(cmpAux3, cmpAux1, cmpAux2);
	cmpIm3_RE[5] = cmp_Exp_sKj2Ky2_x[1];
	cmpIm3_IM[5] = cmp_Exp_sKj2Ky2_x[2];

	/* Im3,6 */
	cmpAux1[1] = fKy * cmpIm3_RE[1];
	cmpAux1[2] = fKy * cmpIm3_IM[1];
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj);
	cmpIm3_RE[6] = cmp_Exp_Ky_x[1];
	cmpIm3_IM[6] = cmp_Exp_Ky_x[2];

	if (fKy == 0)
	{
		cmpIm3_RE[2] = 0.0;
		cmpIm3_IM[2] = 0.0;
		cmpIm3_RE[4] = 0.0;
		cmpIm3_IM[4] = 0.0;
		cmpIm3_RE[6] = 0.0;
		cmpIm3_IM[6] = 0.0;
		cmpIm3_RE[3] = cmpIm3_RE[1];
		cmpIm3_IM[3] = cmpIm3_IM[1];
	} /* if */

	/* Im2,1 */
	cmpAux1[1] = cmpIm3_RE[5];
	cmpAux1[2] = cmpIm3_IM[5];
	C6_mult_CMP(cmpAux2, cmpAux1, cmpKj);
	cmpIm2_RE[1] = cmpAux2[1];
	cmpIm2_IM[1] = cmpAux2[2];

	/* Im2,2 */
	cmpIm2_RE[2] = cmpIm3_RE[3] *fKy;
	cmpIm2_IM[2] = cmpIm3_IM[3] *fKy;

	/* Im2,3 */
	cmpAux1[1] = cmp_Exp_Ky_x[1] / fabs(fKy);
	cmpAux1[2] = cmp_Exp_Ky_x[2] / fabs(fKy);
	C6_div_CMP(cmpAux2, cmp_Exp_sKj2Ky2_x, cmp_sKj2Ky2);
	cmpAux1[1] -= cmpAux2[2];
	cmpAux1[2] += cmpAux2[1];
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2);
	cmpIm2_RE[3] = -cmpAux2[1] / 2.0;
	cmpIm2_IM[3] = -cmpAux2[2] / 2.0;

	/* Im2,4 */
	cmpIm2_RE[4] = cmpIm3_RE[2] *fKy;
	cmpIm2_IM[4] = cmpIm3_IM[2] *fKy;

	/* Im2,5 */
	C6_sub_CMP(cmpAux1, cmpOne, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2Ky2);
	C6_div_CMP(cmpAux1, cmpAux2, cmpKj);
	C6_div_CMP(cmpAux2, cmpAux1, cmpIm);
	cmpIm2_RE[5] = -kSign * cmpAux2[1] / 2.0;
	cmpIm2_IM[5] = -kSign * cmpAux2[2] / 2.0;

	/* Im2,6 */
	cmpIm2_RE[6] = cmpIm3_RE[5] *fKy;
	cmpIm2_IM[6] = cmpIm3_IM[5] *fKy;

	if (fKy == 0)
	{
		cmpIm2_RE[2] = 0.0;
		cmpIm2_IM[2] = 0.0;
		cmpIm2_RE[4] = 0.0;
		cmpIm2_IM[4] = 0.0;
		cmpIm2_RE[6] = 0.0;
		cmpIm2_IM[6] = 0.0;
		cmpIm2_RE[3] = cmpIm2_RE[1];
		cmpIm2_IM[3] = cmpIm2_IM[1];
	} /* if */

	/* Im1,1 */
	cmpAux1[1] = cmpIm2_RE[5];
	cmpAux1[2] = cmpIm2_IM[5];
	C6_mult_CMP(cmpAux2, cmpAux1, cmpKj);
	cmpIm1_RE[1] = cmpAux2[1];
	cmpIm1_IM[1] = cmpAux2[2];

	/* Im1,2 */
	cmpIm1_RE[2] = cmpIm2_RE[3] *fKy;
	cmpIm1_IM[2] = cmpIm2_IM[3] *fKy;

	/* Im1,3 */
	C6_sub_CMP(cmpAux1, cmp_Exp_Ky_x, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2);
	C6_div_CMP(cmpAux1, cmpAux2, cmpIm);
	cmpIm1_RE[3] = -kSign * cmpAux1[1] / 2.0;
	cmpIm1_IM[3] = -kSign * cmpAux1[2] / 2.0;

	/* Im1,4 */
	cmpIm1_RE[4] = cmpIm2_RE[2] *fKy;
	cmpIm1_IM[4] = cmpIm2_IM[2] *fKy;

	/* Im1,5 */
	C6_div_CMP(cmpAux1, cmp_Exp_sKj2Ky2_x, cmp_sKj2Ky2);
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj);
	C6_div_CMP(cmpAux1, cmpAux2, cmpIm);
	cmpIm1_RE[5] = cmpAux1[1] / 2.0;
	cmpIm1_IM[5] = cmpAux1[2] / 2.0;

	/* Im1,6 */
	cmpIm1_RE[6] = cmpIm2_RE[5] *fKy;
	cmpIm1_IM[6] = cmpIm2_IM[5] *fKy;

	/* I0,1 */
	cmpAux1[1] = cmpIm1_RE[5];
	cmpAux1[2] = cmpIm1_IM[5];
	C6_mult_CMP(cmpAux2, cmpAux1, cmpKj);
	cmpI0_RE[1] = cmpAux2[1];
	cmpI0_IM[1] = cmpAux2[2];

	/* I0,2 */
	cmpI0_RE[2] = cmpIm1_RE[3] * fKy;
	cmpI0_IM[2] = cmpIm1_IM[3] * fKy;

	/* I0,3 */
	C6_mult_CMP(cmpAux1, cmp_sKj2Ky2, cmp_Exp_sKj2Ky2_x);
	cmpAux2[1] = cmp_Exp_Ky_x[1] * fabs(fKy);
	cmpAux2[2] = cmp_Exp_Ky_x[2] * fabs(fKy);
	cmpAux1[1] -= cmpAux2[2];
	cmpAux1[2] += cmpAux2[1];
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2);
	C6_div_CMP(cmpAux1, cmpAux2, cmpIm);
	cmpI0_RE[3] = cmpAux1[1] / 2.0;
	cmpI0_IM[3] = cmpAux1[2] / 2.0;

	/* I0,4 */
	cmpI0_RE[4] = cmpIm1_RE[2] * fKy;
	cmpI0_IM[4] = cmpIm1_IM[2] * fKy;

	/* I0,5 */
	C6_div_CMP(cmpAux1, cmp_Exp_sKj2Ky2_x, cmpKj);
	C6_div_CMP(cmpAux2, cmpAux1, cmpIm);
	cmpI0_RE[5] = kSign * cmpAux2[1] / 2.0;
	cmpI0_IM[5] = kSign * cmpAux2[2] / 2.0;

	/* I0,6 */
	cmpI0_RE[6] = cmpIm1_RE[5] * fKy;
	cmpI0_IM[6] = cmpIm1_IM[5] * fKy;

	/* I1,1 */
	cmpAux1[1] = cmpI0_RE[5];
	cmpAux1[2] = cmpI0_IM[5];
	C6_mult_CMP(cmpAux2, cmpAux1, cmpKj);
	cmpI1_RE[1] = cmpAux2[1];
	cmpI1_IM[1] = cmpAux2[2];

	/* I1,2 */
	cmpI1_RE[2] = cmpI0_RE[3] * fKy;
	cmpI1_IM[2] = cmpI0_IM[3] * fKy;

	/* I1,3 */
	C6_mult_CMP(cmpAux1, cmpKj2Ky2, cmp_Exp_sKj2Ky2_x);
	C6_mult_CMP(cmpAux2, cmpKy2, cmp_Exp_Ky_x);
	cmpAux1[1] += cmpAux2[1];
	cmpAux1[2] += cmpAux2[2];
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2);
	C6_div_CMP(cmpAux1, cmpAux2, cmpIm);
	cmpI1_RE[3] = kSign * cmpAux1[1] / 2.0;
	cmpI1_IM[3] = kSign * cmpAux1[2] / 2.0;

	/* I1,4 */
	cmpI1_RE[4] = cmpI0_RE[2] * fKy;
	cmpI1_IM[4] = cmpI0_IM[2] * fKy;

	/* I1,5 */
	C6_mult_CMP(cmpAux1, cmp_sKj2Ky2, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj);
	C6_div_CMP(cmpAux1, cmpAux2, cmpIm);
	cmpI1_RE[5] = cmpAux1[1] / 2.0;
	cmpI1_IM[5] = cmpAux1[2] / 2.0;

	/* I1,6 */
	cmpI1_RE[6] = cmpI0_RE[5] * fKy;
	cmpI1_IM[6] = cmpI0_IM[5] * fKy;

	/* I2,1 */
	cmpAux1[1] = cmpI1_RE[5];
	cmpAux1[2] = cmpI1_IM[5];
	C6_mult_CMP(cmpAux2, cmpAux1, cmpKj);
	cmpI2_RE[1] = cmpAux2[1];
	cmpI2_IM[1] = cmpAux2[2];

	/* I2,2 */
	cmpI2_RE[2] = cmpI1_RE[3] * fKy;
	cmpI2_IM[2] = cmpI1_IM[3] * fKy;

	/* I2,3 */
	C6_mult_CMP(cmpAux1, cmp_sKj2Ky2, cmpKj2Ky2);
	C6_mult_CMP(cmpAux2, cmpAux1, cmp_Exp_sKj2Ky2_x);
	C6_mult_CMP(cmpAux1, cmp_Exp_Ky_x, cmpKy2);
	cmpAux1[1] *= fabs(fKy);
	cmpAux1[2] *= fabs(fKy);
	cmpAux2[1] += cmpAux1[2];
	cmpAux2[2] -= cmpAux1[1];
	C6_div_CMP(cmpAux1, cmpAux2, cmpKj2);
	C6_div_CMP(cmpAux2, cmpAux1, cmpIm);
	cmpI2_RE[3] = cmpAux2[1] / 2.0;
	cmpI2_IM[3] = cmpAux2[2] / 2.0;

	/* I2,4 */
	cmpI2_RE[4] = cmpI1_RE[2] * fKy;
	cmpI2_IM[4] = cmpI1_IM[2] * fKy;

	/* I2,5 */
	C6_mult_CMP(cmpAux1, cmpKj2Ky2, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj);
	C6_div_CMP(cmpAux1, cmpAux2, cmpIm);
	cmpI2_RE[5] = kSign * cmpAux1[1] / 2.0;
	cmpI2_IM[5] = kSign * cmpAux1[2] / 2.0;

	/* I2,6 */
	cmpI2_RE[6] = cmpI1_RE[5] * fKy;
	cmpI2_IM[6] = cmpI1_IM[5] * fKy;

	/* I3,1 */
	cmpAux1[1] = cmpI2_RE[5];
	cmpAux1[2] = cmpI2_IM[5];
	C6_mult_CMP(cmpAux2, cmpAux1, cmpKj);
	cmpI3_RE[1] = cmpAux2[1];
	cmpI3_IM[1] = cmpAux2[2];

	/* I3,2 */
	cmpI3_RE[2] = cmpI2_RE[3] * fKy;
	cmpI3_IM[2] = cmpI2_IM[3] * fKy;

	/* I3,3 */
	C6_mult_CMP(cmpAux1, cmpKj2Ky2, cmpKj2Ky2);
	C6_mult_CMP(cmpAux2, cmpAux1, cmp_Exp_sKj2Ky2_x);
	C6_mult_CMP(cmpAux1, cmp_Exp_Ky_x, cmpKy4);
	cmpAux2[1] -= cmpAux1[1];
	cmpAux2[2] -= cmpAux1[2];
	C6_div_CMP(cmpAux1, cmpAux2, cmpKj2);
	C6_div_CMP(cmpAux2, cmpAux1, cmpIm);
	cmpI3_RE[3] = kSign * cmpAux2[1] / 2.0;
	cmpI3_IM[3] = kSign * cmpAux2[2] / 2.0;

	/* I3,4 */
	cmpI3_RE[4] = cmpI2_RE[2] * fKy;
	cmpI3_IM[4] = cmpI2_IM[2] * fKy;

	/* I3,5 */
	C6_mult_CMP(cmpAux2, cmp_sKj2Ky2, cmpKj2Ky2);
	C6_mult_CMP(cmpAux1, cmpAux2, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj);
	C6_div_CMP(cmpAux1, cmpAux2, cmpIm);
	cmpI3_RE[5] = cmpAux1[1] / 2.0;
	cmpI3_IM[5] = cmpAux1[2] / 2.0;

	/* I3,6 */
	cmpI3_RE[6] = cmpI2_RE[5] * fKy;
	cmpI3_IM[6] = cmpI2_IM[5] * fKy;
} /* CalcTLM_25D_Integrals_GPU_SVP */

__kernel void CalcTLM_25D_Integrals_GPU_SH
(
	int nX,
	int nDofSH,
	type fKy,
	__global type* ad1X,
	__global type* ad1Im3_SH,
	__global type* ad1Im2_SH,
	__global type* ad1Im1_SH,
	__global type* ad1I0_SH,
	__global type* ad1I1_SH,
	__global type* ad1I2_SH,
	__global type* ad1I3_SH,
	__global type* ad1SH_Poles
)
{
	int iMode = get_global_id(0) + 1;
	int jMode;
	int iX = get_global_id(1) + 1;
	int jX;
	int kSign;
	type fX;
	type fAux;
	type cmpAux1[3];
	type cmpAux2[3];
	type cmpAux3[3];
	type cmpIm[3];
	type cmpKj[3];
	type cmpKj2[3];
	type cmpKj2Ky2[3];
	type cmpKy[3];
	type cmpKy2[3];
	type cmpKy3[3];
	type cmpKy4[3];
	type cmpOne[3];
	type cmp_Exp_Ky_x[3];
	type cmp_Exp_sKj2Ky2_x[3];
	type cmp_sKj2Ky2[3];
	__global type* cmpIm3_RE;
	__global type* cmpIm3_IM;
	__global type* cmpIm2_RE;
	__global type* cmpIm2_IM;
	__global type* cmpIm1_RE;
	__global type* cmpIm1_IM;
	__global type* cmpI0_RE;
	__global type* cmpI0_IM;
	__global type* cmpI1_RE;
	__global type* cmpI1_IM;
	__global type* cmpI2_RE;
	__global type* cmpI2_IM;
	__global type* cmpI3_RE;
	__global type* cmpI3_IM;

	__global type* acmp1SH_Poles[3];
	__global type* acmp1Im3_SH[2*N_X*N_DOF_SH+1];
	__global type* acmp1Im2_SH[2*N_X*N_DOF_SH+1];
	__global type* acmp1Im1_SH[2*N_X*N_DOF_SH+1];
	__global type* acmp1I0_SH[2*N_X*N_DOF_SH+1];
	__global type* acmp1I1_SH[2*N_X*N_DOF_SH+1];
	__global type* acmp1I2_SH[2*N_X*N_DOF_SH+1];
	__global type* acmp1I3_SH[2*N_X*N_DOF_SH+1];
	__global type** acmp2Im3_SH[2*N_X+1];
	__global type** acmp2Im2_SH[2*N_X+1];
	__global type** acmp2Im1_SH[2*N_X+1];
	__global type** acmp2I0_SH[2*N_X+1];
	__global type** acmp2I1_SH[2*N_X+1];
	__global type** acmp2I2_SH[2*N_X+1];
	__global type** acmp2I3_SH[2*N_X+1];
	__global type*** acmp3Im3_SH[3];
	__global type*** acmp3Im2_SH[3];
	__global type*** acmp3Im1_SH[3];
	__global type*** acmp3I0_SH[3];
	__global type*** acmp3I1_SH[3];
	__global type*** acmp3I2_SH[3];
	__global type*** acmp3I3_SH[3];

	/* Calculate pointers */
	acmp1SH_Poles[1] = &ad1SH_Poles[0];
	acmp1SH_Poles[1]--;
	acmp1SH_Poles[2] = &ad1SH_Poles[nDofSH];
	acmp1SH_Poles[2]--;

	acmp3Im3_SH[1] = &acmp2Im3_SH[1];
	acmp3Im3_SH[1]--;
	acmp3Im3_SH[2] = &acmp2Im3_SH[1+nX];
	acmp3Im3_SH[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3Im3_SH[1][jX] = &acmp1Im3_SH[(jX-1)*nDofSH + 1];
		acmp3Im3_SH[1][jX]--;
		acmp3Im3_SH[2][jX] = &acmp1Im3_SH[(nX+jX-1)*nDofSH + 1];
		acmp3Im3_SH[2][jX]--;
		for (jMode = 1; jMode <= nDofSH; jMode++)
		{
			acmp3Im3_SH[1][jX][jMode] =
							&ad1Im3_SH[(jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3Im3_SH[1][jX][jMode]--;
			acmp3Im3_SH[2][jX][jMode] =
							&ad1Im3_SH[(nX+jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3Im3_SH[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3Im2_SH[1] = &acmp2Im2_SH[1];
	acmp3Im2_SH[1]--;
	acmp3Im2_SH[2] = &acmp2Im2_SH[1+nX];
	acmp3Im2_SH[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3Im2_SH[1][jX] = &acmp1Im2_SH[(jX-1)*nDofSH + 1];
		acmp3Im2_SH[1][jX]--;
		acmp3Im2_SH[2][jX] = &acmp1Im2_SH[(nX+jX-1)*nDofSH + 1];
		acmp3Im2_SH[2][jX]--;
		for (jMode = 1; jMode <= nDofSH; jMode++)
		{
			acmp3Im2_SH[1][jX][jMode] =
							&ad1Im2_SH[(jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3Im2_SH[1][jX][jMode]--;
			acmp3Im2_SH[2][jX][jMode] =
							&ad1Im2_SH[(nX+jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3Im2_SH[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3Im1_SH[1] = &acmp2Im1_SH[1];
	acmp3Im1_SH[1]--;
	acmp3Im1_SH[2] = &acmp2Im1_SH[1+nX];
	acmp3Im1_SH[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3Im1_SH[1][jX] = &acmp1Im1_SH[(jX-1)*nDofSH + 1];
		acmp3Im1_SH[1][jX]--;
		acmp3Im1_SH[2][jX] = &acmp1Im1_SH[(nX+jX-1)*nDofSH + 1];
		acmp3Im1_SH[2][jX]--;
		for (jMode = 1; jMode <= nDofSH; jMode++)
		{
			acmp3Im1_SH[1][jX][jMode] =
							&ad1Im1_SH[(jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3Im1_SH[1][jX][jMode]--;
			acmp3Im1_SH[2][jX][jMode] =
							&ad1Im1_SH[(nX+jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3Im1_SH[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3I0_SH[1] = &acmp2I0_SH[1];
	acmp3I0_SH[1]--;
	acmp3I0_SH[2] = &acmp2I0_SH[1+nX];
	acmp3I0_SH[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3I0_SH[1][jX] = &acmp1I0_SH[(jX-1)*nDofSH + 1];
		acmp3I0_SH[1][jX]--;
		acmp3I0_SH[2][jX] = &acmp1I0_SH[(nX+jX-1)*nDofSH + 1];
		acmp3I0_SH[2][jX]--;
		for (jMode = 1; jMode <= nDofSH; jMode++)
		{
			acmp3I0_SH[1][jX][jMode] =
							&ad1I0_SH[(jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3I0_SH[1][jX][jMode]--;
			acmp3I0_SH[2][jX][jMode] =
							&ad1I0_SH[(nX+jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3I0_SH[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3I1_SH[1] = &acmp2I1_SH[1];
	acmp3I1_SH[1]--;
	acmp3I1_SH[2] = &acmp2I1_SH[1+nX];
	acmp3I1_SH[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3I1_SH[1][jX] = &acmp1I1_SH[(jX-1)*nDofSH + 1];
		acmp3I1_SH[1][jX]--;
		acmp3I1_SH[2][jX] = &acmp1I1_SH[(nX+jX-1)*nDofSH + 1];
		acmp3I1_SH[2][jX]--;
		for (jMode = 1; jMode <= nDofSH; jMode++)
		{
			acmp3I1_SH[1][jX][jMode] =
							&ad1I1_SH[(jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3I1_SH[1][jX][jMode]--;
			acmp3I1_SH[2][jX][jMode] =
							&ad1I1_SH[(nX+jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3I1_SH[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3I2_SH[1] = &acmp2I2_SH[1];
	acmp3I2_SH[1]--;
	acmp3I2_SH[2] = &acmp2I2_SH[1+nX];
	acmp3I2_SH[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3I2_SH[1][jX] = &acmp1I2_SH[(jX-1)*nDofSH + 1];
		acmp3I2_SH[1][jX]--;
		acmp3I2_SH[2][jX] = &acmp1I2_SH[(nX+jX-1)*nDofSH + 1];
		acmp3I2_SH[2][jX]--;
		for (jMode = 1; jMode <= nDofSH; jMode++)
		{
			acmp3I2_SH[1][jX][jMode] =
							&ad1I2_SH[(jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3I2_SH[1][jX][jMode]--;
			acmp3I2_SH[2][jX][jMode] =
							&ad1I2_SH[(nX+jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3I2_SH[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	acmp3I3_SH[1] = &acmp2I3_SH[1];
	acmp3I3_SH[1]--;
	acmp3I3_SH[2] = &acmp2I3_SH[1+nX];
	acmp3I3_SH[2]--;
	for (jX = 1; jX <= nX; jX++)
	{
		acmp3I3_SH[1][jX] = &acmp1I3_SH[(jX-1)*nDofSH + 1];
		acmp3I3_SH[1][jX]--;
		acmp3I3_SH[2][jX] = &acmp1I3_SH[(nX+jX-1)*nDofSH + 1];
		acmp3I3_SH[2][jX]--;
		for (jMode = 1; jMode <= nDofSH; jMode++)
		{
			acmp3I3_SH[1][jX][jMode] =
							&ad1I3_SH[(jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3I3_SH[1][jX][jMode]--;
			acmp3I3_SH[2][jX][jMode] =
							&ad1I3_SH[(nX+jX-1)*nDofSH*4+(jMode-1)*4];
			acmp3I3_SH[2][jX][jMode]--;
		} /* for (jMode) */
	} /* for (jX) */

	cmpKy[1] = fKy;
	cmpKy[2] = 0.0;
	cmpKy2[1] = fKy * fKy;
	cmpKy2[2] = 0.0;
	cmpKy3[1] = fabs(fKy * fKy * fKy);
	cmpKy3[2] = 0.0;
	cmpKy4[1] = fKy * fKy * fKy * fKy;
	cmpKy4[2] = 0.0;
	cmpOne[1] = 1.0;
	cmpOne[2] = 0.0;
	cmpIm[1] = 0.0;
	cmpIm[2] = 1.0;

	cmpKj[1] = acmp1SH_Poles[1][iMode];
	cmpKj[2] = acmp1SH_Poles[2][iMode];
	C6_mult_CMP(cmpKj2, cmpKj, cmpKj);
	C6_sub_CMP(cmpKj2Ky2, cmpKj2, cmpKy2);
	fAux = 0.5;
	C6_pow_CMP(cmp_sKj2Ky2, cmpKj2Ky2, fAux);
	if (cmp_sKj2Ky2[2] > 0.0)
	{
		cmp_sKj2Ky2[1] = -cmp_sKj2Ky2[1];
		cmp_sKj2Ky2[2] = -cmp_sKj2Ky2[2];
	}

	/* Calculate SH integrals */
	cmpIm3_RE = acmp3Im3_SH[1][iX][iMode];
	cmpIm3_IM = acmp3Im3_SH[2][iX][iMode];
	cmpIm2_RE = acmp3Im2_SH[1][iX][iMode];
	cmpIm2_IM = acmp3Im2_SH[2][iX][iMode];
	cmpIm1_RE = acmp3Im1_SH[1][iX][iMode];
	cmpIm1_IM = acmp3Im1_SH[2][iX][iMode];
	cmpI0_RE = acmp3I0_SH[1][iX][iMode];
	cmpI0_IM = acmp3I0_SH[2][iX][iMode];
	cmpI1_RE = acmp3I1_SH[1][iX][iMode];
	cmpI1_IM = acmp3I1_SH[2][iX][iMode];
	cmpI2_RE = acmp3I2_SH[1][iX][iMode];
	cmpI2_IM = acmp3I2_SH[2][iX][iMode];
	cmpI3_RE = acmp3I3_SH[1][iX][iMode];
	cmpI3_IM = acmp3I3_SH[2][iX][iMode];

	fX = ad1X[iX - 1];
	if (fX > 0.0) kSign = 1;
	else if (fX == 0.0) kSign = 0;
	else if (fX < 0.0) kSign = -1;
	fX = fabs(fX);

	cmpAux1[1] = fX * cmp_sKj2Ky2[2];
	cmpAux1[2] = -fX * cmp_sKj2Ky2[1];
	C6_exp_CMP(cmp_Exp_sKj2Ky2_x, cmpAux1);

	cmpAux1[1] = -fabs(fX * fKy);
	cmpAux1[2] = 0.0;
	C6_exp_CMP(cmp_Exp_Ky_x, cmpAux1);

	/* Im3,1 */
	cmpAux3[1] = fX * fX / 2.0;
	cmpAux3[2] = 0.0;
	C6_div_CMP(cmpAux1, cmpOne, cmpKj2Ky2);
	cmpAux3[1] -= cmpAux1[1];
	cmpAux3[2] -= cmpAux1[2];
	C6_div_CMP(cmpAux1, cmp_Exp_sKj2Ky2_x, cmpKj2Ky2);
	cmpAux3[1] += cmpAux1[1];
	cmpAux3[2] += cmpAux1[2];
	cmpAux1[1] = 2.0 * cmpKj2Ky2[2];
	cmpAux1[2] = -2.0 * cmpKj2Ky2[1];
	C6_div_CMP(cmpAux2, cmpAux3, cmpAux1);
	cmpAux3[1] = -kSign * cmpAux2[1];
	cmpAux3[2] = -kSign * cmpAux2[2];
	cmpIm3_RE[1] = cmpAux3[1];
	cmpIm3_IM[1] = cmpAux3[2];

	/* Im3,2 */
	cmpAux1[1] = fX;
	cmpAux1[2] = 0.0;
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2Ky2);
	cmpAux1[1] = fabs(fKy);
	cmpAux1[2] = 0.0;
	C6_mult_CMP(cmpAux3, cmpAux1, cmpKj2);
	C6_div_CMP(cmpAux1, cmp_Exp_Ky_x, cmpAux3);
	cmpAux2[1] += cmpAux1[1];
	cmpAux2[2] += cmpAux1[2];
	cmpAux1[1] = 0.0;
	cmpAux1[2] = -fKy * fKy;
	C6_mult_CMP(cmpAux3, cmpAux1, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux1, cmpAux3, cmpKj2);
	C6_div_CMP(cmpAux3, cmpAux1, cmpKj2Ky2);
	C6_div_CMP(cmpAux1, cmpAux3, cmp_sKj2Ky2);
	cmpAux2[1] += cmpAux1[1];
	cmpAux2[2] += cmpAux1[2];
	C6_div_CMP(cmpAux3, cmpAux2, cmpKy);
	cmpAux3[1] /= 2.0;
	cmpAux3[2] /= 2.0;
	cmpIm3_RE[2] = cmpAux3[1];
	cmpIm3_IM[2] = cmpAux3[2];
	
	/* Im3,3 */
	C6_div_CMP(cmpAux1, cmpOne, cmpKj2Ky2);
	cmpAux1[1] = -cmpAux1[1];
	cmpAux1[2] = -cmpAux1[2];
	C6_div_CMP(cmpAux2, cmp_Exp_Ky_x, cmpKj2);
	cmpAux1[1] += cmpAux2[1];
	cmpAux1[2] += cmpAux2[2];
	C6_mult_CMP(cmpAux2, cmpKy2, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux3, cmpAux2, cmpKj2);
	C6_div_CMP(cmpAux2, cmpAux3, cmpKj2Ky2);
	cmpAux1[1] += cmpAux2[1];
	cmpAux1[2] += cmpAux2[2];
	cmpAux2[1] = 0.0;
	cmpAux2[2] = 2.0 * fKy * fKy;
	C6_div_CMP(cmpAux3, cmpAux1, cmpAux2);
	cmpIm3_RE[3] = kSign * cmpAux3[1];
	cmpIm3_IM[3] = kSign * cmpAux3[2];

	/* Im3,4 */
	cmpAux1[1] = -fX * fX / 2.0;
	cmpAux1[2] = 0.0;
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2Ky2);
	cmpAux1[1] = 2.0 * cmpKy2[1] - cmpKj2[1];
	cmpAux1[2] = 2.0 * cmpKy2[2] - cmpKj2[2];
	C6_div_CMP(cmpAux3, cmpAux1, cmpKy2);
	C6_div_CMP(cmpAux1, cmpAux3, cmpKj2Ky2);
	C6_div_CMP(cmpAux3, cmpAux1, cmpKj2Ky2);
	cmpAux2[1] += cmpAux3[1];
	cmpAux2[2] += cmpAux3[2];
	C6_div_CMP(cmpAux1, cmp_Exp_Ky_x, cmpKy2);
	C6_div_CMP(cmpAux3, cmpAux1, cmpKj2);
	cmpAux2[1] += cmpAux3[1];
	cmpAux2[2] += cmpAux3[2];
	C6_mult_CMP(cmpAux1, cmpKy2, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux3, cmpAux1, cmpKj2);
	C6_div_CMP(cmpAux1, cmpAux3, cmpKj2Ky2);
	C6_div_CMP(cmpAux3, cmpAux1, cmpKj2Ky2);
	cmpAux2[1] -= cmpAux3[1];
	cmpAux2[2] -= cmpAux3[2];
	C6_div_CMP(cmpAux3, cmpAux2, cmpIm);
	cmpAux3[1] *= -kSign / 2.0;
	cmpAux3[2] *= -kSign / 2.0;
	cmpIm3_RE[4] = cmpAux3[1];
	cmpIm3_IM[4] = cmpAux3[2];

	if (fKy == 0)
	{
		cmpIm3_RE[2] = 0.0;
		cmpIm3_IM[2] = 0.0;
		cmpIm3_RE[4] = 0.0;
		cmpIm3_IM[4] = 0.0;
		cmpIm3_RE[3] = cmpIm3_RE[1];
		cmpIm3_IM[3] = cmpIm3_IM[1];
	} /* if */

	/* Im2,1 */
	cmpAux1[1] = cmpIm3_RE[5];
	cmpAux1[2] = cmpIm3_IM[5];
	C6_mult_CMP(cmpAux2, cmpAux1, cmpKj);
	cmpIm2_RE[1] = cmpAux2[1];
	cmpIm2_IM[1] = cmpAux2[2];

	/* Im2,2 */
	cmpIm2_RE[2] = cmpIm3_RE[3] *fKy;
	cmpIm2_IM[2] = cmpIm3_IM[3] *fKy;

	/* Im2,3 */
	cmpAux1[1] = cmp_Exp_Ky_x[1] / fabs(fKy);
	cmpAux1[2] = cmp_Exp_Ky_x[2] / fabs(fKy);
	C6_div_CMP(cmpAux2, cmp_Exp_sKj2Ky2_x, cmp_sKj2Ky2);
	cmpAux1[1] -= cmpAux2[2];
	cmpAux1[2] += cmpAux2[1];
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2);
	cmpIm2_RE[3] = -cmpAux2[1] / 2.0;
	cmpIm2_IM[3] = -cmpAux2[2] / 2.0;

	/* Im2,4 */
	cmpIm2_RE[4] = cmpIm3_RE[2] *fKy;
	cmpIm2_IM[4] = cmpIm3_IM[2] *fKy;

	if (fKy == 0)
	{
		cmpIm2_RE[2] = 0.0;
		cmpIm2_IM[2] = 0.0;
		cmpIm2_RE[4] = 0.0;
		cmpIm2_IM[4] = 0.0;
		cmpIm2_RE[3] = cmpIm2_RE[1];
		cmpIm2_IM[3] = cmpIm2_IM[1];
	} /* if */

	/* Im1,2 */
	cmpIm1_RE[2] = cmpIm2_RE[3] *fKy;
	cmpIm1_IM[2] = cmpIm2_IM[3] *fKy;

	/* Im1,3 */
	C6_sub_CMP(cmpAux1, cmp_Exp_Ky_x, cmp_Exp_sKj2Ky2_x);
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2);
	C6_div_CMP(cmpAux1, cmpAux2, cmpIm);
	cmpIm1_RE[3] = -kSign * cmpAux1[1] / 2.0;
	cmpIm1_IM[3] = -kSign * cmpAux1[2] / 2.0;

	/* Im1,4 */
	cmpIm1_RE[4] = cmpIm2_RE[2] *fKy;
	cmpIm1_IM[4] = cmpIm2_IM[2] *fKy;

	/* I0,2 */
	cmpI0_RE[2] = cmpIm1_RE[3] * fKy;
	cmpI0_IM[2] = cmpIm1_IM[3] * fKy;

	/* I0,3 */
	C6_mult_CMP(cmpAux1, cmp_sKj2Ky2, cmp_Exp_sKj2Ky2_x);
	cmpAux2[1] = cmp_Exp_Ky_x[1] * fabs(fKy);
	cmpAux2[2] = cmp_Exp_Ky_x[2] * fabs(fKy);
	cmpAux1[1] -= cmpAux2[2];
	cmpAux1[2] += cmpAux2[1];
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2);
	C6_div_CMP(cmpAux1, cmpAux2, cmpIm);
	cmpI0_RE[3] = cmpAux1[1] / 2.0;
	cmpI0_IM[3] = cmpAux1[2] / 2.0;

	/* I0,4 */
	cmpI0_RE[4] = cmpIm1_RE[2] * fKy;
	cmpI0_IM[4] = cmpIm1_IM[2] * fKy;

	/* I1,2 */
	cmpI1_RE[2] = cmpI0_RE[3] * fKy;
	cmpI1_IM[2] = cmpI0_IM[3] * fKy;

	/* I1,3 */
	C6_mult_CMP(cmpAux1, cmpKj2Ky2, cmp_Exp_sKj2Ky2_x);
	C6_mult_CMP(cmpAux2, cmpKy2, cmp_Exp_Ky_x);
	cmpAux1[1] += cmpAux2[1];
	cmpAux1[2] += cmpAux2[2];
	C6_div_CMP(cmpAux2, cmpAux1, cmpKj2);
	C6_div_CMP(cmpAux1, cmpAux2, cmpIm);
	cmpI1_RE[3] = kSign * cmpAux1[1] / 2.0;
	cmpI1_IM[3] = kSign * cmpAux1[2] / 2.0;

	/* I1,4 */
	cmpI1_RE[4] = cmpI0_RE[2] * fKy;
	cmpI1_IM[4] = cmpI0_IM[2] * fKy;

	/* I2,2 */
	cmpI2_RE[2] = cmpI1_RE[3] * fKy;
	cmpI2_IM[2] = cmpI1_IM[3] * fKy;

	/* I2,3 */
	C6_mult_CMP(cmpAux1, cmp_sKj2Ky2, cmpKj2Ky2);
	C6_mult_CMP(cmpAux2, cmpAux1, cmp_Exp_sKj2Ky2_x);
	C6_mult_CMP(cmpAux1, cmp_Exp_Ky_x, cmpKy2);
	cmpAux1[1] *= fabs(fKy);
	cmpAux1[2] *= fabs(fKy);
	cmpAux2[1] += cmpAux1[2];
	cmpAux2[2] -= cmpAux1[1];
	C6_div_CMP(cmpAux1, cmpAux2, cmpKj2);
	C6_div_CMP(cmpAux2, cmpAux1, cmpIm);
	cmpI2_RE[3] = cmpAux2[1] / 2.0;
	cmpI2_IM[3] = cmpAux2[2] / 2.0;

	/* I2,4 */
	cmpI2_RE[4] = cmpI1_RE[2] * fKy;
	cmpI2_IM[4] = cmpI1_IM[2] * fKy;

	/* I3,2 */
	cmpI3_RE[2] = cmpI2_RE[3] * fKy;
	cmpI3_IM[2] = cmpI2_IM[3] * fKy;

	/* I3,3 */
	C6_mult_CMP(cmpAux1, cmpKj2Ky2, cmpKj2Ky2);
	C6_mult_CMP(cmpAux2, cmpAux1, cmp_Exp_sKj2Ky2_x);
	C6_mult_CMP(cmpAux1, cmp_Exp_Ky_x, cmpKy4);
	cmpAux2[1] -= cmpAux1[1];
	cmpAux2[2] -= cmpAux1[2];
	C6_div_CMP(cmpAux1, cmpAux2, cmpKj2);
	C6_div_CMP(cmpAux2, cmpAux1, cmpIm);
	cmpI3_RE[3] = kSign * cmpAux2[1] / 2.0;
	cmpI3_IM[3] = kSign * cmpAux2[2] / 2.0;

	/* I3,4 */
	cmpI3_RE[4] = cmpI2_RE[2] * fKy;
	cmpI3_IM[4] = cmpI2_IM[2] * fKy;
} /* CalcTLM_25D_Integrals_GPU_SH */


void C6_add_CMP
(
	type* cmp_c,
	type* cmp_a,
	type* cmp_b
)
{
	cmp_c[1] = cmp_a[1] + cmp_b[1];
	cmp_c[2] = cmp_a[2] + cmp_b[2];

} /* C6_add_CMP */

void C6_sub_CMP
(
	type* cmp_c,
	type* cmp_a,
	type* cmp_b
)
{
	cmp_c[1] = cmp_a[1] - cmp_b[1];
	cmp_c[2] = cmp_a[2] - cmp_b[2];

} /* C6_sub_CMP */

void C6_mult_CMP
(
	type* cmp_c,
	type* cmp_a,
	type* cmp_b
)
{
	cmp_c[1] = cmp_a[1] * cmp_b[1] - cmp_a[2] * cmp_b[2];
	cmp_c[2] = cmp_a[2] * cmp_b[1] + cmp_a[1] * cmp_b[2];

} /* C6_div_CMP */

void C6_div_CMP
(
	type* cmp_c,
	type* cmp_a,
	type* cmp_b
)
{
	type fNorm;

	fNorm = cmp_b[1] * cmp_b[1] + cmp_b[2] * cmp_b[2];

	cmp_c[1] = (cmp_a[1] * cmp_b[1] + cmp_a[2] * cmp_b[2]) / fNorm;
	cmp_c[2] = (cmp_a[2] * cmp_b[1] - cmp_a[1] * cmp_b[2]) / fNorm;

} /* C6_div_CMP */

void C6_exp_CMP
(
	type* cmp_c,
	type* cmp_a
)
{
	type fNorm;
	type fCos;
	type fSin;

	fCos = cos(cmp_a[2]);
	fSin = sin(cmp_a[2]);

	fNorm = exp(cmp_a[1]);
	cmp_c[1] = fNorm * fCos;
	cmp_c[2] = fNorm * fSin;

} /* C6_exp_CMP */

void C6_pow_CMP
(
	type* cmp_c,
	type* cmp_a,
	type fb
)
{
	type fNorm;
	type fAngle;
	type fCos;
	type fSin;

	fNorm = sqrt(cmp_a[1] * cmp_a[1] + cmp_a[2] * cmp_a[2]);
	fAngle = atan2(cmp_a[2], cmp_a[1]);
	fNorm = pow(fNorm, fb);
	fAngle = fAngle * fb;
	fCos = cos(fAngle);
	fSin = sin(fAngle);

	cmp_c[1] = fNorm * fCos;
	cmp_c[2] = fNorm * fSin;

} /* C6_pow_CMP */
/* end_of_code */

Hi all,

Still concerning my question, is it possible that there is a maximum size for the code above which things go wrong, even if the code is correct?
I’ve been adding and removing parts of the code and I realized that in some cases things go as expected, while in other cases don’t.

For example, I divided the code into parts A, B and C, each part independent of the others. If then I include only parts A and B, or B and C, or A and C, everything works fine. If on the other hand I include everything, i.e., A, B and C, then the program returns 1.#IND00e+000.

Does anyone have an idea of what may be going on? If so, can anyone give me a suggestion on how to overcome this problem!
Any suggestion is welcome.

Thank you!