LLVM compiler has failed to compile a function

Hi, I try to porting function on the OpenCL kernel, but compiler gave me next error:
cvmsErrorCompilerFailure: LLVM compiler has failed to compile a function.
Here is my kernel code:


//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable

//##############################################################################
// Utils function
//##############################################################################
void mmemcpy(unsigned char *dst, const unsigned char *src, size_t len) {
    for (size_t i = 0; i < len; i++)
        dst[i] = src[i];
}


//##############################################################################
// Tree hashing constants definition
//##############################################################################
/* tweak word T[1]: bit field starting positions */
#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */

#define SKEIN_T1_POS_TREE_LVL   SKEIN_T1_BIT(112)       /* bits 112..118: level in hash tree       */
#define SKEIN_T1_POS_BIT_PAD    SKEIN_T1_BIT(119)       /* bit  119     : partial final input byte */
#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */

/* tweak word T[1]: flag bit definition(s) */
#define SKEIN_T1_FLAG_FIRST     (((unsigned long)  1 ) << SKEIN_T1_POS_FIRST)
#define SKEIN_T1_FLAG_FINAL     (((unsigned long)  1 ) << SKEIN_T1_POS_FINAL)
#define SKEIN_T1_FLAG_BIT_PAD   (((unsigned long)  1 ) << SKEIN_T1_POS_BIT_PAD)

//##############################################################################
// Skein macros
//##############################################################################
#ifndef RotL_64
#define RotL_64(x, N)    (((x) << (N)) | ((x) >> (64-(N))))
#endif

//##############################################################################
// Skein block
//##############################################################################
#ifndef SKEIN_USE_ASM
#define SKEIN_USE_ASM   (0)                     /* default is all C code (no ASM) */
#endif

#ifndef SKEIN_LOOP
#define SKEIN_LOOP 001                          /* default: unroll 256 and 512, but not 1024 */
#endif

#define BLK_BITS        (WCNT*64)               /* some useful definitions for code here */
#define KW_TWK_BASE     (0)
#define KW_KEY_BASE     (3)
#define ks              (kw + KW_KEY_BASE)                
#define ts              (kw + KW_TWK_BASE)

//##############################################################################
// Port settings
//##############################################################################
/* Platform settings */
#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */

/* Define platfor here */
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN

#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
/* here for big-endian */
#define SKEIN_NEED_SWAP (1)
#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
/* here for little-endian */
#define SKEIN_NEED_SWAP (0)
#if PLATFORM_MUST_ALIGN == 0              /* ok to use "fast" versions? */
#define Skein_Put64_LSB_First(dst08, src64, bCnt) mmemcpy(dst08, src64, bCnt)
#define Skein_Get64_LSB_First(dst64, src08, wCnt) mmemcpy(dst64, src08, 8*(wCnt))
#endif
#endif

//##############################################################################
// Skein.h
//##############################################################################
enum {   
    /* Skein_256 round rotation constants */
    R_256_0_0=14, R_256_0_1=16,
    R_256_1_0=52, R_256_1_1=57,
    R_256_2_0=23, R_256_2_1=40,
    R_256_3_0= 5, R_256_3_1=37,
    R_256_4_0=25, R_256_4_1=33,
    R_256_5_0=46, R_256_5_1=12,
    R_256_6_0=58, R_256_6_1=22,
    R_256_7_0=32, R_256_7_1=32,
    
    /* Skein_512 round rotation constants */
    R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
    R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
    R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
    R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
    R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
    R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
    R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
    R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
    
    /* Skein1024 round rotation constants */
    R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
    R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
    R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
    R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
    R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
    R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
    R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
    R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
};

#ifndef SKEIN_ID_STRING_LE      /* allow compile-time personalization */
#define SKEIN_ID_STRING_LE      (0x33414853)            /* "SHA3" (little-endian)*/
#endif

#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((unsigned long) (hi32)) << 32))
#define SKEIN_SCHEMA_VER        SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)

//##############################################################################
// Skein structures
//##############################################################################
enum {
    SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
    SKEIN_FAIL            =      1,
    SKEIN_BAD_HASHLEN     =      2
};

#define  SKEIN_MODIFIER_WORDS  ( 2)          /* number of modifier (tweak) words */

#define  SKEIN_256_STATE_WORDS ( 4)
#define  SKEIN_512_STATE_WORDS ( 8)
#define  SKEIN1024_STATE_WORDS (16)
#define  SKEIN_MAX_STATE_WORDS (16)

#define  SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define  SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)

#define  SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
#define  SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)

#define  SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define  SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)

#define SKEIN_256_ROUNDS_TOTAL (72)          /* number of rounds for the different block sizes */
#define SKEIN_512_ROUNDS_TOTAL (72)
#define SKEIN1024_ROUNDS_TOTAL (80)

typedef struct {
    unsigned long  hashBitLen;                             /* size of hash result, in bits */
    unsigned long  bCnt;                                   /* current byte count in buffer b[] */
    unsigned long  T[SKEIN_MODIFIER_WORDS];         /* tweak words: T[0]=byte cnt, T[1]=flags */
} Skein_Ctxt_Hdr_t;

typedef struct {                                    /*  256-bit Skein hash context structure */
    Skein_Ctxt_Hdr_t h;                             /* common header context variables */
    unsigned long  X[SKEIN_256_STATE_WORDS];        /* chaining variables */
    unsigned char  b[SKEIN_256_BLOCK_BYTES];        /* partial block buffer (8-byte aligned) */
} Skein_256_Ctxt_t;

//##############################################################################
// 256-bit Skein 
//##############################################################################
/*****************************  Skein_256 ******************************/
void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const unsigned *blkPtr, unsigned long blkCnt, unsigned long byteCntAdd) { /* do it in C */
    
    enum {
        WCNT = SKEIN_256_STATE_WORDS
    };
#undef  RCNT
#define RCNT (SKEIN_256_ROUNDS_TOTAL/8)
    
#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
#else
#define SKEIN_UNROLL_256 (0)
#endif
    
#if SKEIN_UNROLL_256
#if (RCNT % SKEIN_UNROLL_256)
#error "Invalid SKEIN_UNROLL_256"               /* sanity check on unroll count */
#endif
    unsigned long  r;
    unsigned long  kw[WCNT+4+RCNT*2];           /* key schedule words : chaining vars + tweak + "rotation"*/
#else
    unsigned long  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
#endif
    unsigned long  X0, X1, X2, X3;                        /* local copy of context vars, for speed */
    unsigned long  w[WCNT];                           /* local copy of input block */
    
   /* never call with blkCnt == 0! */
    if (!(blkCnt != 0))
        return;
    
    ts[0] = ctx->h.T[0];
    ts[1] = ctx->h.T[1];
    do  {
        /* this implementation only supports 2**64 input bytes (no carry out here) */
        ts[0] += byteCntAdd;                    /* update processed length */
        
        /* precompute the key schedule for this block */
        ks[0] = ctx->X[0];     
        ks[1] = ctx->X[1];
        ks[2] = ctx->X[2];
        ks[3] = ctx->X[3];
        ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
        
        ts[2] = ts[0] ^ ts[1];
        
        Skein_Get64_LSB_First(w, blkPtr, WCNT);   /* get input block in little-endian format */
        
        X0 = w[0] + ks[0];                      /* do the first full key injection */
        X1 = w[1] + ks[1] + ts[0];
        X2 = w[2] + ks[2] + ts[1];
        X3 = w[3] + ks[3];
        
        blkPtr += SKEIN_256_BLOCK_BYTES;

        /* run the rounds */
        
// IN THIS MACRO WHEN I EXCHANGED X##p1 and X##p in RotL_64 for numbers 1, 2, THEN COMPILATION IS OK
#define Round256(p0, p1, p2, p3, ROT, rNum)                     \
X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \

#if SKEIN_UNROLL_256 == 0                       
#define R256(p0, p1, p2, p3, ROT, rNum)  /* fully unrolled */   \
Round256(p0, p1, p2, p3, ROT, rNum)
        
#define I256(R)                                                 \
X0   += ks[((R)+1) % 5];    /* inject the key schedule value */ \
X1   += ks[((R)+2) % 5] + ts[((R)+1) % 3];                      \
X2   += ks[((R)+3) % 5] + ts[((R)+2) % 3];                      \
X3   += ks[((R)+4) % 5] +     (R)+1;
#else                                       /* looping version */
#define R256(p0, p1, p2, p3, ROT, rNum)                         \
Round256(p0, p1, p2, p3, ROT, rNum)                             \
        
#define I256(R)                                                 \
X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
X1   += ks[r+(R)+1] + ts[r+(R)+0];                              \
X2   += ks[r+(R)+2] + ts[r+(R)+1];                              \
X3   += ks[r+(R)+3] +    r+(R)   ;                              \
ks[r + (R)+4    ]   = ks[r+(R)-1];     /* rotate key schedule */\
ts[r + (R)+2    ]   = ts[r+(R)-1];                              
        
        for (r = 1; r < 2*RCNT; r += 2*SKEIN_UNROLL_256)  /* loop thru it */
#endif  
        {    
#define R256_8_rounds(R)          \
R256(0,1,2,3,R_256_0,8*(R) + 1);  \
R256(0,3,2,1,R_256_1,8*(R) + 2);  \
R256(0,1,2,3,R_256_2,8*(R) + 3);  \
R256(0,3,2,1,R_256_3,8*(R) + 4);  \
I256(2*(R));                      \
R256(0,1,2,3,R_256_4,8*(R) + 5);  \
R256(0,3,2,1,R_256_5,8*(R) + 6);  \
R256(0,1,2,3,R_256_6,8*(R) + 7);  \
R256(0,3,2,1,R_256_7,8*(R) + 8);  \
I256(2*(R)+1);
            
            R256_8_rounds( 0);
            
#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
            
#if   R256_Unroll_R( 1)
            R256_8_rounds( 1);
#endif
#if   R256_Unroll_R( 2)
            R256_8_rounds( 2);
#endif
#if   R256_Unroll_R( 3)
            R256_8_rounds( 3);
#endif
#if   R256_Unroll_R( 4)
            R256_8_rounds( 4);
#endif
#if   R256_Unroll_R( 5)
            R256_8_rounds( 5);
#endif
#if   R256_Unroll_R( 6)
            R256_8_rounds( 6);
#endif
#if   R256_Unroll_R( 7)
            R256_8_rounds( 7);
#endif
#if   R256_Unroll_R( 8)
            R256_8_rounds( 8);
#endif
#if   R256_Unroll_R( 9)
            R256_8_rounds( 9);
#endif
#if   R256_Unroll_R(10)
            R256_8_rounds(10);
#endif
#if   R256_Unroll_R(11)
            R256_8_rounds(11);
#endif
#if   R256_Unroll_R(12)
            R256_8_rounds(12);
#endif
#if   R256_Unroll_R(13)
            R256_8_rounds(13);
#endif
#if   R256_Unroll_R(14)
            R256_8_rounds(14);
#endif
#if  (SKEIN_UNROLL_256 > 14)
#error  "need more unrolling in Skein_256_Process_Block"
#endif
        }

        // WHEN I COMMENT NEXT 4 LINES, THEN COMPILATION IS OK
        ctx->X[0] = X0 ^ w[0]; 
        ctx->X[1] = X1 ^ w[1];
        ctx->X[2] = X2 ^ w[2];
        ctx->X[3] = X3 ^ w[3];
        
        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
    }
    while (--blkCnt);
    ctx->h.T[0] = ts[0];
    ctx->h.T[1] = ts[1];
}

__kernel void update( __global Skein_256_Ctxt_t *gctx) {
    int glId = get_global_id(0);
    int lId = get_local_id(0);
    int grId = get_group_id(0);
    
    // TODO now is one hashState per block, try one hs per thread to accelerate computing
    gctx += grId;
    
    Skein_256_Ctxt_t ctx;
    ctx.h.hashBitLen = 256;
    Skein_256_Process_Block(&ctx, &ctx.b, 1, SKEIN_256_BLOCK_BYTES);
    *gctx = ctx;
}

When I comment marked line (I wrote this line in capital letters) or change the values (not both) the compilation is ok. Can anyone help me?

These overly generic compilation errors usually mean that the compiler crashed due to an internal bug. The fact that commenting some apparently valid code makes the error go away supports this hypothesis.

Try contacting the vendor’s developer support and show them the same code you showed us. They are more likely to look at the bug if you can reduce the number of lines of code.

Can anyone compile this code on another platform or different graphic card. I’m using MAC OS 10.6.7, graphic card NVIDIA GeForce 9400M. Its would be helpful for me.