#include <stdint.h>
#include <emmintrin.h>

#define CHARTYPE unsigned char
#define MAXPAT 51000
#define LIM 65536
#define W 64
#define Q 16
#define BS 6

#define movemask_epi8 _mm_movemask_epi8
#define loadu_si _mm_loadu_si128
#define mi __m128i

typedef uint64_t bv;

bv pre, B[LIM];
CHARTYPE pattern[MAXPAT];
int patlen, a, b;
mi y_ptr;

void prep(const CHARTYPE *pat, register int m) {
    int i, j, r, x, y;

    memcpy(pattern, pat, m);
    patlen = m;
    pre = *((uint64_t*) (pat));

    for (i=0; i<LIM; i++) B[i]=0;

    x=0; if (m<=4096) x=1;
    a=(m-Q+1-x)/W+x; 
    r=(m-Q+1)/a; 
    if (r>W) r=W;
    y=W-r;
    b=r*a;

    i=m-Q;
    while (i>m-b-Q) {
       for (j=1; j<=a; j++) {
          y_ptr = loadu_si(( mi *)(pattern+i));
          B[movemask_epi8(y_ptr<<BS)] |= (bv)1 << y;
          i--;  
       }
       y++;
    }
}

int exec(CHARTYPE *t, register int n) {

    bv d;
    int i, j, k, m = patlen, count = 0;
 
    i = m-1;
    t[n] = 0;

   while (i<n) {
      y_ptr = loadu_si(( mi *)(t+i-15));
      d = B[movemask_epi8(y_ptr<<BS)];
      if (d==0) i+=b;
      else {
         j = i;
         do {i-=a; 
            y_ptr = loadu_si(( mi *)(t+i-15));
            d = (d<<1)&B[movemask_epi8(y_ptr<<BS)];}
         while (d);
         i += b; 
         if (i==j) {
            for (k=0; k<a; k++) 
               if(pre == *((uint64_t*) (t+i-m+1+k)))
                  if(memcmp(t+i-m+1+k,pattern,m) == 0) 
                     count++;
            i+=a;
         }
      }
    }
return count;
}

