#include <stdint.h>
#include <emmintrin.h>

#define CHARTYPE unsigned char
#define MAXPAT 51000
#define LIM 65536
#define W 64
#define Q 16
#define BS 6

#define movemask_epi8 _mm_movemask_epi8
#define loadu_si _mm_loadu_si128
#define mi __m128i

typedef uint64_t bv;

bv pre;
uint8_t B[LIM];
int low[LIM], high[LIM];
CHARTYPE pattern[MAXPAT];
int patlen, a, b;
mi y_ptr;

void prep(const CHARTYPE *pat, register int m) {
    int i, j, r, x, ch;

    memcpy(pattern, pat, m);
    patlen = m;
    pre = *((uint64_t*) (pat));

    x=0; if (m<=4096) x=1;
    a=(m-Q+1-x)/W+x; 
    r=(m-Q+1)/a; 
    if (r>W) r=W;
    b=r*a;

    for (i=0; i<LIM; i++) {B[i]=0; low[i]=a+1; high[i]=a;}

    i=m-Q;
    while (i>m-b-Q) {
       y_ptr = loadu_si(( mi *)(pattern+i));
       ch = movemask_epi8(y_ptr<<BS);
       B[ch] = 1;
       if (i>m-Q-a) {
          j=a-(i-(m-Q-a));
          if (low[ch]==a+1) {low[ch]=j; high[ch]=j;}
          if (j>high[ch]) high[ch]=j;
       }
       i--;  
    }
}

int exec(CHARTYPE *t, register int n) {

    bv d;
    int i, j, k, s, m = patlen, count = 0;
 
    i = m-1;
    t[n] = 0;

   while (i<n) {
      y_ptr = loadu_si(( mi *)(t+i-15));
      s = movemask_epi8(y_ptr<<BS);
      d = B[s];
      if (d==0) i+=b;
      else {
         j = i;
         do {i-=a; 
            y_ptr = loadu_si(( mi *)(t+i-15));
            d = B[movemask_epi8(y_ptr<<BS)];}
         while (d && (i>j-b));
         i += b; 
         if (i==j) {
            for (k=low[s]; k<=high[s]; k++)
               if(pre == *((uint64_t*) (t+i-m+1+k)))
                  if(memcmp(t+i-m+1+k,pattern,m) == 0) 
                     count++;
            i+=a;
         }
      }
    }
return count;
}

