#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <xmmintrin.h>
#include <immintrin.h>
#include <x86intrin.h>

#include <ctype.h>
#include <getopt.h>
#include <stdlib.h>
#include <sys/times.h>
#include <unistd.h>

#include <limits.h>
#define ASIZE (UCHAR_MAX + 1)
#define PSIZE 4100

#define ALPHA 32
#define LOG_ALPHA 5
#define movemask_epi8 _mm256_movemask_epi8
#define cmpeq_epi8 _mm256_cmpeq_epi8
#define cmpgt_epi8 _mm256_cmpgt_epi8
#define and_si _mm256_and_si256
#define loadu_si _mm256_loadu_si256
#define mi __m256i

#define MIN(a,b) ((a) < (b) ? (a) : (b))
#define MAX(a,b) ((a) > (b) ? (a) : (b))
#define FALSE      0
#define TRUE       1
#define XSIZE    66000          //maximal length of the pattern
#define WSIZE    256            //greater int value fitting in a computer word
#define SIGMA       256         //constant alphabet size
#define UNDEFINED       -1
#define HALFDEFINED     -2
#define WORD	    32          //computer word size (in bit)
#define OUTPUT(j)   count++

#define   BEGIN_PREPROCESSING
#define   END_PREPROCESSING
#define   BEGIN_SEARCHING
#define   END_SEARCHING

#ifndef CHARTYPE
#define CHARTYPE        unsigned char
#endif
#define MAXPAT  66000

static struct
{
  int patlen;
  CHARTYPE pat[MAXPAT];
  int lastchar;
} pat;

static unsigned char mask[ASIZE * ALPHA];
static unsigned int pi[PSIZE], pj[PSIZE], pz[ALPHA];
int t[255], pp[4100];


#define PI(a) (a)

void
prep (const CHARTYPE * P, register int m)
{
  memcpy (pat.pat, P, m);
  pat.patlen = m;

  int i, j, k;

  for (j = 0; j < ALPHA; j++)
    pz[j] = 0;
  for (j = 0; j < ASIZE; j++)
    {
      k = j;
      for (i = 0; i < ALPHA; i++)
        mask[j * ALPHA + i] = k;
    }

  for (j = 0; j <= 255; ++j)
    t[j] = 0;
  t['a'] = 1;
  t['c'] = 2;
  t['g'] = 4;
  t['t'] = 8;
  t['r'] = 5;
  t['y'] = 10;
  t['s'] = 6;
  t['w'] = 9;
  t['k'] = 12;
  t['m'] = 3;
  t['b'] = 14;
  t['d'] = 13;
  t['h'] = 11;
  t['v'] = 7;
  t['n'] = 15;

  for (j = 0; j < m; ++j)
    pp[j] = t[pat.pat[j]];

}

int
exec (CHARTYPE * y, register int n)
{
  __m256i mask1 = _mm256_set_epi8 (0, 15, 3, 0, 12, 0, 0, 11,
                                   4, 0, 0, 13, 2, 14, 1, 0,
                                   0, 15, 3, 0, 12, 0, 0, 11,
                                   4, 0, 0, 13, 2, 14, 1, 0);
  __m256i mask2 = _mm256_set_epi8 (0, 0, 0, 0, 0, 0, 10, 0,
                                   9, 7, 0, 8, 6, 5, 0, 0,
                                   0, 0, 0, 0, 0, 0, 10, 0,
                                   9, 7, 0, 8, 6, 5, 0, 0);
  __m256i test = _mm256_set_epi8 ('o', 'o', 'o', 'o', 'o', 'o', 'o', 'o',
                                  'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o',
                                  'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o',
                                  'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o');

  static const unsigned char *x;
  x = pat.pat;
  int m = pat.patlen;

  int i, j, id, s, count = 0;
  uint32_t k;
  mi x_ptr, y_ptr, z_ptr, ap, bp;
  static const unsigned char *yy;
  yy = y;


  z_ptr = loadu_si ((mi *) (pz));

  for (j = 0; j < n; j += ALPHA)
    {
      id = PI (0);
      ap = loadu_si ((mi *) (yy + id));
      bp = _mm256_blendv_epi8 (_mm256_shuffle_epi8 (mask1, ap),
                               _mm256_shuffle_epi8 (mask2, ap),
                               _mm256_cmpgt_epi8 (ap, test));
      x_ptr = loadu_si ((mi *) (mask + ((pp[id]) << LOG_ALPHA)));
      k = movemask_epi8 (cmpgt_epi8 (and_si (bp, x_ptr), z_ptr));

      id = PI (1);
      ap = loadu_si ((mi *) (yy + id));
      bp = _mm256_blendv_epi8 (_mm256_shuffle_epi8 (mask1, ap),
                               _mm256_shuffle_epi8 (mask2, ap),
                               _mm256_cmpgt_epi8 (ap, test));
      x_ptr = loadu_si ((mi *) (mask + ((pp[id]) << LOG_ALPHA)));
      k &= movemask_epi8 (cmpgt_epi8 (and_si (bp, x_ptr), z_ptr));

      id = PI (2);
      ap = loadu_si ((mi *) (yy + id));
      bp = _mm256_blendv_epi8 (_mm256_shuffle_epi8 (mask1, ap),
                               _mm256_shuffle_epi8 (mask2, ap),
                               _mm256_cmpgt_epi8 (ap, test));
      x_ptr = loadu_si ((mi *) (mask + ((pp[id]) << LOG_ALPHA)));
      k &= movemask_epi8 (cmpgt_epi8 (and_si (bp, x_ptr), z_ptr));

      id = PI (3);
      ap = loadu_si ((mi *) (yy + id));
      bp = _mm256_blendv_epi8 (_mm256_shuffle_epi8 (mask1, ap),
                               _mm256_shuffle_epi8 (mask2, ap),
                               _mm256_cmpgt_epi8 (ap, test));
      x_ptr = loadu_si ((mi *) (mask + ((pp[id]) << LOG_ALPHA)));
      k &= movemask_epi8 (cmpgt_epi8 (and_si (bp, x_ptr), z_ptr));

      id = PI (4);
      ap = loadu_si ((mi *) (yy + id));
      bp = _mm256_blendv_epi8 (_mm256_shuffle_epi8 (mask1, ap),
                               _mm256_shuffle_epi8 (mask2, ap),
                               _mm256_cmpgt_epi8 (ap, test));
      x_ptr = loadu_si ((mi *) (mask + ((pp[id]) << LOG_ALPHA)));
      k &= movemask_epi8 (cmpgt_epi8 (and_si (bp, x_ptr), z_ptr));

      if (k == 0)
        goto out;

      for (i = 5; i < m; i++)
        {
          id = PI (i);
          ap = loadu_si ((mi *) (yy + id));
          bp = _mm256_blendv_epi8 (_mm256_shuffle_epi8 (mask1, ap),
                                   _mm256_shuffle_epi8 (mask2, ap),
                                   _mm256_cmpgt_epi8 (ap, test));
          x_ptr = loadu_si ((mi *) (mask + ((pp[id]) << LOG_ALPHA)));
          k &= movemask_epi8 (cmpgt_epi8 (and_si (bp, x_ptr), z_ptr));

          if (k == 0)
            goto out;
        }
      count += _mm_popcnt_u32 (k);

    out:yy += ALPHA;
    }

  return count;
}
