#ifndef MYBITSTRING_HEADER_
#define MYBITSTRING_HEADER_

/*! @file
 * @brief
 * i386 assembler optimized implementation of an array class of bits (in dense representation)
 */


#include <new>
using std::nothrow;

#include "utils.H"

class myBitString
{
protected:
  int size;
  unsigned int *data;
  inline void resize(int newsize)
    {
      if (newsize < size) return;
      //cerr << "Resize " << size << " to " << newsize << endl;
      unsigned int *olddata = data;
      data = new(nothrow) unsigned int[newsize];
      if (!data)
	{
	  cerr << "myBitString.resize(" << newsize << ") out of memory" << endl;
	  exit(1);
	}
      for (int i=0; i<size; i++) data[i] = olddata[i];
      for (int i=size; i< newsize; i++) data[i]=0;
      size=newsize;
      if (olddata) delete [] olddata;
    };
public:
  inline void optisize(void)
    {
      int i=size-1;
      while (i>=0 && data[i]==0) i--;
      if (i==size-1) return; // nothing to do...

      if (size<128)
       {
         // to avoid heap pollution for small sizes, simply decrease
         // the size without reallocating any heap memory
#ifdef VERBOSE
         cout << "MyBitString::Optisize, no reallocate " << size << " to " << i+1 << endl;
#endif
         size=i+1; return;
       }

#ifdef VERBOSE
      cout << "MyBitString::Optisize, reallocate" << size << " to " << i+1 << endl;
#endif
      if (i<0) { delete [] data; data=NULL; size=0; return; } // now empty

      size=i+1; // new size
      unsigned int *olddata = data;
      data = new(nothrow) unsigned int[size];
      if (!data)
	{
	  cerr << "myBitString.optisize(" << size << ") out of memory" << endl;
	  exit(1);
	}
      for (; i>=0; --i) data[i] = olddata[i];
      if (olddata) delete [] olddata;
    };

  inline myBitString(void) : size(0), data(NULL) { }
  inline ~myBitString(void)
    {
      if (data) delete [] data;
      data=NULL;
      size=0;
    }
  inline myBitString(const myBitString &rhs) : size(rhs.size), data(NULL)
   {
     data = new(nothrow) unsigned int[size];
     if (!data)
       {
         cerr << "myBitString.operator=(" << size << ") out of memory" << endl;
         exit(1);
       }
     for (int i=size-1; i>=0; --i) data[i] = rhs.data[i];
   }
  inline myBitString& operator= (const myBitString &rhs)
   {
     if (data) delete [] data;
     size=rhs.size;
     data = new(nothrow) unsigned int[size];
     if (!data)
       {
         cerr << "myBitString.operator=(" << size << ") out of memory" << endl;
         exit(1);
       }
     for (int i=size-1; i>=0; --i) data[i] = rhs.data[i];
     return *this;
   }
  inline int count(const bool b = true) const
    {
      int z=0;
      if (!b) return -1;
      for (int i=0; i<size; i++)
	{
	  register unsigned int h = data[i];
	  while (h)
	    {
	      if (h&1) z++;
	      h>>=1;
	    }
	}
      return z;
    }
  inline int first(const bool b = true) const
    {
      if (b)
	{
	  for (int i=0; i<size; i++)
	    {
	      register unsigned int h = data[i];
	      if (h)
		{
		  int j;
		  asm("bsfl %0,%1" : "+g" (h), "=c" (j) );
		  return 32*i+j;
		}
	    }
	  //cerr << "first(1)=-1" << endl;
	  return -1;
	}
      else
	{
	  cerr << "myBitString.first(0) not implemented" << endl;
	  exit(1);
	}
    };
  inline int next(int pos, const bool b = true) const
    {
      int i,j;
      if (pos<0) return first(b);
      pos++;
      if (b)
	{
	  i=pos>>5;
	  j=pos&31;
	  if (i>=size) return -1;
	  register unsigned int h = data[i];
	  h>>=j;
	  if (h)
	    {
	      int j;
	      asm("bsfl %0,%1" : "+g" (h), "=c" (j) );
	      return pos+j;
	    }
	  else
	    {
	      j = 0;
	      for (i++;i<size; i++)
		{
		  h = data[i];
		  if (h)
		    {
		      int j;
		      asm("bsfl %0,%1" : "+g" (h), "=c" (j) );
		      return 32*i+j;
		    }
		}
	    }
	  return -1;
	}
      else
	{
	  cerr << "myBitString.next(0) not implemented" << endl;
	  exit(1);
	}
    };
  inline int last(const bool b = true) const
    {
      if (b)
	{
	  for (register int i=size-1; i>=0; i--)
	    {
	      //cerr << "i=" << i << endl;
	      register unsigned int h = data[i];
	      if (h)
		{
		  int j;
		  asm("bsrl %0,%1" : "+g" (h), "=c" (j) );
		  return 32*i+j;
		}
	    }
	  //cerr << "last(1)=-1"<<endl;
	  return -1;
	}
      else
	{
	  cerr << "myBitString.last(0) not implemented" << endl;
	  exit(1);
	}
    };
  inline int prev(int pos, const bool b = true) const
    {
      if (pos>32*size) return last(b);
      if (pos<=0) return -1;
      pos--;
      if (b)
	{
	  int i=pos>>5;
	  int j=pos&31;
	  register unsigned int h = data[i];
	  h<<=31-j;
	  if (h)
	    {
	      int j;
	      asm("bsrl %0,%1" : "+g" (h), "=c" (j) );
	      return pos-(31-j);
	    }
	  else
	    {
	      for (i--;i>=0; i--)
		{
		  h = data[i];
		  if (h)
		    {
		      int j;
		      asm("bsrl %0,%1" : "+g" (h), "=c" (j) );
		      return 32*i+j;
		    }
		}
	    }
	  return -1;
	}
      else
	{
	  cerr << "myBitString.prev(0) not implemented" << endl;
	  exit(1);
	}
    };
  inline void invert(const int pos)
    {
      //cerr << "invert("<<pos<<")"<<endl;
      int i=pos>>5; // 32 bits = 2^5
      if (i>=size) resize(i+1);
      asm volatile ("btcl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); // bit-test and complement
      // actually data[pos/32] gets modified (but we do not inform the compiler about this...)
    };
  inline bool test_and_invert(const int pos)
    {
      //cerr << "invert("<<pos<<")"<<endl;
      int i=pos>>5; // 32 bits = 2^5
      if (i>=size) resize(i+1);
      bool ret;
      asm ("btcl %[pos],(%[data])\n\tsetc %[flag]" : [flag] "=g" (ret) : [data] "q" (data), [pos] "r" (pos): "cc"); // bit-test and complement
       // actually data[pos/32] gets modified (but we do not inform the compiler about this...)
      return ret;
    };
  inline bool test(const int pos) const
    {
      if ((pos>>5)>=size) return false;
      bool r;
      asm ("btl %2,(%1)\n\tsetc %0" : "=r" (r) : "q" (data), "r" (pos) : "cc"); // bit-test and r=CarryFlag
      return r;
    };
  inline void set(const int pos, const bool b)
    {
      //cerr << "set("<<pos<< "," << b << ")" << endl;
      register int i=pos>>5;
      if (i>=size)
	if (b)
	  resize(i+1);
	else
	  return;
      if (b)
	asm volatile ("btsl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); //data[i] |= bitmask[j];
      else
	asm volatile ("btrl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); //data[i] &= ~bitmask[j];
      // actually data[pos/32] gets modified (but we do not inform the compiler about this...)
    }
  inline void set(const int pos)
    {
      register int i=pos>>5;
      if (i>=size) resize(i+1);
      asm  ("btsl %1,(%0)" : : "q" (data), "r" (pos) : "cc"); //data[i] |= bitmask[j];
      // actually data[pos/32] gets modified (but we do not inform the compiler about this...)
    }
  inline void _xor(const myBitString &s)
    {
      int i=s.size-1;
      while (i>=0 && s.data[i]==0) i--; // Nullen des zweiten BitStrings ignorieren
      if (i>=size) // <-> i+1>size
	{
	  cerr << "myBitString.xor calls resize" << endl;
	  resize(i+1);
	}
      for (; i>=0; --i)
	data[i] ^= s.data[i];
    }
  inline const myBitString& operator ^= (const myBitString &s)
    {
      _xor(s); return *this;   
    }

  inline void _and(const myBitString &s1, const myBitString &s2)
  // der eigene Bitstring wird durch s1&s2 (komponentenweise) berschrieben
    {
      int i = MIN(s1.size,s2.size)-1;
      if (i>=size) // <-> i+1>size
	{
          // CAUTION:  (this == &s1 || this == &s2) -> BOOM!!
          // But since (this == &s1 || this == &s2) also
          // implies MIN(s1.size,s2.size)-1 < size, we can safely
          // destroy the array...
          if (data) delete [] data; // alten Vektor lschen
          size=i+1;
          data = new unsigned int[size];
	  // initialisieren der Bits des neuen Vektors nicht notwendig (geschieht ja gleich durch das and)
	}
      else
       {
         register int j=size-1;
         while (j>i) data[j--]=0; // berschssige Bits im Resultat lschen
       }
      for (; i>=0; --i)
	data[i] = s1.data[i] & s2.data[i];
    }

  template<typename T> void test_and_add_carry(const myBitString &s2, T &CarryVec) const
  // generic version, but still fast (will be automatically used, if no
  // specialized version is defined below)
  // temporre (komponentenweise) AND-Verknpfung den eigenen Bitstrings
  // mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
  // Positionen inkrementieren.
    {
      const int s = MIN(size,s2.size);
      const int Bits = 8*sizeof(unsigned int);
      for (int i=0; i<s; ++i)
       {
         register unsigned int h = data[i] & s2.data[i];
         register int j=0;
         while (h)
          {
            if (h&0xff)
             {
               CarryVec[Bits*i+j  ]+=h&1;
               CarryVec[Bits*i+j+1]+=(h>>1)&1;
               CarryVec[Bits*i+j+2]+=(h>>2)&1;
               CarryVec[Bits*i+j+3]+=(h>>3)&1;
               CarryVec[Bits*i+j+4]+=(h>>4)&1;
               CarryVec[Bits*i+j+5]+=(h>>5)&1;
               CarryVec[Bits*i+j+6]+=(h>>6)&1;
               CarryVec[Bits*i+j+7]+=(h>>7)&1;
             }
            h>>=8;
            j+=8;
          }
       }
    }

#if 1 && defined(ASM_SSE2)
 #if defined(VERBOSE)
  #warning "support for test_and_add_carry(...), (uint16, using SSE2)"
 #endif
  void test_and_add_carry(const myBitString &s2, unsigned short int CarryVec[]) const
  // temporre (komponentenweise) AND-Verknpfung den eigenen Bitstrings
  // mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
  // Positionen inkrementieren.
    {
      //cout << "spezialisierte Variante uint16 (using SSE2)" << endl;
      static const unsigned short int PackedMultipl[8] __attribute__ ((aligned (16))) = { 128,64,32,16,8,4,2,1 };
       // these multipliers are needed to shift the bits of the packed words to the desired position

      int s = MIN(size,s2.size)-1; // doublewords (minus 1) to process
      // int s = MIN(last(),s2.last()); if (s>0) s/=32;

      // important:
      //  (i) we preload -4(%[data1],%[i],4) and -4(%[data2],%[i],4);
      //      fencepost error (reading -4(%[data]), resp. -4(%[data2]) is avoided by using %%eax instead of %[i]
      //  (ii) CarryVec MUST be 16-byte aligned!! (otherwise the program segfaults!)

      asm volatile ( \
       "test %[i],%[i] \n\t" \
       "js 9f \n\t" \
       "movl $0x00010001,%%eax \n\t" \
       "movl %[i],%%edx \n\t" \
       "movd %%eax,%%xmm7 \n\t" \
       "movdqa %[PM],%%xmm6 \n\t" \
       "movl (%[data1],%[i],4),%%eax \n\t" \
       "pshufd $0x00,%%xmm7,%%xmm7 \n\t" \
       "shll $6,%%edx \n\t" \
       "andl (%[data2],%[i],4),%%eax \n\t" \
       "addl %[V],%%edx \n\t" \
       "movd %%eax,%%xmm4 \n\t" \
       "mov %[i],%%eax \n\t" \
       "1: \n\t" \
       "dec %%eax \n\t" \
       "cmovs %[i],%%eax \n\t" \
       "prefetchnta -72(%[data2],%[i],4) \n\t" \
       "pshuflw $0x00,%%xmm4,%%xmm1 \n\t" \
       "pshuflw $0x55,%%xmm4,%%xmm3 \n\t" \
       "movd (%[data1],%%eax,4),%%xmm5 \n\t" \
       "pshufd $0x00,%%xmm1,%%xmm1 \n\t" \
       "pshufd $0x00,%%xmm3,%%xmm2 \n\t" \
       "pmullw %%xmm6,%%xmm1 \n\t" \
       "movd (%[data2],%%eax,4),%%xmm4 \n\t" \
       "movdqa %%xmm1,%%xmm0 \n\t" \
       "psrlw $15,%%xmm1 \n\t" \
       "pmullw %%xmm6,%%xmm2 \n\t" \
       "psrlw $7,%%xmm0 \n\t" \
       "movdqa %%xmm2,%%xmm3 \n\t" \
       "psrlw $7,%%xmm2 \n\t" \
       "pand %%xmm7,%%xmm0 \n\t" \
       "psrlw $15,%%xmm3 \n\t" \
       "pand %%xmm7,%%xmm2 \n\t" \
       "paddw (%%edx),%%xmm0 \n\t" \
       "paddw 16(%%edx),%%xmm1 \n\t" \
       "pand %%xmm5,%%xmm4 \n\t" \
       "paddw 32(%%edx),%%xmm2 \n\t" \
       "paddw 48(%%edx),%%xmm3 \n\t" \
       "movdqa %%xmm0,(%%edx) \n\t" \
       "movdqa %%xmm1,16(%%edx) \n\t" \
       "movdqa %%xmm2,32(%%edx) \n\t" \
       "movdqa %%xmm3,48(%%edx) \n\t" \
       "3: subl $64,%%edx \n\t" \
       "decl %[i] \n\t" \
       "jns 1b \n\t" \
       "9: " \
       : [i] "+r" (s)
       : [PM] "m" (PackedMultipl[0]), [data1] "r" (data), [data2] "r" (s2.data), [V] "D" (CarryVec)
       : "memory", "cc", "eax", "edx", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
    }
#elif defined(ASM_ATHLON) || (defined (ASM_MMX) && defined(ASM_SSE))
#ifdef DEBUG
 #warning "support for test_and_add_carry(...), (uint16, using MMX/MMX-extensions)"
#endif
  void test_and_add_carry(const myBitString &s2, unsigned short int CarryVec[]) const
  // temporre (komponentenweise) AND-Verknpfung den eigenen Bitstrings
  // mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
  // Positionen inkrementieren.
    {
      //cout << "spezialisierte Variante uint16 (using MMX/3DNow!)" << endl;
      static const unsigned short int PackedMultipl[4] __attribute__ ((aligned (8))) = { 8,4,2,1 };
      int s = MIN(size,s2.size)-1;
      asm volatile ( \
       "test %[i],%[i] \n\t" \
       "js 9f \n\t" \
       "movl $0x00010001,%%eax \n\t" \
       "movd %%eax,%%mm7 \n\t" \
       "movq %[PM],%%mm6 \n\t" \
       "punpckldq %%mm7,%%mm7 \n\t" \
       "1: movl (%[data1],%[i],4),%%eax \n\t" \
       "movl %[i],%%edx \n\t" \
       "andl (%[data2],%[i],4),%%eax \n\t" \
       "shll $5,%%edx \n\t" \
       "prefetchnta -64(%[data2],%[i],4) \n\t" \
       "2: \n\t" \
       "movd %%eax,%%mm0 \n\t" \
       "pshufw $0x00,%%mm0,%%mm0 \n\t" \
       "pmullw %%mm6,%%mm0 \n\t" \
       "movq %%mm0,%%mm1 \n\t" \
       "movq %%mm0,%%mm2 \n\t" \
       "movq %%mm0,%%mm3 \n\t" \
       "psrlw $3,%%mm0 \n\t" \
       "psrlw $7,%%mm1 \n\t" \
       "psrlw $11,%%mm2 \n\t" \
       "psrlw $15,%%mm3 \n\t" \
       "pand %%mm7,%%mm0 \n\t" \
       "pand %%mm7,%%mm1 \n\t" \
       "pand %%mm7,%%mm2 \n\t" \
       "paddw (%[V],%%edx,2),%%mm0 \n\t" \
       "paddw 8(%[V],%%edx,2),%%mm1 \n\t" \
       "paddw 16(%[V],%%edx,2),%%mm2 \n\t" \
       "paddw 24(%[V],%%edx,2),%%mm3 \n\t" \
       "movq %%mm0,(%[V],%%edx,2) \n\t" \
       "movq %%mm1,8(%[V],%%edx,2) \n\t" \
       "movq %%mm2,16(%[V],%%edx,2) \n\t" \
       "movq %%mm3,24(%[V],%%edx,2) \n\t" \
       "addl $16,%%edx \n\t" \
       "shrl $16,%%eax \n\t" \
       "jnz 2b \n\t" \
       "decl %[i] \n\t" \
       "jns 1b \n\t" \
       "emms \n\t" \
       "9: " \
       : [i] "+r" (s)
       : [PM] "m" (PackedMultipl[0]), [data1] "r" (data), [data2] "r" (s2.data), [V] "D" (CarryVec)
       : "memory", "cc", "eax", "edx", "mm0", "mm1", "mm2", "mm3", "mm6", "mm7");
    }
#else
  void test_and_add_carry(const myBitString &s2, unsigned short int CarryVec[]) const
  // temporre (komponentenweise) AND-Verknpfung den eigenen Bitstrings
  // mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
  // Positionen inkrementieren.
    {
      //cout << "spezialisierte Variante uint16" << endl;
      int s = MIN(size,s2.size)-1;
      asm volatile ( \
       "test %[i],%[i] \n\t" \
       "js 9f \n\t" \
       "1: movl (%[data1],%[i],4),%%eax \n\t" \
       "movl %[i],%%edx \n\t" \
       "andl (%[data2],%[i],4),%%eax \n\t" \
       "shll $5,%%edx \n\t" \
       "2: shrl %%eax \n\t" \
       "adcw $0,(%[V],%%edx,2) \n\t" \
       "shrl %%eax \n\t" \
       "adcw $0,2(%[V],%%edx,2) \n\t" \
       "shrl %%eax \n\t" \
       "adcw $0,4(%[V],%%edx,2) \n\t" \
       "shrl %%eax \n\t" \
       "adcw $0,6(%[V],%%edx,2) \n\t" \
       "shrl %%eax \n\t" \
       "adcw $0,8(%[V],%%edx,2) \n\t" \
       "shrl %%eax \n\t" \
       "adcw $0,10(%[V],%%edx,2) \n\t" \
       "shrl %%eax \n\t" \
       "adcw $0,12(%[V],%%edx,2) \n\t" \
       "shrl %%eax \n\t" \
       "adcw $0,14(%[V],%%edx,2) \n\t" \
       "addl $8,%%edx \n\t" \
       "test %%eax,%%eax \n\t" \
       "jnz 2b \n\t" \
       "decl %[i] \n\t" \
       "jns 1b \n\t" \
       "9: " \
       : [i] "+r" (s)
       : [data1] "r" (data), [data2] "r" (s2.data), [V] "D" (CarryVec)
       : "memory", "cc", "eax", "edx");
    }
#endif


#if 1
  void test_and_add_carry(const myBitString &s2, unsigned int CarryVec[]) const
  // temporre (komponentenweise) AND-Verknpfung den eigenen Bitstrings
  // mit s2; den Vektor CarryVector an den korrespondierenden gesetzten
  // Positionen inkrementieren.
    {
      //cout << "spezialisierte Variante uint32" << endl;
      int s = MIN(size,s2.size)-1;
      asm volatile ( \
       "test %[i],%[i] \n\t" \
       "js 9f \n\t" \
       "1: movl (%[data1],%[i],4),%%eax \n\t" \
       "movl %[i],%%edx \n\t" \
       "andl (%[data2],%[i],4),%%eax \n\t" \
       "shll $5,%%edx \n\t" \
       "2: btl $0,%%eax \n\t" \
       "adcl $0,(%[V],%%edx,4) \n\t" \
       "btl $1,%%eax \n\t" \
       "adcl $0,4(%[V],%%edx,4) \n\t" \
       "btl $2,%%eax \n\t" \
       "adcl $0,8(%[V],%%edx,4) \n\t" \
       "btl $3,%%eax \n\t" \
       "adcl $0,12(%[V],%%edx,4) \n\t" \
       "btl $4,%%eax \n\t" \
       "adcl $0,16(%[V],%%edx,4) \n\t" \
       "btl $5,%%eax \n\t" \
       "adcl $0,20(%[V],%%edx,4) \n\t" \
       "btl $6,%%eax \n\t" \
       "adcl $0,24(%[V],%%edx,4) \n\t" \
       "btl $7,%%eax \n\t" \
       "adcl $0,28(%[V],%%edx,4) \n\t" \
       "addl $8,%%edx \n\t" \
       "shrl $8,%%eax \n\t" \
       "jnz 2b \n\t" \
       "decl %[i] \n\t" \
       "jns 1b \n\t" \
       "9: " \
       : [i] "+r" (s)
       : [data1] "r" (data), [data2] "r" (s2.data), [V] "D" (CarryVec)
       : "memory", "cc", "eax", "edx");
    }
#endif

};

#endif
