/*! @file
 * @brief
 * mpqs sieving stuff; module that is compiled separately
 */

#include "Sieving.H"
#include "qsieve.H"
#include "modulo.H"
using namespace numtheory;

#include "mpqsPolynom.H"
extern CmpqsPolynom Polynom;


int SieveOffset = -LogicalSieveSize;
/* In general the logical sieve interval for each mpqs polynomial will exceed
   the size of the physical sieve. The logical sieve has to be sieved in
   several parts/steps. SieveOffset is the offset of the logical sieve at
   the starting point of a physical sieve.
 */


// "virtual" forward declaration (will be resolved at link time!)
// IMPORTANT: never ever link against the wrong version, this would result
//            in strange behaviour and a glorious mess!!
class StaticRelations
{
 public:
  static void insert(CRelation *GL, const bool do_multi_combine_init=true);
};

TSieveElement SieveArray_[PhysicalSieveSize+64+8] __attribute__ ((aligned (64))); // sieve memory

TSieveElement log_Primzahl_of_PrimePowers[StaticFactorbase::max_additional_Powers];
// this is the place for converted logarithms of further prime-powers

TSieveElement SieveControl::log_PrimeNumbers[StaticFactorbase::MaxSize] __attribute__ ((aligned (16)));
TSieveElement SieveControl::log_PrimeNumbers_dirty[StaticFactorbase::MaxSize] __attribute__ ((aligned (16)));
int SieveControl::myFBStartIndex = SieveControl::FBLowestStartIndex;
int SieveControl::myFBStartIndexHint = SieveControl::FBLowestStartIndex;
signed int SieveControl::DirtinessPegel = 0;

// for efficient access the values (per sieve interval) are placed here.
int CDeltaComputations::Delta_of_PrimeNumbers[StaticFactorbase::MaxSize][2] __attribute__ ((aligned (16)));
int CDeltaComputations::Delta_of_PrimePowers[StaticFactorbase::max_additional_Powers][2] __attribute__ ((aligned (16)));

int CDeltaComputations::D_mod_FBPrime[StaticFactorbase::MaxSize];
signed int CDeltaComputations::D_difference_to_last = 0;
CDeltaComputations::CmpqsDmodPUpdater CDeltaComputations::mpqsDmodPUpdater;

void CDeltaComputations::CmpqsDmodPUpdater::update()
{
  //cout << "-MPQS PolyD update- " << Polynom.get_D() << endl;
  mpz_sub(act_D,Polynom.get_D(),act_D);
  if (mpz_fits_uint_p(act_D))
   {
     const int d = D_difference_to_last = mpz_get_ui(act_D);
#ifdef VERBOSE
     MARK; cout<< "D update difference is " << d << ", #FB=" << StaticFactorbase::Size() << endl;
#endif
     // start with element 1 (instead of element 0, which is expected to be -1)
     int i;
     for (i=2; i+1<StaticFactorbase::Size() && PrimeNumbers[i+1]<46340; i+=2)
      {
        D_mod_FBPrime[i+1]+=d;
        while (D_mod_FBPrime[i+1]>=D_mod_FBPrime[i]) D_mod_FBPrime[i+1]-=D_mod_FBPrime[i];
      }
     for (; i<StaticFactorbase::Size(); ++i)
      {
        D_mod_FBPrime[i]+=d;
        while (D_mod_FBPrime[i]>=PrimeNumbers[i]) D_mod_FBPrime[i]-=PrimeNumbers[i];
      }
     mpz_set(act_D,Polynom.get_D());
   }
  else
   {
#ifdef VERBOSE
     MARK; cout<< "D update difference is " << act_D << ", #FB=" << StaticFactorbase::Size() << endl;
#endif 
     mpz_set(act_D,Polynom.get_D());
     int i;
     for (i=2; i+1<StaticFactorbase::Size() && PrimeNumbers[i+1]<46340; i+=2)
      {
        D_mod_FBPrime[i]=PrimeNumbers[i]*PrimeNumbers[i+1];
        D_mod_FBPrime[i+1]=mpz_remainder_ui(act_D,D_mod_FBPrime[i]);
      }
     for (; i<StaticFactorbase::Size(); ++i) D_mod_FBPrime[i]=mpz_remainder_ui(act_D,PrimeNumbers[i]);
   }

#ifdef DEBUG
  int i;
  for (i=2; i+1<StaticFactorbase::Size() && PrimeNumbers[i+1]<46340; i+=2)
    if (mpz_remainder_ui(Polynom.get_D(),D_mod_FBPrime[i]) != D_mod_FBPrime[i+1])
     {
       MARK; cerr << "sanity check failed for i=" << i << ": "
                  << mpz_remainder_ui(Polynom.get_D(),D_mod_FBPrime[i])
                  << " != " << D_mod_FBPrime[i+1] << endl;
       exit(1);
     }
   for (; i<StaticFactorbase::Size(); ++i)
    if (mpz_remainder_ui(Polynom.get_D(),PrimeNumbers[i]) != D_mod_FBPrime[i])
     {
       MARK; cerr << "sanity check failed for i=" << i << ": "
                  << mpz_remainder_ui(Polynom.get_D(),PrimeNumbers[i])
                  << " != " << D_mod_FBPrime[i] << endl;
       exit(1);
     }
#endif
}


#if (TSIEVEELEMENTSIZE == 1)
short unsigned int CSieveStaticFactors::LogValRuns[128];
short unsigned int CSieveStaticFactors::PrimeNumberDiffs[CSieveStaticFactors::MaxSize];
#endif

int* CSieveStaticFactors::DefaultHelperTable[2] = { NULL, NULL };

unsigned int SieveControl::PhysicalSieveIntervalIndex; // number of the current physical interval to sieve with
// starts with 0 for the start of each new logical sieve interval
// and is increased by 1 for each new physical sieve interval

unsigned int SieveControl::BestPSI[2];
// indices of the two best PhysicalSieveIntervals

TSieveElement SieveControl::PhysicalSieveIntervalThresholdCorrector[1024] = { 0 };
// idea: the logical sieve interval differs slightly in quality for different
// physical sieve intervals. So let's roughly correct the threshold for
// each physical sieve interval from time to time. This may increase the quality of
// detected relations.
// SieveControl::GetLogThreshold() will always add this correction value to its
// computed value. So it behaves totally neutral. You may change these values on the fly...
// initial default values: 0, therefore neutral and no correction.


void SieveControl::set_logVal_for_Primenumber(const int pos, const TSieveElement val, const TSieveElement dirtyval)
{
  log_PrimeNumbers[pos]=val;
  log_PrimeNumbers_dirty[pos]=dirtyval;
}

void SieveControl::set_logVal_for_Primepower(const int pos, const TSieveElement val)
{
  log_Primzahl_of_PrimePowers[pos]=val;
}

const bool SieveControl::Hit_after_DirtyCorrection(const signed int SievePos, TSieveElement Value)
{
  // An dieser Stelle im Sieb wurde ein Treffer gefunden;
  // aber wegen der Dirty-Siebfaktoren, die fr alle Siebelemente als Treffer gezhlt werden,
  // knnte es tatschlich gar kein Treffer sein.
  // Wir mssen daher den Dirty-Siebwert um die nicht-teilenden Dirty-Faktorbasiselemente korrigieren.
  // Dieser Korrekturwert wird hier bestimmt (allerdings mit shortcut).

  // Wenn diese Routine aufgerufen wird, ist mglicherweise ein Siebtreffer gelandet worden;
  // Da es sich um ein Regelungssystem handelt, ist dies (zwar kein perfekter, aber)
  // ein sinnvoller Indikator, um nun wieder etwas weniger Dirtiness zu verlangen,
  // wenn doch kein Siebtreffer vorlag...

  for (int i=FBStartIndex()-1; i>=FBLowestStartIndex; --i)
   {
     register int d = normalized_signed_mod(SievePos,StaticFactorbase::PrimeNumbers[i],StaticFactorbase::PrimeNumberReciprocals[i]);
     // same as: SievePos%PrimeNumbers[i]; if (d<0) d+=PrimeNumbers[i];
     // remark: %-operation does *not* eliminate the sign!
     //         ugly: -1=1 (mod 2), but (-1)%2=-1, 1%2=1
     // -> so we have to eliminate the sign!!!
     const bool Treffer = (d==CDeltaComputations::Delta_of_PrimeNumbers[i][0] || d==CDeltaComputations::Delta_of_PrimeNumbers[i][1]);
     if (Treffer) // Treffer means: hit
      {
        // Value is decreasing
        Value+=PrimzahlSieveWeight_dirty(i); // remove dirty-Weight
	Value-=PrimzahlSieveWeight(i); // and replace it by the real Weight
      }
     else
      {
        // no hit
        // Value is increasing
        Value+=PrimzahlSieveWeight_dirty(i); // remove dirty-weight, because we haven't had a hit

        // shortcut "return false" on value>=0 doesn't work anymore, since it's not monotone increasing
        //  2004-03-12: maybe we should revert to the old behaviour?
         // the more dirty the sieving is -> the more values to analyse
         // the more expensive divisions we need to take and we cannot shortcut! :-(
      }
   }

  if (Value>=0)
   {
     SieveControl::RequestLessDirtiness(); // "steer in the opposite direction"
     return false; // Threshold exceeded, no sieve hit
   }
  return true; // Threshold not exceeded -> sieve hit
}


TSieveElement SieveControl::log_Threshold = 0; // will be initialized with the "real" value while the whole sieving stuff is initialized.
TSieveElement SieveControl::raw_log_Threshold = 0; // will be initialized with the "real" value while the whole sieving stuff is initialized.
TSieveElement SieveControl::log_ThresholdDeltaHint = 0; // initial value

void SieveControl::compute_SieveThreshold(void)
{
  // Here we determine/estimate the Threshold
  
  // Constraints for calling this initialization:
  // the values for 
  // 1. LogicalSieveSize (for interval [-LogicalSieveSize,LogicalSieveSize])
  // 2. kN (pre-multiplier of N to increase quality of static factorbase)
  // 3. and the static factorbase itself
  // have to be already initialized.
 
  // computing logarithmic threshold:
  mpz_t x; mpz_init(x);
  mpz_div_ui(x,kN,2); mpz_sqrt(x,kN); mpz_mul_ui(x,x,LogicalSieveSize);
  double d = log(fabs(mpz_get_d(x))) - (Factor_Threshold*log(StaticFactorbase::BiggestPrime()));
  mpz_clear(x);
  if (StaticFactorbase::PrimeNumbers[1]==2) d-=log(2); // If 2 is in the factorbase, then 2 divides all sieve entries.
  d*=log_SieveEntryMultiplier;
#ifdef VERBOSE_INFO
  cout << "Threshold (initial sieve entries) for detecting relations set to " << d << endl;
#endif
  raw_log_Threshold=log_Threshold=static_cast<TSieveElement>(floor(d));
  if (log_Threshold!=floor(d)) 
   {
     cerr << "With the compiled-in types and constants this" << endl;
     cerr << "factorization cannot be done, sorry." << endl; 
     cerr << "TSieveElement (compile-time type) needs to be increased to fit the threshold!" << endl;
     cerr << "Please modify log_SieveEntryMultiplier and/or TSieveElement" << endl;
     cerr << "and compile the program again." << endl;
     exit(1);
   }

#if TSIEVEELEMENTSIZE > 0
  // sanity check...
  if (sizeof(TSieveElement)!=TSIEVEELEMENTSIZE)
   {
     cerr << "Compiled types and values for " << endl
          << "TSieveElement (Size=" << sizeof(TSieveElement) << ")"
          << " and TSIEVEELEMENTSIZE (=" << TSIEVEELEMENTSIZE << ")" << endl
          << "do not match. Please correct them!" << endl;
     exit(1);
   }
#endif
}




#ifdef ASM_386
#define SIEBASM_386

unsigned int clobbered_int; // dummy

#if TSIEVEELEMENTSIZE == 1
#if defined(ASM_SSE2)
 #ifdef DEBUG
  #warning "Initialize Sieve: SSE2 Sieve initialization activated."
 #endif
#define asm_sieb_init(logValue) \
  asm volatile(\
   "movd %%eax,%%xmm0 # -> (x,x,x,x,x,x,x,x,x,x,x,x,x,x,x,a) \n\t" \
   "punpcklbw %%xmm0,%%xmm0 # (x,x,x,x,x,x,x,x,x,x,x,x,x,x,a,a) \n\t" \
   "movl %[count],%%eax \n\t" \
   "punpcklbw %%xmm0,%%xmm0 # (x,x,x,x,x,x,x,x,x,x,x,x,a,a,a,a) \n\t" \
   "punpcklbw %%xmm0,%%xmm0 # (x,x,x,x,x,x,x,x,a,a,a,a,a,a,a,a) \n\t" \
   "punpcklbw %%xmm0,%%xmm0 # (a,a,a,a,a,a,a,a,a,a,a,a,a,a,a,a) \n\t" \
   ".balign 16 \n\t" \
   "1: \n\t" \
   "movdqa %%xmm0,(%[Sieve]) \n\t" \
   "movdqa %%xmm0,0x10(%[Sieve]) \n\t" \
   "movdqa %%xmm0,0x20(%[Sieve]) \n\t" \
   "movdqa %%xmm0,0x30(%[Sieve]) \n\t" \
   "add $0x40,%[Sieve] \n\t" \
   "sub $0x40,%%eax \n\t" \
   "jnz 1b \n\t" \
   "emms \n" \
   : "=D" (clobbered_int), "=a" (clobbered_int) : [threshold] "a" (logValue), [Sieve] "D" (SieveArray), [count] "i" (PhysicalSieveSize) : "cc", "memory", "xmm0");
#elif defined(ASM_MMX)
 #ifdef DEBUG
  #warning "Initialize Sieve: MMX Sieve initialization activated."
 #endif
#define asm_sieb_init(logValue) \
  asm volatile(\
   "movd %%eax,%%mm0 # -> (x,x,x,x,x,x,x,a) \n\t" \
   "punpcklbw %%mm0,%%mm0 # (x,x,x,x,x,x,a,a) \n\t" \
   "movl %[count],%%eax \n\t" \
   "punpcklbw %%mm0,%%mm0 # (x,x,x,x,a,a,a,a) \n\t" \
   "punpcklbw %%mm0,%%mm0 # (a,a,a,a,a,a,a,a) \n\t" \
   ".balign 16 \n\t" \
   "1: \n\t" \
   "movq %%mm0,(%[Sieve]) \n\t" \
   "movq %%mm0,8(%[Sieve]) \n\t" \
   "movq %%mm0,16(%[Sieve]) \n\t" \
   "movq %%mm0,24(%[Sieve]) \n\t" \
   "movq %%mm0,32(%[Sieve]) \n\t" \
   "movq %%mm0,40(%[Sieve]) \n\t" \
   "movq %%mm0,48(%[Sieve]) \n\t" \
   "movq %%mm0,56(%[Sieve]) \n\t" \
   "add $64,%[Sieve] \n\t" \
   "sub $64,%%eax \n\t" \
   "jnz 1b \n\t" \
   "emms \n" \
   : "=D" (clobbered_int), "=a" (clobbered_int) : [threshold] "a" (logValue), [Sieve] "D" (SieveArray), [count] "i" (PhysicalSieveSize) : "cc", "memory", "mm0");
#else
#define asm_sieb_init(logValue) \
  asm volatile ( \
       "movb %%al,%%ah\n\t" \
       "pushw %%ax\n\t" \
       "shll $16,%%eax\n\t" \
       "popw %%ax\n\t" \
       "shrl $2,%%ecx\n\t" \
       "cld\n\t" \
       "rep\n\t" \
       "stosl" \
       : "=c" (clobbered_int), "=D" (clobbered_int) : "a" (logValue), "D" (SieveArray), "c" (PhysicalSieveSize) : "cc", "memory");
#endif

#if defined(ASM_ATHLON) || defined(ASM_SSE)
// ATHLON-optimized code
#ifdef DEBUG
#warning "using IA-32/ATHLON version for macro: asm_search_sieb"
#endif
#define asm_search_sieb(offset) \
  asm volatile ( \
       "0: # calibrate alignment \n\t" \
       "testl $7,%[pos] \n\t" \
       "jz 1f # is aligned \n\t" \
       "decl %[pos] \n\t" \
       "testb $0x80,(%[sieb],%[pos]) \n\t" \
       "jns 0b \n\t" \
       "js 4f # hit detected \n\t" \
       "1: # search for hit, (big steps) loop \n\t" \
       "prefetcht2 -256(%[sieb],%[pos]) # tuning cache!! \n\t" \
       "subl $64,%[pos] \n\t" \
       "movq  (%[sieb],%[pos]),%%mm0 \n\t" \
       "movq 8(%[sieb],%[pos]),%%mm1 \n\t" \
       "por 16(%[sieb],%[pos]),%%mm0 \n\t" \
       "por 24(%[sieb],%[pos]),%%mm1 \n\t" \
       "por 32(%[sieb],%[pos]),%%mm0 \n\t" \
       "por 40(%[sieb],%[pos]),%%mm1 \n\t" \
       "por 48(%[sieb],%[pos]),%%mm0 \n\t" \
       "por 56(%[sieb],%[pos]),%%mm1 \n\t" \
       "por %%mm1,%%mm0 \n\t" \
       "pmovmskb %%mm0,%%eax # highest bits of packed bytes to %%ah \n\t" \
       "testl %%eax,%%eax \n\t" \
       "jz 1b \n\t" \
       "emms # clear MMX state \n\t" \
       "2: # search for hit, (little steps) init \n\t" \
       "addl $64,%[pos] \n\t" \
       "3: # search for hit, (little steps) loop \n\t" \
       "subl $4,%[pos] \n\t" \
       "testl $0x80808080,(%[sieb],%[pos]) \n\t" \
       "jz 3b \n\t" \
       "movl (%[sieb],%[pos]),%%eax \n\t" \
       "addl $3,%[pos] \n\t" \
       "testl $0x80000000,%%eax \n\t" \
       "jnz 4f \n\t" \
       "decl %[pos] \n\t" \
       "testl $0x00800000,%%eax \n\t" \
       "jnz 4f \n\t" \
       "decl %[pos] \n\t" \
       "testl $0x00008000,%%eax \n\t" \
       "jnz 4f \n\t" \
       "decl %[pos] \n\t" \
       "4: # hit detected" \
       : [pos] "=q" (offset) : "[pos]" (offset), [sieb] "r" (SieveArray) : "cc", "eax", "mm0", "mm1");
#else
// optimized i386 code
#define asm_search_sieb(offset) \
  asm volatile ( \
       "0: # calibrate alignment \n\t" \
       "testl $3,%[pos] \n\t" \
       "jz 1f # is aligned \n\t" \
       "decl %[pos] \n\t" \
       "testb $0x80,(%[sieb],%[pos]) \n\t" \
       "jns 0b \n\t" \
       "js 2f # hit detected \n\t" \
       "1: # start search \n\t" \
       "subl $4,%[pos] \n\t" \
       "testl $0x80808080,(%[sieb],%[pos]) \n\t" \
       "jz 1b \n\t" \
       "addl $3,%[pos] \n\t" \
       "testb $0x80,(%[sieb],%[pos]) \n\t" \
       "js 2f \n\t" \
       "decl %[pos] \n\t" \
       "testb $0x80,(%[sieb],%[pos]) \n\t" \
       "js 2f \n\t" \
       "decl %[pos] \n\t" \
       "testb $0x80,(%[sieb],%[pos]) \n\t" \
       "js 2f \n\t" \
       "decl %[pos] \n\t" \
       "2: # hit detected" \
       : [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");
#endif
#if defined(ASM_CMOV)
// code containing cmov optimizations
 #ifdef DEBUG
  #warning "using CMOV version for macro: asm_sieb"
 #endif
#define asm_sieb(lp, d0, d1, P) \
  asm volatile ( \
       "cmpl %[disp1],%[disp0]\n\t" \
       "cmova %[disp0],%%eax # conditional swap part1\n\t" \
       "cmova %[disp1],%[disp0] # part2 \n\t" \
       "cmova %%eax,%[disp1] # part3 \n\t" \
       "movl %[limit],%%eax\n\t" \
       "jmp 1f \n\t" \
       ".balign 16 \n\t" \
       "0: subb %[val],(%[sieb],%[disp1])\n\t" \
       "addl %[step],%[disp1]\n\t" \
       "subb %[val],(%[sieb],%[disp0])\n\t" \
       "addl %[step],%[disp0]\n\t" \
       "1: cmpl %%eax,%[disp1]\n\t" \
       "jb 0b\n\t" \
       "cmpl %%eax,%[disp0] \n\t" \
       "cmovb %[disp0],%%eax \n\t" \
       "subb %[val],(%[sieb],%%eax)\n\t" \
       "xorl %%eax,%%eax \n\t" \
       "cmpl %[limit],%[disp0] \n\t" \
       "cmovb %[step],%%eax \n\t" \
       "addl %%eax,%[disp0] \n" \
       : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc", "eax");

// do the same as for asm_sieb (but optimzed for small Ps)
#define asm_sieb_small(lp, d0, d1, P) \
  asm volatile ( \
       "cmpl %[disp1],%[disp0]\n\t" \
       "cmova %[disp0],%%eax # conditional swap part1\n\t" \
       "cmova %[disp1],%[disp0] # part2 \n\t" \
       "cmova %%eax,%[disp1] # part3 \n\t" \
       "movl %[limit],%%eax\n\t" \
       "jmp 0f \n\t" \
       ".balign 16 \n\t" \
       "0: subb %[val],(%[sieb],%[disp1])\n\t" \
       "addl %[step],%[disp1]\n\t" \
       "subb %[val],(%[sieb],%[disp0])\n\t" \
       "addl %[step],%[disp0]\n\t" \
       "1: cmpl %%eax,%[disp1]\n\t" \
       "jb 0b\n\t" \
       "cmpl %%eax,%[disp0] \n\t" \
       "cmovb %[disp0],%%eax \n\t" \
       "subb %[val],(%[sieb],%%eax)\n\t" \
       "xorl %%eax,%%eax \n\t" \
       "cmpl %[limit],%[disp0] \n\t" \
       "cmovb %[step],%%eax \n\t" \
       "addl %%eax,%[disp0] \n" \
       : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc", "eax");
#else
#define asm_sieb(lp, d0, d1, P) \
  asm volatile ( \
       "cmpl %[disp1],%[disp0]\n\t" \
       "jbe 1f \n\t" \
       "xchgl %[disp0],%[disp1]\n\t" \
       "jmp 1f \n\t" \
       "0: subb %[val],(%[sieb],%[disp1])\n\t" \
       "addl %[step],%[disp1]\n\t" \
       "subb %[val],(%[sieb],%[disp0])\n\t" \
       "addl %[step],%[disp0]\n\t" \
       "1: cmpl %[limit],%[disp1]\n\t" \
       "jb 0b\n\t" \
       "cmpl %[limit],%[disp0] \n\t" \
       "jae 2f \n\t \n\t" \
       "subb %[val],(%[sieb],%[disp0])\n\t" \
       "addl %[step],%[disp0] \n\t" \
       "2: #done \n" \
       : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");

#define asm_sieb_small(lp, d0, d1, P) \
  asm volatile ( \
       "cmpl %[disp1],%[disp0]\n\t" \
       "jbe 0f \n\t" \
       "xchgl %[disp0],%[disp1]\n\t" \
       "0: subb %[val],(%[sieb],%[disp1])\n\t" \
       "addl %[step],%[disp1]\n\t" \
       "subb %[val],(%[sieb],%[disp0])\n\t" \
       "addl %[step],%[disp0]\n\t" \
       "1: cmpl %[limit],%[disp1]\n\t" \
       "jb 0b\n\t" \
       "cmpl %[limit],%[disp0] \n\t" \
       "jae 2f \n\t \n\t" \
       "subb %[val],(%[sieb],%[disp0])\n\t" \
       "addl %[step],%[disp0] \n\t" \
       "2: #done \n" \
       : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
#endif
#elif TSIEVEELEMENTSIZE == 2
#define asm_sieb_init(logValue) \
  asm volatile ( \
       "pushw %%ax\n\t" \
       "shll $16,%%eax\n\t" \
       "popw %%ax\n\t" \
       "shrl $1,%%ecx\n\t" \
       "cld\n\t" \
       "rep\n\t" \
       "stosl" \
       : "=c" (clobbered_int), "=D" (clobbered_int) : "a" (logValue), "D" (SieveArray), "c" (PhysicalSieveSize) : "cc", "memory");
#define asm_search_sieb(offset) \
  asm volatile ( \
       "0: # calibrate alignment \n\t" \
       "testl $1,%[pos] \n\t" \
       "jz 1f # is aligned \n\t" \
       "decl %[pos] \n\t" \
       "testw $0x8000,(%[sieb],%[pos],2) \n\t" \
       "js 2f # hit detected \n\t" \
       "1: # Treffer suchen \n\t" \
       "subl $2,%[pos] \n\t" \
       "testl $0x80008000,(%[sieb],%[pos],2) \n\t" \
       "jz 1b \n\t" \
       "incl %[pos] \n\t" \
       "testw $0x8000,(%[sieb],%[pos],2) \n\t" \
       "js 2f \n\t" \
       "decl %[pos] \n\t" \
       "2: # hit detected" \
       : [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");
#define asm_sieb(lp, d0, d1, P) \
  asm volatile ( \
       "cmpl %[disp1],%[disp0]\n\t" \
       "jbe 1f \n\t" \
       "xchgl %[disp0],%[disp1]\n\t" \
       "jmp 1f \n\t" \
       "0: subw %[val],(%[sieb],%[disp1],2)\n\t" \
       "subw %[val],(%[sieb],%[disp0],2)\n\t" \
       "addl %[step],%[disp1]\n\t" \
       "addl %[step],%[disp0]\n\t" \
       "1: cmpl %[limit],%[disp1]\n\t" \
       "jb 0b\n\t" \
       "cmpl %[limit],%[disp0] \n\t" \
       "jae 2f \n\t \n\t" \
       "subw %[val],(%[sieb],%[disp0],2)\n\t" \
       "addl %[step],%[disp0] \n\t" \
       "2: \n" \
       : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
#elif TSIEVEELEMENTSIZE == 4
#define asm_sieb_init(logValue) \
  asm volatile ( \
       "cld\n\t" \
       "rep\n\t" \
       "stosl" \
       : "=c" (clobbered_int), "=D" (clobbered_int) : "a" (logValue), "D" (SieveArray), "c" (PhysicalSieveSize) : "memory");
#define asm_search_sieb(offset) \
  asm volatile ( \
       "1: # start search \n\t" \
       "decl %[pos] \n\t" \
       "testl $0x80008000,(%[sieb],%[pos],4) \n\t" \
       "jz 1b \n\t" \
       "2: # hit detected" \
       : [pos] "=r" (offset) : "[pos]" (offset), [sieb] "q" (SieveArray) : "cc");
#define asm_sieb(lp, d0, d1, P) \
  asm volatile ( \
       "cmpl %[disp1],%[disp0]\n\t" \
       "jbe 1f \n\t" \
       "xchgl %[disp0],%[disp1]\n\t" \
       "jmp 1f \n\t" \
       "0: subl %[val],(%[sieb],%[disp1],4)\n\t" \
       "subl %[val],(%[sieb],%[disp0],4)\n\t" \
       "addl %[step],%[disp1]\n\t" \
       "addl %[step],%[disp0]\n\t" \
       "1: cmpl %[limit],%[disp1]\n\t" \
       "jb 0b\n\t" \
       "cmpl %[limit],%[disp0] \n\t" \
       "jae 2f \n\t \n\t" \
       "subl %[val],(%[sieb],%[disp0],4)\n\t" \
       "addl %[step],%[disp0] \n\t" \
       "2: \n" \
       : [disp0] "+q" (d0), [disp1] "+q" (d1) : [step] "r" (P), [val] "r" (lp), [sieb] "r" (&SieveArray[0]), [limit] "i" (PhysicalSieveSize) : "cc");
#else
#undef SIEBASM_386
#warning "No optimized assemblercode available for sieving with TSieveElement!"
#endif
 #if !defined(asm_sieb_small)
  #warning "no optimized asm_sieb_small, therefore using asm_sieb"
  #define asm_sieb_small asm_sieb
 #endif
#endif


void SieveControl::RecalibrateThresholds()
{
#ifdef VERBOSE_NOTICE
  cout << "recalibrating MPQS thresholds" << endl;
#endif

  const double FB_Threshold = log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime());
  TSieveElement Min=std::numeric_limits<TSieveElement>::max(), Max=std::numeric_limits<TSieveElement>::min();
  unsigned int I=0;
  for (signed int mySieveOffset=-LogicalSieveSize; mySieveOffset<LogicalSieveSize; mySieveOffset+=PhysicalSieveSize)
   {
    double wMin = log_SieveEntryMultiplier*Polynom.get_logval(mySieveOffset);
    double wMax = wMin;
    double dAvg = 0.0; unsigned int samplecount = 0;
    for (int i=0; i<PhysicalSieveSize; i+=8) // or use "i+=1" for finest granularity
     {
       const double w = log_SieveEntryMultiplier*Polynom.get_logval(mySieveOffset+i);
       if (w<wMin) wMin=w; // find minimum of the samples
       if (w>wMax) wMax=w; // find maximum of the samples
       dAvg+=w; ++samplecount;
     }
    dAvg /= samplecount;
    //cout << "pre I: " << I << ": " << dAvg << ", " << wMin << ", " << wMax << endl;

    // now we are able to set a reasonable threshold for this interval:
    register double d = dAvg; // you can try wAvg, wMin, wMax, (wMin+wMax)/2 or something else...
    //cout << "d=" << d << endl;
    d-=FB_Threshold;
    if (StaticFactorbaseSettings::StaticPrime(1)==2) d-=log_SieveEntryMultiplier*log(2);
    d-=GetRawLogThreshold();
    if ( d>std::numeric_limits<TSieveElement>::max() || d<std::numeric_limits<TSieveElement>::min() )
     {
       // threshold does not fit in TSieveElement...
       MARK; cerr << "numeric_limits<TSieveElement> exceeded. Out of range." << endl;
       exit(1);
     }
    register const TSieveElement w = static_cast<TSieveElement>(d); // convert to SieveElement type
    PhysicalSieveIntervalThresholdCorrector[I++]=w;
    //cout << "I: " << I-1 << ": " << (int)PhysicalSieveIntervalThresholdCorrector[I-1] << ", " << d << endl;

    #if TSIEVEELEMENTSIZE == 1
       Min=MIN(Min,w); Max=MAX(Max,w);
    #endif
   }

  // and now find out the two best physical intervals
  BestPSI[0]=0; BestPSI[1]=I-1;
  for (unsigned int i=1; i<I-i; ++i)
   {
     //cout << (int)(PhysicalSieveIntervalThresholdCorrector[i]) << " " << (int)(PhysicalSieveIntervalThresholdCorrector[I-1-i]) << endl;
     if (PhysicalSieveIntervalThresholdCorrector[i]<PhysicalSieveIntervalThresholdCorrector[BestPSI[0]]) BestPSI[0]=i;
     if (PhysicalSieveIntervalThresholdCorrector[I-1-i]<PhysicalSieveIntervalThresholdCorrector[BestPSI[1]]) BestPSI[1]=I-1-i;
   }
  cout << "Best two physical sieve intervals are " << BestPSI[0] << " and " << BestPSI[1] << endl;

#if defined(VERBOSE_INFO)
 #if TSIEVEELEMENTSIZE == 1
  // do you want to get the picture?
  cout << "MPQS polynomial thresholds over physical intervals:" << endl;
  cout << "+";  for (unsigned int j=0; j<I; ++j) cout << "-"; cout << "+" << endl;
  for (signed int i=Max; i>=Min; --i)
   {
     if (i) cout << "|"; else cout << "+";
     const char ch = i? ' ' : '-'; 
     for (unsigned int j=0; j<I; ++j)
      if (PhysicalSieveIntervalThresholdCorrector[j]==i) cout << "*"; else cout << ch;
     cout << "|" << endl;
   }
  cout << "+";  for (unsigned int j=0; j<I; ++j) cout << "-"; cout << "+" << endl;
 #endif
#endif /* VERBOSE_INFO */
}

void initialize_Sieve()
{
  //cout << "Initializing sieve from " << SieveOffset
  //     << " to " << SieveOffset+PhysicalSieveSize << endl;

  // for each "false positive hit" the dirtiness gets decreased,
  // and for each physical interval the dirtiness is increased,
  // -> adaptive control system for dirtiness
  SieveControl::RequestMoreDirtiness();

#if 1
  // static version using an overall precomputed threshold
  register TSieveElement w = SieveControl::GetLogThreshold();
#else
  // should only be activated for debugging sessions, since the above code
  // can already make use of recalibrating MPQS interval thresholds! (Thorsten 2005-01-21)

  // dynamic version: compute an adaptive threshold for each physical sieve interval
    const TSieveElement FB_Threshold
      = static_cast<TSieveElement>(log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime()));
    register TSieveElement w = static_cast<TSieveElement>(log_SieveEntryMultiplier*Polynom.get_logval(SieveOffset));

    w-=FB_Threshold;
    if (StaticFactorbaseSettings::StaticPrime(1)==2) w-=static_cast<TSieveElement>(log_SieveEntryMultiplier*log(2));
    w-=SieveControl::GetLogDirtyCorrection();
#endif
  /* log_Threshold denotes the threshold for detecting a relation.
     Instead of adding a logarithm at each place where a static factor hits
     the sieve (and comparing the sieve-entry with the threshold
     afterwards), we proceed the other way round: We initialize each sieve
     entry by using the threshold, then we're going subtract logarithms for
     each hit. This eases detection of hits, because the only thing to do
     is checking for negative sieve entries after the sieving is finished
     for a certain interval.
   */
  
  // Initialize sieve with the roughly calculated threshold
  // (hint: -> never calculate separate thresholds for each entry, as this slows down performance!)
#ifdef SIEBASM_386
  asm_sieb_init(w); // confer the appropriate Assembler-Macro
#else
  register int i=PhysicalSieveSize; do SieveArray[--i]=w; while (i);
#endif

  // activate this block to compare the polynomial value and its logval against the actually used threshold
#if 0
  {
    TSieveElement w_test = static_cast<TSieveElement>(log_SieveEntryMultiplier*Polynom.get_logval(SieveOffset));
    TSieveElement FB_Threshold
      = static_cast<TSieveElement>(log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime()));
    w_test-=FB_Threshold;
    if (StaticFactorbaseSettings::StaticPrime(1)==2) w_test-=static_cast<TSieveElement>(log_SieveEntryMultiplier*log(2));
    w_test-=SieveControl::GetLogDirtyCorrection();

    static unsigned long int overall_intervals = 0;
    static unsigned long int differing_intervals = 0;

    ++overall_intervals;
    if (w_test!=w)
     {
       ++differing_intervals;
       cout << "Thresholds bad/all: " << differing_intervals << "/" << overall_intervals << " ";
       cout << "Offset: " << SieveOffset;
    #if TSIEVEELEMENTSIZE == 1
       // chars are not printed as numbers, so we cast them:
       cout << " -> logval: " << static_cast<signed int>(w_test)
            << " <-> threshold: " << static_cast<signed int>(w) << endl;
    #else
       cout << " -> logval: " << w_test << " <-> threshold: " << w << endl;
    #endif
     }
  }
#endif
}


void SieveControl::create_sieve_image(int* &Image, int mySieveOffset)
{
#ifdef VERBOSE
  cout << "creating MPQS physical sieve image for SieveOffset " << SieveOffset << endl;
#endif
  if (Image==NULL) Image = new int[PhysicalSieveSize];
  const TSieveElement FB_Threshold
    = static_cast<TSieveElement>(log_SieveEntryMultiplier*Factor_Threshold*log(StaticFactorbaseSettings::BiggestPrime()));
  for (int i=0; i<PhysicalSieveSize; ++i)
   {
     register TSieveElement w = static_cast<TSieveElement>(log_SieveEntryMultiplier*Polynom.get_logval(mySieveOffset+i));
     w-=FB_Threshold;
     if (StaticFactorbaseSettings::StaticPrime(1)==2) w-=static_cast<TSieveElement>(log_SieveEntryMultiplier*log(2));
     w-=GetRawLogThreshold();
     Image[i]=w;
   }
}


void initialize_Sieve(const int Image[])
{
  //cout << "Initializing sieve from " << SieveOffset
  //     << " to " << SieveOffset+PhysicalSieveSize << endl;

  // for each "false positive hit" the dirtiness gets decreased,
  // and for each physical interval the dirtiness is increased,
  // -> adaptive control system for dirtiness
  SieveControl::RequestMoreDirtiness();

#if defined(ASM_MMX) && (TSIEVEELEMENTSIZE == 1)
 #ifdef DEBUG
  #warning "Initialize Sieve: MMX Sieve Image copy activated."
 #endif
 // about using prefetchnta: it's not a pure MMX instruction, but a streaming extension;
 // also if the sieve image will fit into cache, "prefetchnta" is counterproductive.
  asm volatile(\
   "movd %%eax,%%mm0 # -> (x,x,x,x,x,x,x,a) \n\t" \
   "punpcklbw %%mm0,%%mm0 # (x,x,x,x,x,x,a,a) \n\t" \
   "movl %[count],%%eax \n\t" \
   "punpcklbw %%mm0,%%mm0 # (x,x,x,x,a,a,a,a) \n\t" \
   "punpcklbw %%mm0,%%mm0 # (a,a,a,a,a,a,a,a) \n\t" \
   ".balign 16 \n\t" \
   "1: \n\t" \
   "#prefetchnta 200(%[Image]) # this is an extension, not pure MMX \n\t" \
   "movq (%[Image]),%%mm1 \n\t" \
   "movq 8(%[Image]),%%mm2 \n\t" \
   "movq 16(%[Image]),%%mm3 \n\t" \
   "movq 24(%[Image]),%%mm4 \n\t" \
   "paddb %%mm0,%%mm1 \n\t" \
   "paddb %%mm0,%%mm2 \n\t" \
   "paddb %%mm0,%%mm3 \n\t" \
   "paddb %%mm0,%%mm4 \n\t" \
   "movq %%mm1,(%[Sieve]) \n\t" \
   "movq %%mm2,8(%[Sieve]) \n\t" \
   "movq %%mm3,16(%[Sieve]) \n\t" \
   "movq %%mm4,24(%[Sieve]) \n\t" \
   "movq 32(%[Image]),%%mm1 \n\t" \
   "movq 40(%[Image]),%%mm2 \n\t" \
   "movq 48(%[Image]),%%mm3 \n\t" \
   "movq 56(%[Image]),%%mm4 \n\t" \
   "paddb %%mm0,%%mm1 \n\t" \
   "paddb %%mm0,%%mm2 \n\t" \
   "paddb %%mm0,%%mm3 \n\t" \
   "paddb %%mm0,%%mm4 \n\t" \
   "movq %%mm1,32(%[Sieve]) \n\t" \
   "movq %%mm2,40(%[Sieve]) \n\t" \
   "movq %%mm3,48(%[Sieve]) \n\t" \
   "movq %%mm4,56(%[Sieve]) \n\t" \
   "add $64,%[Image] \n\t" \
   "add $64,%[Sieve] \n\t" \
   "sub $64,%%eax \n\t" \
   "jnz 1b \n\t" \
   "emms \n" \
   : "=D" (clobbered_int), "=S" (clobbered_int), "=a" (clobbered_int) /* these registers are clobbered, but we needed the input... */
   : [threshold] "a" (SieveControl::log_Threshold), [Image] "S" (Image),
     [Sieve] "D" (SieveArray), [count] "i" (PhysicalSieveSize)
   : "cc", "memory", "mm0", "mm1", "mm2", "mm3", "mm4");
#else
  // generic version
  register TSieveElement w = SieveControl::log_Threshold;
  register int i=PhysicalSieveSize-1;
  do
   {
     register TSieveElement x=w+Image[i];
     SieveArray[i]=x;
   } while (--i>=0);
#endif
}


int CDeltaComputations::LocalPhysInterval_Delta_of_PrimeNumbers[StaticFactorbase::MaxSize][2] __attribute__ ((aligned (16)));
int CDeltaComputations::LocalPhysInterval_Delta_of_PrimePowers[StaticFactorbase::max_additional_Powers][2] __attribute__ ((aligned (16)));


void CSieveStaticFactors::init_LocalDeltas()
{
  SieveControl::RenewThresholds();
  // determine the first hit for each primenumber in the first physical sieve interval
  for (int nr=StaticFactorbase::Size()-1; nr>=SieveControl::FBStartIndex(); --nr) 
    {
      register int Primzahl=PrimeNumbers[nr];
      register int d0,d1;
      {
        register int h=normalized_signed_mod(SieveOffset,Primzahl,PrimeNumberReciprocals[nr]); // damned signed %-Operation...
	d0=Delta_of_PrimeNumbers[nr][0]-h;
	d1=Delta_of_PrimeNumbers[nr][1]-h;
	if (d1<0)
          d1+=Primzahl;
	if (d0<0)
          d0+=Primzahl;
      }
      LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
      LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
    }

  // higher powers of primes
  for (int nr=0; nr<NumberOf_more_PrimePowers; ++nr)
    {
      register int PPotenz = PrimePowers[nr];
      register int d0,d1;
      {
        register int h=normalized_signed_mod(SieveOffset,PPotenz,PrimePowerReciprocals[nr]); // damned signed %-Operation...
	d0=Delta_of_PrimePowers[nr][0]-h;
	d1=Delta_of_PrimePowers[nr][1]-h;
	if (d1<0)
          d1+=PPotenz;
	if (d0<0)
          d0+=PPotenz;
      }
      LocalPhysInterval_Delta_of_PrimePowers[nr][0]=d0;
      LocalPhysInterval_Delta_of_PrimePowers[nr][1]=d1;
    }
}


void CSieveStaticFactors::create_Tables_for_init_LocalDeltas(int* &Table1, int* &Table2, const int SieveOffset)
{
  if (Table1==NULL) Table1 = new int[StaticFactorbase::Size()];
  if (Table2==NULL) Table2 = new int[StaticFactorbase::Size()];
  for (int nr=StaticFactorbase::Size()-1; nr>=SieveControl::FBStartIndex(); --nr) 
    Table1[nr]=normalized_signed_mod(SieveOffset,PrimeNumbers[nr],PrimeNumberReciprocals[nr]); // damned signed %-Operation...

  // higher powers of primes
  for (int nr=0; nr<NumberOf_more_PrimePowers; ++nr)
    Table2[nr]=normalized_signed_mod(SieveOffset,PrimePowers[nr],PrimePowerReciprocals[nr]); // damned signed %-Operation...
}


void CSieveStaticFactors::init_LocalDeltas(int const Table1[], int const Table2[])
{
  SieveControl::RenewThresholds();
  // determine the first hit for each primenumber in the first physical sieve interval
  for (int nr=StaticFactorbase::Size()-1; nr>=SieveControl::FBStartIndex(); --nr) 
    {
      register int Primzahl=PrimeNumbers[nr];
      register int d0,d1;
      {
        register int h=Table1[nr]; // = normalized_signed_mod(SieveOffset,Primzahl);
	d0=Delta_of_PrimeNumbers[nr][0]-h;
	d1=Delta_of_PrimeNumbers[nr][1]-h;
	if (d1<0)
          d1+=Primzahl;
	if (d0<0)
          d0+=Primzahl;
      }
      LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
      LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
    }

  // higher powers of primes
  for (int nr=0; nr<NumberOf_more_PrimePowers; ++nr)
    {
      register int PPotenz = PrimePowers[nr];
      register int d0,d1;
      {
        register int h=Table2[nr]; // = normalized_signed_mod(SieveOffset,PPotenz);
	d0=Delta_of_PrimePowers[nr][0]-h;
	d1=Delta_of_PrimePowers[nr][1]-h;
	if (d1<0)
          d1+=PPotenz;
	if (d0<0)
          d0+=PPotenz;
      }
      LocalPhysInterval_Delta_of_PrimePowers[nr][0]=d0;
      LocalPhysInterval_Delta_of_PrimePowers[nr][1]=d1;
    }
}


void CSieveStaticFactors::do_sieving_StaticFactors()
{
  //cout << "Sieving with primes of static FB: " << SieveOffset<< endl;
  
  /* We do not sieve with the first couples of primes of the static factorbase.
     We want to sieve fast, quick and dirty! 

     The lower the first sieving prime, the slower the sieving! But this
     introduces errors which must be compensated!
       -> increase Factor_Threshold slightly if necessary.
       -> sieve with powers of primes (so that the interesting hits can be corrected anyway)

     CAUTION: Do not define a too high value for shortcut! Otherwise many relations remain undetected!
     Attention! Starting value needs to be at least "2" to guarantee that -1 and 2 will be ignored! (For any other
                elements of the static factorbase there are exactly two Deltas defined. But not for -1 and 2...)
                -> relevant for looping the smaller prime numbers
  */

  // standard code, but uses also various assembler optimizations

  register int nr = StaticFactorbase::Size()-1;

#if 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && defined(ASM_ATHLON)
// code containing cmov optimizations plus prefetching for Athlon
// Hint: MMX code (with prefetching) for Athlon seems to be very fast, too!
//       So you may want to skip this block and use MMX instead.

// FIXME: prefetch need to be tuned according to cpu speed and memory timing,
// optimal settings can speed up sieving for more than 10% !
// Athlon 1.2 Ghz, SD-RAM:
//   prefetcht2 %[Deltas]-256(,%[nr],8) # tuning cache!!
//   prefetcht2 %[PrimeDiffs]-128(,%[nr],2)# tuning cache!!
// Athlon-XP 1.75 Ghz, Athlon64 (32bit mode) 1.8 Ghz, DDR-RAM:
//   prefetch %[Deltas]-128(,%[nr],8) # tuning cache!!
//   prefetch %[PrimeDiffs]-64(,%[nr],2)# tuning cache!!
// P3,P4:
//   this routine also works on these processors (using prefetcht2; better: not using software prefetch at all),
//   but it runs much slower than a specialized version!
 #ifdef DEBUG
  #warning "experimental (cmov) static factor sieving code Athlon (and Athlon XP/Athlon-64)"
 #endif
  asm volatile ( \
       "movb %[SieveWeight](%[nr]),%%al \n\t" \
       "movb $1,%%ah # position of next run length \n\t" \
       "bswap %%eax # eax holds 3 different values\n\t" \
       "movw %[LogValRuns],%%ax # first run-length \n\t" \
       "movl %[PrimeNumbers](,%[nr],4),%%edi \n\t" \
       "incw %%ax \n\t" \
       "jmp 2f \n\t" \
       ".balign 16 \n\t" \
       "1: movl %[Deltas](,%[nr],8),%%esi\n\t" \
       "prefetchw %[Deltas]-128(,%[nr],8) # tuning cache!! \n\t" \
       "prefetch %[PrimeDiffs]-64(,%[nr],2)# tuning cache!! \n\t" \
       "bswap %%eax \n\t" \
       "movl %%esi,%%edx\n\t" \
       "subl %%ecx,%%esi\n\t" \
       "cmovnsl %%ecx,%%edx\n\t" \
       "movzx %%cl,%%ecx # assuming cl=0\n\t" \
       "cmovsl %%edi,%%ecx\n\t" \
       "subb %%al,%[feld](%%edx)\n\t" \
       "addl %%ecx,%%esi\n\t" \
       "movl %[PhSiSi],%%ecx\n\t" \
       "movl %%esi,%[Deltas](,%[nr],8)\n\t" \
       "movl %[Deltas]+4(,%[nr],8),%%esi \n\t" \
       "movl %%esi,%%edx\n\t" \
       "subl %%ecx,%%esi\n\t" \
       "cmovnsl %%ecx,%%edx\n\t" \
       "movzx %%cl,%%ecx # assuming cl=0\n\t" \
       "cmovsl %%edi,%%ecx\n\t" \
       "subb %%al,%[feld](%%edx)\n\t" \
       "addl %%ecx,%%esi\n\t" \
       "movl %%esi,%[Deltas]+4(,%[nr],8)\n\t" \
       "bswap %%eax \n\t" \
       "movzwl %[PrimeDiffs](,%[nr],2),%%edx\n\t" \
       "dec %[nr] \n\t" \
       "subl %%edx,%%edi \n\t" \
       "2: movl %[PhSiSi],%%ecx\n\t" \
       "decw %%ax \n\t " \
       "jnz 1b \n\t" \
       "bswap %%eax \n\t" \
       "movzx %%ah,%%edx \n\t" \
       "dec %%al # LogVal for run \n\t" \
       "inc %%ah # next run \n\t" \
       "bswap %%eax \n\t" \
       "movw %[LogValRuns](,%%edx,2),%%ax # run length \n\t" \
       "test %%ax,%%ax \n\t" \
       "jnz 1b \n" \
       : [nr] "+r" (nr)
       : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
         [PrimeNumbers] "o" (PrimeNumbers[0]),
         [PrimeDiffs] "o" (PrimeNumberDiffs[0]),
         [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
         [feld] "o" (SieveArray[0]),
         [LogValRuns] "o" (LogValRuns[0]),
         [PhSiSi] "i" (PhysicalSieveSize)
       : "cc", "memory", "eax","ecx", "edx", "esi", "edi");

#elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && (defined(ASM_3DNOW) && !defined(ASM_ATHLON))
//   cpu type ------------------------------------------------------>       -- AMD K6 3D --
// MMX variant using specialized 3DNow prefetching code
 #ifdef DEBUG
  #warning "experimental static factor sieving code (3DNow!) for K6 3D"
 #endif
  asm volatile ( \
       "movl %[PhSiSi],%%edx \n\t" \
       "movd %%edx,%%mm7 \n\t" \
       "punpckldq %%mm7,%%mm7 # lo/hi->lo/lo \n\t" \
       "jmp 2f \n\t" \
       ".balign 32,,8 \n\t" \
       "1: movq %[Deltas]-8(,%[nr],8),%%mm0\n\t" \
       "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
       "pcmpgtd %%mm4,%%mm4 # set mm4 to 0xffff... \n\t" \
       "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
       "movq %[PrimeNumbers]-4(,%[nr],4),%%mm2 \n\t" \
       "psubd %%mm7,%%mm0 \n\t" \
       "movw %[SieveWeight]-1(%[nr]),%%ax\n\t" \
       "sub $2,%[nr] \n\t" \
       "movq %%mm2,%%mm3 \n\t" \
       "psubd %%mm7,%%mm1 \n\t" \
       "punpckldq %%mm2,%%mm2 # lo/hi->lo/lo \n\t" \
       "pcmpgtd %%mm0,%%mm4 \n\t" \
       "punpckhdq %%mm3,%%mm3 # lo/hi->hi/hi \n\t" \
       "pcmpgtd %%mm1,%%mm5 \n\t" \
       "pand %%mm4,%%mm2 \n\t" \
       "pand %%mm5,%%mm3 \n\t" \
       "pand %%mm0,%%mm4 \n\t" \
       "pand %%mm1,%%mm5 \n\t" \
       "prefetchw %[Deltas]-128(,%[nr],8) # tuning cache!! \n\t" \
       "prefetch %[PrimeNumbers]-64(,%[nr],4)# tuning cache!! \n\t" \
       "movd %%mm4,%%esi \n\t" \
       "movd %%mm5,%%edi \n\t" \
       "paddd %%mm2,%%mm0 \n\t" \
       "punpckhdq %%mm4,%%mm4 # get hi to lo \n\t" \
       "subb %%al,%[feldEnd](%%esi)\n\t" \
       "paddd %%mm3,%%mm1 \n\t" \
       "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
       "subb %%ah,%[feldEnd](%%edi)\n\t" \
       "movd %%mm4,%%esi \n\t" \
       "movd %%mm5,%%edi \n\t" \
       "movq %%mm0,%[Deltas]+8(,%[nr],8)\n\t" \
       "movq %%mm1,%[Deltas]+16(,%[nr],8)\n\t" \
       "subb %%al,%[feldEnd](%%esi)\n\t" \
       "subb %%ah,%[feldEnd](%%edi)\n\t" \
       "2: cmpl %%edx,%[PrimeNumbers]-4(,%[nr],4)\n\t" \
       "jge 1b \n\t" \
       "cmpl %%edx,%[PrimeNumbers](,%[nr],4)\n\t" \
       "jl 3f \n\t" \
       "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
       "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
       "movq %[PrimeNumbers](,%[nr],4),%%mm3 \n\t" \
       "movb %[SieveWeight](%[nr]),%%al\n\t" \
       "psubd %%mm7,%%mm1 \n\t" \
       "punpckldq %%mm3,%%mm3 # lo/hi->lo/lo \n\t" \
       "pcmpgtd %%mm1,%%mm5 \n\t" \
       "pand %%mm5,%%mm3 \n\t" \
       "pand %%mm1,%%mm5 \n\t" \
       "movd %%mm5,%%esi \n\t" \
       "paddd %%mm3,%%mm1 \n\t" \
       "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
       "movd %%mm5,%%edi \n\t" \
       "movq %%mm1,%[Deltas](,%[nr],8)\n\t" \
       "subb %%al,%[feldEnd](%%esi)\n\t" \
       "subb %%al,%[feldEnd](%%edi)\n\t" \
       "sub $1,%[nr] \n\t" \
       "3: femms \n" \
       : [nr] "+r" (nr)
       : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
         [PrimeNumbers] "o" (PrimeNumbers[0]),
         [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
         [feldEnd] "o" (SieveArray[PhysicalSieveSize]),
         [PhSiSi] "i" (PhysicalSieveSize)
       : "cc", "memory", "eax", "edx", "esi", "edi", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");

#elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && (defined(ASM_ATHLON) || defined(ASM_SSE))
// code containing optimizations for MMX (plus prefetching) machines
 #ifdef DEBUG
  #warning "experimental static factor sieving code for MMX (plus prefetching MMX-ext)"
 #endif
  asm volatile ( \
       "movl %[PhSiSi],%%edx \n\t" \
       "movd %%edx,%%mm7 \n\t" \
       "punpckldq %%mm7,%%mm7 # lo/hi->lo/lo \n\t" \
       "jmp 2f \n\t" \
       ".balign 16 \n\t" \
       "1: movq %[Deltas]-8(,%[nr],8),%%mm0\n\t" \
       "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
       "pcmpgtd %%mm4,%%mm4 # set mm4 to 0xffff... \n\t" \
       "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
       "movq %[PrimeNumbers]-4(,%[nr],4),%%mm2 \n\t" \
       "movw %[SieveWeight]-1(%[nr]),%%ax\n\t" \
       "sub $2,%[nr] \n\t" \
       "movq %%mm2,%%mm3 \n\t" \
       "psubd %%mm7,%%mm0 \n\t" \
       "psubd %%mm7,%%mm1 \n\t" \
       "punpckldq %%mm2,%%mm2 # lo/hi->lo/lo \n\t" \
       "punpckhdq %%mm3,%%mm3 # lo/hi->hi/hi \n\t" \
       "pcmpgtd %%mm0,%%mm4 \n\t" \
       "pcmpgtd %%mm1,%%mm5 \n\t" \
       "pand %%mm4,%%mm2 \n\t" \
       "pand %%mm5,%%mm3 \n\t" \
       "pand %%mm0,%%mm4 \n\t" \
       "pand %%mm1,%%mm5 \n\t" \
       "prefetcht2 %[Deltas]-256(,%[nr],8) # tuning cache!! \n\t" \
       "prefetcht2 %[PrimeNumbers]-128(,%[nr],4)# tuning cache!! \n\t" \
       "movd %%mm4,%%esi \n\t" \
       "movd %%mm5,%%edi \n\t" \
       "paddd %%mm2,%%mm0 \n\t" \
       "punpckhdq %%mm4,%%mm4 # get hi to lo \n\t" \
       "paddd %%mm3,%%mm1 \n\t" \
       "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
       "subb %%al,%[feldEnd](%%esi)\n\t" \
       "subb %%ah,%[feldEnd](%%edi)\n\t" \
       "movd %%mm4,%%esi \n\t" \
       "movd %%mm5,%%edi \n\t" \
       "movq %%mm0,%[Deltas]+8(,%[nr],8)\n\t" \
       "movq %%mm1,%[Deltas]+16(,%[nr],8)\n\t" \
       "subb %%al,%[feldEnd](%%esi)\n\t" \
       "subb %%ah,%[feldEnd](%%edi)\n\t" \
       "2: cmpl %%edx,%[PrimeNumbers]-4(,%[nr],4)\n\t" \
       "jge 1b \n\t" \
       "cmpl %%edx,%[PrimeNumbers](,%[nr],4)\n\t" \
       "jl 3f \n\t" \
       "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
       "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
       "movq %[PrimeNumbers](,%[nr],4),%%mm3 \n\t" \
       "movb %[SieveWeight](%[nr]),%%al\n\t" \
       "psubd %%mm7,%%mm1 \n\t" \
       "punpckldq %%mm3,%%mm3 # lo/hi->lo/lo \n\t" \
       "pcmpgtd %%mm1,%%mm5 \n\t" \
       "pand %%mm5,%%mm3 \n\t" \
       "pand %%mm1,%%mm5 \n\t" \
       "movd %%mm5,%%esi \n\t" \
       "paddd %%mm3,%%mm1 \n\t" \
       "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
       "movd %%mm5,%%edi \n\t" \
       "movq %%mm1,%[Deltas](,%[nr],8)\n\t" \
       "subb %%al,%[feldEnd](%%esi)\n\t" \
       "subb %%al,%[feldEnd](%%edi)\n\t" \
       "sub $1,%[nr] \n\t" \
       "3: emms \n" \
       : [nr] "+r" (nr)
       : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
         [PrimeNumbers] "o" (PrimeNumbers[0]),
         [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
         [feldEnd] "o" (SieveArray[PhysicalSieveSize]),
         [PhSiSi] "i" (PhysicalSieveSize)
       : "cc", "memory", "eax", "edx", "esi", "edi", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");

#elif 1 && defined(SIEBASM_386) && (TSIEVEELEMENTSIZE == 1) && defined(ASM_MMX)
// code containing optimizations for MMX compatible machines
 #ifdef DEBUG
  #warning "experimental static factor sieving code for MMX compatible machines"
 #endif
  asm volatile ( \
       "movl %[PhSiSi],%%edx \n\t" \
       "movd %%edx,%%mm7 \n\t" \
       "punpckldq %%mm7,%%mm7 # lo/hi->lo/lo \n\t" \
       "jmp 2f \n\t" \
       "1: movq %[Deltas]-8(,%[nr],8),%%mm0\n\t" \
       "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
       "pcmpgtd %%mm4,%%mm4 # set mm4 to 0xffff... \n\t" \
       "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
       "movq %[PrimeNumbers]-4(,%[nr],4),%%mm2 \n\t" \
       "movw %[SieveWeight]-1(%[nr]),%%ax\n\t" \
       "sub $2,%[nr] \n\t" \
       "movq %%mm2,%%mm3 \n\t" \
       "psubd %%mm7,%%mm0 \n\t" \
       "psubd %%mm7,%%mm1 \n\t" \
       "punpckldq %%mm2,%%mm2 # lo/hi->lo/lo \n\t" \
       "punpckhdq %%mm3,%%mm3 # lo/hi->hi/hi \n\t" \
       "pcmpgtd %%mm0,%%mm4 \n\t" \
       "pcmpgtd %%mm1,%%mm5 \n\t" \
       "pand %%mm4,%%mm2 \n\t" \
       "pand %%mm5,%%mm3 \n\t" \
       "pand %%mm0,%%mm4 \n\t" \
       "pand %%mm1,%%mm5 \n\t" \
       "#prefetcht2 %[Deltas]-256(,%[nr],8) # tuning cache!! \n\t" \
       "#prefetcht2 %[PrimeNumbers]-128(,%[nr],4)# tuning cache!! \n\t" \
       "movd %%mm4,%%esi \n\t" \
       "movd %%mm5,%%edi \n\t" \
       "paddd %%mm2,%%mm0 \n\t" \
       "punpckhdq %%mm4,%%mm4 # get hi to lo \n\t" \
       "paddd %%mm3,%%mm1 \n\t" \
       "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
       "subb %%al,%[feldEnd](%%esi)\n\t" \
       "subb %%ah,%[feldEnd](%%edi)\n\t" \
       "movd %%mm4,%%esi \n\t" \
       "movd %%mm5,%%edi \n\t" \
       "movq %%mm0,%[Deltas]+8(,%[nr],8)\n\t" \
       "movq %%mm1,%[Deltas]+16(,%[nr],8)\n\t" \
       "subb %%al,%[feldEnd](%%esi)\n\t" \
       "subb %%ah,%[feldEnd](%%edi)\n\t" \
       "2: cmpl %%edx,%[PrimeNumbers]-4(,%[nr],4)\n\t" \
       "jge 1b \n\t" \
       "cmpl %%edx,%[PrimeNumbers](,%[nr],4)\n\t" \
       "jl 3f \n\t" \
       "movq %[Deltas](,%[nr],8),%%mm1\n\t" \
       "pcmpgtd %%mm5,%%mm5 # set mm5 to 0xffff... \n\t" \
       "movq %[PrimeNumbers](,%[nr],4),%%mm3 \n\t" \
       "movb %[SieveWeight](%[nr]),%%al\n\t" \
       "psubd %%mm7,%%mm1 \n\t" \
       "punpckldq %%mm3,%%mm3 # lo/hi->lo/lo \n\t" \
       "pcmpgtd %%mm1,%%mm5 \n\t" \
       "pand %%mm5,%%mm3 \n\t" \
       "pand %%mm1,%%mm5 \n\t" \
       "movd %%mm5,%%esi \n\t" \
       "paddd %%mm3,%%mm1 \n\t" \
       "punpckhdq %%mm5,%%mm5 # get hi to lo \n\t" \
       "movd %%mm5,%%edi \n\t" \
       "movq %%mm1,%[Deltas](,%[nr],8)\n\t" \
       "subb %%al,%[feldEnd](%%esi)\n\t" \
       "subb %%al,%[feldEnd](%%edi)\n\t" \
       "sub $1,%[nr] \n\t" \
       "3: emms \n" \
       : [nr] "+r" (nr)
       : [SieveWeight] "o" (SieveControl::log_PrimeNumbers[0]),
         [PrimeNumbers] "o" (PrimeNumbers[0]),
         [Deltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
         [feldEnd] "o" (SieveArray[PhysicalSieveSize]),
         [PhSiSi] "i" (PhysicalSieveSize)
       : "cc", "memory", "eax", "edx", "esi", "edi", "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7");

#else
  // generic version
  // larger prime numbers (maximal 1 hit per Delta) 
  for (;; --nr) 
    {
      register int Primzahl=PrimeNumbers[nr];
      if (Primzahl<PhysicalSieveSize) break;

      register TSieveElement log_Primzahl = SieveControl::PrimzahlSieveWeight(nr);
      /* i.e. log_Primzahl gets the value (TSieveElement) (log_SieveEntryMultiplier*log(Primzahl)) */

      // d0/d1-calculation
      register int d0,d1;
      
      d0=LocalPhysInterval_Delta_of_PrimeNumbers[nr][0];
      d1=LocalPhysInterval_Delta_of_PrimeNumbers[nr][1];

      // hint: PhysicalSieveSize is also a dummy position inside the sieve;
      //       writing to a dummy avoids expensive branch instructions.
      SieveArray[d0<PhysicalSieveSize ? d0 : PhysicalSieveSize]-=log_Primzahl;
      SieveArray[d1<PhysicalSieveSize ? d1 : PhysicalSieveSize]-=log_Primzahl;

      d0-=PhysicalSieveSize; if (d0<0) d0+=Primzahl;
      d1-=PhysicalSieveSize; if (d1<0) d1+=Primzahl;

      LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
      LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
    }
#endif

  // smaller primes (many hits per Delta)
  const int NonDirtyIndex = SieveControl::FBStartIndex();
  for (; nr>=NonDirtyIndex; --nr) 
    {
      register int Primzahl=PrimeNumbers[nr];
      register TSieveElement log_Primzahl = SieveControl::PrimzahlSieveWeight(nr);
      /* i.e. log_Primzahl gets the value (TSieveElement) (log_SieveEntryMultiplier*log(Primzahl)) */

      // d0/d1-calculation
      register int d0,d1;
      
      d0=LocalPhysInterval_Delta_of_PrimeNumbers[nr][0];
      d1=LocalPhysInterval_Delta_of_PrimeNumbers[nr][1];

#ifdef SIEBASM_386
      asm_sieb_small(log_Primzahl, d0, d1, Primzahl);
#else
      if (d0>d1) { register int h=d0; d0=d1; d1=h; }
      // now: d0<=d1
      while (d1<PhysicalSieveSize)
       {
         SieveArray[d0]-=log_Primzahl; d0+=Primzahl;
         SieveArray[d1]-=log_Primzahl; d1+=Primzahl;
       }
      if (d0<PhysicalSieveSize)
       {
         SieveArray[d0]-=log_Primzahl; d0+=Primzahl;
       }
#endif
      d0-=PhysicalSieveSize; if (d0<0) d0+=Primzahl;
      d1-=PhysicalSieveSize; if (d1<0) d1+=Primzahl;

      LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]=d0;
      LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]=d1;
    }

  // higher powers of primes
  for (nr=0; nr<NumberOf_more_PrimePowers; nr++)
    {
      register int PPotenz = PrimePowers[nr];
      register TSieveElement log_Primzahl = log_Primzahl_of_PrimePowers[nr];
      /* i.e. log_Primzahl gets the value (TSieveElement) (log_SieveEntryMultiplier*log(Primzahl)) */
      // notice, that: if we sieve with these primes, then not log(Primzahl^k),
      // but only log(Primzahl) has to be subtracted!
      // Analogy: If you have already divided by p, then you would ending in dividing by p^3 all together,
      //          should you decide to divide by p^2 instead of p!

      // d0/d1-calculation
      register int d0,d1;
      
      d0=LocalPhysInterval_Delta_of_PrimePowers[nr][0];
      d1=LocalPhysInterval_Delta_of_PrimePowers[nr][1];

#ifdef SIEBASM_386
      asm_sieb(log_Primzahl, d0, d1, PPotenz);
#else
      if (d0>d1) { register int h=d0; d0=d1; d1=h; }
      // now: d0<=d1
      while (d1<PhysicalSieveSize)
       {
         SieveArray[d0]-=log_Primzahl; d0+=PPotenz;
         SieveArray[d1]-=log_Primzahl; d1+=PPotenz;
       }
      if (d0<PhysicalSieveSize)
       {
         SieveArray[d0]-=log_Primzahl; d0+=PPotenz;
       }
#endif
      d0-=PhysicalSieveSize; if (d0<0) d0+=PPotenz;
      d1-=PhysicalSieveSize; if (d1<0) d1+=PPotenz;

      LocalPhysInterval_Delta_of_PrimePowers[nr][0]=d0;
      LocalPhysInterval_Delta_of_PrimePowers[nr][1]=d1;
    }

}


void do_scanning_Sieve(void)
{
  //cout << "Auslesen des (gesiebten) Siebintervalls" << endl;

  // only a small fraction of sieve entries are expected to be hits!
  //  --> search for hits can be accelerated!
  // Constraint: the 64 positions below the sieve (SieveArray[-64]..SieveArray[-1]) are already set to -1.
  register int d=PhysicalSieveSize;
  do
   {
#ifdef SIEBASM_386
     asm_search_sieb(d); // search for hits...
     //if (d>=0 && SieveArray[d]<0) cout << "hit for position " << d << endl;
     //else cout << "??? for " << d << endl; // d==-1 is okay...
#else
     while (SieveArray[--d]>=0);; // search for hit...
#endif
     if (d>=0) // check whether the detected hit is a provoked dummy-hit to exit the loop.
      {
        // a regular hit
        if (SieveControl::Hit_after_DirtyCorrection(SieveOffset+d,SieveArray[d]))
          StaticRelations::insert(new CRelation(SieveOffset+d)); // insert a possible relation
      }
   } while (d>0); // as long as more hits are possible: continue search
// Werteverteilung_im_Sieb_ausgeben();
}


#if defined(ASM_SSE2) /* special code using SSE2 instructions (P4,Athlon-64) */

void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
{
  // find factors in static factorbase
  // (starting with the 3. prime of the FB (see above) )

  // (Pentium4) SSE2 specific code
 #ifdef DEBUG
  #warning "Pentium4/SSE2 specific code to detect static factors"
 #endif
  typedef int v4sf __attribute__ ((vector_size (16))); // 16 bytes -> 4 short floats
  register v4sf XMM6 asm("xmm6"); // XMM6 is now bound to %xmm6 (intention: signedness correction)
  register v4sf XMM7 asm("xmm7"); // XMM7 is now bound to %xmm7

   // refer GCC manual, "Specifying registers for local variables";
   // but remember: that does not necessarily mean, that %xmm7 cannot be
   // used for other variables as well! (It is only guaranteed, that if we access
   // XMM7, this access is synonym to accessing %xmm7; but accessing %xmm7 is not necessarily
   // synonym to accessing XMM7 (because if the compiler assumes, that XMM7 is dead, %xmm7 could
   // be used for other variables; and the compiler does *not* know, that we want
   // XMM7 to be alive by accessing %xmm7!).

  asm volatile ( \
       "emms                       # enter mmx\n\t" \
       "movd %[Sieboffs],%%mm0   # -      mm0=(0,Sieboffs)\n\t" \
       "xorps %[_xmm6],%[_xmm6]    # L2AM   clear mm6 \n\t" \
       "punpckldq %%mm0,%%mm0  # L2AM   (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
       "cvtpi2ps %%mm0,%[_xmm7]  # packed int to float \n\t" \
       "shufps $0x44,%[_xmm7],%[_xmm7] \n\t" \
       "cmpnleps %[_xmm7],%[_xmm6] # L2AM   set bits if signed, clear bits if unsigned" \
       : [_xmm7] "=x" (XMM7), [_xmm6] "=x" (XMM6) : [Sieboffs] "rm" (SievePos) : "mm0");

  // PrimeNumbers[0] and PrimeNumbers[1] get left out (since special treatment is necessary)
  // PrimeNumbers[2] und PrimeNumbers[3] get left out (so that the remaining number of primes is divisible by 4)
  {
     register int d = normalized_signed_mod(SievePos,PrimeNumbers[2],PrimeNumberReciprocals[2]);
     if (d==Delta_of_PrimeNumbers[2][0] || d==Delta_of_PrimeNumbers[2][1]) Factors.push_back(2);
     d = normalized_signed_mod(SievePos,PrimeNumbers[3],PrimeNumberReciprocals[3]);
     if (d==Delta_of_PrimeNumbers[3][0] || d==Delta_of_PrimeNumbers[3][1]) Factors.push_back(3);
  }
  
  int nr;
  for (nr=4; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=4) 
    {
      register unsigned int pflags;
      asm ( \
       "movaps %[_xmm7],%%xmm2 \n\t" \
       "cvtdq2ps %[divisors](,%[i],4),%%xmm1 # convert 4 packed int to float \n\t" \
       "mulps %[FloatRecips](,%[i],4),%%xmm2 \n\t" \
       "movaps %%xmm1,%%xmm3 \n\t" \
       "cvttps2dq %%xmm2,%%xmm0 \n\t" \
       "movaps %[_xmm7],%%xmm2 \n\t" \
       "cvtdq2ps %%xmm0,%%xmm0 \n\t" \
       "mulps %%xmm1,%%xmm0 \n\t" \
       "subps %%xmm0,%%xmm2 \n\t" \
       "andps %[_xmm6],%%xmm1     # correct sign (part 1) \n\t" \
       "addps %%xmm1,%%xmm2       # correct sign (part 2) \n\t" \
       "cmpneqps %%xmm2,%%xmm3    #  assure modp=0 instead of modp=p \n\t" \
       "andps %%xmm3,%%xmm2   \n\t" \
       "cvttps2dq %%xmm2,%%xmm3 \n\t" \
       "pshufd $0x50,%%xmm3,%%xmm0 \n\t" \
       "pshufd $0xfa,%%xmm3,%%xmm1 \n\t" \
       "pcmpeqd %[Deltas](,%[i],8),%%xmm0 \n\t" \
       "pcmpeqd 16+%[Deltas](,%[i],8),%%xmm1 \n\t" \
       "packssdw %%xmm1,%%xmm0 \n\t" \
       "pmovmskb %%xmm0,%[pflags] # create bytemask" \
       : [pflags] "=r" (pflags)
       : [divisors] "o" (PrimeNumbers[0]), [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
         [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
         [_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)
       : "xmm0", "xmm1", "xmm2", "xmm3");

      //cout << "SievePos: " << SievePos << ": " << PackedResult[0] << "," << PackedResult[1] << endl;
 
#if defined(DEBUG)
       {
         // check pflags for correct byte mask
         int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
         if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x0003)==0x0003) ) { MARK; cerr << nr << "!!" << (pflags&0x0003) << endl; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x000c)==0x000c) ) { MARK; cerr << nr << "!!" << (pflags&0x000c) << endl; exit(7); }
         d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
         if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x0030)==0x0030) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x0030) << endl; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0x00c0)==0x00c0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x00c0) << endl; exit(7); }
         d=normalized_signed_mod(SievePos,PrimeNumbers[nr+2]);
         if ( (d==Delta_of_PrimeNumbers[nr+2][0]) != ((pflags&0x0300)==0x0300) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0300) << endl; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr+2][1]) != ((pflags&0x0c00)==0x0c00) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0c00) << endl; exit(7); }
         d=normalized_signed_mod(SievePos,PrimeNumbers[nr+3]);
         if ( (d==Delta_of_PrimeNumbers[nr+3][0]) != ((pflags&0x3000)==0x3000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0x3000) << endl; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr+3][1]) != ((pflags&0xc000)==0xc000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0xc000) << endl; exit(7); }
       }
#endif
      if ( pflags & 0x000f ) Factors.push_back(nr);
      if ( pflags & 0x00f0 ) Factors.push_back(nr+1);
      if ( pflags & 0x0f00 ) Factors.push_back(nr+2);
      if ( pflags & 0xf000 ) Factors.push_back(nr+3);

    }

  const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
  asm volatile ( \
       "movd %[mySievePos],%[_xmm7]   # -      mm0=(0,mySiebPos)\n\t" \
       "shufps $0x00,%[_xmm7],%[_xmm7] \n\t" \
       : [_xmm7] "=x" (XMM7) : [mySievePos] "r" (mySievePos));

  for (; nr<StaticFactorbase::Size(); nr+=4)
   {
      register unsigned int pflags;
      asm ( \
       "movdqa %[PrimeNumbers](,%[i],4),%%xmm2   # L2AMS  get y : x (as packed int)\n\t" \
       "paddd %[_xmm7],%%xmm2        # L2AM \n\t" \
       "movdqa 16+%[LocDeltas](,%[i],8),%%xmm3 \n\t" \
       "pshufd $0x50,%%xmm2,%%xmm0 \n\t" \
       "pshufd $0xfa,%%xmm2,%%xmm1 \n\t" \
       "pcmpeqd %[LocDeltas](,%[i],8),%%xmm0 \n\t" \
       "pcmpeqd %%xmm3,%%xmm1 \n\t" \
       "packssdw %%xmm1,%%xmm0 \n\t" \
       "pmovmskb %%xmm0,%[pflags] # create bytemask" \
       : [pflags] "=r" (pflags)
       : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
         [_xmm7] "x" (XMM7), [i] "r" (nr)
       : "xmm0", "xmm1", "xmm2", "xmm3");

#if 0 || defined(DEBUG)
       {
         // check pflags for correct byte mask
         int d = mySievePos+PrimeNumbers[nr];
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x0003)==0x0003) ) { MARK; cerr << nr << "!!" << (pflags&0x0003) << endl; exit(7); }
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x000c)==0x000c) ) { MARK; cerr << nr << "!!" << (pflags&0x000c) << endl; exit(7); }
         d=mySievePos+PrimeNumbers[nr+1];
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x0030)==0x0030) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x0030) << endl; exit(7); }
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0x00c0)==0x00c0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x00c0) << endl; exit(7); }
         d=mySievePos+PrimeNumbers[nr+2];
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+2][0]) != ((pflags&0x0300)==0x0300) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0300) << endl; exit(7); }
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+2][1]) != ((pflags&0x0c00)==0x0c00) ) { MARK; cerr << nr+2 << "!!" << (pflags&0x0c00) << endl; exit(7); }
         d=mySievePos+PrimeNumbers[nr+3];
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+3][0]) != ((pflags&0x3000)==0x3000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0x3000) << endl; exit(7); }
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+3][1]) != ((pflags&0xc000)==0xc000) ) { MARK; cerr << nr+3 << "!!" << (pflags&0xc000) << endl; exit(7); }
       }
#endif
      if (pflags==0) continue;
      if ( pflags & 0x000f ) Factors.push_back(nr);
      if ( pflags & 0x00f0 ) Factors.push_back(nr+1);
      if ( pflags & 0x0f00 ) Factors.push_back(nr+2);
      if ( pflags & 0xf000 ) Factors.push_back(nr+3);
   }
  asm volatile ("emms              # exit mmx");

}
// end of SSE2 optimized version


#elif defined(ASM_SSE) /* special code using SSE instructions (P3,Athlon-XP) */

void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
{
  // find factors in static factorbase
  // (starting with the 3. prime of the FB (see above) )

  // (Pentium3) SSE specific code
 #ifdef DEBUG
  #warning "Pentium3/SSE specific code to detect static factors"
 #endif
  typedef int v4sf __attribute__ ((vector_size (16))); // 16 bytes -> 4 short floats
  register v4sf XMM6 asm("xmm6"); // XMM6 is now bound to %xmm6 (intention: signedness correction)
  register v4sf XMM7 asm("xmm7"); // XMM7 is now bound to %xmm7

   // refer GCC manual, "Specifying registers for local variables";
   // but remember: that does not necessarily mean, that %xmm7 cannot be
   // used for other variables as well! (It is only guaranteed, that if we access
   // XMM7, this access is synonym to accessing %xmm7; but accessing %xmm7 is not necessarily
   // synonym to accessing XMM7 (because if the compiler assumes, that XMM7 is dead, %xmm7 could
   // be used for other variables; and the compiler does *not* know, that we want
   // XMM7 to be alive by accessing %xmm7!).

  asm volatile ( \
       "emms                       # enter mmx\n\t" \
       "movd %[Sieboffs],%%mm0   # -      mm0=(0,Sieboffs)\n\t" \
       "xorps %[_xmm6],%[_xmm6]    # L2AM   clear mm6 \n\t" \
       "punpckldq %%mm0,%%mm0  # L2AM   (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
       "cvtpi2ps %%mm0,%[_xmm7]  # packed int to float \n\t" \
       "shufps $0x44,%[_xmm7],%[_xmm7] \n\t" \
       "cmpnleps %[_xmm7],%[_xmm6] # L2AM   set bits if signed, clear bits if unsigned" \
       : [_xmm7] "=x" (XMM7), [_xmm6] "=x" (XMM6) : [Sieboffs] "rm" (SievePos) : "mm0");

  // PrimeNumbers[0] and PrimeNumbers[1] get left out (since special treatment is necessary)
  // PrimeNumbers[2] und PrimeNumbers[3] get left out (so that the remaining number of primes is divisible by 4)
  {
     register int d = normalized_signed_mod(SievePos,PrimeNumbers[2],PrimeNumberReciprocals[2]);
     if (d==Delta_of_PrimeNumbers[2][0] || d==Delta_of_PrimeNumbers[2][1]) Factors.push_back(2);
     d = normalized_signed_mod(SievePos,PrimeNumbers[3],PrimeNumberReciprocals[3]);
     if (d==Delta_of_PrimeNumbers[3][0] || d==Delta_of_PrimeNumbers[3][1]) Factors.push_back(3);
  }
  
  int nr;
  for (nr=4; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=4) 
    {
      register unsigned int pflags, pflags2;
      asm ( \
       "movaps %[_xmm7],%%xmm2 \n\t" \
       "movq %[divisors](,%[i],4),%%mm4   # L2AMS  get y : x (as packed int) \n\t" \
       "movq 8+%[divisors](,%[i],4),%%mm5   # L2AMS  get y : x (as packed int) \n\t" \
       "cvtpi2ps %%mm4,%%xmm0 # convert packed int to float \n\t" \
       "cvtpi2ps %%mm5,%%xmm1 # convert packed int to float \n\t" \
       "mulps %[FloatRecips](,%[i],4),%%xmm2 \n\t" \
       "movlhps %%xmm1,%%xmm0 \n\t" \
       "movaps %%xmm0,%%xmm1 \n\t" \
       "movaps %%xmm0,%%xmm3 \n\t" \
       "movhlps %%xmm2,%%xmm4 \n\t" \
       "cvttps2pi %%xmm2,%%mm0 \n\t" \
       "cvttps2pi %%xmm4,%%mm1 \n\t" \
       "movaps %[_xmm7],%%xmm2 \n\t" \
       "cvtpi2ps %%mm0,%%xmm0 \n\t" \
       "cvtpi2ps %%mm1,%%xmm4 \n\t" \
       "movlhps %%xmm4,%%xmm0 \n\t" \
       "mulps %%xmm1,%%xmm0 \n\t" \
       "subps %%xmm0,%%xmm2 \n\t" \
       "andps %[_xmm6],%%xmm1      # correct sign (part 1) \n\t" \
       "addps %%xmm1,%%xmm2        # correct sign (part 2) \n\t" \
       "movaps %%xmm2,%%xmm0 \n\t" \
       "cmpneqps %%xmm3,%%xmm0      #  assure modp=0 instead of modp=p \n\t" \
       "andps %%xmm0,%%xmm2   \n\t" \
       "movhlps %%xmm2,%%xmm4 \n\t" \
       "cvttps2pi %%xmm2,%%mm0 \n\t" \
       "pshufw $0x44,%%mm0,%%mm1   # L2AM \n\t" \
       "pshufw $0xee,%%mm0,%%mm2   # L2AM \n\t" \
       "pcmpeqd %[Deltas](,%[i],8),%%mm1   # L2AM \n\t" \
       "pcmpeqd 8+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \
       "packssdw %%mm2,%%mm1       # L2AM  pack comparison results together \n\t" \
       "pmovmskb %%mm1,%[pflags]   # L6+ (VectorPath)  into bytemask \n\t" \
       "cvttps2pi %%xmm4,%%mm0 \n\t" \
       "pshufw $0x44,%%mm0,%%mm1   # L2AM \n\t" \
       "pshufw $0xee,%%mm0,%%mm2   # L2AM \n\t" \
       "pcmpeqd 16+%[Deltas](,%[i],8),%%mm1 # L2AM \n\t" \
       "pcmpeqd 24+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \
       "packssdw %%mm2,%%mm1       # L2AM  pack comparison results together \n\t" \
       "pmovmskb %%mm1,%[pflags2]  # L6+ (VectorPath)  into bytemask" \
       : [pflags] "=r" (pflags), [pflags2] "=r" (pflags2)
       : [divisors] "o" (PrimeNumbers[0]), [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
         [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
         [_xmm7] "x" (XMM7), [_xmm6] "x" (XMM6), [i] "r" (nr)
       : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");

      //cout << "SievePos: " << SievePos << ": " << PackedResult[0] << "," << PackedResult[1] << endl;
 
#if defined(DEBUG)
       {
         // check pflags for correct byte mask
         const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
         if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; cerr << nr << "!!" << (pflags&0x03) << endl; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; cerr << nr << "!!" << (pflags&0x0c) << endl; exit(7); }
       }
#endif
      if ( pflags & 0x0f ) Factors.push_back(nr);

#if defined(DEBUG)
       {
         // check pflags for correct byte mask
         const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
         if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; cerr << nr+1 << "!!" << (pflags&0x30) << endl; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; cerr << nr+1 << "!!" << (pflags&0xc0) << endl; exit(7); }
       }
#endif
      if ( pflags & 0xf0 ) Factors.push_back(nr+1);

#if defined(DEBUG)
       {
         // check pflags2 for correct byte mask
         const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+2]);
         if ( (d==Delta_of_PrimeNumbers[nr+2][0]) != ((pflags2&0x03)==0x03) ) { MARK; cerr << nr+2 << "!!" << (pflags2&0x03) << endl; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr+2][1]) != ((pflags2&0x0c)==0x0c) ) { MARK; cerr << nr+2 << "!!" << (pflags2&0x0c) << endl; exit(7); }
       }
#endif
      if ( pflags2 & 0x0f ) Factors.push_back(nr+2);

#if defined(DEBUG)
       {
         // check pflags2 for correct byte mask
         const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+3]);
         if ( (d==Delta_of_PrimeNumbers[nr+3][0]) != ((pflags2&0x30)==0x30) ) { MARK; cerr << nr+3 << "!!" << (pflags2&0x30) << endl; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr+3][1]) != ((pflags2&0xc0)==0xc0) ) { MARK; cerr << nr+3 << "!!" << (pflags2&0xc0) << endl; exit(7); }
       }
#endif
      if ( pflags2 & 0xf0 ) Factors.push_back(nr+3);
    }

  const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
  typedef int v2di __attribute__ ((vector_size (8))); // 8 bytes -> 2 double words
  register v2di MM7 asm("mm7"); // MM7 is now bound to %mm7

  asm volatile ( \
       "movd %[mySievePos],%%mm7   # -      mm0=(0,mySiebPos)\n\t" \
       "punpckldq %%mm7,%%mm7  # L2AM   (0,mySievePos) -> (mySievePos,mySievePos) \n\t"
       : [_mm7] "=y" (MM7) : [mySievePos] "r" (mySievePos));

  for (; nr<StaticFactorbase::Size(); nr+=2)
   {
      register unsigned int pflags;
      asm ( \
       "movq %[PrimeNumbers](,%[i],4),%%mm0   # L2AMS  get y : x (as packed int)\n\t" \
       "paddd %[_mm7],%%mm0        # L2AM \n\t" \
       "pshufw $0x44,%%mm0,%%mm1   # L2AM \n\t" \
       "pshufw $0xee,%%mm0,%%mm2   # L2AM \n\t" \
       "pcmpeqd %[LocDeltas](,%[i],8),%%mm1   # L2AM \n\t" \
       "pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm2 # L2AM \n\t" \
       "packssdw %%mm2,%%mm1       # L2AM  pack comparison results together \n\t" \
       "pmovmskb %%mm1,%[pflags]   # L6+ (VectorPath)  into bytemask" \
       : [pflags] "=r" (pflags)
       : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
         [_mm7] "y" (MM7), [i] "r" (nr)
       : "mm0", "mm1", "mm2");

#if 0 || defined(DEBUG)
       {
         // check pflags for correct byte mask
         register const int d = mySievePos+PrimeNumbers[nr];
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; exit(7); }
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; exit(7); }
       }
#endif
      if ( pflags & 0x0f ) Factors.push_back(nr);

#if 0 || defined(DEBUG)
       {
         // check pflags for correct byte mask
         register const int d = mySievePos+PrimeNumbers[nr+1];
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; exit(7); }
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; exit(7); }
       }
#endif
      if ( pflags & 0xf0 ) Factors.push_back(nr+1);

   }
  asm volatile ("emms              # exit mmx");

}
// end of SSE optimized version


#elif defined(ASM_ATHLON) /* special code for AMD Athlon 3DNow! */

void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
{
  // find factors in static factorbase
  // (beginnend mit der 3. Primzahl der FB (s.o) )

  // Athlon 3DNow! specific code
 #ifdef DEBUG
  #warning "AMD Athlon 3DNow! specific code to detect static factors"
 #endif
  typedef int v2sf __attribute__ ((vector_size (8))); // 8 bytes -> 2 short floats
  register v2sf MM7 asm("mm7"); // MM7 is now bound to %mm7
  register v2sf MM6 asm("mm6"); // MM6 is now bound to %mm6 (intention: signedness correction)

   // refer GCC manual, "Specifying registers for local variables";
   // but remember: that does not necessarily mean, that %mm7 cannot be
   // used for other variables as well! (It is only guaranteed, that if we access
   // MM7, this access is synonym to accessing %mm7; but accessing %mm7 is not necessarily
   // synonym to accessing MM7 (because if the compiler assumes, that MM7 is dead, %mm7 could
   // be used for other variables; and the compiler does *not* know, that we want
   // MM7 to be alive by accessing %mm7!).

  asm volatile ( \
       "femms                      # L2AMS  fast enter mmx\n\t" \
       "movd %[Sieboffs],%[_mm7]   # -      MM7=(0,Sieboffs)\n\t" \
       "punpckldq %[_mm7],%[_mm7]  # L2AM   (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
       "movq %[_mm7],%%mm0         # L2AM \n\t" \
       "pxor %[_mm6],%[_mm6]       # L2AM   clear mm6 \n\t" \
       "pi2fd %[_mm7],%[_mm7]      # L4A    int to float \n\t" \
       "pcmpgtd %%mm0,%%mm6        # L2AM   set bits if signed, clear bits if unsigned" \
       : [_mm7] "=y" (MM7), [_mm6] "=y" (MM6) : [Sieboffs] "rm" (SievePos) : "mm0");

  // PrimeNumbers[0] und PrimeNumbers[1] are left out (because they need special treatment)
  int nr;
  for (nr=2; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=2) 
    {
      register unsigned int pflags;
      asm ( \
       "movq %[_mm7],%%mm2         # L2AMS \n\t" \
       "movq %[divisors](,%[i],4),%%mm4   # L2AMS  get y : x (as packed int)\n\t" \
       "#prefetch %[FloatRecips]+64(,%[i],4) \n\t" \
       "pfmul %[FloatRecips](,%[i],4),%%mm2 # L4M    z/y : w/x \n\t" \
       "pi2fd %%mm4,%%mm0          # L4A    get y : x (as packed float)\n\t" \
       "movq %%mm4,%%mm5           # L2AMS \n\t" \
       "pf2id %%mm2,%%mm1          # L4A    packed float to int \n\t" \
       "pand %%mm6,%%mm4           # L2AM   correct sign (part 1) \n\t" \
       "pi2fd %%mm1,%%mm1          # L4A    and back (->floor) \n\t" \
       "pfmul %%mm0,%%mm1          # L4M    multiply \n\t" \
       "pfsubr %[_mm7],%%mm1       # L4A    subtract \n\t" \
       "pf2id %%mm1,%%mm0          # L4A    ->modulo \n\t" \
       "paddd %%mm4,%%mm0          # L2AM   correct sign (part 2) \n\t" \
       "pcmpeqd %%mm0,%%mm5        # L2AM  assure modp=0 instead of modp=p \n\t" \
       "pandn %%mm0,%%mm5          # L2AM \n\t" \
       "pshufw $0x44,%%mm5,%%mm1   # L2AM \n\t" \
       "pshufw $0xee,%%mm5,%%mm2   # L2AM \n\t" \
       "pcmpeqd %[Deltas](,%[i],8),%%mm1   # L2AM \n\t" \
       "pcmpeqd 8+%[Deltas](,%[i],8),%%mm2 # L2AM \n\t" \
       "packssdw %%mm2,%%mm1       # L2AM  pack comparison results together \n\t" \
       "pmovmskb %%mm1,%[pflags]   # L6+ (VectorPath)  into bytemask" \
       : [pflags] "=r" (pflags)
       : [divisors] "o" (PrimeNumbers[0]), [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
         [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
         [_mm7] "y" (MM7), [_mm6] "y" (MM6), [i] "r" (nr)
       : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5");

      //cout << "SievePos: " << SievePos << ": " << PackedResult[0] << "," << PackedResult[1] << endl;
 
#ifdef DEBUG
       {
         // check pflags for correct byte mask
         const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
         if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; exit(7); }
       }
#endif
      if ( pflags & 0x0f ) Factors.push_back(nr);

#ifdef DEBUG
       {
         // check pflags for correct byte mask
         const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
         if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; exit(7); }
       }
#endif
      if ( pflags & 0xf0 ) Factors.push_back(nr+1);
    }

  const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;

  asm volatile ( \
       "movd %[mySievePos],%%mm7   # -      mm0=(0,mySiebPos)\n\t" \
       "punpckldq %%mm7,%%mm7  # L2AM   (0,mySievePos) -> (mySievePos,mySievePos) \n\t"
       : [_mm7] "=y" (MM7) : [mySievePos] "r" (mySievePos));

  for (; nr<StaticFactorbase::Size(); nr+=2)
   {
      register unsigned int pflags;
      asm ( \
       "movq %[PrimeNumbers](,%[i],4),%%mm0   # L2AMS  get y : x (as packed int)\n\t" \
       "paddd %[_mm7],%%mm0        # L2AM \n\t" \
       "pshufw $0x44,%%mm0,%%mm1   # L2AM \n\t" \
       "pshufw $0xee,%%mm0,%%mm2   # L2AM \n\t" \
       "pcmpeqd %[LocDeltas](,%[i],8),%%mm1   # L2AM \n\t" \
       "pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm2 # L2AM \n\t" \
       "packssdw %%mm2,%%mm1       # L2AM  pack comparison results together \n\t" \
       "pmovmskb %%mm1,%[pflags]   # L6+ (VectorPath)  into bytemask" \
       : [pflags] "=r" (pflags)
       : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
         [_mm7] "y" (MM7), [i] "r" (nr)
       : "mm0", "mm1", "mm2");

#if 0 || defined(DEBUG)
       {
         // check pflags for correct byte mask
         register const int d = mySievePos+PrimeNumbers[nr];
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x03)==0x03) ) { MARK; exit(7); }
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0c)==0x0c) ) { MARK; exit(7); }
       }
#endif
      if ( pflags & 0x0f ) Factors.push_back(nr);

#if 0 || defined(DEBUG)
       {
         // check pflags for correct byte mask
         register const int d = mySievePos+PrimeNumbers[nr+1];
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x30)==0x30) ) { MARK; exit(7); }
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xc0)==0xc0) ) { MARK; exit(7); }
       }
#endif
      if ( pflags & 0xf0 ) Factors.push_back(nr+1);

   }
  asm volatile ("femms             # L2AMS  fast exit mmx");

}
// end of AMD Athlon 3DNow! optimized version

#elif defined(ASM_3DNOW) /* special code for 3DNow! (without extensions) */

void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
{
  // find factors in static factorbase
  // (beginnend mit der 3. Primzahl der FB (s.o) )

  // AMD-K6 3DNow! specific code (without using MMX/3DNow! extensions)
 #ifdef DEBUG
  #warning "AMD K6 3DNow! specific code to detect static factors"
 #endif
  typedef int v2sf __attribute__ ((vector_size (8))); // 8 bytes -> 2 short floats
  register v2sf MM7 asm("mm7"); // MM7 is now bound to %mm7
  register v2sf MM6 asm("mm6"); // MM6 is now bound to %mm6 (intention: signedness correction)

   // refer GCC manual, "Specifying registers for local variables";
   // but remember: that does not necessarily mean, that %mm7 cannot be
   // used for other variables as well! (It is only guaranteed, that if we access
   // MM7, this access is synonym to accessing %mm7; but accessing %mm7 is not necessarily
   // synonym to accessing MM7 (because if the compiler assumes, that MM7 is dead, %mm7 could
   // be used for other variables; and the compiler does *not* know, that we want
   // MM7 to be alive by accessing %mm7!).

  asm volatile ( \
       "femms                      # L2AMS  fast enter mmx\n\t" \
       "movd %[Sieboffs],%[_mm7]   # -      MM7=(0,Sieboffs)\n\t" \
       "punpckldq %[_mm7],%[_mm7]  # L2AM   (0,Sieboffs) -> (Sieboffs,Sieboffs) \n\t" \
       "movq %[_mm7],%%mm0         # L2AM \n\t" \
       "pxor %[_mm6],%[_mm6]       # L2AM   clear mm6 \n\t" \
       "pi2fd %[_mm7],%[_mm7]      # L4A    int to float \n\t" \
       "pcmpgtd %%mm0,%%mm6        # L2AM   set bits if signed, clear bits if unsigned" \
       : [_mm7] "=y" (MM7), [_mm6] "=y" (MM6) : [Sieboffs] "rm" (SievePos) : "mm0");

  // PrimeNumbers[0] und PrimeNumbers[1] are left out (because they need special treatment)
  int nr;
  for (nr=2; nr<StaticFactorbase::Size() && PrimeNumbers[nr]<PhysicalSieveSize; nr+=2) 
    {
      register unsigned int pflags;
      asm ( \
       "movq %[_mm7],%%mm2         # L2AMS \n\t" \
       "movq %[divisors](,%[i],4),%%mm4   # L2AMS  get y : x (as packed int)\n\t" \
       "pfmul %[FloatRecips](,%[i],4),%%mm2 # L4M    z/y : w/x \n\t" \
       "pi2fd %%mm4,%%mm0          # L4A    get y : x (as packed float)\n\t" \
       "movq %%mm4,%%mm5           # L2AMS \n\t" \
       "pf2id %%mm2,%%mm1          # L4A    packed float to int \n\t" \
       "pand %%mm6,%%mm4           # L2AM   correct sign (part 1) \n\t" \
       "pi2fd %%mm1,%%mm1          # L4A    and back (->floor) \n\t" \
       "pfmul %%mm0,%%mm1          # L4M    multiply \n\t" \
       "pfsubr %[_mm7],%%mm1       # L4A    subtract \n\t" \
       "pf2id %%mm1,%%mm0          # L4A    ->modulo \n\t" \
       "prefetch %[FloatRecips]+96(,%[i],4) \n\t" \
       "paddd %%mm4,%%mm0          # L2AM   correct sign (part 2) \n\t" \
       "pcmpeqd %%mm0,%%mm5        # L2AM  assure modp=0 instead of modp=p \n\t" \
       "pandn %%mm0,%%mm5          # L2AM \n\t" \
       "movq %%mm5,%%mm1 \n\t" \
       "punpckldq %%mm5,%%mm5 \n\t" \
       "punpckhdq %%mm1,%%mm1 \n\t" \
       "pcmpeqd %[Deltas](,%[i],8),%%mm5   # L2AM \n\t" \
       "pcmpeqd 8+%[Deltas](,%[i],8),%%mm1 # L2AM \n\t" \
       "packssdw %%mm1,%%mm5 \n\t" \
       "packsswb %%mm0,%%mm5 # mm0 is not relevant\n\t" \
       "movd %%mm5,%[pflags]   # L5+ (VectorPath)" \
       : [pflags] "=r" (pflags)
       : [divisors] "o" (PrimeNumbers[0]), [FloatRecips] "o" (PrimeNumberFloatReciprocals[0]),
         [Deltas] "o" (Delta_of_PrimeNumbers[0][0]),
         [_mm7] "y" (MM7), [_mm6] "y" (MM6), [i] "r" (nr)
       : "mm0", "mm1", "mm2", "mm3", "mm4", "mm5");

      //cout << "SievePos: " << SievePos << ": " << PackedResult[0] << "," << PackedResult[1] << endl;
 
#if 0 || defined(DEBUG)
       {
         // check pflags for correct byte mask
         const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr]);
         if ( (d==Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x000000ffU)==0x000000ffU) ) { MARK; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0000ff00U)==0x0000ff00U) ) { MARK; exit(7); }
       }
#endif
      if ( pflags & 0x0000ffffU ) Factors.push_back(nr);

#if 0 || defined(DEBUG)
       {
         // check pflags for correct byte mask
         const int d=normalized_signed_mod(SievePos,PrimeNumbers[nr+1]);
         if ( (d==Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x00ff0000U)==0x00ff0000U) ) { MARK; exit(7); }
         if ( (d==Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xff000000U)==0xff000000U) ) { MARK; exit(7); }
       }
#endif
      if ( pflags & 0xffff0000U ) Factors.push_back(nr+1);
    }

  const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;

  asm volatile ( \
       "movd %[mySievePos],%%mm7   # -      mm0=(0,mySiebPos)\n\t" \
       "punpckldq %%mm7,%%mm7  # L2AM   (0,mySievePos) -> (mySievePos,mySievePos) \n\t"
       : [_mm7] "=y" (MM7) : [mySievePos] "r" (mySievePos));

  for (; nr<StaticFactorbase::Size(); nr+=2)
   {
      register unsigned int pflags;
      asm ( \
       "movq %[PrimeNumbers](,%[i],4),%%mm0   # L2AMS  get y : x (as packed int)\n\t" \
       "paddd %[_mm7],%%mm0        # L2AM \n\t" \
       "movq %%mm0,%%mm1 \n\t" \
       "punpckldq %%mm0,%%mm0 \n\t" \
       "punpckhdq %%mm1,%%mm1 \n\t" \
       "pcmpeqd %[LocDeltas](,%[i],8),%%mm0   # L2AM \n\t" \
       "pcmpeqd 8+%[LocDeltas](,%[i],8),%%mm1 # L2AM \n\t" \
       "packssdw %%mm1,%%mm0 \n\t" \
       "packsswb %%mm1,%%mm0 \n\t" \
       "movd %%mm0,%[pflags]   # L5+ (VectorPath)" \
       : [pflags] "=r" (pflags)
       : [PrimeNumbers] "o" (PrimeNumbers[0]), [LocDeltas] "o" (LocalPhysInterval_Delta_of_PrimeNumbers[0][0]),
         [_mm7] "y" (MM7), [i] "r" (nr)
       : "mm0", "mm1");

#if 0 || defined(DEBUG)
       {
         // check pflags for correct byte mask
         register const int d = mySievePos+PrimeNumbers[nr];
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][0]) != ((pflags&0x000000ffU)==0x000000ffU) ) { MARK; exit(7); }
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr][1]) != ((pflags&0x0000ff00U)==0x0000ff00U) ) { MARK; exit(7); }
       }
#endif
      if ( pflags & 0x0000ffffU ) Factors.push_back(nr);

#if 0 || defined(DEBUG)
       {
         // check pflags for correct byte mask
         register const int d = mySievePos+PrimeNumbers[nr+1];
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][0]) != ((pflags&0x00ff0000U)==0x00ff0000U) ) { MARK; exit(7); }
         if ( (d==LocalPhysInterval_Delta_of_PrimeNumbers[nr+1][1]) != ((pflags&0xff000000U)==0xff000000U) ) { MARK; exit(7); }
       }
#endif
      if ( pflags & 0xffff0000U ) Factors.push_back(nr+1);

   }
  asm volatile ("femms             # L2AMS  fast exit mmx");

}
// end of AMD 3DNow! optimized version

#else /* generic version */

void CDeltaComputations::GetStaticFactorHits(std::vector<unsigned int> &Factors, const signed int SievePos)
{
  // find factors in static factorbase
  // (beginnend mit der 3. Primzahl der FB (s.o) )

  int i;
  for (i=2; i<StaticFactorbase::Size() && PrimeNumbers[i]<PhysicalSieveSize; ++i)
   {
     // shortcut:
     /* avoid expensive mpz-divisions!
        We can re-check for sieved hits at SievePos. This is cheaper!
      */

     register int d = normalized_signed_mod(SievePos,PrimeNumbers[i],PrimeNumberReciprocals[i]);
     // same as: SievePos%PrimeNumbers[i]; if (d<0) d+=PrimeNumbers[i];
     // Bemerkung: %-operation does *not* eliminate the sign!
     //            ugly: -1=1 (mod 2), but (-1)%2=-1, 1%2=1
     // -> so we have to eliminate the sign!!!
     // and don't try to use unsigned int d, this won't work!

     if (d==Delta_of_PrimeNumbers[i][0] || d==Delta_of_PrimeNumbers[i][1]) // hit?
       Factors.push_back(i); // these are the divisors!
   }

  const signed int mySievePos = SievePos-SieveOffset-PhysicalSieveSize;
  for (; i<StaticFactorbase::Size(); ++i)
   {
     // shortcut:
     /* avoid expensive mpz-divisions!
        We can re-check for sieved hits at SievePos. This is cheaper!
        For the bigger primefactors this is even easier and needs no division:
        Simply repeat the same procedure as done in the sieving code; then
        compare whether the result will fit...
      */

     register int d = mySievePos+PrimeNumbers[i];
     if (d==LocalPhysInterval_Delta_of_PrimeNumbers[i][0] || d==LocalPhysInterval_Delta_of_PrimeNumbers[i][1]) // hit?
       Factors.push_back(i); // these are the divisors!
   }


}
#endif /* generic code */


#include "Sieving-inc.cc"
