/*! @file
 * @brief
 * implementation of base class used in DLP-MPQS
 */

#include "Tfactor.H"
#include "modulo.H"
#include <cmath>
#include "qsieve-fwd.H"

using std::setw;
using std::setprecision;


// static class members
double CmpqsFactor::rejected_dlp_counter = 0.0;
mpz_t CmpqsFactor::DLP_Threshold; // Double-Large-Prime-Threshold; will be initialized in main()


// Pollard-rho to factorize Double-Large-Primes
const bool CmpqsFactor::DLP_get_using_pollard_rho(const mpz_t n)
{
  if (mpz_cmp(n,DLP_Threshold)>0)
   {
     //cout << "DLP_get: Threshold exceeded..." << endl;
     return false;
   }

  int runden=50000; // maximum #rounds for pollard-rho
  // hint: high value -> costly, low value: decreasing rate of detection

  mpz_t x,a,a2;
  mpz_init_set(x,n); mpz_init(a); mpz_init(a2);
  mpz_set_ui(a,1); mpz_set(a2,a);

  p1=0; p2=0; // failsafe defaults

  do
    {
      mpz_mul(a,a,a); mpz_add_ui(a,a,1); mpz_mod(a,a,n);
      mpz_mul(a2,a2,a2); mpz_add_ui(a2,a2,1); mpz_mod(a2,a2,n);
      mpz_mul(a2,a2,a2); mpz_add_ui(a2,a2,1); mpz_mod(a2,a2,n);
      mpz_sub(x,a2,a);
      mpz_gcd(x,x,n);
    } while (--runden && mpz_cmp_ui(x,1)==0);
  if (mpz_cmp_ui(x,1)!=0)
    {
      // check, whether both factors are prime and small enough
      if (mpz_cmp_ui(x,SingleLargePrime_Threshold)>=0) goto done;
      p1=mpz_get_ui(x);
      mpz_divexact(a,n,x);
      if (mpz_cmp_ui(a,SingleLargePrime_Threshold)>=0) goto done;
      p2=mpz_get_ui(a);
      // Hint: if we check here for primality, then we save the
      //       check later (when a factor is used for splitting).
      //       To be on the safe side, we do it now...
      if (!numtheory::probab_prime(p1)) { p1=0; goto done; }
      if (!numtheory::probab_prime(p2)) { p2=0; goto done; }
      if (p1>p2) std::swap(p1,p2);
    };
done:
  mpz_clear(a); mpz_clear(a2); mpz_clear(x);
  //if (runden==0) cout << "DLP_get: rounds exceeded." << endl;
#ifdef VERBOSE_INFO
  cout << "DLP_prho_get (" << 50000-runden << ") for " << n << ": " << p1 << "," << p2 << endl;
#endif
  
  //static unsigned int runden_ges = 0;
  //runden_ges+=50000-runden;
  //cout << "DLP_get: rounds needed: " << runden_ges << endl;

  return (p1!=0 && p2!=0);
}


// Shank's square forms to factorize Double-Large-Primes
const bool CmpqsFactor::DLP_get(const mpz_t n)
{
  const unsigned int runden = 20000; // maximum rounds which will be tried, needs to be an odd value!
  // hint for tuning: more rounds cost more time, less rounds decrease rate of detection

  mpz_t mx,mr;
  mpz_init(mx); mpz_init(mr);

  p1=0; p2=0; // by default: both factors are invalid
  short unsigned int SQUFOF_Multiplier = 1; // we assume this to be 1 or a very small prime number!

try_new_Multiplier:
  if (SQUFOF_Multiplier==1) mpz_sqrtrem(mx,mr,n);
  // else:  mx will be set where the SQUFOF_Multiplier is determined 

  const unsigned int sq = mpz_get_ui(mx);
  const unsigned int d = mpz_get_ui(mr);
  mpz_mul_ui(mx,mx,2); mpz_add_ui(mx,mx,1); mpz_sqrt(mx,mx);
  const unsigned int sq2sqN = mpz_get_ui(mx);

  unsigned int lindex=0;
  const unsigned int bound = 64;
  short unsigned int list[bound];
  bool square_found=false;

  unsigned int r=0;
  unsigned int runde1=runden, runde2=0;
  unsigned int Q0,Q1,Q2,P1;

  if (d==0)
   {
     // perfect square!!
     p1=p2=sq; goto done;
   }

  P1=d-sq;
  Q1=1+sq-P1; // in case of (temporary) overflow, this remains correct!
  if (sq+sq-d>=d) // do we need an expensive division?
   {
     Q1=(sq+sq-d)/d +1;
     P1=sq-((sq+sq-d)%d);
     if (sq>=P1) Q1=1+Q1*(sq-P1); else Q1=1-Q1*(P1-sq);
   }
  Q0=d;
  Q2 = (Q1&1) ? Q1 : Q1>>1;
  if (Q2<sq2sqN)
   {
     //cout << "SQUFOF: list[" << lindex <<"]=" << Q2 << endl;
     list[lindex++]=static_cast<short unsigned int>(Q2);
   }
  if ( (0x2030213U>>(Q1&0x1fU))&1U )
   {
     r=static_cast<unsigned int>(sqrt(Q1));
     if (Q1==r*r) // is a square
      {
        square_found=(r>1); runde1=1;
        //cout << "SQUFOF early hit!!" << endl;
      }
   }


  while (--runde1)
   {
     register unsigned int u;

#if 1 && defined(ASM_CMOV)
     {
       asm ( \
        "# first \n\t" \
        "mov %[sq],%%eax \n\t" \
        "mov %[Q1],%%edx \n\t" \
        "add %[P1],%%eax \n\t" \
        "sub %[P1],%%edx \n\t" \
        "sub %[Q1],%%eax \n\t" \
        "cmp %[Q1],%%eax \n\t" \
        "jb 2f \n\t" \
        "1: xor %%edx,%%edx \n\t" \
        "divl %[Q1] \n\t" \
        "sub %[sq],%%edx # edx is P2! \n\t" \
        "incl %%eax \n\t" \
        "add %%edx,%[P1] \n\t" \
        "imull %[P1],%%eax # eax is Q2!\n\t" \
        "neg %%edx \n\t" \
        "jmp 3f \n\t" \
        "2: mov %[P1],%%eax \n\t" \
        "sub %%edx,%%eax \n\t" \
        "3: add %[Q0],%%eax \n\t" \
        "mov %[Q1],%[Q0] \n\t" \
        "mov %%eax,%[Q1] \n\t" \
        "mov %%edx,%[P1] \n\t" \
        "# second \n\t" \
        "mov %[sq],%%eax \n\t" \
        "mov %[Q1],%%edx \n\t" \
        "add %[P1],%%eax \n\t" \
        "sub %[P1],%%edx \n\t" \
        "sub %[Q1],%%eax \n\t" \
        "cmp %[Q1],%%eax \n\t" \
        "jb 2f \n\t" \
        "1: xor %%edx,%%edx \n\t" \
        "divl %[Q1] \n\t" \
        "sub %[sq],%%edx # edx is P2! \n\t" \
        "incl %%eax \n\t" \
        "add %%edx,%[P1] \n\t" \
        "imull %[P1],%%eax # eax is Q2!\n\t" \
        "neg %%edx \n\t" \
        "jmp 3f \n\t" \
        "2: mov %[P1],%%eax \n\t" \
        "sub %%edx,%%eax \n\t" \
        "3: add %[Q0],%%eax \n\t" \
        "mov %%edx,%[P1] \n\t" \
        "mov %[Q1],%[Q0] \n\t" \
        "mov %[Q1],%%edx \n\t" \
        "shr %%edx \n\t" \
        "mov %%eax,%[Q1] \n\t" \
        "cmovc %[Q0],%%edx \n" \
        : [P1] "+r" (P1), [Q0] "+r" (Q0), [Q1] "+r" (Q1), "=&d" (u)
        : [sq] "r" (sq)
        : "cc", "eax");
     }
#elif 1 && defined(ASM_386)
     {
       asm ( \
        "# first \n\t" \
        "mov %[sq],%%eax \n\t" \
        "mov %[Q1],%%edx \n\t" \
        "add %[P1],%%eax \n\t" \
        "sub %[P1],%%edx \n\t" \
        "sub %[Q1],%%eax \n\t" \
        "cmp %[Q1],%%eax \n\t" \
        "jb 2f \n\t" \
        "1: xor %%edx,%%edx \n\t" \
        "divl %[Q1] \n\t" \
        "sub %[sq],%%edx # edx is P2! \n\t" \
        "incl %%eax \n\t" \
        "add %%edx,%[P1] \n\t" \
        "imull %[P1],%%eax # eax is Q2!\n\t" \
        "neg %%edx \n\t" \
        "jmp 3f \n\t" \
        "2: mov %[P1],%%eax \n\t" \
        "sub %%edx,%%eax \n\t" \
        "3: add %[Q0],%%eax \n\t" \
        "mov %[Q1],%[Q0] \n\t" \
        "mov %%eax,%[Q1] \n\t" \
        "mov %%edx,%[P1] \n\t" \
        "# second \n\t" \
        "mov %[sq],%%eax \n\t" \
        "mov %[Q1],%%edx \n\t" \
        "add %[P1],%%eax \n\t" \
        "sub %[P1],%%edx \n\t" \
        "sub %[Q1],%%eax \n\t" \
        "cmp %[Q1],%%eax \n\t" \
        "jb 2f \n\t" \
        "1: xor %%edx,%%edx \n\t" \
        "divl %[Q1] \n\t" \
        "sub %[sq],%%edx # edx is P2! \n\t" \
        "incl %%eax \n\t" \
        "add %%edx,%[P1] \n\t" \
        "imull %[P1],%%eax # eax is Q2!\n\t" \
        "neg %%edx \n\t" \
        "jmp 3f \n\t" \
        "2: mov %[P1],%%eax \n\t" \
        "sub %%edx,%%eax \n\t" \
        "3: add %[Q0],%%eax \n\t" \
        "mov %%edx,%[P1] \n\t" \
        "mov %[Q1],%[Q0] \n\t" \
        "mov %[Q1],%%edx \n\t" \
        "shr %%edx \n\t" \
        "mov %%eax,%[Q1] \n\t" \
        "jnc 4f \n\t" \
        "mov %[Q0],%%edx \n\t" \
        "4: \n" \
        : [P1] "+r" (P1), [Q0] "+r" (Q0), [Q1] "+r" (Q1), "=&d" (u)
        : [sq] "r" (sq)
        : "cc", "eax");
     }
#else
     {
       // we expect sq+P1>=Q1;
       // furthermore, in about half of the cases (sq+P1)/Q1 == 1, so it is acceptable to
       // trade expensive division with a branch...
       // (found this nice idea in msieve-0.88 written by Jason Papadopoulos)

       register unsigned int P2=Q1-P1;
       Q2=Q0+P1-P2; // in case of (temporary) overflow, this remains correct!
       if (sq+P1-Q1>=Q1) // do we need an expensive division?
        {
          Q2=(sq+P1)/Q1;
          P2=sq-((sq+P1)%Q1); // equivalent to P2=Q2*Q1-P1, but "/" and "%" probably use the same cpu instruction!
          if (P1>=P2) Q2=Q0+Q2*(P1-P2); else Q2=Q0-Q2*(P2-P1);
           // "unsigned int" has one more bit than "signed int", so overflows
           // should not occur at all,
           // but therefore now we have to take care for the sign!
           // Anyway: seldom overflows will cause no harm at all,
           // they only waste computing time (resulting in pollard rho doing
           // the job again...)
           // -> see below: "this line shouldn't be..."
        }
       P1=P2; Q0=Q1; Q1=Q2;
       P2=Q1-P1;
       Q2=Q0+P1-P2; // in case of (temporary) overflow, this remains correct!
       if (sq+P1-Q1>=Q1) // do we need an expensive division?
        {
          Q2=(sq+P1)/Q1;
          P2=sq-((sq+P1)%Q1); // equivalent to P2=Q2*Q1-P1, but "/" and "%" probably use the same cpu instruction!
          if (P1>=P2) Q2=Q0+Q2*(P1-P2); else Q2=Q0-Q2*(P2-P1);
           // "unsigned int" has one more bit than "signed int", so overflows
           // should not occur at all,
           // but therefore now we have to take care for the sign!
           // Anyway: seldom overflows will cause no harm at all,
           // they only waste computing time (resulting in pollard rho doing
           // the job again...)
           // -> see below: "this line shouldn't be..."
        }
       P1=P2; Q0=Q1; Q1=Q2;
       u = (Q0&1) ? Q0 : Q0>>1;
     }
#endif

     if (u<sq2sqN)
      {
        //cout << "SQUFOF: list[" << lindex <<"]=" << u << endl;
        list[lindex++]=static_cast<short unsigned int>(u);
        if (lindex>=bound)
         {
           cout << "SQUFOF: list exceeded (out of bound)..." << endl;
           break;
         }
      }
     u = (Q1&1) ? Q1 : Q1>>1;
     if (u<sq2sqN)
      {
        //cout << "SQUFOF: list[" << lindex <<"]=" << u << endl;
        list[lindex++]=static_cast<short unsigned int>(u);
        if (lindex>=bound)
         {
           cout << "SQUFOF: list exceeded (out of bound)..." << endl;
           break;
         }
      }
     if ( (0x2030213U>>(Q1&0x1fU))&1U )
      {
        // try to avoid entering this block, if Q1 cannot be a square!
        // This VOODOO avoids expensive sqrt-check for 78.125% of all cases!
        // now let's explain our VOODOO:
        //  Q1 mod 32 in [0,1,4,9,16,17,25] -> Q1 can be a square
        //  Q1 mod 32 not in [0,1,4,9,16,17,25] -> Q1 cannot be a square
        // (prove by induction or by simply using the modulo-32 ring!)
        // in binary/hexadecimal notation:
        // (1<<0 | 1<<1 | 1<<4 | 1<<9 | 1<<16 | 1<<17 | 1<<25) = 0x2030213,
        // Q1 mod 32 = Q1 & 0x1f
        // therefore:  if ((1<<(Q1&0x1f))&0x2030213) is ZERO, then Q1 cannot be a square!
        r=static_cast<unsigned int>(sqrt(Q1)); if (Q1!=r*r) continue; // not a square
        if (r<=1) { square_found=false; break; }
        bool fl=false;
        for (unsigned int k=0; k<lindex; ++k)
         if (r==list[k]) { fl=true; break; }
        if (fl) continue; // Square not useful
        square_found=true; break;
      }
   }

  runde1=runden-runde1;

choose_new_Multiplier:
  if (!square_found)
   {
     //if (SQUFOF_Multiplier>1)
     // cout << "DLP-SQUFOF, Multiplier " << SQUFOF_Multiplier << ": No success." << endl;
     switch (SQUFOF_Multiplier)
      {
        case 1: SQUFOF_Multiplier=2; break;
        case 2: SQUFOF_Multiplier=3; break;
        case 3: SQUFOF_Multiplier=5; break;
        case 5: SQUFOF_Multiplier=7; break;
        case 7: SQUFOF_Multiplier=10; break;
        case 10: SQUFOF_Multiplier=11; break;
        case 11: SQUFOF_Multiplier=13; break;
        case 13: SQUFOF_Multiplier=15; break;
        default:
           cout << "DLP-SQUFOF: Fallback to pollard rho method." << endl;
           DLP_get_using_pollard_rho(n);
           goto done;
      }

     mpz_mul_ui(mx,n,SQUFOF_Multiplier);
     if (mpz_sizeinbase(mx,2)>62)
      {
        //cout << "DLP-SQUFOF limits exceeded. Fallback to pollard rho method." << endl;
        DLP_get_using_pollard_rho(n);
        goto done;
      }
     mpz_sqrtrem(mx,mr,mx);

     goto try_new_Multiplier;
   }

  //cout << "DLP-SQUFOF: square found in round " << runde1 << ": " << r << endl;

  double DQ,DQ0,DQ1,DQ2,DP1,DP2;
  mpz_set_ui(mr,P1); mpz_mul_ui(mr,mr,P1);

  if (SQUFOF_Multiplier==1) mpz_sub(mx,n,mr);
  else { mpz_mul_ui(mx,n,SQUFOF_Multiplier); mpz_sub(mx,mx,mr); }
  mpz_div_ui(mx,mx,r);
  DQ1=mpz_get_d(mx);
  DQ0=r; DP1=P1;

  for (runde2=2*runden; runde2; --runde2)
   {
     DQ=floor((sq+DP1)/DQ1); DP2=DQ*DQ1-DP1;
     DQ2=DQ0+DQ*(DP1-DP2); DQ0=DQ1; DQ1=DQ2;
     if (DP1==DP2)
      {
        // factor found

        // eliminate common factors of SQUFOF_Multiplier
        if ((DQ0/2)==floor(DQ0/2)) DQ0/=2;
        mpz_set_d(mx,DQ0);
        DQ0/=mpz_gcd_ui(mr,mx,SQUFOF_Multiplier);

        if (DQ0==1.0)
         {
           //cout << "DLP-SQUFOF: Multiplier " << SQUFOF_Multiplier << " rediscovered..." << endl;
           square_found=false; goto choose_new_Multiplier;
         }

        // check, whether both factors are prime and small enough

        if (DQ0>=SingleLargePrime_Threshold) goto done;
        p1=static_cast<unsigned int>(DQ0);
        if (mpz_div_ui(mx,n,p1))
         {
           cerr << "DLP_get-SQUFOF: weird factors! remainder??" << p1 << endl;
           cerr << "DLP-SQUFOF: SQUFOF_Multiplier=" << SQUFOF_Multiplier << endl;
           exit(1);
         }
        if (mpz_cmp_ui(mx,SingleLargePrime_Threshold)>=0) goto done;
        p2=mpz_get_ui(mx);
        // Hint: if we check here for primality, then we save the
        //       check later (when a factor is used for splitting).
        //       To be on the safe side, we do it now...
        if (!numtheory::probab_prime(p1)) { p1=0; goto done; }
        if (!numtheory::probab_prime(p2)) { p2=0; goto done; }
        if (p1>p2) std::swap(p1,p2);
        //cout << "DLP-SQUFOF: factorization found!" << endl;
        goto done;
      }
     DP1=DP2;
   }
  MARK;
  cerr << "SQUFOF_Multiplier=" << SQUFOF_Multiplier << endl;
  cerr << "This line should'nt be executed!!! Overflow in unsigned int?? Negative values??" << endl;
  DLP_get_using_pollard_rho(n); // fallback...

done:
  mpz_clear(mx); mpz_clear(mr);

#if 0 || defined(VERBOSE)
  runde2=2*runden-runde2;
  cout << "DLP_get (SQUFOF," << SQUFOF_Multiplier << "): (" << runde1 << "," << runde2 << ") "
       << p1 << "," << p2 << endl;
#endif

#if 1 /* some statistics */
 static double good_dlp_counter = 0.0;
 static double bad_dlp_counter  = 0.0;
 if (p1&&p2) good_dlp_counter+=1.0; else bad_dlp_counter+=1.0;
 static time_t lastout = 0;
 if (time(NULL)>lastout+60)
  {
    lastout=time(NULL);
    cout << "DLP-SQUFOF: "
         << setw(6) << setprecision(5)
         << 100.0*rejected_dlp_counter/(rejected_dlp_counter+good_dlp_counter+bad_dlp_counter)
         << "% of DLP-candidates were rejected!" << endl;
    cout << "DLP-SQUFOF: "
         << setw(10) << setprecision(0) << good_dlp_counter 
         << " [" << setw(6) << setprecision(5)
         << 100.0*good_dlp_counter/(good_dlp_counter+bad_dlp_counter)
         << "%] of non-rejected DLP are good!" << endl;
  }
#endif

  return (p1&&p2);
}


istream& operator>> (istream &istr, CmpqsFactor &x)
{
   char s[50];
   istr >> setw(sizeof(s)) >> s;
   x.p1=0; x.p2=0;
   int i=0;
   while (s[i]!=0 && s[i]!='*') ++i;
   if (s[i]==0) 
    {
      x.p1=0; x.p2=atoi(s);
      //cout << "DLP: read a value" << endl;
    }
   else 
    if (s[i]=='*')
     {
       s[i]=0;
       x.p1=atoi(s);
       x.p2=atoi(&s[i+1]);
       //cout << "DLP: read two values" << endl;
     }
    else
     {
       cerr << "Reading DLP failed!" << endl;
     }
   //cout << "DLP:" << x.p1 << "*" << x.p2 << endl;
   return istr;
}
