// Discrete Fast Fourier Transform
// written by Thorsten Reinecke, 2003-07-04
// last change: 2005-05-24

/*! @file
 * @brief
 * Discrete Fast Fourier Transform
 */


/*
   A(x)=a[0]+a[1]*x^1+a[2]*x^2+...+a[k]*x^m  mod(N)
   B(x)=b[0]+b[1]*x^1+b[2]*x^2+...+b[k]*x^n  mod(N)
   -> C(x)=A(x)*B(x)=c[0]+c[1]*x+...+c[m+n]*x^(m+n)   mod(N)

   Problem: Quadratwurzeln knnen wir modulo N nicht ziehen,
   jedenfalls nicht in ertrglicher Zeit... (da N im allgemeinen eine
   zusammgesetzte Zahl ist, deren Teiler uns nicht bekannt sind)
   ... also knnen wir die diskrete FFT nicht durchfhren!

   Lsung:
   Beobachtung, dass die einzelnen Koeffizienten des Polynoms A(x)*B(x)
   nicht grer werden knnen als (m+n)*N^2, wenn die einzelnen
   Koeffizienten von A(x) und B(x) im Bereich { 0 ...N-1 } liegen.
   (Beweis: Worst-Case-berschlagsrechnung bei naiver Polynommultiplikation)

   [ Nebenbei: Das ist auch der Grund dafr, warum sich
     mpz_mod innerhalb der Rekursion bei der 
     Karatsuba-Polynommultiplikation nicht lohnt... ]

   Wir knnen uns also eine Primzahl P > (m+n)*N^2 suchen,
   fr die wir Quadratwurzeln ziehen knnen!
   Wir fhren die DFT dann modulo P durch.
   Das Ergebnis sollte immer mit dem Ergebnis ohne Modulorechnung
   bereinstimmen.
   -> Also  C(x) mod P = C(x) mod (m+n)*N^2
   -> (C(x) mod P) mod N ist dann unser gesuchtes Polynom!

   So viel zum Prinzipiellen...

   Leider wird die skalare Multiplikation dann unheimlich teuer
   und die Koeffizienten ziemlich gro, so dass man diese
   mod P reduzieren mu (allein schon, um Speicher zu sparen).
   Die Laufzeitgewinne schrumpfen also bei greren Zahlen
   schnell wieder ein...
    
   Alternativ knnen wir aber statt eines groen P viele kleine P's
   verwenden und die Teilergebnisse dann mit dem chinesischen Restsatz 
   zum Gesamtergebnis zusammensetzen.

   eventuell noch implementierbar:

   Da die Teilmultiplikationen nach dem SIMD-Prinzip (Single Instruction
   Multiple Data) durchgefhrt werden knnen, eignet sich das gut fr
   Vektorrechner und drfte auch gut fr eine sptere MMX/3dnow-Optimierung
   geeignet sein.

   32-bittige unsigned ints sind dafr allerdings nicht unbedingt zu empfehlen,
   da es im Intervall [2^31,2^32] nur 199 Primzahlen gibt mit (p-1)=2^20*rest,
   wie nachfolgende Prozedur zeigt:

     c:=0;
     for i from 2^31+1 to 2^32 by 2^20 do 
       if isprime(i) then c:=c+1; print(c,i); fi;
     od;

   Damit lieen sich dann grob berschlagen Dezimalzahlen der Stelligkeit
   lg(((2^31)^(199/2))/(2^20))=(31*99.5-20)*lg(2)=3064.5*0.30103=922.5
   beackern, da wir die 199 Primzahlen ber den chinesischen Restsatz
   kombinieren und die Gre der Polynome auf maximal 2^20 Koeffizienten
   begrenzen mssen. (Koeffizientengre von etwa 2^20*N^2 mu noch korrekt
   rekonstruierbar sein!)
   Hier heit es also auf 64bit umzusteigen oder gleich in Fliekomma
   zu rechnen.
*/


#include "modulo.H"
#include "mpz_wrapper.H"

namespace polynomial
{

using namespace my_mpz_wrapper;

class CDFT_base0 : private ForbidAssignment
{
 public:
  const unsigned int max_size;

  // static helper function (for constructor to initialize max_size)
  static const unsigned int calc_max_size(const unsigned int x_size)
   {
     unsigned int i=2;
     while (i<x_size) i<<=1;
     return i;
   }

  inline const unsigned int use_size(const unsigned int input_size) const
  {
    unsigned int i = 2;
    while (i<input_size) i<<=1;
    if (i>max_size)
     {
       MARK;
       cerr << "input_size is invalid!" << endl;
       exit(1);
     }
    return i;
  }

  explicit CDFT_base0(const unsigned int x_size)
   : max_size(calc_max_size(x_size))
   {
   }
};

class CDFT_base : public CDFT_base0
{
public:
  // this procedure calculates "count" valid primes (beginning at "Start") suitable for
  // doing dft (with recursion depth "Depth" and returns in a newly created Polynom "primes",
  // which needs to be initially an empty reference.
  static void get_valid_primes_for(TPolynom &primes, const unsigned int count,
                                   const mpz_t Start, const unsigned Depth);

 private:
  mpz_t h; // auxiliary variable, that any method is allowed to use
  mpz_t inverse[32];
  TPolynom w;

 protected:
  int size;
  mpz_t M; // coset: we compute all results (mod M)!

  inline const mpz_t& invpow2(const unsigned int i) const { return inverse[i]; }

  void calc_roots_and_inverse();
  void convolute(const TPolynom p, const unsigned int n);

 public:
  explicit CDFT_base(const unsigned int x_size)
   : CDFT_base0(x_size), w(new mpz_t[max_size]), size(0)
   {
#if defined(VERBOSE_INFO)
     MARK; cout << "CDFT_base-constructor for maximum size=" << max_size << endl;
#endif
     mpz_init(M);
     mpz_init(h);
     for (unsigned int i=0; i<max_size; ++i) mpz_init(w[i]);
     for (unsigned int i=0; i<32; ++i) mpz_init(inverse[i]);
   }

  CDFT_base(const unsigned int x_size, const mpz_t x_M)
   : CDFT_base0(x_size), w(new mpz_t[max_size]),  size(0)
   {
#if defined(VERBOSE_INFO)
     MARK; cout << "CDFT_base-constructor for maximum size=" << max_size << endl;
#endif
     mpz_init(M);
     mpz_init(h);
     for (unsigned int i=0; i<max_size; ++i) mpz_init(w[i]);
     for (unsigned int i=0; i<32; ++i) mpz_init(inverse[i]);
     mpz_set(M,x_M);
     calc_roots_and_inverse();
   }

  virtual ~CDFT_base()
   {
     for (int i=31; i>=0; --i) mpz_clear(inverse[i]);

     for (int i=max_size-1; i>=0; --i) mpz_clear(w[i]);
     delete [] w;

     mpz_clear(h);
     mpz_clear(M);
   }

  const int dftmul(const TPolynom R, const int kR,
                   const TconstPolynom P1, const int k1,
                   const TconstPolynom P2, const int k2);

  friend class CDFT_chinrem;
};



class CDFT : public CDFT_base
{
 private:
  mpz_t N; // this is our underlying number!

  // helper function for constructor 
  void calc_field_and_roots_and_inverse();

 protected:
  const int internal_mul(const TPolynom R, const int kR, 
                 const TconstPolynom P1, const int k1,
                 const TconstPolynom P2, const int k2,
                 const bool reduce_result_modN);

 public:

  // DFT (discrete fourier transform) to multiply to polynomials,
  // whose product have no more than <size> coefficients.
  // x_N denotes the coset, in which the result must be still correct.

  CDFT(const unsigned int x_size, const mpz_t x_N)
   : CDFT_base(x_size)
   {
#if defined(VERBOSE_INFO)
     MARK; cout << "CDFT-constructor for maximum size=" << max_size << endl;
#endif
     mpz_init_set(N,x_N);
     calc_field_and_roots_and_inverse();
   }

  virtual ~CDFT()
   {
     mpz_clear(N);
   }

 inline const mpz_t& get_N(void) const { return N; }

 inline const int mul(const TPolynom R, const int kR, 
                      const TconstPolynom P1, const int k1,
                      const TconstPolynom P2, const int k2)
  {
    return internal_mul(R,kR,P1,k1,P2,k2,false);
  }

 inline const int mulmod(const TPolynom R, const int kR,
                         const TconstPolynom P1, const int k1,
                         const TconstPolynom P2, const int k2)
  {
    return internal_mul(R,kR,P1,k1,P2,k2,true);
  }

 inline const int square(const TPolynom R, const int kR,
                         const TconstPolynom P, const int k)
  {
    return internal_mul(R,kR,P,k,P,k,false);
  }

 inline const int squaremod(const TPolynom R, const int kR,
                            const TconstPolynom P, const int k)
  {
    return internal_mul(R,kR,P,k,P,k,true);
  }

};


class CDFT_chinrem : public CDFT_base0
{
 private:
  mpz_t N; // this is our underlying number!

  // helper function for constructor 
  void calc_field_and_roots_and_inverse();

 protected:

  struct tnode
   {
     mpz_t M;
     mpz_t inv_first_M_mod_second_M;
     tnode *left, *right;
     CDFT_base *first_dft, *second_dft;
   };
  void create_nodes(tnode &node, int &count, int depth = 1);
  void delete_nodes(tnode &node);
  void recurse_dftmul(tnode &node,
               const TPolynom R, const int kR,
               const TconstPolynom P1, const int k1,
               const TconstPolynom P2, const int k2);
  tnode root_node;

  int anz_dft;
  TPolynom Ms;
  CDFT_base** dft;
  const int internal_mul(const TPolynom R, const int kR, 
                 const TconstPolynom P1, const int k1,
                 const TconstPolynom P2, const int k2,
                 const bool reduce_result_modN);

 public:

  // DFT (discrete fourier transform) to multiply to polynomials,
  // whose product have no more than <size> coefficients.
  // x_N denotes the coset, in which the result must be still correct.

  CDFT_chinrem(const unsigned int x_size, const mpz_t x_N)
   : CDFT_base0(x_size), anz_dft(0), Ms(NULL), dft(NULL)
   {
#if defined(VERBOSE_INFO)
     MARK; cout << "CDFT-constructor for maximum size=" << max_size << endl;
#endif
     mpz_init_set(N,x_N);
     calc_field_and_roots_and_inverse();
   }

  virtual ~CDFT_chinrem()
   {
     delete_nodes(root_node);
     for (int i=0; i<anz_dft; ++i) delete dft[i];
     delete [] dft;
     for (int i=0; i<anz_dft; ++i) mpz_clear(Ms[i]);
     delete [] Ms;
     mpz_clear(N);
   }

 inline const mpz_t& get_N(void) const { return N; }

 inline const int mul(const TPolynom R, const int kR, 
                      const TconstPolynom P1, const int k1,
                      const TconstPolynom P2, const int k2)
  {
    return internal_mul(R,kR,P1,k1,P2,k2,false);
  }

 inline const int mulmod(const TPolynom R, const int kR,
                         const TconstPolynom P1, const int k1,
                         const TconstPolynom P2, const int k2)
  {
    return internal_mul(R,kR,P1,k1,P2,k2,true);
  }

 inline const int square(const TPolynom R, const int kR,
                         const TconstPolynom P, const int k)
  {
    return internal_mul(R,kR,P,k,P,k,false);
  }

 inline const int squaremod(const TPolynom R, const int kR,
                            const TconstPolynom P, const int k)
  {
    return internal_mul(R,kR,P,k,P,k,true);
  }

};



// ----------------- Implementation ---------------------------------------


void CDFT_base::get_valid_primes_for(TPolynom &primes, const unsigned int count,
                                     const mpz_t Start, const unsigned Depth)
{
  // this procedure calculates "count" valid primes (beginning at "Start") suitable for
  // doing dft (with recursion depth "Depth" and returns in a newly created Polynom "primes",
  // which needs to be initially an empty reference.

 if (primes!=NULL)
  {
    cout << __FILE__ << ", " << __FUNCTION__ << ": line " <<  __LINE__ << endl;
    cerr << "First parameter is a call by reference," << endl;
    cerr << "it should initially point to NULL (to avoid memory-leaks)," << endl;
    cerr << "because a new pointer to new data will be generated and" << endl;
    cerr << "there is no need for initially data pointed by \"primes!\"" << endl;
    exit(1);
  }

 primes = new mpz_t[count];
 for (unsigned int i=0; i<count; ++i) mpz_init(primes[i]);

  mpz_t x,M;
  mpz_init(x); mpz_init(M);

  const size_t bits = mpz_sizeinbase(Start,2)+1; // magnitude ld(Start)+1
  // + "safety-bits", in case that any coefficients of polynomials are slightly too big...
  mpz_set_ui(M,1); mpz_mul_2exp(M,M,bits);
#if 1
  // paranoid
  if (mpz_cmp_ui(M,Depth)<0) mpz_set_ui(M,Depth);
#endif
  mpz_add_ui(M,M,1);

  const unsigned int interval = 10000;

#ifdef VERBOSE
  MARK;
#endif

  for (unsigned int bisher=0; bisher<count; ++bisher)
   {
#ifdef VERBOSE
    cerr << bisher+1 << "/" << count << ": ";
#endif
    do
     {
      // sieve[i] -> true, if M+i*Depth is composite
      // sieve[i] -> false: unknown
      bool sieve[interval] = { false };
      for (unsigned int p=3; p<1000; p+=2) if (numtheory::is_prime(p))
       {
         unsigned int i = 0;
         unsigned int r = mpz_fdiv_ui(M,p);
         while (r) { r=(r+Depth)%p; ++i; }
         while (i<interval) { sieve[i]=true; i+=p; }
       }
      sieve[interval-1]=false;
      unsigned int i=0;
      while(i<interval)
       {
         while(sieve[i]) ++i;
         mpz_set_ui(x,Depth); mpz_mul_ui(x,x,i); mpz_add(x,x,M);
#ifdef VERBOSE
         cerr << i << " ";
#endif
         if (mpz_probab_prime_p(x,10)) break;
         ++i;
       }
      mpz_set_ui(x,Depth); mpz_mul_ui(x,x,i); mpz_add(M,M,x);
#ifdef VERBOSE
      cerr << " # +" << i << endl;
#endif
     } while (mpz_probab_prime_p(M,10)==0);
    mpz_set(primes[bisher],M); mpz_add_ui(M,M,Depth);
   }

  mpz_clear(M); mpz_clear(x);
}


void CDFT::calc_field_and_roots_and_inverse()
{
  mpz_mul(M,N,N); mpz_mul_ui(M,M,max_size);
  mpz_mul_ui(M,M,4); // + "Safety-Bits", in case that any coefficients are slightly too big...

  TPolynom MyField = NULL;
  get_valid_primes_for(MyField,1,M,max_size);
  mpz_set(M,MyField[0]);
  mpz_clear(MyField[0]); delete [] MyField;
  calc_roots_and_inverse();
}


void CDFT_chinrem::delete_nodes(tnode &node)
{
  if (node.left) delete_nodes(*node.left);
  if (node.right) delete_nodes(*node.right);
  mpz_clear(node.inv_first_M_mod_second_M); mpz_clear(node.M); 
  delete node.right; delete node.left;
}

void CDFT_chinrem::create_nodes(tnode &node, int &count, int depth)
{
  if ( 1<<depth >= anz_dft )
   {
     // leaf is reached
     node.first_dft=dft[count++];
#if defined(VERBOSE_INFO)
     cout << "created dft " << count << ": " << node.first_dft->M << endl;
#endif
     node.second_dft=dft[count++];
#if defined(VERBOSE_INFO)
     cout << "created dft " << count << ": " << node.second_dft->M << endl;
#endif
     node.left=node.right=NULL;
     mpz_init(node.M); mpz_mul(node.M,node.first_dft->M,node.second_dft->M);
     mpz_init(node.inv_first_M_mod_second_M);
     if (!mpz_invert(node.inv_first_M_mod_second_M,node.first_dft->M,node.second_dft->M))
      {
        MARK; cerr << "BUG!! Inverse MUST exist!" << endl;
        exit(1);
      }
   }
  else
   {
     node.first_dft=node.second_dft=NULL;
     node.left  = new tnode;
     create_nodes(*node.left,count,depth+1);
     node.right = new tnode;
     create_nodes(*node.right,count,depth+1);
     mpz_init(node.M); mpz_mul(node.M,node.left->M,node.right->M);
     mpz_init(node.inv_first_M_mod_second_M);
     if (!mpz_invert(node.inv_first_M_mod_second_M,node.left->M,node.right->M))
      {
        MARK; cerr << "BUG!! Inverse MUST exist!" << endl;
        exit(1);
      }
   }
#if defined(VERBOSE_INFO)
  cout << "node value: " << node.M << endl;
#endif
}

void CDFT_chinrem::calc_field_and_roots_and_inverse()
{
  typedef CDFT_base* Pdft; 
  mpz_t M;
  mpz_init(M);
  mpz_mul(M,N,N); mpz_mul_ui(M,M,max_size);
  mpz_sqrt(M,M); // chinese remaindering using two primenumbers -> p*q > max_size*N
  anz_dft=2;
  while (mpz_sizeinbase(M,10)>125)
   {
     // increase depth of binary tree for chinese remaindering until
     // modulo value is below a given threshold
     mpz_sqrt(M,M); anz_dft*=2;
   }
  Ms = NULL; CDFT_base::get_valid_primes_for(Ms,anz_dft,M,max_size);
  dft = new Pdft[anz_dft];
  for (int i=0; i<anz_dft; ++i) dft[i] = new CDFT_base(max_size,Ms[i]);
#if defined(VERBOSE_INFO)
  cout << "N = " << N << endl;
  cout << anz_dft << " nodes have been prepared for chinese remaindering." << endl;
#endif
  int count=0; create_nodes(root_node,count); 
  mpz_clear(M);
}


void CDFT_base::calc_roots_and_inverse()
{
  mpz_t x,e;
  mpz_init(x); mpz_init(e);

  if (!mpz_probab_prime_p(M,10))
   {
     MARK;
     cerr << "invalid M for dft!" << endl;
     exit(1);
   }

  mpz_sub_ui(e,M,1);
  if (mpz_div_ui(e,e,max_size)!=0)
   {
     MARK;
     cerr << "invalid M for dft!" << endl;
     exit(1);
   }

  unsigned int r=911;
try_r:
  mpz_set_ui(x,r); mpz_powm(w[1],x,e,M); mpz_powm_ui(x,w[1],max_size/2,M);
  mpz_add_ui(x,x,1); mpz_mod(x,x,M);
  //cout << "Restklasse " << "M="; mpz_out_str(stdout,10,M); cout << endl;
  //cout << "-1? ";  mpz_out_str(stdout,10,x); cout << endl << endl;
  if (mpz_cmp_ui(x,0)!=0)
   {
     r+=2; if (r<2000) goto try_r;
     cerr << "unable to find valid roots..." << endl;
     exit(1);
   }

  // otherwise w[1] is the first root of unity...
  mpz_set_ui(w[0],1); // w^0 = 1
  for (unsigned int i=2; i<max_size; ++i)
   {
     mpz_mul(x,w[i-1],w[1]); mpz_mod(w[i],x,M);
     if (mpz_cmp_ui(w[i],1)==0)
      {
        MARK;
        cerr << "invalid roots..." << endl;
        exit(1);
      }
   }

  mpz_clear(x); mpz_clear(e);

  // finally precalculate inverse of 2^k (mod M), k=0..31
  mpz_set_ui(inverse[0],1);
  mpz_set_ui(h,2);
  if (!mpz_invert(h,h,M)) { MARK; cerr << "inverse of 2 does not exist!" << endl; exit(1); }
  mpz_mod(h,h,M); mpz_set(inverse[1],h);
  for (int i=2; i<32; ++i)
   {
     mpz_mul(h,inverse[i-1],inverse[1]); mpz_mod(h,h,M);
     mpz_set(inverse[i],h);
   }
}

void CDFT_base::convolute(const TPolynom p, const unsigned int n)
{
  // we assume n = 2^k, k>1 !!
  if (n==4)
   {
     mpz_add(p[0],p[0],p[2]); mpz_mul_2exp(p[2],p[2],1); mpz_sub(p[2],p[0],p[2]);
     mpz_add(p[1],p[1],p[3]); mpz_add(p[0],p[0],p[1]); mpz_mul_2exp(p[3],p[3],1); mpz_sub(p[3],p[1],p[3]);
     mpz_mul(h,p[3],w[max_size>>2]); mpz_mod(h,h,M); mpz_sub(p[3],p[2],h); mpz_add(p[2],p[2],h);
     mpz_mul_2exp(p[1],p[1],1); mpz_sub(p[1],p[0],p[1]);
     mpz_swap(p[1],p[2]);
   }
  else
   {
     const unsigned int nh = n>>1;

#if 1
     {
       mpz_t* const temp = new mpz_t[nh]; // will be only used for swapping, so no initialization is needed...
       for (unsigned int i=0, j=0; i<nh; ++i)
        {
	  mpz_swap(p[i],p[j++]);
	  mpz_swap(temp[i],p[j++]);
        }
       for (unsigned int i=0; i<nh; ++i) mpz_swap(p[i+nh],temp[i]);
       delete [] temp;
     }
#else
     // alternative method: using more temporary memory, but less movements...
     {
       mpz_t* const temp = new mpz_t[n]; // will be only used for swapping, so no initialization is needed...
       memcpy(temp,p,n*sizeof(mpz_t));
       for (unsigned int i=0, j=0; i<nh; ++i)
        {
	  memcpy(&p[i],&temp[j++],sizeof(mpz_t));
	  memcpy(&p[i+nh],&temp[j++],sizeof(mpz_t));
        }
       delete [] temp;
     }
#endif

     convolute(p,nh);
     convolute(&p[nh],nh);

     const unsigned int dj = max_size/n;
     for (unsigned int i=0,j=0; i<nh; ++i,j+=dj)
      {
	mpz_mul(h,p[i+nh],w[j]); mpz_mod(h,h,M);
        mpz_sub(p[i+nh],p[i],h);
	mpz_add(p[i],p[i],h);
      }
   }
}


const int CDFT_base::dftmul(const TPolynom R, const int kR,
                            const TconstPolynom P1, const int k1,
                            const TconstPolynom P2, const int k2)
{
  const unsigned int estimated_memusage_in_bits = mpz_sizeinbase(M,2)+32; // for optimizing mpz-heap allocation
  const int result_size = k1+k2-1;
#if defined(VERBOSE)
  cout << "dftmul: " << k1 << ", " << k2 << ", " << result_size << endl;
#endif
  size = use_size(result_size);

  // sanity checks
  if (result_size>size)
   {
     MARK; cerr << "(result_size>size)" << endl;
     exit(1);
   }
  if (kR<result_size)
   {
     MARK; cerr << "destination polynomial is too small!" << endl;
     exit(1);
   }

  // use R, if memory in R suffices for convolution;
  // otherwise uses temporary memory...
  const TPolynom p = (kR>=size) ? R : new mpz_t[size];

  if (p!=R) for (int i=0; i<size; ++i) mpz_init2(p[i],estimated_memusage_in_bits);
  else for (int i=k1; i<size; ++i) mpz_set_ui(p[i],0); // padding with zeros

  for (int i=0; i<k1; ++i) mpz_mod(p[i],P1[i],M); // get first multiplicand
  convolute(p,size); // do fft

  const TPolynom q = (P1==P2 && k1==k2) ? p : new mpz_t[size]; // for special case p*q = p^2
  if (p!=q)
   {
     for (int i=0; i<size; ++i) mpz_init2(q[i],estimated_memusage_in_bits);
     // is done already by init2!! for (int i=k2; i<size; ++i) mpz_init(q[i]); // padding with zeros

     for (int i=0; i<k2; ++i) mpz_mod(q[i],P2[i],M); // get second multiplicand
     convolute(q,size); // do fft
   }

  // IMPORTANT: store result for last fft in p (to save memory space)
  for (int i=0; i<size; ++i)
   {
     mpz_mul(h,p[i],q[i]);
     mpz_mod(p[i],h,M);
   }
   // the result will be in p now!!!

  if (q!=p)
   {
     // we can delete the temporary polynomial q
     for (int i=0; i<size; ++i) mpz_clear(q[i]);
     delete [] q;
   }

  convolute(p,size); // do fft
  for (int i=1; i<size/2; ++i) mpz_swap(p[i],p[size-i]);

  int inv_index=0;
  for (int i=1; i<size; i<<=1) ++inv_index;

  for (int i=0; i<result_size; ++i)
   {
     mpz_mul(h,p[i],invpow2(inv_index));
     mpz_mod(p[i],h,M);
   }

  if (p!=R)
   {
     // copy result and release temporary polynomial
     for (int i=result_size-1; i>=0; --i) mpz_set(R[i],p[i]); // faster would be mpz_swap, but: memory fragmentation?
     if (size-result_size>10) cout << size-result_size << " computations saved..." << endl;
     // release temporary polynomial
     for (int i=0; i<size; ++i) mpz_clear(p[i]);
     delete [] p;
   }
  else
   for (int i=result_size; i<kR; ++i) mpz_set_ui(R[i],0); // leading zeroes seem apparently necessary (in spite of returning result_size!)
  return result_size; // return size of result
}


const int CDFT::internal_mul(const TPolynom R, const int kR,
               const TconstPolynom P1, const int k1,
               const TconstPolynom P2, const int k2,
               const bool reduce_result_modN)
{
  const size_t ld_N = mpz_sizeinbase(N,2); // ld(any input coefficient)>ld_N -> result could be wrong!!
  const unsigned int estimated_memusage_in_bits = mpz_sizeinbase(M,2)*2+5; // for optimizing mpz-heap allocation
  const int result_size = k1+k2-1;
#if defined(VERBOSE_INFO)
  cout << "_mul (dft): ";
  if (P1==P2) cout << "(SQUARE) ";
  cout << k1 << ", " << k2 << ", " << result_size << endl; 
#endif
  size = use_size(result_size);

  // sanity checks
  if (result_size>size)
   {
     MARK; cerr << "(result_size>size)" << endl;
     exit(1);
   }
  if (kR<result_size)
   {
     MARK; cerr << "destination polynomial is too small!" << endl;
     exit(1);
   }

  // use R, if memory in R suffices for convolution;
  // otherwise uses temporary memory...
  const TPolynom p = (kR>=size && mpz_sizeinbase(R[0],2)>=estimated_memusage_in_bits) ? R : new mpz_t[size];

  // IMPORTANT!!! all coefficients of P1 and P2 must be
  // between 0 and N-1!
  // This is *very* important for DFFT!!

  if (p!=R) for (int i=0; i<size; ++i) mpz_init2(p[i],estimated_memusage_in_bits);
  for (int i=0; i<k1; ++i) mpz_set(p[i],P1[i]); // get first multiplicand

  // and this is done by mpz_init already :-)
  // for (int i=k1; i<size; ++i) mpz_set_ui(p[i],0); // padding with zeros


#if 0
  for (int i=0; i<k1; ++i) mpz_mod(p[i],p[i],N); // just to be on the safe side
#else
  {
   int j=0;
   for (int i=0; i<k1; ++i)
    {
     if ( mpz_sgn(p[i])<0 || mpz_sizeinbase(p[i],2)>ld_N )
      {
        ++j; mpz_mod(p[i],p[i],N); // just to be on the safe side
      }
    }
#if defined(VERBOSE)
   if (j) cout << "P1: " << j << " out of " << k1 << " coefficients corrected." << endl;
#endif
  }
#endif

  convolute(p,size); // do fft

  const TPolynom q = (P1==P2 && k1==k2) ? p : new mpz_t[size]; // for special case p*q = p^2
  if (p!=q)
   {
     // IMPORTANT!!! all input coefficients of P1 and P2 must be
     // between 0 and N-1!
     // This is *very* important for DFFT!!

     for (int i=0; i<size; ++i) mpz_init2(q[i],estimated_memusage_in_bits);
     for (int i=0; i<k2; ++i) mpz_set(q[i],P2[i]); // get second multiplicand
     // is done already by init2!! for (int i=k2; i<size; ++i) mpz_init(q[i]); // padding with zeros

#if 0
     for (int i=0; i<k2; ++i) mpz_mod(q[i],q[i],N); // just to be on the safe side
#else
     {
      int j=0;
      for (int i=0; i<k2; ++i)
       {
        if (mpz_sgn(q[i])<0 || mpz_sizeinbase(q[i],2)>ld_N )
         {
           ++j; mpz_mod(q[i],q[i],N); // just to be on the safe side
         }
       }
#if defined(VERBOSE)
      if (j) cout << "P2: " << j << " out of " << k2 << " coefficients corrected." << endl;
#endif
     }
#endif

      convolute(q,size); // do fft
    }

  // IMPORTANT: store result for last fft in p (to save memory space)
  for (int i=0; i<size; ++i)
   {
     mpz_mul(p[i],p[i],q[i]);
     mpz_mod(p[i],p[i],M);
   }
   // the result will be in p now!!!

  if (q!=p)
   {
     // we can delete the temporary polynomial q
     for (int i=0; i<size; ++i) mpz_clear(q[i]);
     delete [] q;
   }

  convolute(p,size); // do fft
  for (int i=1; i<size/2; ++i) mpz_swap(p[i],p[size-i]);

  int inv_index=0;
  for (int i=1; i<size; i<<=1) ++inv_index;

  for (int i=0; i<result_size; ++i)
   {
     mpz_mul(p[i],p[i],invpow2(inv_index));
     mpz_mod(p[i],p[i],M);
     if (reduce_result_modN) mpz_mod(p[i],p[i],N);
   }

  if (p!=R)
   {
     // copy result and release temporary polynomial
     for (int i=result_size-1; i>=0; --i) mpz_set(R[i],p[i]); // mpz_swap would be faster, but: memory fragmentation?
     if (size-result_size>10) cout << size-result_size << " computations saved..." << endl;
#ifdef DEBUG
     // sanity check
     for (int i=result_size; i<size; ++i)
      {
        mpz_mul(p[i],p[i],invpow2(inv_index));
        mpz_mod(p[i],p[i],M);
        if (mpz_cmp_ui(p[i],0)!=0)
         {
           MARK;
           cerr << "These values should be ZERO!" << endl;
         }
      }
#endif
     // release temporary polynomial
     for (int i=0; i<size; ++i) mpz_clear(p[i]);
     delete [] p;
   }
  //for (int i=result_size; i<kR; ++i) mpz_set_ui(R[i],0); // leading zeroes
  return result_size; // return size of result
}



void CDFT_chinrem::recurse_dftmul(tnode &node,
               const TPolynom R, const int kR,
               const TconstPolynom P1, const int k1,
               const TconstPolynom P2, const int k2)
{
  if (node.left || node.right)
   {
     // recurse deeper
     //cout << "downwards left" << endl;
     const TPolynom p1 = new mpz_t[k1];
     for (int i=0; i<k1; ++i) { mpz_init(p1[i]); mpz_mod(p1[i],P1[i],node.M); }
     const TPolynom p2 = (P1==P2) ? p1 : new mpz_t[k2];
     if (p1!=p2)
      for (int i=0; i<k2; ++i) { mpz_init(p2[i]); mpz_mod(p2[i],P2[i],node.M); }

     recurse_dftmul(*node.left,R,kR,p1,k1,p2,k2);
     TTempPolynom R2(kR);
     //cout << "downwards right" << endl;
     recurse_dftmul(*node.right,R2,kR,p1,k1,p2,k2);

     if (p1!=p2)
      {
        for (int i=0; i<k2; ++i) mpz_clear(p2[i]);
        delete [] p2;
      }
     for (int i=0; i<k1; ++i) mpz_clear(p1[i]);
     delete [] p1;

     // and returned from recursion.
     // -> do chinese remaindering
     mpz_t h; mpz_init(h);
     for (int i=0; i<k1+k2-1; ++i)
      {
        mpz_sub(h,R2[i],R[i]);
        mpz_mul(h,h,node.inv_first_M_mod_second_M);
        mpz_mod(h,h,node.right->M); mpz_addmul(R[i],h,node.left->M);
        //mpz_mod(R[i],R[i],node.M);
      }
     mpz_clear(h);
   }
  else
   {
     // reached a leaf
     //cout << "reached a leaf" << endl;

     const TPolynom p1 = new mpz_t[k1];
     for (int i=0; i<k1; ++i) { mpz_init(p1[i]); mpz_mod(p1[i],P1[i],node.M); }
     const TPolynom p2 = (P1==P2) ? p1 : new mpz_t[k2];
     if (p1!=p2)
      for (int i=0; i<k2; ++i) { mpz_init(p2[i]); mpz_mod(p2[i],P2[i],node.M); }

     node.first_dft->dftmul(R,kR,p1,k1,p2,k2);
     TTempPolynom R2(kR);
     node.second_dft->dftmul(R2,kR,p1,k1,p2,k2);

     if (p1!=p2)
      {
        for (int i=0; i<k2; ++i) mpz_clear(p2[i]);
        delete [] p2;
      }
     for (int i=0; i<k1; ++i) mpz_clear(p1[i]);
     delete [] p1;

     // and now: chinese remaindering
     mpz_t h; mpz_init(h);
     for (int i=0; i<k1+k2-1; ++i)
      {
        mpz_sub(h,R2[i],R[i]);
        mpz_mul(h,h,node.inv_first_M_mod_second_M);
        mpz_mod(h,h,node.second_dft->M); mpz_addmul(R[i],h,node.first_dft->M);
        //mpz_mod(R[i],R[i],node.M);
      }
     mpz_clear(h);
   }
}


const int CDFT_chinrem::internal_mul(const TPolynom R, const int kR,
               const TconstPolynom P1, const int k1,
               const TconstPolynom P2, const int k2,
               const bool reduce_result_modN)
{
  const int result_size = k1+k2-1;
#if defined(VERBOSE_INFO)
  cout << "_mul (dft,chinrem): ";
  if (P1==P2) cout << "(SQUARE) ";
  cout << k1 << ", " << k2 << ", " << result_size << endl; 
#endif
  // sanity check
  if (kR<result_size)
   {
     MARK; cerr << "destination polynomial is too small!" << endl;
     exit(1);
   }

  // use R, if memory in R suffices for convolution;
  // otherwise uses temporary memory...
  const TPolynom p1 = new mpz_t[k1];

  // IMPORTANT!!! all coefficients of P1 and P2 must be
  // between 0 and N-1!
  // This is *very* important for DFFT!!

  for (int i=0; i<k1; ++i) mpz_init(p1[i]); // get first multiplicand
  for (int i=0; i<k1; ++i) mpz_mod(p1[i],P1[i],N); // just to be on the safe side

  const TPolynom p2 = (P1==P2 && k1==k2) ? p1 : new mpz_t[k2]; // special case p*q = p^2
  if (p1!=p2)
   {
     // IMPORTANT!!! all input coefficients of P1 and P2 must be
     // between 0 and N-1!
     // This is *very* important for DFFT!!

     for (int i=0; i<k2; ++i) mpz_init(p2[i]); // get second multiplicand
     for (int i=0; i<k2; ++i) mpz_mod(p2[i],P2[i],N); // just to be on the safe side
   }

  const int size = use_size(result_size);
  if (kR>=size)
   {
     recurse_dftmul(root_node,R,kR,p1,k1,p2,k2);
     if (reduce_result_modN)
      for (int i=0; i<result_size; ++i) mpz_mod(R[i],R[i],N);
   }
  else
   {
     TTempPolynom myR(size);
     recurse_dftmul(root_node,myR,size,p1,k1,p2,k2);
     if (reduce_result_modN)
      for (int i=0; i<result_size; ++i) mpz_mod(R[i],myR[i],N);
     else
      for (int i=0; i<result_size; ++i) mpz_set(R[i],myR[i]);
   }

  if (p1!=p2)
   {
     for (int i=0; i<k2; ++i) mpz_clear(p2[i]);
     delete [] p2;
   }
  for (int i=0; i<k1; ++i) mpz_clear(p1[i]);
  delete [] p1;

  //for (int i=result_size; i<kR; ++i) mpz_set_ui(R[i],0); // leading zeroes
  return result_size; // return size of result
}



// ------------------------------------------------------------------------



// the following lines are more specific for our application...

//typedef CDFT TDFT;// dft without chinese remaindering
typedef CDFT_chinrem TDFT; // dft with chinese remaindering
typedef TDFT* PDFT;


#if 1
inline bool dft_mul_is_recommended(const int k1, const int k2)
{
  // tune...
  if (k1<14000 || k2<14000) return false;
  if (k1<=16384) return true;
  if (k1>25000 && k2>25000) return true;
  return false;
}

inline bool dft_square_is_recommended(const int k)
{
  // tune...
  return k>=8192;
}

#else

inline bool dft_mul_is_recommended(const int k1, const int k2)
{
  return k1+k2>=8;
  //return k1+k2>=512;
}

inline bool dft_square_is_recommended(const int k)
{
  return k>=4;
  //return k>=256;
}

#endif



const PDFT get_dft(const unsigned int n, const mpz_t m)
{
 static PDFT pdft = NULL;
 if (!pdft)
  {
    if (n<=0) return NULL;
    pdft = new TDFT(n>32768 ? n : 32768,m);
  }
 
 if ( n > pdft->max_size // resize is necessary!
       ||
      mpz_cmp(pdft->get_N(),m)!=0 // modulo-base has changed!
       ||
      n==0 // request to clear the temporary object
    ) 
  {
    delete pdft; pdft=NULL;
    if (n>0)
     {
       cout << "renewing dft-object..." << endl;
       pdft = new TDFT(n,m);
     }
    else
     {
       // if n<=0, then no new dft-object will be created...
       cout << "dft-object is released..." << endl;
     }
    return pdft;
  }
 return pdft;
}

void clear_dft_tempmemory()
{
  mpz_t x;
  mpz_init(x);
  get_dft(0,x); // trigger releasing of the DFT-object...
  mpz_clear(x);
}

} // namespace polynomial

