// polynomial arithmetic
// written by Thorsten Reinecke, Bernd Edler & Frank Heyder, 1999-12-10
// some changes: 1999-12-30
// quick review: 2003-07-02 by Thorsten Reinecke
// last change: 2005-03-30


/*! @file
 * @brief
 * contains implementation of polynomial arithmetic for multiple precision numbers
 *
 * references and related literature:
 *
 *  [1] Alfred V. Aho, John E. Hopcroft, Jeffrey D. Ullman: 
 *      "The Design and Analysis of Computer Algorithms",
 *      Addison-Wesley, 1974
 *   
 *  [2] Robert Sedgewick: "Algorithmen",
 *      Addison-Wesley, 1991
 *
 *  [3] Peter L. Montgomery: 
 *      "An FFT Extension of the Elliptic Curve Method of Factorization",
 *      (dissertation, University of California), 1992,
 *      ftp.cwi.nl/pub/pmontgom/ucladissertation.psl.gz
 */

   
#ifndef POLYNOMIAL_cc_
#define POLYNOMIAL_cc_

#include <cstdlib>
#include <iostream>
#include <new>
#include "utils.H"

#include "polynomial.H"


//! contains polynomial arithmetic concepts for mpz_t
namespace polynomial
{

//#define DEBUG    /* for explicit DEBUGGING-sessions */
//#define VERBOSE  /* be more verbose */

// this define should be triggered by Makefile
// #define USE_DFT /* enable use of discrete fourier transform */

using std::cout;
using std::cerr;
using std::endl;
using std::cin;


void print (const TconstPolynom P, int k)
{
  //cout << "polynomial, k=" << k << " :" << flush;
  while (--k>0) 
   {
     mpz_out_str(stdout,10,P[k]);
     cout << "*x^" << k << " + ";
   }
  if (k==0) mpz_out_str(stdout,10,P[0]);
  else cout << "0";
}

void eval(mpz_t res, const TconstPolynom P, const int k, const mpz_t x, const mpz_t m)
// computes res = P(x) mod m  using the Horner scheme
{
  mpz_t y; // temporary variable needed for &res=&x or &res=&m
  mpz_init_set(y,P[k-1]);
  for (int i=k-2; i>=0; --i)
   {
     mpz_mul(y,y,x); mpz_add(y,y,P[i]); mpz_mod(y,y,m);
   }
  mpz_set(res,y);
  mpz_clear(y);
}


// a small template (now and then)
template<typename T> inline T ld(T n)
{
  // 2er-logarithm (truncated)
  T r = 0;
  while(n>>=1) ++r;
  return r;
}

} // namespace polynomial
// leave the namespace to include optional polynomial implementations
// possibly including header files in other namespaces...


#ifdef USE_DFT
 #include "dft.cc" // fast discrete fourier transformation
#endif


// enter the namespace again
namespace polynomial
{

int classic_mul(const TPolynom __restrict__ Pr, const int kr,
                const TconstPolynom P1, const int k1,
                const TconstPolynom P2, const int k2)
// Pres = P1*P2, &Pres must be different from &P1,&P2
// returns degree of the resulting polynomial
// complexity: O(k1*k2)
// (classical method, implemented alternatively to Karatsuba for small polynomials)
{
  if (kr<k1+k2-1) cerr << "classic_mul: Not enough memory for result polynomial" << endl;
  for (int i=0; i<kr; ++i) mpz_set_ui(Pr[i],0);

  for (int i=0; i<k1; ++i)
     for (int j=0; j<k2; ++j)
        mpz_addmul(Pr[i+j],P1[i],P2[j]);

  return k1+k2-1;
}

int classic_mul(const TPolynom __restrict__ Pr, const int kr,
                const TconstPolynom P1, const int k1,
                const TconstPolynom P2, const int k2, const mpz_t m)
// Pres = P1*P2, &Pres must be different from &P1,&P2
// returns degree of the resulting polynomial
// complexity: O(k1*k2)
// (classical method, implemented alternatively to Karatsuba for small polynomials)
{
  if (kr<k1+k2-1) cerr << "classic_mul: Not enough memory for result polynomial" << endl;
  for (int i=0; i<kr; ++i) mpz_set_ui(Pr[i],0);

  for (int i=0; i<k1; ++i)
     for (int j=0; j<k2; ++j)
        mpz_addmul(Pr[i+j],P1[i],P2[j]);

  // mod besser auslagern:
  for (int i=k1+k2-2; i>=0; --i)
    mpz_mod(Pr[i],Pr[i],m);
  return k1+k2-1;
}


static int square_rek(const TPolynom R, const int kR,
                      const TconstPolynom P, const int k,
                      const TPolynom temp)
// R = P^2, &R must be different from &P
// returns degree of resulting polynomial
// the result polynomial must provide enough temporary memory!
// complexity: O(k^1.59) -> Karatsuba
{
#if 1
  if (k==2) // manually optimized...
   {
     mpz_mul(R[0],P[0],P[0]);
     mpz_mul(R[1],P[0],P[1]); mpz_mul_2exp(R[1],R[1],1);
     mpz_mul(R[2],P[1],P[1]);
     return 3;
   }
#endif
  if (k==1) // this is the real base case for the recursion
   {
     mpz_mul(R[0],P[0],P[0]);
     return 1;
   }


  const int Middle=k/2; // Middle separates LO/HI for divide & conquer

  // Pr=P^2
  // P splitted into in PL=P[0]..P[Middle-1], PH=P[Middle]..P[k-1]
  //                     L->Lower part        H->Higher part

  const TconstPolynom PL=&P[0]; const int kPL=Middle;
  const TconstPolynom PH=&P[kPL]; const int kPH=k-Middle;

  const int kH1=MAX(kPL,kPH);
  const TPolynom H1=&R[0];
  for (int i=0; i<kPL; ++i) mpz_add(H1[i],PH[i],PL[i]);
  for (int i=kPL; i<kH1; ++i) mpz_set(H1[i], PH[i]);

  int kM=2*kH1-1;
  const TPolynom M=&R[Middle]; // mittleres Polynom
  const TPolynom temp2 = &temp[2*kH1-1];
  kM=square_rek(temp,kM,H1,kH1,temp2);

  for (int i=kR-1; i>=0; --i) mpz_set_ui(R[i],0); // result polynomial: fill coefficients with zeros.

  int kL=kPL+kPL-1;
  const TPolynom L=R;
  kL=square_rek(L,kL,PL,kPL,temp2);

  int kH=kPH+kPH-1;
  const TPolynom H=&R[Middle*2];
  kH=square_rek(H,kH,PH,kPH,temp2);

  for (int i=0; i<kL; ++i) { mpz_sub(temp[i], temp[i], L[i]); }
  for (int i=0; i<kH; ++i) { mpz_sub(temp[i], temp[i], H[i]); }
  for (int i=0; i<kM; ++i) { mpz_add(M[i], M[i], temp[i]); }

  return 2*k-1;
}


static int mul_rek(const TPolynom R, const int kR,
                   const TconstPolynom P1, const int k1, const TconstPolynom P2, const int k2,
                   const TPolynom temp)
// Pres = P1*P2, &R must be different from &P1,&P2
// returns degree of resulting polynomial
// the result polynomial must provide enough temporary memory!
// complexity: O(max(k1,k2)^1.59) -> Karatsuba
{
  if (k1==2)
   {
    if (k2==2) // hand-optimized code
     {
       // (a0+a1*x)(b0+b1*x)=a0*b0+((a0+a1)(b0+b1)-a0*b0-a1*b1)*x+(a1*b1)*x^2
       mpz_add(temp[0],P1[0],P1[1]); mpz_add(temp[1],P2[0],P2[1]);
       mpz_mul(R[0],P1[0],P2[0]); mpz_mul(R[2],P1[1],P2[1]);
       mpz_mul(R[1],temp[0],temp[1]);
       mpz_sub(R[1],R[1],R[0]); mpz_sub(R[1],R[1],R[2]);
       return 3;
     }
    if (k2==3)
     {
       // toom-cook-like (without recursion)
       // result should be: P1*P2=R=w0+w1*x+w2*x^2+w3*x^3

       mpz_add(temp[0],P1[0],P1[1]); mpz_add(temp[1],P2[0],P2[2]);
       mpz_add(temp[2],temp[1],P2[1]); 
       mpz_mul(R[1],temp[0],temp[2]); // R[1]=w0+w1+w2+w3
       mpz_sub(temp[1],temp[1],P2[1]); mpz_sub(temp[0],P1[0],P1[1]);
       mpz_mul(R[2],temp[0],temp[1]); // R[2]=w0-w1+w2-w3
       mpz_mul(R[0],P1[0],P2[0]); // R[0]=w0
       mpz_mul(R[3],P1[1],P2[2]); // R[3]=w3

       mpz_add(R[2],R[2],R[1]); // R[2]=2*(w0+w2);
       mpz_tdiv_q_2exp(R[2],R[2],1); // R[2]=w0+w2
       mpz_sub(R[1],R[1],R[2]); // R[1]=w1+w3
       mpz_sub(R[2],R[2],R[0]); // R[2]=w2
       mpz_sub(R[1],R[1],R[3]); // R[1]=w1

       return 4;
     }
#ifdef VERBOSE
    cout << __FUNCTION__ << ": no hand-optimized code for " << k1 << "," << k2 << "." << endl;
#endif
   } // k1==2

  if (k1==1 || k2==1)
   {
    if (k1==1 && k2==1)
     {
       mpz_mul(R[0],P1[0],P2[0]);
       return 1;
     }
    if (k1==1)
     {
       for (int i=0; i<k2; ++i) mpz_mul(R[i],P1[0],P2[i]);
       return k2;
     }
    if (k2==1)
     {
       for (int i=0; i<k1; ++i) mpz_mul(R[i],P2[0],P1[i]);
       return k1;
     }
   } // k1==1 || k2==1


#ifdef VERBOSE
  if (k1<4 || k2<4)
   {
     if (k1==3 && k2==3) 
      cout << __FUNCTION__ << " : 3x3!" << endl;
     else
      cout << __FUNCTION__  << ": " << k1 << ", " << k2 << endl;
   }
#endif

  const int Middle=MIN(k1,k2)/2; // Middle separates LO/HI for divide & conquer
  // important: Middle must be lower than minimum!

  // Pr=P1*P2
  // P1 gesplittet in P1L=P1[0]..P1[Middle-1], P1H=P1[Middle]..P1[k1-1]
  // P2 gesplittet in P2L=P2[0]..P2[Middle-1], P2H=P2[Middle]..P2[k2-1]

  //if (k1!=k2) cout << "k1=" << k1 << " , " << "k2=" << k2 << endl;

  const TconstPolynom P1L=&P1[0]; const int kP1L=Middle;
  const TconstPolynom P1H=&P1[kP1L]; const int kP1H=k1-Middle;
  const TconstPolynom P2L=&P2[0]; const int kP2L=Middle;
  const TconstPolynom P2H=&P2[kP2L]; const int kP2H=k2-Middle;

  const int kH1=MAX(kP1L,kP1H), kH2=MAX(kP2L,kP2H);
  const TPolynom H1=&R[0], H2=&R[kH1];
  for (int i=0; i<kP1L; ++i) mpz_add(H1[i],P1H[i],P1L[i]);
  for (int i=kP1L; i<kH1; ++i) mpz_set(H1[i], P1H[i]);
  for (int i=0; i<kP2L; ++i) mpz_add(H2[i],P2H[i],P2L[i]);
  for (int i=kP2L; i<kH2; ++i) mpz_set(H2[i], P2H[i]);

  int kM=kH1+kH2-1;
  const TPolynom M=&R[Middle]; // middle polynomial
  const TPolynom temp2 = &temp[kH1+kH2-1];
  kM=mul_rek(temp,kM,H1,kH1,H2,kH2,temp2);

  for (int i=kR-1; i>=0; --i) mpz_set_ui(R[i],0); // result polynomial: fill with zeroes.

  int kL=kP1L+kP2L-1;
  const TPolynom L=R;
  kL=mul_rek(L,kL,P1L,kP1L,P2L,kP2L,temp2);

  int kH=kP1H+kP2H-1;
  const TPolynom H=&R[Middle*2];
  kH=mul_rek(H,kH,P1H,kP1H,P2H,kP2H,temp2);

  for (int i=0; i<kL; ++i) { mpz_sub(temp[i], temp[i], L[i]); }
  for (int i=0; i<kH; ++i) { mpz_sub(temp[i], temp[i], H[i]); }
  for (int i=0; i<kM; ++i) { mpz_add(M[i], M[i], temp[i]); }

  return k1+k2-1;
}


// forward declarations...
int monic_mul(const TPolynom R, const int kR,
              const TconstPolynom P1, int k1,
              const TconstPolynom P2, int k2, const mpz_t m);
int monic_square(const TPolynom R, const int kR,
                 const TconstPolynom P, int k, const mpz_t m);


int square(const TPolynom R, const int kR, const TconstPolynom P, const int k, const mpz_t m)
// R = P^2, &R must be different from &P
// returns degree of resulting polynomial
// complexity: O(k^1.59) -> Karatsuba
{
  if (kR<2*k-1)
   {
     MARK;
     cerr << "Not enough memory for target polynomial!" << endl;
     cerr << "-> k=" << k << ", kR=" << kR << endl;
   }

  //if (((2*k-1)&1) && mpz_cmp_ui(P[k-1],1)==0) return monic_square(R,kR,P,k,m);
  //if (mpz_cmp_ui(P[k-1],1)==0) { MARK; return monic_square(R,kR,P,k,m); }

#ifdef USE_DFT
  if (dft_square_is_recommended(k))
   {
     return get_dft(2*k-1,m)->squaremod(R,kR,P,k);
   }
#endif

  const int estimated_operand_size = 2*mpz_sizeinbase(m,2)+5;
  const int tempsize = 2*2*k;
  const TTempPolynom temp(tempsize,estimated_operand_size);

  const int ret=square_rek(R,kR,P,k,temp);

  for (int i=0; i<ret; ++i) 
    mpz_mod(R[i],R[i],m); // normalize result polynomial
 
  return ret;
}


int square(const TPolynom R, const int kR, const TconstPolynom P, const int k)
// R = P^2, &R must be different from &P
// returns degree of resulting polynomial
// complexity: O(k^1.59) -> Karatsuba
{
  if (kR<2*k-1)
   {
     MARK;
     cerr << "Not enough memory for result polynomial!" << endl;
     cerr << "-> k=" << k << ", kR=" << kR << endl;
   }

#if 0 && defined(USE_DFT)
  // we need a valid "modulo m" value to do dft;
  // we could calculate m, but I think this takes too much time...
  // calculate m would be something like this:
  //  mpz_t m,h; mpz_init(m); mpz_init(h);
  //  for (int i=0; i<k; ++i)
  //    {  mpz_abs(h,P[i]); if (mpz_cmp(P[i],m)>0) mpz_swap(m,h); }
  //  ...
  //  mpz_clear(h); mpz_clear(m);
  if (dft_square_is_recommended(k))
   {
     if (mpz_cmp_ui(P[k-1],1)==0) { MARK; cout << k << " -> " << 2*k-1 << endl; }
     return get_dft(2*k-1,m)->square(R,kR,P,k);
   }
#endif

  const int tempsize = 2*2*k;
  const TTempPolynom temp(tempsize);

  const int ret=square_rek(R,kR,P,k,temp);

  return ret;
}


int mul(const TPolynom R, const int kR,
        TconstPolynom P1, int k1,
        TconstPolynom P2, int k2, const mpz_t m)
// Pres = P1*P2, &R must be different from &P1,&P2
// returns degree of the resulting polynomial
// complexity: O(max(k1,k2)^1.59) -> Karatsuba
// resp. complexity: O((k1+k2)*ld(k1+k2)) -> using (optimal) dft
{
  //cout << "mul_mod " << k1 << "x" << k2 << endl;

  if (kR<k1+k2-1)
   {
     MARK; cerr << "Not enough memory for result polynomial!" << endl;
     cerr << "-> k1=" << k1 << ", k2=" << k2 << ", kR=" << kR << endl;
   }

#if 1
  if ( ((k1+k2-1)&1) && mpz_cmp_ui(P1[k1-1],1)==0 && mpz_cmp_ui(P2[k2-1],1)==0 )
   {
     // we call monic mul for better performance ;-)
     //MARK; cout << "monic MUL!" << endl;
     return monic_mul(R,kR,P1,k1,P2,k2,m);
   }
#endif

#ifdef USE_DFT
  if (dft_mul_is_recommended(k1,k2))
   {
     return get_dft(k1+k2-1,m)->mulmod(R,kR,P1,k1,P2,k2);
   }
#endif

  if (k2<k1) { std::swap(k1,k2); std::swap(P1,P2); }
  // now k1<=k2

  if (2*k2>3*k1)
   {
#if defined(VERBOSE)
     MARK;
     cout << "using classic mul for " << k1 << ", " << k2 << endl;
#endif
     return classic_mul(R,kR,P1,k1,P2,k2,m);
   }
  //cout << "Hint: " << k1 << ", " << k2 << endl;


  const int estimated_operand_size = 2*mpz_sizeinbase(m,2)+5;
  const int tempsize = 4*k2;
  const TTempPolynom temp(tempsize,estimated_operand_size);
 
  const int ret = mul_rek(R,kR,P1,k1,P2,k2,temp);
  for (int i=0; i<ret; ++i)
    mpz_mod(R[i],R[i],m); // normalize result polynomial
 
  return ret;
}

int mul(const TPolynom R, const int kR,
        TconstPolynom P1, int k1,
        TconstPolynom P2, int k2)
// Pres = P1*P2, &R must be different from &P1,&P2
// returns degree of resulting polynomial
// complexity: O(max(k1,k2)^1.59) -> Karatsuba
{
  //cout << "mul " << k1 << "x" << k2 << endl;

  if (kR<k1+k2-1)
   {
     cerr << "karamul: Not enough memory for result polynomial!" << endl;
     cerr << "-> k1=" << k1 << ", k2=" << k2 << ", kR=" << kR << endl;
   }

  if (k2<k1) { std::swap(k1,k2); std::swap(P1,P2); }
  // now k1<=k2

  if (2*k2>3*k1)
   {
#if defined(VERBOSE)
     MARK;
     cout << "using classic mul for " << k1 << ", " << k2 << endl;
#endif
     return classic_mul(R,kR,P1,k1,P2,k2);
   }
  //cout << "Hint: " << k1 << ", " << k2 << endl;

  const int tempsize = 4*k2;
  const TTempPolynom temp(tempsize);
 
  const int ret = mul_rek(R,kR,P1,k1,P2,k2,temp);
 
  return ret;
}


int monic_mul(const TPolynom R, const int kR,
              const TconstPolynom P1, int k1,
              const TconstPolynom P2, int k2, const mpz_t m)
// multiplies two polynomials, whose highest coefficients are P1[k1-1]=P2[k2-1]=1.
// monic polynomials, whose leading coefficients are zero, are also allowed.
// This should be somewhat faster than normal mul...
{
  if (kR<k1+k2-1)
   {
     cerr << "monic mul: Not enough memory for result polynomial!" << endl;
     cerr << "-> k1=" << k1 << ", k2=" << k2 << ", kR=" << kR << endl;
   }
  
  k1--; while (mpz_cmp_ui(P1[k1],0)==0) --k1;
  k2--; while (mpz_cmp_ui(P2[k2],0)==0) --k2;
  
  if (mpz_cmp_ui(P1[k1],1)!=0 || mpz_cmp_ui(P2[k2],1)!=0)
   {
     cerr << __FUNCTION__ << ": polynomial is not monic!" << endl;
     cout << "P1="; print(P1,k1+1); cout << endl;
     cout << "P2="; print(P2,k2+1); cout << endl;
     exit(1);
   }

#if 1
  // special case (easy and fast)
  if (k1==1 && k2==1)
   {
     mpz_mul(R[0],P1[0],P2[0]); mpz_mod(R[0],R[0],m);
     mpz_add(R[1],P1[0],P2[0]); mpz_mod(R[1],R[1],m);
     mpz_set_ui(R[2],1); 
     return 3;
   }
#endif


  /*
    (x^k1 +PR1)*(x^k2 +PR2) = X^(k1+k2) + x^(k1) *PR2 + x^(k2) *PR1 + PR1*PR2
    strategy:
     1. compute PR1*PR2
  */ 

#ifdef USE_DFT
  int ret;
  if (dft_mul_is_recommended(k1,k2))
   {
     if (mpz_cmp_ui(P1[k1-1],1)==0 && mpz_cmp_ui(P2[k2-1],1)==0) { MARK; cout << k1+k2-1 << endl; }
     ret=get_dft(k1+k2-1,m)->mul(R,kR,P1,k1,P2,k2);
   }
  else
   ret=mul(R,kR,P1,k1,P2,k2);
#else
  int ret=mul(R,kR,P1,k1,P2,k2); // multiply one-degree-decremented polynomials
#endif

  /*
     2. complete result by x^(k1+k2) (and insert zeroes, if necessary)
  */
  while (ret<k1+k2) mpz_set_ui(R[ret++],0);
  mpz_set_ui(R[ret++],1);

  /*
     3. add x^(k1) *PR2
     4. add x^(k2) *PR1
  */
  for (int i=0; i<k1; ++i) mpz_add(R[i+k2],R[i+k2],P1[i]);
  for (int i=0; i<k2; ++i) mpz_add(R[i+k1],R[i+k1],P2[i]);
  for (int i=0; i<ret-1; ++i) mpz_mod(R[i],R[i],m);
  return ret;
}

int monic_square(const TPolynom R, const int kR,
                 const TconstPolynom P, int k, const mpz_t m)
// multiplies two polynomials, whose highest coefficients are P1[k1-1]=P2[k2-1]=1.
// monic polynomials, whose leading coefficients are zero, are also allowed.
// This should be somewhat faster than normal mul...
{
  if (kR<2*k-1)
   {
     cerr << "monic square: Not enough memory for result polynomial!" << endl;
     cerr << "-> k=" << k << ", kR=" << kR << endl;
   }
  
  k--; while (k && mpz_cmp_ui(P[k],0)==0) --k;
  
  if (mpz_cmp_ui(P[k],1)!=0)
   {
     cerr << __FUNCTION__ << ": polynomial is not monic!" << endl;
     cout << "P="; print(P,k+1); cout << endl;
     exit(1);
   }

  // besonderer Spezialfall: P=x^0
  if (k==0) 
   {
     mpz_set_ui(R[0],1); return 1;
   }

#if 1
  // special case (easy and fast)
  if (k==1)
   {
     mpz_mul(R[0],P[0],P[0]); mpz_mod(R[0],R[0],m);
     mpz_add(R[1],P[0],P[0]); mpz_mod(R[1],R[1],m);
     mpz_set_ui(R[2],1); 
     return 3;
   }
#endif


  /*
    (x^k1 +PR1)*(x^k2 +PR2) = X^(k1+k2) + x^(k1) *PR2 + x^(k2) *PR1 + PR1*PR2
    strategy:
     1. compute PR1*PR2
  */ 

#ifdef USE_DFT
  int ret;
  if (dft_square_is_recommended(k))
   {
     ret=get_dft(2*k-1,m)->square(R,kR,P,k);
   }
  else
   {
     ret=square(R,kR,P,k);
   }
#else
  int ret=square(R,kR,P,k); // square the one-degree-decremented polynomial
#endif
  /*
     2. complete result by x^(k1+k2) (and insert zeroes, if necessary)
  */
  while (ret<2*k) mpz_set_ui(R[ret++],0);
  mpz_set_ui(R[ret++],1);

  /*
     3. add x^(k1) *PR2
     4. add x^(k2) *PR1
  */
  for (int i=0; i<k; ++i) mpz_addmul_ui(R[i+k],P[i],2);
  for (int i=0; i<ret-1; ++i) mpz_mod(R[i],R[i],m);
  return ret;
}


void reciprocal2p1(TPolynom R, int &kR, const TconstPolynom f, const int np1, const mpz_t m)
{
  const int n=np1-1;
  const TPolynom R1 = &R[1];
  TTempPolynom R2(2*np1), H(2*np1);
  mpz_t e, fni, x;
  mpz_init(e); mpz_init(fni); mpz_init(x);

  mpz_invert(fni,f[n],m); mpz_mod(fni,fni,m);
  kR=1; mpz_set(R1[0],fni);
  mpz_neg(e,f[n-1]); mpz_mul(e,e,fni); mpz_mod(e,e,m);

  for (int k=2; k<=n; k*=2)
   {
     const int kR2=square(R2,2*np1,R1,kR,m); // R2=R1^2
//     if (kR2<k) { MARK; cerr << kR2 << "," << k << endl; }
     mul(H,2*np1,R2,kR2,&f[n-(k-1)],k,m); // H=R1^2 * f[n-(k-1)],k
     for (int i=0; i<k/2; ++i) 
      {
        mpz_mul_2exp(R1[k/2+i],R1[i],1);
        mpz_set_ui(R1[i],0);
      }
     for (int i=0; i<k; ++i) mpz_sub(R1[i],R1[i],H[i+k-2]);

     mpz_mul(e,e,e); mpz_mod(e,e,m);
     if (k==2)
      mpz_set_ui(x,0);
     else
      {
        mpz_mul(x,H[k-3],f[n]); mpz_mod(x,x,m);
      }
     mpz_sub(e,e,x);
     mpz_mul(x,f[n-k],fni); mpz_mod(x,x,m);
     mpz_sub(e,e,x); mpz_mod(e,e,m);
     kR=k;
   }
  
  // man beachte &R1 = &R[1] !
  mpz_mul(x,e,fni); mpz_mod(R[0],x,m);
  kR++;

  mpz_clear(e); mpz_clear(fni); mpz_clear(x);
}


void reciprocal2(TPolynom R, int &kR, const TconstPolynom P, const int k, const mpz_t m)
{
  // computes the reciprocal polynomial of P,
  // k must be a power of two!
  // R must provide enough memory!
  // P must not have any leading zeros!
  if (k==1)
   {
     if (mpz_invert(R[0],P[0],m)==0)
      {
        cerr << __FUNCTION__ << ": inverse does not exist!" << endl;
        cout << "P="; print(P,k); cout << endl;
        char ch; cin >> ch;
      }
     kR=1;
   }
  else
   {
     // Polynom Q bereitstellen
     const int mem_kQ=3*(k>>1); TTempPolynom Q(mem_kQ);
     int kQ=mem_kQ;
     reciprocal2(Q,kQ,&P[k>>1],k>>1,m); // and compute

     // compute square of Q
     const int mem_kQQ=2*kQ; TTempPolynom QQ(mem_kQQ);
     int kQQ=square(QQ,mem_kQQ,Q,kQ,m);

     // B=QQ(x)*P(x)
     const int mem_kB=kQQ+k; TTempPolynom B(mem_kB);
     int kB=mul(B,mem_kB,QQ,kQQ,P,k,m);
          
     // compute result
     for (int i=0; i<kR; ++i) mpz_set_ui(R[i],0); // fill with zeroes
     
     const int d = k>>1;
     for (int i=0; i<kQ; ++i) mpz_mul_ui(R[d+i],Q[i],2);
     for (int i=k-2; i<kB; ++i) mpz_sub(R[i-(k-2)],R[i-(k-2)],B[i]);
     kR = d+kQ >= kB-(k-2) ? d+kQ : kB-(k-2);
     kR--;
     while (kR>0 && (mpz_cmp_ui(R[kR],0)==0)) kR--; // normalize
     kR++;
     for (int i=0; i<kR; ++i) mpz_mod(R[i],R[i],m);
     //cout << "is "; print(R,kR); cout << endl;
   }
}

void reciprocal(TPolynom R, int &kR, const TconstPolynom P, const int k, const mpz_t m, const unsigned int scale /* =0 */)
{
  // computes the reciprocal polynomial of P,
  // R must provide enough memory!
  // the given polynomial will be scaled using x^scale as multiplier,
  // thereby shifting the coefficients by scale places

  int z=1, h=k+scale;
  while (z<h) z<<=1;
  
  int d=z-h;
  if (d==0 && scale==0) 
   {
     // Zweierpotenz
#if defined(VERBOSE)
     cout << "calling recip with power of 2" << endl;
#endif
     reciprocal2(R,kR,P,k,m);
   }
  if (k==d+2 && scale==0)
   {
     // power of two plus 1
#if defined(VERBOSE)
     cout << "calling recip with 1+power of 2" << endl;
#endif
     reciprocal2p1(R,kR,P,k,m);
   }
  else
   {
     // not a power of two (or scale!=0)
     d+=scale;
#if defined(VERBOSE)
     if (scale) cout << "scaling polynomial!" << endl;
     else cout << "no power of 2: " << k << ", " << z << ", " << d << endl;
#endif
     // allocate temporary memory & call reciprocal (using a displacement)
     const int kH=z; TTempPolynom H(kH);
     for (int i=0; i<k; ++i) mpz_set(H[i+d],P[i]);
     int kR2=2*kH; TTempPolynom R2(kR2);
#ifdef VERBOSE
     cout << __FUNCTION__ << ": calling recip" << endl;
#endif
     reciprocal2(R2,kR2,H,kH,m);
#ifdef VERBOSE
     cout << __FUNCTION__ << ": ...back (of calling recip)" << endl;
#endif
#ifdef DEBUG
     cout << "->"; print(R2,kR2); cout << endl;
#endif
     d-=scale;
     if (kR<kR2-d) cerr << __FUNCTION__ << ": too little memory in target polynomial!" << endl;
     kR=kR2-d;
     for (int i=0; i<kR; ++i) mpz_set_ui(R[i],0);
     for (int i=d; i<kR2; ++i) mpz_set(R[i-d],R2[i]);
   }
}


void classic_div(TPolynom Q, int &kQ, TPolynom R, int &kR,
         const TconstPolynom P1, int k1,
         const TconstPolynom P2, int k2, const mpz_t m)
{
  // classical polynomial division, O(n^2)
  // returns P1/P2 in Q
  // returns P1 mod P2 in R

  for (int i=0; i<kQ; ++i) mpz_set_ui(Q[i],0);
  for (int i=0; i<kR; ++i) mpz_set(R[i],P1[i]);
  kR=k1; 
  mpz_t inv,x,y;
  mpz_init(inv); mpz_init(x); mpz_init(y);

  if (k2==0)
    {
      cout << endl;
      MARK; cerr << "polynomial division: Division by zero (empty polynomial)!" << endl;
      exit(1);
    }
  kR--; k2--; k1--;
  while (k2>=0  && (mpz_cmp_ui(P2[k2],0))==0) k2--;
  if (k2<0)
    {
      cout << endl;
      MARK; cerr << "polynomial division: Division by zero (empty polynomial)!" << endl;
      exit(1);
    }
  if (mpz_invert(inv,P2[k2],m)==0)
   {
     cout << "--->";
     print(P2,k2+1);
     cout << endl;
     MARK; cerr << "polynomial division: Inverse doesn't exist!" << endl;
     exit(1);
   }
  if (k1<0) // dividing the empty polynomial -> 0
    {
      MARK; cout << "dividend polynomial is empty." << endl;
      kQ=kR=0;    
      goto done;
    }

  while (kR>=k2)
   {
    const int Graddifferenz=kR-k2;

    //Skalar=R[kR]/P2[k2] 
    mpz_mul(x,inv,R[kR]); mpz_mod(x,x,m);
    mpz_set(Q[Graddifferenz],x);
    mpz_set_ui(R[kR],0);
    kR--;
    for (int i=k2-1; i>=0; --i )
     {
       int j = kR+i-(k2-1);
       mpz_mul(y,P2[i],x);
       mpz_sub(y,R[j],y);
       mpz_mod(R[j],y,m);
     } 
   }

  while (kR>=0 && mpz_cmp_ui(R[kR],0)==0) kR--;

  kQ=k1-k2+1; kR++;
 done:
  mpz_clear(inv); mpz_clear(x); mpz_clear(y);
#if 1
  cout << "polynomdivision Result:" << endl;
  cout << "A(x) = "; print(P1,k1+1); cout << endl;
  cout << "B(x) = "; print(P2,k2+1); cout << endl;
  cout << "Q(x) = "; print(Q,kQ); cout << endl;
  cout << "R(x) = "; print(R,kR); cout << endl;
#endif
}


void classic_mod(TPolynom R, int &kR,
            const TconstPolynom P1, int k1,
            const TconstPolynom P2, int k2, const mpz_t m)
// Remainder of polynomial division, O(n^2)
// returns P1 mod P2 in R
{
#ifdef DEBUG
  cout << "POLMOD IN (old)" << endl;
#endif
  for (int i=0; i<kR; ++i) mpz_set(R[i],P1[i]);
  kR=k1; 
  /* one could minimize memory consumption in R by
      - precomputing the result size
      - using a ring buffer
        (taking care, that the coefficients are at the right place
         when leaving the loop)
     -> isn't implemented (yet)
  */

  mpz_t inv,x;
  mpz_init(inv); mpz_init(x);

  if (k2==0)
    {
      cout << endl;
      cerr << "polynomial division: division by zero polynomial!" << endl;
      exit(1);
    }
  kR--; k2--; k1--;
  while (k2>=0 && (mpz_cmp_ui(P2[k2],0))==0) k2--;
  if (k2<0)
    {
      cout << endl;
      cerr << "polynomial division: division by  zero polynomial! (-1)" << endl;
      exit(1);
    }
  if (mpz_invert(inv,P2[k2],m)==0)
   {
     cout << "--->";
     print(P2,k2+1);
     cout << endl;
     cerr << "polynomial division: inverse does not exist!" << endl;
     exit(1);
   }
  if (k1<0) // dividend is 0 (zero polynomial)
   {
     cout << "Division using zero-polynomial as dividend!" << endl;
     kR=0;    
     goto done;
   }

  while (kR>=k2)
   {
    //Skalar=R[kR]/P2[k2] 
    mpz_mod(R[kR],R[kR],m); mpz_mul(x,inv,R[kR]); mpz_mod(x,x,m);
    mpz_set_ui(R[kR],0);
    for (int i=k2-1; i>=0; --i)
     {
       mpz_submul(R[kR-k2+i],P2[i],x);
     }
    --kR;
   }
  // now normalize the result...
  for (int i=kR; i>=0; --i)
    mpz_mod(R[i],R[i],m);
  while (kR>=0 && mpz_cmp_ui(R[kR],0)==0) kR--;
  kR++;
 done:
  mpz_clear(inv); mpz_clear(x);
#ifdef DEBUG
  cout << "polynomial division (MODULO) Result:" << endl;
  cout << "A(x) = "; print(P1,k1+1); cout << endl;
  cout << "B(x) = "; print(P2,k2+1); cout << endl;
  cout << "R(x) = "; print(R,kR); cout << endl;
  cout << "POLMOD OUT (old)" << endl;
#endif
}


void div(TPolynom Q, int &kQ,
         const TconstPolynom P1, const int k1,
         const TconstPolynom P2, const int k2, const mpz_t m)
{
  // (fast) polynomial division by multiplying with the reciprocal polynomial
  /* method:
     compute reciprocal polynomial for P2;
     multiply this with P1, and
     the (k1-k2+1) leading coefficients of the product are the desired result.
     This method should be asymptotically faster than any other method,
     because the time complexity to compute the reciprocal polynomial
     depends (essentially) on the complexity of the multiplication algorithm
     (Karatsuba, etc.).
  */

#ifdef DEBUG
  cout << "div-in" << " k1=" << k1 << ", k2=" << k2 << endl;
  cout << "div:P1="; print(P1,k1); cout << endl;
  cout << "div:P2="; print(P2,k2); cout << endl;
#endif

  if ( kQ < k1-k2+1 ) cerr << "div: Zielpolynom zu klein!" << endl;
  if (k2<1) cerr << "Warnung: Null-Polynome bei Division!" << endl;
  if ( (k1>0 && mpz_cmp_ui(P1[k1-1],0)==0) ||
       (k2>0 && mpz_cmp_ui(P2[k2-1],0)==0) )
    {
      cerr << "div: leading zeroes! (not wanted!)" << endl;
      exit(1);
    }

  if (k1==k2)
   {
     // special case: division of two polynomials with the same degree:
     cout << "div: Grad P1=P2 -> easy-div" << endl;
     // Q[0]=P1[k1-1]/P2[k2-1], kQ=1;
     if (kQ<1) cerr << "Speicherproblem!" << endl;
     mpz_invert(Q[0],P2[k2-1],m);
     mpz_mul(Q[0],Q[0],P1[k1-1]); mpz_mod(Q[0],Q[0],m);
     kQ=(mpz_cmp_ui(Q[0],0)==0) ? 0 : 1;
     return;
   }

  int scale=0; // used to scale the reciprocal polynomial
  if (k1>=2*k2)
   {
#ifdef VERBOSE
      cout << __FUNCTION__ << ": problems with reciprocal polynomial: k1>=2*k2!"
           << " k1,k2=" << k1 << "," << k2 << endl;
#endif
      scale=k1-k2-1;
   }

  // allocate a helper polynomial: RP2
  const int mem_kRP2=k2+scale; TTempPolynom RP2(mem_kRP2);
  int kRP2 = mem_kRP2;
  reciprocal(RP2,kRP2,P2,k2,m,scale); // and compute
  // -> RP2(x) = x^(2n-2) / P2(x)

  // as we need only the k1-k2 leading coefficients later,
  // wo can spare to compute anything more...
  const int startP1=k1-(k1-k2+1);
  const int startRP2=kRP2-(k1-k2+1);
#ifdef VERBOSE
  cout << "div-shortcut: " << startP1 << ", " << startRP2 << endl;
#endif
  const int mem_kH=(k1-startP1)+(kRP2-startRP2)-1; TTempPolynom H(mem_kH);

  const int kH=mul(H,mem_kH,&P1[startP1],k1-startP1,&RP2[startRP2],kRP2-startRP2,m);
  // -> H(x)=P1(x)*RP2(x)

#if 0
  cout << "Divisionsergebnis: " << endl;
  cout << "full: "; print(H,kH); cout << endl;
  cout << "kRP2=" << kRP2 << ", k2=" << k2 << endl;
#endif

  // now get the leading k1-k2+1 coefficients, they are the result
  for (int i=0; i<=k1-k2; ++i) mpz_set(Q[i],H[kH-1-(k1-k2)+i]);
  kQ=k1-k2;
  while (kQ>=0 && mpz_cmp_ui(Q[kQ],0)==0) kQ--; // normalize polynomial
  kQ++; // size=degree of polynomial +1 !!

#ifdef DEBUG
  cout << "div-out" << endl;
#endif
}


void mod(TPolynom R, int &kR,
         const TconstPolynom P1, int k1,
         const TconstPolynom P2, int k2, const mpz_t m)
// polynomial remainder, fast(er) method
// returns P1 modulo P2 in R
/* asymptotically faster than the "naive modulo", because we can make use of
   fast multiplication algorithms. */
{

#if 0 || defined(DEBUG)
  cout << "POLMOD IN (new)" << endl;
  cout << "k1=" << k1 << ", k2=" << k2 << ", kR=" << kR << endl;
//  cout << "mod:P1="; print(P1,k1); cout << endl;
//  cout << "mod:P2="; print(P2,k2); cout << endl;
#endif

  if (k2<1) cerr << "Warning: Null-Polynomials in mod!" << endl;

#ifdef VERBOSE
  if ( (k1>0 && mpz_cmp_ui(P1[k1-1],0)==0) ||
       (k2>0 && mpz_cmp_ui(P2[k2-1],0)==0) )
   {
     cerr << __FUNCTION__ << ": Unwanted leading zeros!" << "(for " <<  k1 << "x" << k2 << ")" <<endl;
   }
#endif

  k1--; while (k1>=0 && mpz_cmp_ui(P1[k1],0)==0) k1--; k1++; // normalize
  k2--; while (k2>=0 && mpz_cmp_ui(P2[k2],0)==0) k2--; k2++; // normalize

#ifdef DEBUG
  cout << "k1=" << k1 << ", k2=" << k2 << ", kR=" << kR << endl;
  cout << "mod:P1="; print(P1,k1); cout << endl;
  cout << "mod:P2="; print(P2,k2); cout << endl;
#endif

  if (k2>k1)
   { 
     cout << "triviales mod:  P1 mod P2 -> P1  fr P2>P1" << endl;
     if (kR<k1) cerr << "Too little memory in target polynomial!" << endl;
     for (int i=0; i<k1; ++i) mpz_set(R[i],P1[i]);
     kR=k1;
     return;
   }

#if 1 
  /* speedup analogous to quicksort: if you are near the leafs, then use
     a asymptotic worse, but for small values faster method. */
  if (k1<150 || k2<16)
   {
     //cout << "calling classic_mod" << endl;
     classic_mod(R,kR,P1,k1,P2,k2,m);
     return; 
   }
#endif

  // Quotientenpolynom Q ermitteln
  const int mem_kQ=k1-k2+1; TTempPolynom Q(mem_kQ); 
  int kQ=mem_kQ;
  div(Q,kQ,P1,k1,P2,k2,m); // important: using fast division!
  
  // compute P1-Q*P2
  const int mem_kM=kQ+k2-1; TTempPolynom M(mem_kM); 
  if (kQ>k2)
   {
#ifdef VERBOSE_INFO
     cout << "shortcut possible! kQ=" << kQ << ", k2=" << k2 << endl;
#endif
     kQ=k2;
     //cout << "kQ=k2=" << k2 << endl;
    }

 //cout << "FASTmod: " << kQ << "x" << k2 << endl; 
#if 1
  mul(M,mem_kM,Q,kQ,P2,k2,m); // important: using fast multiplication!
#else
  int kM=mul(M,mem_kM,Q,kQ,P2,k2,m); // important: using fast multiplication!
  if (kM<k2-1) cerr << "mod: problems with remainder" << endl;
  cout << "prod: "; print(M,kM); cout << endl;
#endif

  // shortcut: polynomial remainder can have at most k2-1 coefficients
  {
    int i=k2-2; while (i>=0 && mpz_cmp(P1[i],M[i])==0) --i; // normalize result
    if (i+1>kR) cerr << "mod: not enough memory in target polynomial!" << endl;
    kR=i+1;
    while (i>=0) 
     {
       mpz_sub(R[i],P1[i],M[i]); mpz_mod(R[i],R[i],m);
       --i;
     }
  }

#ifdef DEBUG
  cout << "polynomial division (fast MODULO) Result:" << endl;
  cout << "A(x) = "; print(P1,k1); cout << endl;
  cout << "B(x) = "; print(P2,k2); cout << endl;
  cout << "R(x) = "; print(R,kR); cout << endl;
  cout << "POLMOD OUT (new)" << endl;
#endif
}


static void multipoint_eval_rek(const TPolynom* R, const TconstPolynom P, const int k, TPolynom* A, const int h,
			 const mpz_t m, mpz_t* &res, int* const pos, const int* const step, const int* const stop)
// this function is called by multipoint_eval
{
#ifdef DEBUG
  cout << "recursive call for h=" << h << endl;
#endif
  if (h==0) // leaf entered -> evaluate!
    {
      // direkte Auswertung des Blattes: (a*x+b) mod (x+c) = b-a*c
      if (k==1)
         mpz_mod(res[0],P[0],m);
      else
       {
         mpz_mul(res[0],P[1],A[h][pos[h]]);
         mpz_sub(res[0],P[0],res[0]);
         mpz_mod(res[0],res[0],m);
       }
      res++;
    }
  else
    {
      int kR=k;
      mod(&R[h][0],kR,P,k,&A[h][pos[h]],step[h],m);
      multipoint_eval_rek(R,R[h],kR, A,h-1, m,res,pos,step,stop);
      if (pos[h-1]<stop[h-1])
	multipoint_eval_rek(R,R[h],kR, A,h-1, m,res,pos,step,stop);
    }
  pos[h]+=step[h];
} 


void multipoint_eval(mpz_t* res,
                     const TconstPolynom P, const int k, 
                     const mpz_t* const array_of_arguments, const int size,
                     const mpz_t m)
{
  if ( k<3 || size<3 || size>k-1 )
   {
     cerr << "multipoint_eval: invalid parameters!" << endl;
     exit(1);
   }

  const int MaxHoehe = ld(size)+1;

  TPolynom* const A = new TPolynom[MaxHoehe];
  int* const pos       = new int[MaxHoehe];
  int* const steparray = new int[MaxHoehe];
  int* const stop      = new int[MaxHoehe];

  for (int i=0; i<MaxHoehe; ++i) pos[i]=steparray[i]=stop[i]=0; // initialize

  // for optimizing mpz-memory-allocation & fragmentation we will use mpz_init2
  // therefore we need to estimate our operand size (wrong estimate cause no harm
  // except for triggering more unnecessary reallocations)
  const unsigned int estimated_operand_size = 2*mpz_sizeinbase(m,2)+5;

  /*
    A[h][s] ist das Feld der Polynome in der Hhe h (h=0 -> leafs=monic
    polynomials of degree 1)
  */

  int h = 0, anz=size, step=2, s=anz*step;
  steparray[h]=step; stop[h]=s;

#ifdef VERBOSE
  cout << "calculating polynomials for depth " << h << endl;
#endif
  A[h] = new mpz_t[s]; // allocate space for polynomials in depth h
  for (int i=0, j=0; i<anz; ++i, j+=step)
    {
      // create monic polynomials of degree 1
      mpz_init_set_ui(A[h][j+1],1); // 1*x
      mpz_init_set(A[h][j],array_of_arguments[i]); mpz_neg(A[h][j],A[h][j]);
    }

                     // handle this special case efficiently 
  while ( anz>2 || (anz==2 && k==2*size+1) )
    {
      ++h;
      if (h>=MaxHoehe)
       {
         cout << __FILE__ << ", " << __FUNCTION__ << ": line " <<  __LINE__ << endl;
         cerr << "too many iteration steps..." << endl;
	 cerr << "please increase MaxHoehe and recompile!" << endl;
	 exit(1);
       }

      const int oldstep = step;
      const int rest = anz % 2;
      steparray[h]=step=step*2-1; anz=(anz/2); stop[h]=s=(anz+rest)*step;
#ifdef VERBOSE
      cout << "calculating polynomials for depth " << h << endl;
#endif
      A[h] = new mpz_t[s]; // Platz fr Polynome in Hhe h allokieren
      for (int j=0; j<s; ++j) mpz_init2(A[h][j],estimated_operand_size); // mpz_t initialisieren

      for (int i=0, j=0; i<anz; ++i, j+=step)
	{
	  monic_mul(&A[h][j],step,
	      &A[h-1][2*i*oldstep],oldstep, &A[h-1][(2*i+1)*oldstep],oldstep,m);
	}
      if (rest) 
	{
	  for (int i=0; i<oldstep; i++)
	    mpz_set(A[h][s-step+i],A[h-1][stop[h-1]-oldstep+i]);
	}
      anz+=rest;
    }

  //cout << "READY for recursion..." << endl;

  // now the particular polynomials are ready for polynomial division
  TPolynom* const R = new TPolynom[MaxHoehe]; // polynomial remainder
  for (int i=0; i<MaxHoehe; ++i)
    {
      R[i]=new mpz_t[k];
      for (int j=0; j<k; ++j) mpz_init2(R[i][j],estimated_operand_size);
    }

  for (int i=0; i<anz; i++) multipoint_eval_rek(R,P,k,A,h,m,res,pos,steparray,stop);

  for (int i=MaxHoehe-1; i>=0; --i)
    {
      for (int j=k-1; j>=0; --j) mpz_clear(R[i][j]);
      delete [] R[i];
    }
  delete [] R;

  while (h>=0)
    {
      for (int i=stop[h]-1; i>=0; --i) mpz_clear(A[h][i]);
      delete [] A[h]; // release memory
      --h;
    }
  delete [] pos;
  delete [] steparray;
  delete [] stop;
  delete [] A;
}


int construct_polynomial_from_roots
      (TPolynom &res,
       const mpz_t* const roots_array, const int size,
       const mpz_t m)
{
  // task:
  // Creates a new polynomial using the zeros given in "roots_array";
  // this polynomial will be placed in "res" and its size will be returned.
  // To avoid memory leaks (due to erroneous calls), we expect that
  // "res" is initially an NULL-pointer.
  // additional remark:
  // Sure, it would be possible to return this pointer instead of using a reference
  // parameter, but what if you forget to delete it later? -- Using this technique,
  // the programmer is at least urged to provide a valid container fr our data!

  if (res!=NULL)
   {
     cerr << __FILE__ << ", " << __FUNCTION__ << ": line " <<  __LINE__ << endl;
     cerr << "First parameter is a call by reference," << endl;
     cerr << "it should initially point to NULL (to avoid memory-leaks)," << endl;
     cerr << "because a new pointer to new data will be generated and" << endl;
     cerr << "there is no need for initially data pointed by res!" << endl;
     exit(1);
   }

  if ( size < 3  )
   {
     cerr << __FILE__ << ", " << __FUNCTION__ << ": line " <<  __LINE__ << endl;
     cerr << "Invalid parameters!" << endl;
     exit(1);
   }  

  // for optimizing mpz-memory-allocation & fragmentation we will use mpz_init2
  // therefore we need to estimate our operand size (wrong estimate cause no harm
  // except for triggering more unnecessary reallocations)
  const unsigned int estimated_operand_size = 2*mpz_sizeinbase(m,2)+5;

  /*
    A contains the polynomials of depth h (h=0 -> leafs = monic polynomials of degree 1)
    B contains the polynomials of A of the previous round
  */
  TPolynom A,B;
  int h = 0, anz=size, step=2, s=anz*step;
#ifdef VERBOSE
  cout << "calculating polynomials for depth " << h << endl;
#endif
  A = new mpz_t[s]; // allocate memory for polynomials in depth h
  for (int i=0, j=0; i<anz; ++i, j+=step)
    {
      // create monic polynomials of degree 1
      mpz_init_set_ui(A[j+1],1); // 1*x
      mpz_init_set(A[j],roots_array[i]); mpz_neg(A[j],A[j]);
    }

  while (anz>1)
    {
      ++h; B=A; // next iteration step
      const int oldstep = step;
      const int olds    = s;
      const int rest = anz % 2;
      step=step*2-1; anz=(anz/2); s=(anz+rest)*step;
#ifdef VERBOSE
      cout << "calculating polynomials for depth " << h << endl;
#endif
      A = new mpz_t[s]; // allocate memory for polynomials in depth h
      for (int j=0; j<s; ++j) mpz_init2(A[j],estimated_operand_size); // initialize mpz_t
      for (int i=0, j=0; i<anz; ++i, j+=step)
	{
	  monic_mul(&A[j],step,
	      &B[2*i*oldstep],oldstep, &B[(2*i+1)*oldstep],oldstep,m);
	}
      if (rest) 
	{
	  for (int i=0; i<oldstep; i++)
	    mpz_set(A[s-step+i],B[olds-oldstep+i]);
	}

      // release (only the) memory which is no more needed
      // d.h. das Polynom in B
      for (int i=0; i<olds; ++i) mpz_clear(B[i]);
      delete [] B;

      anz+=rest;
    }

  --s; // normalize resulting polynomial
  while (s>=0 && mpz_cmp_ui(A[s],0)==0) { mpz_clear(A[s]); --s; }
  ++s;
  res=A; // result polynomial
  return s; // size of result polynomial
}


} // namespace polynomial

#endif /* POLYNOMIAL_cc_ */
