#ifndef FAKEHEAP_HAEDER_
#define FAKEHEAP_HEADER_

/*! @file
 * @brief
 * contains faked priority queue implementation designed to speedup DLP-MPQS
 */


#include <vector>
#include <algorithm>

/*
 Der Standard-Vektor wird um die typischen PriorityQueue-Funktionen erweitert.
 Es handelt sich aber nicht wirklich um eine PriorityQueue; die Schnittstellen
 dienen lediglich dazu, in bestimmten Programmstellen langsamere echte
 Priorityqueues durch dieses Fake zu ersetzen, sofern bestimmte
 Nebenbedingungen erfllt werden:

 Zugriff ist nur blockweise gestattet. Zwischen push-Operationen einerseits
 und top/pop-Operationen andererseits ist "sort" aufzurufen.

 Ohne FakeHeap sind zwar alle Priority-Queues, die das STL-Interface
 verwenden, einsetzbar (z.B. Fibonacci-Heaps). Aber nun ist es mglich, auch
 Implementierungen, die unsortiert einfgen und ausgeben (und die Sortierung
 erst auf explizite Aufforderung vornehmen), einzusetzen. Dies bereitet
 einen greren Raum fr Effizienzvergleiche.

 Echte Priority-Queue-Implementierungen knnen selbstverstndlich ebenfalls
 dieses Interface verwenden und den sort()-Aufruf einfach ignorieren.
*/


//#define FAKEHEAP_DEBUG

/*!
 *  This template class provides an interface much like a priority queue,
 *  and it is intended to be used in this manner but with some restrictions
 *  and extensions.
 *
 *  Implementation details are hidden to provide an abstract template class
 *  for various fast implementations.
 *
 *  - Use push() to push elements to the FakeHeap.
 *  - Use top() to (read-only) access the top element of the FakeHeap
 *  - Use pop() to pop elements from the FakeHeap.
 *  - You must use sort() between blockwise usage of push() and pop()
 *    to guarantee that the elements are accessed by their priority.
 *  - Use clear() to clear the FakeHeap.
 *
 *  @remark Of course you could use a simple vector for these operations, too,
 *          but then the interface would be different; and FakeHeap was designed
 *          to replace priority queues in a manner that you can easily switch
 *          between these abstract data types.
 *          (The only reason for the existence of FakeHeap is that it produces
 *          a practical speedup compared to priority queues.)
 */
template <class T> class FakeHeap : private std::vector<T>
{
private:

#ifdef FAKEHEAP_DEBUG
 mutable bool push_locked;
#endif

public:

#ifdef FAKEHEAP_DEBUG
 inline FakeHeap() : std::vector<T>(), push_locked(false) { cout << "FakeHeap:: constructor..." << endl; };
 inline ~FakeHeap() { };
#endif

#ifdef FAKEHEAP_DEBUG
 inline void clear()
  {
    cout << "FakeHeap:: CLEAR..." << endl;
    std::vector<T>::clear();
    push_locked=false;
  }
#else
 inline void clear() { std::vector<T>::clear(); }
#endif


 inline bool empty() const { return std::vector<T>::empty(); }

 inline void sort(void)
  {
    //std::cout << "FakeHeap: sorting " << std::vector<T>::size() << " elements... " << std::flush;
    std::sort(this->begin(),this->end());
    //std::cout << "done." << std::endl;
#ifdef FAKEHEAP_DEBUG
    push_locked=true;
#endif
  }
 inline void push(const T &x)
  {
#ifdef FAKEHEAP_DEBUG
    if (push_locked)
     { 
       std::cerr << "FakeHeap:: Hey?!!! push is locked!!!" << endl;
       push_back(x);
       sort();
       return;
     }
#endif
    push_back(x);
  }
 inline void pop(void)
  {
#ifdef FAKEHEAP_DEBUG
    if (!push_locked)
     {
       std::cerr << "FakeHeap:: Hey?!!! pop is locked!!!" << endl;
       //exit(1);
     }
    push_locked=true;
#endif
    std::vector<T>::pop_back();
  }
 inline const T& top(void) const
  {
#ifdef FAKEHEAP_DEBUG
    if (!push_locked)
     {
       std::cerr << "FakeHeap:: Hey?!!! top is locked!!!" << endl;
       //exit(1);
     }
    push_locked=true;
#endif
    return std::vector<T>::back();
  } 
};


#if 1 && (defined (ASM_MMX) || defined(ASM_ATHLON) || defined(ASM_SSE))
 #ifdef DEBUG
  #warning "new experimental FAKEHEAP specialization enabled"
 #endif
// specialization of the fakeheap using optimized
// median-of-three quicksort with insertion sort near the leaves.

template<> class FakeHeap<TSieve_Delta>
{
private:
 static const int MORESIZE = 0x10000;
 TSieve_Delta* p;
 int size, capacity;

 inline void swap_p (const int i, const int j)
  {
    asm volatile( \
     "movq (%[p],%[i],8),%%mm0 \n\t" \
     "movq (%[p],%[j],8),%%mm1 \n\t" \
     "movq %%mm0,(%[p],%[j],8) \n\t" \
     "movq %%mm1,(%[p],%[i],8)" \
     :
     : [p] "r" (p), [i] "r" (i), [j] "r" (j)
     : "memory", "mm0", "mm1");
  }

 void quicksort(register int l, register int r)
  {
    // we use iteration instead of recursion, therefore we need a stack:
    struct { int L,R; } stack[256];
    int stackpointer = 0;

    // okay, let's start the iteration:
  iteration_loop:
    if (r>l)
     {
       if (r-l<32)
        {
          // insertion sort (when near the leaves)
#if 0
          for (int i=l+1; i<=r; ++i)
           {
             TSieve_Delta v=p[i];
             int j=i;
             while (j>l && p[j-1].delta<v.delta) { p[j]=p[j-1]; --j; }
             p[j]=v;
           }
#elif defined(ASM_ATHLON)
          asm volatile( \
           "movl %[l],%%edx \n\t" \
           "jmp 2f \n\t" \
           ".balign 16 \n\t" \
           "3: movl %%edx,%%ecx \n\t " \
           "decl %%ecx \n\t" \
           "movq (%[p],%%edx,8),%%mm1 \n\t" \
           "movl (%[p],%%edx,8),%%eax \n\t" \
           "0: cmpl (%[p],%%ecx,8),%%eax \n\t" \
           "jle 1f \n\t" \
           "movq (%[p],%%ecx,8),%%mm0 \n\t" \
           "movq %%mm0,8(%[p],%%ecx,8) \n\t" \
           "decl %%ecx \n\t" \
           "jns 0b \n\t" \
           "1: movq %%mm1,8(%[p],%%ecx,8) \n\t" \
           "2: incl %%edx \n\t" \
           "cmpl %[r],%%edx \n\t" \
           "jle 3b" \
           :
           : [p] "r" (p), [l] "g" (l), [r] "g" (r)
           : "cc", "memory", "eax", "ecx", "edx", "mm0", "mm1");
#else  /* defined(ASM_MMX) || defined(ASM_SSE) and not an Athlon */
          asm volatile( \
           "movl %[l],%%edx \n\t" \
           "jmp 2f \n\t" \
           "3: movl %%edx,%%ecx \n\t " \
           "sub $1,%%ecx \n\t" \
           "movq (%[p],%%edx,8),%%mm1 \n\t" \
           "movl (%[p],%%edx,8),%%eax \n\t" \
           "0: cmpl (%[p],%%ecx,8),%%eax \n\t" \
           "jle 1f \n\t" \
           "movq (%[p],%%ecx,8),%%mm0 \n\t" \
           "movq %%mm0,8(%[p],%%ecx,8) \n\t" \
           "sub $1,%%ecx \n\t" \
           "jns 0b \n\t" \
           "1: movq %%mm1,8(%[p],%%ecx,8) \n\t" \
           "2: add $1,%%edx \n\t" \
           "cmpl %[r],%%edx \n\t" \
           "jle 3b" \
           :
           : [p] "r" (p), [l] "g" (l), [r] "g" (r)
           : "cc", "memory", "eax", "ecx", "edx", "mm0", "mm1");
#endif
        }
       else
        {
          // quicksort with median of three
          //std::cout << l << " " << r << ":" << std::endl;
          register int i = (l+r)>>1;
          if (p[l].delta<p[r].delta) swap_p(l,r);
          if (p[l].delta<p[i].delta) swap_p(l,i);
          if (p[r].delta<p[i].delta) swap_p(r,i);
#if 0
          // quicksort in C++
          const int pivot=p[r].delta;
          i=l;
          int j=r;
          while (true)
           {
             do ++i; while (p[i].delta>pivot);
             do --j; while (p[j].delta<pivot);
             if (i>=j) break;
             swap_p(i,j);
           }
          swap_p(i,r);
#elif defined(ASM_ATHLON)  /* and ATHLON-XP! */
          // pivot element in %%eax (key) + %%mm2 (all data)
          asm( \
           "movq (%[p],%[r],8),%%mm2 \n\t" \
           "movl (%[p],%[r],8),%%eax \n\t" \
           "movl %[l],%[i] \n\t " \
           "movl %[r],%%ecx \n\t" \
           "0: incl %[i] \n\t"
           "cmpl (%[p],%[i],8),%%eax \n\t" \
           "jl 0b \n\t" \
           "prefetchw 192(%[p],%[i],8) \n\t" \
           "1: decl %%ecx \n\t"
           "cmpl (%[p],%%ecx,8),%%eax \n\t" \
           "jg 1b \n\t" \
           "cmpl %%ecx,%[i] \n\t" \
           "movq (%[p],%[i],8),%%mm0 \n\t" \
           "jge 2f \n\t" \
           "movq (%[p],%%ecx,8),%%mm1 \n\t" \
           "movq %%mm0,(%[p],%%ecx,8) \n\t" \
           "movq %%mm1,(%[p],%[i],8) \n\t" \
           "prefetchw -192(%[p],%%ecx,8) \n\t" \
           "jmp 0b \n\t" \
           "2: movq %%mm2,(%[p],%[i],8) \n\t" \
           "movq %%mm0,(%[p],%[r],8)" \
           : [i] "=&r" (i) // important: early clobbered as it may conflict with l!
           : [p] "r" (p), [l] "r" (l), [r] "r" (r)
           : "cc", "memory", "eax", "ecx", "mm0", "mm1", "mm2");
#else  /* defined(ASM_MMX) || defined(ASM_SSE) and not an Athlon */
          // pivot element in %%eax (key) + %%mm2 (all data)
          asm( \
           "lea (%[p],%[r],8),%%ecx \n\t" \
           "lea (%[p],%[l],8),%[i] \n\t " \
           "movl (%%ecx),%%eax \n\t" \
           "movq (%%ecx),%%mm2 \n\t" \
           "0: add $8,%[i] \n\t"
           "cmpl (%[i]),%%eax \n\t" \
           "jl 0b \n\t" \
           "movq (%[i]),%%mm0 \n\t" \
           "1: sub $8,%%ecx \n\t"
           "cmpl (%%ecx),%%eax \n\t" \
           "jg 1b \n\t" \
           "cmpl %%ecx,%[i] \n\t" \
           "jge 2f \n\t" \
           "movq (%%ecx),%%mm1 \n\t" \
           "movq %%mm0,(%%ecx) \n\t" \
           "movq %%mm1,(%[i]) \n\t" \
           "jmp 0b \n\t" \
           "2: movq %%mm2,(%[i]) \n\t" \
           "subl %[p],%[i] \n\t" \
           "shrl $3,%[i] \n\t" \
           "movq %%mm0,(%[p],%[r],8)" \
           : [i] "=&r" (i) // important: early clobbered as it may conflict with l!
           : [p] "r" (p), [l] "r" (l), [r] "r" (r)
           : "cc", "memory", "eax", "ecx", "mm0", "mm1", "mm2");
#endif
          //std::cout << l << " " << i << " " << r << std::endl;

          // to save stack space, we always push the larger part on stack
          if (r-i<i-l)
           {
             stack[stackpointer].L=l; stack[stackpointer].R=i-1; l=i+1;
           }
          else
           {
             stack[stackpointer].L=i+1; stack[stackpointer].R=r; r=i-1;
           }
          ++stackpointer;
          goto iteration_loop; // and iterate the smaller part
        }
     }
    if (stackpointer)
     {
       --stackpointer;
       l=stack[stackpointer].L; r=stack[stackpointer].R;
       goto iteration_loop;
     }
  }

public:

 inline FakeHeap() : p(new TSieve_Delta[MORESIZE]), size(0), capacity(MORESIZE) { }
 inline ~FakeHeap()
  {
    delete [] p;
#ifdef DEBUG
    MARK; cout << "cleared " << capacity*8/1024 << " KB." << endl;
#endif
  }

 inline void clear() { size=0; }
 inline bool empty() const { return size==0; }

 inline void sort(void)
  {
    //std::cout << "FakeHeap<TSieve_Delta>: sorting " << size << " elements... " << std::flush;
    if (size>1)
     {
       quicksort(0,size-1);
       asm volatile ("emms");
     }
    //std::cout << "done." << std::endl;
#if 0 || defined(DEBUG)
    for (int i=1; i<size; ++i) if (p[i-1].delta<p[i].delta) { std::cout << "not sorted " << i << ": " << p[i-1].delta << " " << p[i].delta << std::endl; MARK; exit(1); }
#endif
    //for (int i=0; i<size; ++i) std::cout << p[i].delta << " ";
    //std::cout << std::endl;
  }
 inline void push(const TSieve_Delta &x)
  {
    if (size>=capacity)
     {
       TSieve_Delta* p_old = p;
       capacity+=MORESIZE;
       p = new TSieve_Delta[capacity];
       for (int i=0; i<size; ++i) p[i]=p_old[i];
       delete [] p_old;
     }
    p[size]=x; ++size;
    __builtin_prefetch (&p[size+24], 1, 1);
  }
 inline void pop(void)
  {
    --size;
  }
 inline const TSieve_Delta& top(void) const
  {
    __builtin_prefetch (&p[size-16], 0, 0);
    return p[size-1];
  } 
};

#endif

#endif
