/*! @file
 * @brief
 * floating point implementation for computing the modular inverse
 */


/*--------------------------------------------------------------------
This file is placed in the public domain by its author,
Jason Papadopoulos. You may use it for any purpose, free of charge,
without having to notify anyone. I disclaim any responsibility for any
errors.

Optionally, please be nice and tell me if you find this source to be
useful. Again optionally, if you add to the functionality present here
please consider making those additions public too, so that others may 
benefit from your work.	
       				   --jasonp@boo.net
--------------------------------------------------------------------*/


/*-------------------------------------------------------------------------
This file was modified by Thorsten Reinecke to be compatible with Qsieve.
It can be used as replacement for the other invmod functions in modulo.cc.

For generic support of 23bit invmod, this version is a very fast
alternative. (For Athlon computers, the crafted cmov assembler version of
montgom_invmod is faster than this one.) Many thanks to the author for
providing me this file, so that I was able to do performance tests.

Except for this file (and except otherwise noted), all the other files in
the Qsieve distribution are still copyrighted under the GPL!
                                   thre@thorstenreinecke.de
-------------------------------------------------------------------------*/


/*--------------------------------------------------------------------*/

	/* When the factor base is really big, the MPQS
	   initialization stage takes a big fraction of the
	   runtime; timings show that ~90% of the time in
	   the initialization stage is spent computing modular
	   inverses. To that end, the following implements a
	   custom modular inverse that avoids integer division.
	   It's somehwat obscure, but far less obscure than
	   imeplementing the self-initializing variant of MPQS.

	   This is *much* faster than using mp_expo_1, because
	   it uses floating point and several tricks for fast
	   rounding. */

const double ieee_round = 6755399441055744.0;
const double intel_round = 6755399441055744.0 * 2048.0;
const double round_test = 2.7;
const double round_correct = 3.0;
volatile double round_constant[2];


void init_fast_invmod(void) {
 
 	/* Unfortunately, the fast rounding requires figuring
	   out the size of a floating point register. The choices
	   are the 53-bit IEEE size and the 64-bit Intel size;
	   however, the OS may force the Intel size down to the
	   IEEE size for compatibility. The following performs
	   a runtime check to see which register size is in use,
	   and selects the magic rounding constant accordingly. */

	double round_me;
 
	round_constant[0] = intel_round;
	round_constant[1] = intel_round;
	round_me = round_test;
	round_me += round_constant[0];
	round_me -= round_constant[1];
	
	
	printf("initializing jason's fast_invmod\n");
	if (round_me != round_correct) {
		round_constant[0] = ieee_round;
		round_constant[1] = ieee_round;
		round_me = round_test;
		round_me += round_constant[0];
		round_me -= round_constant[1];
		if( round_me != round_correct ) {
			printf("error: can't initialize fast rounding\n");
			exit(-1);
		}
	}

	/* Check that the compiler did not incorrectly 
	   optimize our custom modular inverse routine */

	if (fast_invmod(6, 17) != 3) {
		printf("error: modular inverse does not work\n");
		exit(-1);
	}
}

unsigned int fast_invmod(unsigned int a, unsigned int p) {

	/* Compute mp_expo_1(a, p-2, p) using floating point.
	   'a' and 'p' must be less than ~25 bits in size if
	   using IEEE fast rounding, but can go up to ~31 bits
	   if Intel fast rounding is in use. Both of these
	   are far larger than MPQS will ever need.

	   The code avoids an integer remainder by multiplying
	   by the floating point reciprocal and (fast-)rounding to
	   the nearest integral floating point value. Within
	   the while() loop, the accumulated result is always
	   bounded by +-p, and is normalized afterwards. 
	   
	   The exponent is scanned right to left; at each step the
	   modular squaring and multiply are both performed, even
	   if the multiply result will be thrown away */
	   
	unsigned int exponent = p - 2;
	int result;
	double dp = (double)(p);
	double dsquare = (double)(a);
	double recip = 1.0 / dp;
	double round0 = round_constant[0];
	double round1 = round_constant[1];
	double dresult = 1.0;

	while (exponent) {
		double dsquare2 = dsquare * dsquare;
		double dresult2 = dresult * dsquare;

		double drsquare = dsquare2 * recip + round0 - round1;
		double drresult = dresult2 * recip + round0 - round1;

		dsquare2 -= dp * drsquare;
		dresult2 -= dp * drresult;

		dsquare = dsquare2;
		if (exponent & 1)
			dresult = dresult2;

		exponent >>= 1;
	}

	result = (int)dresult;

	if (result < 0)
		return (unsigned int)(result + p);
	else
		return (unsigned int)result;
}
