/* Toom-3.5, the "slightly" and the "very" unbalanced flavour.

   Contributed to the GNU project by Marco Bodrato.

   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE
   INTERFACE. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
   INTERFACES. IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE
   OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

   Copyright 2009 Free Software Foundation, Inc.

   This file is part of the GNU MP Library.

   The GNU MP Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 3 of the
   License, or (at your option) any later version.

   The GNU MP Library is distributed in the hope that it will be
   useful, but WITHOUT ANY WARRANTY; without even the implied warranty
   of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with the GNU MP Library.  If not, see
   http://www.gnu.org/licenses/.
 */

/*
  mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.333_
	times as large as bn.  Or more accurately, bn < an < 2bn.
  mpn_toom52_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 2.5
	times as large as bn. Or more accurately, 2bn < an < 5bn.
  Neither the first nor the second version do handle the case an == 2bn !

  Things to work on:

  1. Move allocation of scratch area from inside to outside of mul_toom.

  2. Decide once and forever if Toom-3.5 will be used also for
     near-borderline cases, id est n>=w0n (=s+t), or very small
     operands (n<7). If not, remove code specific to those cases.
*/

#include "gmp.h"
#include "gmp-impl.h"

enum toom3_5_flags {toom3_5_all_pos = 0, toom3_5_vm1_neg = 1, toom3_5_vm2_neg = 2};

static inline int
mpn_zero_p (mp_srcptr ap, mp_size_t n)
{
  mp_size_t i;
  for (i = n - 1; i >= 0; i--)
    {
      if (ap[i] != 0)
	return 0;
    }
  return 1;
}

/* Interpolation for Toom-3.5, using the evaluation points: infinity,
   1, -1, 2, -2. More precisely, we want to compute
   f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
   six values

     w5 = f(0),
     w4 = f(-1),
     w3 = f(1)
     w2 = f(-2),
     w1 = f(2),
     w0 = limit at infinity of f(x) / x^5,

   The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
   {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
   {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
   significant limbs small). f(-1) and f(-2) may be negative, signs
   determined by the flag bits. All intermediate results are positive.
   Inputs are destroyed.

   Interpolation sequence was taken from the paper: "Integer and
   Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
   Some slight variations were introduced: adaptatation to "gmp
   instruction set", and a final saving of an operation by interlacing
   interpolation and recomposition phases.
*/

static void
mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom3_5_flags flags,
			   mp_ptr w4, mp_ptr w2, mp_ptr w1,
			   mp_size_t w0n)
{
  mp_limb_t cy;
  /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
  mp_limb_t cy4, cy6, embankment;

#define w5  pp					/* 2n   */
#define w3  (pp + 2 * n)			/* 2n+1 */
#define w0  (pp + 5 * n)			/* w0n  */

  /* Interpolate with sequence:
     W2 =(W1 - W2)>>2
     W4 =(W3 - W4)>>1
     W1 =(W1 - W5)>>1
     W1 =(W1 - W2)>>1
     W2 =(W2 - W4)/3 - W0<<2
     W3 = W3 - W4 - W5
     W1 =(W1 - W3)/3
     // Last steps are mixed with recomposition...
     W4 = W4 - W2
     W3 = W3 - W1
     W2 = W2 - W0
  */

  /* W2 =(W1 - W2)>>2 */
  if (flags & toom3_5_vm2_neg)
    mpn_add_n (w2, w1, w2, 2 * n + 1);
  else
    mpn_sub_n (w2, w1, w2, 2 * n + 1);
  mpn_rshift (w2, w2, 2 * n + 1, 2);

  /* W4 =(W3 - W4)>>1 */
  if (flags & toom3_5_vm1_neg)
    {
#if HAVE_NATIVE_mpn_rsh1add_n
      mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
#else
      mpn_add_n (w4, w3, w4, 2 * n + 1);
      mpn_rshift (w4, w4, 2 * n + 1, 1);
#endif
    }
  else
    {
#if HAVE_NATIVE_mpn_rsh1sub_n
      mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
#else
      mpn_sub_n (w4, w3, w4, 2 * n + 1);
      mpn_rshift (w4, w4, 2 * n + 1, 1);
#endif
    }

  /* W1 =(W1 - W5)>>1 */
  w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
  mpn_rshift (w1, w1, 2 * n + 1, 1);

  /* W1 =(W1 - W2)>>1 */
#if HAVE_NATIVE_mpn_rsh1sub_n
  mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
#else
  mpn_sub_n (w1, w1, w2, 2 * n + 1);
  mpn_rshift (w1, w1, 2 * n + 1, 1);
#endif

  /* W2 =(W2 - W4)/3 - W0<<2 */
  mpn_sub_n (w2, w2, w4, 2 * n + 1);
  mpn_divexact_by3 (w2, w2, 2 * n + 1);
  cy = mpn_submul_1 (w2, w0, w0n, 4);
  MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);

  /* W3 = W3 - W4 - W5 */
  mpn_sub_n (w3, w3, w4, 2 * n + 1);
  w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);

  /* W1 =(W1 - W3)/3 */
  mpn_sub_n (w1, w1, w3, 2 * n + 1);
  mpn_divexact_by3 (w1, w1, 2 * n + 1);

  /*
    [1 0 0 0 0 0;
     0 1 0 0 0 0;
     1 0 1 0 0 0;
     0 1 0 1 0 0;
     1 0 1 0 1 0;
     0 0 0 0 0 1]

    pp[] prior to operations:
     |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|

    summation scheme for remaining operations:
     |______________5|n_____4|n_____3|n_____2|n______|n______|pp
     |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
				    || H w4  | L w4  |
		    || H w2  | L w2  |
	    || H w1  | L w1  |
			    ||-H w1  |-L w1  |
		     |-H w0  |-L w0 ||-H w2  |-L w2  |
  */
  cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
  MPN_INCR_U (pp + 3 * n + 1, n, cy);

  /* W4L = W4L - W2L */
  cy = mpn_sub_n (pp + n, pp + n, w2, n);
  MPN_DECR_U (w3, 2 * n + 1, cy);

  /* W3H = W3H + W2L */
  cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
  /* W1L + W2H */
  cy = mpn_add_n (pp + 4 * n, w1, w2 + n, n);
  MPN_INCR_U (w1 + n, n + 1, w2[2 * n] + cy);

  /* W0 = W0 + W1H */
  if (LIKELY (w0n > n))
    cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
  else
    cy6 = mpn_add_n (w0, w0, w1 + n, w0n);

  /*
    summation scheme for the next operation:
     |...____5|n_____4|n_____3|n_____2|n______|n______|pp
     |...w0___|_w1_2__|_H w3__|_L w3__|_H w5__|_L w5__|
		     ...-w0___|-w1_w2 |
  */
  /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
  cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n); 

  /* embankment is a "dirty trick" to avoid carry/borrow propagation
     beyond allocated memory */
  embankment = w0[w0n - 1];
  w0[w0n - 1] = 0x80;
  if (LIKELY (w0n > n)) {
    if ( LIKELY(cy4 > cy6) )
      MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
    else
      MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
    MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
    MPN_INCR_U (w0 + n, w0n - n, cy6);
  } else {
    MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
    MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
  }
  w0[w0n - 1] += embankment - 0x80;
    
#undef w5
#undef w3
#undef w0

}

/*
  Toom-3.5, the "slightly" unbalanced flavour.
  Evaluate in: 0, +1, -1, +2, -2, +inf

  <-s-><--n--><--n--><--n-->
   ___ ______ ______ ______
  |a3_|___a2_|___a1_|___a0_|
	|_b2_|___b1_|___b0_|
	<-t--><--n--><--n-->

  v0  =  a0             * b0          #   A(0)*B(0)
  v1  = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 3  bh <= 2
  vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1 |bh|<= 1
  v2  = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 14 bh <= 6
  vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) #  A(-2)*B(-2)    |ah| <= 9 |bh|<= 4
  vinf=              a3 *         b2  # A(inf)*B(inf)
*/

void
mpn_toom43_mul (mp_ptr pp,
		mp_srcptr ap, mp_size_t an,
		mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
{
  mp_size_t n, s, t;
  enum toom3_5_flags flags = toom3_5_all_pos;
  mp_limb_t cy;
  mp_ptr as1, as2;
/*   mp_ptr as1, asm1, as2, asm2; */
/*   mp_ptr bs1, bsm1, bs2, bsm2; */
  TMP_DECL;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2 * n)
#define a3  (ap + 3 * n)
#define b0  bp
#define b1  (bp + n)
#define b2  (bp + 2 * n)

  n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (unsigned long) 3);

  s = an - 3 * n;
  t = bn - 2 * n;

  ASSERT (0 < s && s <= n);
  ASSERT (0 < t && t <= n);

  TMP_MARK;

#define v0    pp				/* 2n */
#define vm1   (scratch)				/* 2n+1 */
#define v1    (pp + 2*n)			/* 2n+1 */
#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
#define v2    (scratch + 4 * n + 2)		/* 2n+1 */
#define vinf  (pp + 5 * n)			/* s+t */
#define bs1    pp				/* n+1 */
#define bsm1  (scratch + 2 * n + 2)		/* n+1 */
#define asm1  (scratch + 3 * n + 3)		/* n+1 */
#define asm2  (scratch + 4 * n + 4)		/* n+1 */
#define bsm2  (pp + n + 1)			/* n+1 */
#define bs2   (pp + 2 * n + 2)			/* n+1 */

 /* Alloc one more byte, because products will overwrite 2n+2
    limbs. If n=1 asm2 reaches scratch+5n+5=scratch+10 and the last
    byte is used anyway. */
  scratch = TMP_SALLOC_LIMBS (6 * n + 3 + 1);

  /* #define as2   (pp + 3 * n + 3)			/\* n+1 *\/ */
  /* n+(n-2)<=n+(s+t), so the next test is always false if n>2.
     Consider replacing it with the above define if threshold is big
     enough.
  */
  if (n+s+t < 4)
    as2 = TMP_SALLOC_LIMBS (n + 1);
  else
    as2 = (pp + 3 * n + 3);

  /* #define as1   (pp + 4 * n + 4)			/\* n+1 *\/ */
  /* (n-2)<=(s+t), so the next test is always false if n>6.
     Consider replacing it with the above define if threshold is big
     enough.
  */
  if (s+t < 5)
    as1 = TMP_SALLOC_LIMBS (n + 1);
  else
    as1 = (pp + 4 * n + 4);

/*   bsm1 = TMP_SALLOC_LIMBS (n + 1); */
/*   bs2 = TMP_SALLOC_LIMBS (n + 1); */
/*   bsm2 = TMP_SALLOC_LIMBS (n + 1); */

#define a0a2  scratch
#define b0b2  scratch
#define a1a3  asm1
#define b1d   bsm1

  /* Compute as2 and asm2.  */
  cy = mpn_lshift (asm2, a3, s, 3);			/* 8a3                */
  a1a3[n] = mpn_lshift (a1a3, a1, n, 1);		/*           2a1      */
  cy += mpn_add_n (a1a3, a1a3, asm2, s);		/* 8a3      +2a1      */
  MPN_INCR_U(a1a3 + s, n - s + 1, cy);
  cy = mpn_lshift (asm2, a2, n, 2);			/*      4a2           */
  a0a2[n] = cy + mpn_add_n (a0a2, a0, asm2, n);		/*      4a2      + a0 */

#if HAVE_NATIVE_mpn_addsub_n
  if (mpn_cmp (a0a2, a1a3, n+1) < 0)
    {
      mpn_addsub_n (as2, asm2, a1a3, a0a2, n+1);
      flags ^= toom3_5_vm2_neg;
    }
  else
    {
      mpn_addsub_n (as2, asm2, a0a2, a1a3, n+1);
    }
#else
  mpn_add_n (as2, a0a2, a1a3, n+1);
  if (mpn_cmp (a0a2, a1a3, n+1) < 0)
    {
      mpn_sub_n (asm2, a1a3, a0a2, n+1);
      flags ^= toom3_5_vm2_neg;
    }
  else
    {
      mpn_sub_n (asm2, a0a2, a1a3, n+1);
    }
#endif

  /* Compute bs2 and bsm2.  */
  b1d[n] = mpn_lshift (b1d, b1, n, 1);			/*       2b1      */
  cy  = mpn_lshift (b0b2, b2, t, 2);			/*  4b2           */
  cy += mpn_add_n (b0b2, b0b2, b0, t);			/*  4b2      + b0 */
  if (t != n)
    cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy);
  b0b2[n] = cy;

#if HAVE_NATIVE_mpn_addsub_n
  if (mpn_cmp (b0b2, b1d, n+1) < 0)
    {
      mpn_addsub_n (bs2, bsm2, b1d, b0b2, n+1);
      flags ^= toom3_5_vm2_neg;
    }
  else
    {
      mpn_addsub_n (bs2, bsm2, b0b2, b1d, n+1);
    }
#else
  mpn_add_n (bs2, b0b2, b1d, n+1);
  if (mpn_cmp (b0b2, b1d, n+1) < 0)
    {
      mpn_sub_n (bsm2, b1d, b0b2, n+1);
      flags ^= toom3_5_vm2_neg;
    }
  else
    {
      mpn_sub_n (bsm2, b0b2, b1d, n+1);
    }
#endif

  /* Compute as1 and asm1.  */
  a0a2[n] = mpn_add_n (a0a2, a0, a2, n);
  asm1[n] = mpn_add (asm1, a1, n, a3, s);
#if HAVE_NATIVE_mpn_addsub_n
  if (mpn_cmp (a0a2, asm1, n+1) < 0)
    {
      cy = mpn_addsub_n (as1, asm1, asm1, a0a2, n+1);
      flags ^= toom3_5_vm1_neg;
    }
  else
    {
      cy = mpn_addsub_n (as1, asm1, a0a2, asm1, n+1);
    }
#else
  mpn_add_n (as1, a0a2, asm1, n+1);
  if (mpn_cmp (a0a2, asm1, n+1) < 0)
    {
      mpn_sub_n (asm1, asm1, a0a2, n+1);
      flags ^= toom3_5_vm1_neg;
    }
  else
    {
      mpn_sub_n (asm1, a0a2, asm1, n+1);
    }
#endif

  /* Compute bs1 and bsm1.  */
  bsm1[n] = mpn_add (bsm1, b0, n, b2, t);
#if HAVE_NATIVE_mpn_addsub_n
  if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
    {
      cy = mpn_addsub_n (bs1, bsm1, b1, bsm1, n);
      bs1[n] = cy >> 1;
      flags ^= toom3_5_vm1_neg;
    }
  else
    {
      cy = mpn_addsub_n (bs1, bsm1, bsm1, b1, n);
      bs1[n] = bsm1[n] + (cy >> 1);
      bsm1[n]-= cy & 1;
    }
#else
  bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n);
  if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0)
    {
      mpn_sub_n (bsm1, b1, bsm1, n);
      flags ^= toom3_5_vm1_neg;
    }
  else
    {
      bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n);
    }
#endif

  ASSERT (as1[n] <= 3);
  ASSERT (bs1[n] <= 2);
  ASSERT (asm1[n] <= 1);
  ASSERT (bsm1[n] <= 1);
  ASSERT (as2[n] <=14);
  ASSERT (bs2[n] <= 6);
  ASSERT (asm2[n] <= 9);
  ASSERT (bsm2[n] <= 4);

  /* vm1, 2n+1 limbs */
  mpn_mul_n (vm1, asm1, bsm1, n+1);  /* W4 */

  /* vm2, 2n+1 limbs */
  mpn_mul_n (vm2, asm2, bsm2, n+1);  /* W2 */

  /* v2, 2n+1 limbs */
  mpn_mul_n (v2, as2, bs2, n+1);  /* W1 */

  /* v1, 2n+1 limbs */
  mpn_mul_n (v1, as1, bs1, n+1);  /* W3 */

  /* vinf, s+t limbs */   /* W0 */
  if (s > t)  mpn_mul (vinf, a3, s, b2, t);
  else        mpn_mul (vinf, b2, t, a3, s);

  /* v0, 2n limbs */
  mpn_mul_n (v0, ap, bp, n);  /* W5 */

  mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);

#undef v0
#undef vm1
#undef v1
#undef vm2
#undef v2
#undef vinf
#undef bs1
#undef bs2
#undef bsm1
#undef bsm2
#undef asm1
#undef asm2
/* #undef as1 */
/* #undef as2 */
#undef a0a2
#undef b0b2
#undef a1a3
#undef b1d
#undef a0
#undef a1
#undef a2
#undef a3
#undef b0
#undef b1
#undef b2

  TMP_FREE;
}

/*
  Toom-3.5, the "very" unbalanced flavour.
  Evaluate in: 0, +1, -1, +2, -2, +inf

  <-s-><--n--><--n--><--n--><--n-->
   ___ ______ ______ ______ ______
  |a4_|___a3_|___a2_|___a1_|___a0_|
			|b1|___b0_|
			<t-><--n-->

  v0  =  a0                  * b0      #   A(0)*B(0)
  v1  = (a0+ a1+ a2+ a3+  a4)*(b0+ b1) #   A(1)*B(1)      ah  <= 4   bh <= 1
  vm1 = (a0- a1+ a2- a3+  a4)*(b0- b1) #  A(-1)*B(-1)    |ah| <= 2   bh  = 0
  v2  = (a0+2a1+4a2+8a3+16a4)*(b0+2b1) #   A(2)*B(2)      ah  <= 30  bh <= 2
  vm2 = (a0-2a1+4a2-8a3+16a4)*(b0-2b1) #  A(-2)*B(-2)    |ah| <= 20 |bh|<= 1
  vinf=                   a4 *     b1  # A(inf)*B(inf)

  Some slight optimization in evaluation are taken from the paper:
  "Towards Optimal Toom-Cook Multiplication for Univariate and
  Multivariate Polynomials in Characteristic 2 and 0."
*/

void
mpn_toom52_mul (mp_ptr pp,
		mp_srcptr ap, mp_size_t an,
		mp_srcptr bp, mp_size_t bn, mp_ptr scratch)
{
  mp_size_t n, s, t;
  enum toom3_5_flags flags = toom3_5_all_pos;
  mp_limb_t cy;
  mp_ptr as1, as2;
/*   mp_ptr as1, asm1, as2, asm2; */
/*   mp_ptr bs1, bsm1, bs2, bsm2; */
  TMP_DECL;

#define a0  ap
#define a1  (ap + n)
#define a2  (ap + 2 * n)
#define a3  (ap + 3 * n)
#define a4  (ap + 4 * n)
#define b0  bp
#define b1  (bp + n)

  n = 1 + (2 * an >= 5 * bn ? (an - 1) / (unsigned long) 5 : (bn - 1) >> 1);

  s = an - 4 * n;
  t = bn - n;

  ASSERT (0 < s && s <= n);
  ASSERT (0 < t && t <= n);

  TMP_MARK;

#define v0    pp				/* 2n */
#define vm1   (scratch)				/* 2n+1 */
#define v1    (pp + 2*n)			/* 2n+1 */
#define vm2   (scratch + 2 * n + 1)		/* 2n+1 */
#define v2    (scratch + 4 * n + 2)		/* 2n+1 */
#define vinf  (pp + 5 * n)			/* s+t */
#define bs1    pp				/* n+1 */
#define bsm1  (scratch + 2 * n + 2)		/* n   */
#define asm1  (scratch + 3 * n + 3)		/* n+1 */
#define asm2  (scratch + 4 * n + 4)		/* n+1 */
#define bsm2  (pp + n + 1)			/* n+1 */
#define bs2   (pp + 2 * n + 2)			/* n+1 */

 /* Alloc one more byte, because products will overwrite 2n+2
    limbs. If n=1 asm2 reaches scratch+5n+5=scratch+10 and the last
    byte is used anyway. */
  scratch = TMP_SALLOC_LIMBS (6 * n + 3 + 1);

  /* #define as2   (pp + 3 * n + 3)			/\* n+1 *\/ */
  /* n+(n-2)<=n+(s+t), so the next test is always false if n>2.
     Consider replacing it with the above define if threshold is big
     enough.
  */
  if (n+s+t < 4)
    as2 = TMP_SALLOC_LIMBS (n + 1);
  else
    as2 = (pp + 3 * n + 3);

  /* #define as1   (pp + 4 * n + 4)			/\* n+1 *\/ */
  /* (n-2)<=(s+t), so the next test is always false if n>6.
     Consider replacing it with the above define if threshold is big
     enough.
  */
  if (s+t < 5)
    as1 = TMP_SALLOC_LIMBS (n + 1);
  else
    as1 = (pp + 4 * n + 4);

/*   bsm1 = TMP_SALLOC_LIMBS (n + 1); */
/*   bs2 = TMP_SALLOC_LIMBS (n + 1); */
/*   bsm2 = TMP_SALLOC_LIMBS (n + 1); */

#define a0a2  scratch
#define a1a3  asm1

  /* Compute bs1 and bsm1.  */
  if (t == n)
    {
#if HAVE_NATIVE_mpn_addsub_n
      if (mpn_cmp (b0, b1, n) < 0)
	{
	  cy = mpn_addsub_n (bs1, bsm1, b1, b0, n);
	  flags ^= toom3_5_vm1_neg;
	}
      else
	{
	  cy = mpn_addsub_n (bs1, bsm1, b0, b1, n);
	}
      bs1[n] = cy >> 1;
#else
      bs1[n] = mpn_add_n (bs1, b0, b1, n);
      if (mpn_cmp (b0, b1, n) < 0)
	{
	  mpn_sub_n (bsm1, b1, b0, n);
	  flags ^= toom3_5_vm1_neg;
	}
      else
	{
	  mpn_sub_n (bsm1, b0, b1, n);
	}
#endif
    }
  else
    {
      bs1[n] = mpn_add (bs1, b0, n, b1, t);
      if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0)
	{
	  mpn_sub_n (bsm1, b1, b0, t);
	  MPN_ZERO (bsm1 + t, n - t);
	  flags ^= toom3_5_vm1_neg;
	}
      else
	{
	  mpn_sub (bsm1, b0, n, b1, t);
	}
    }

  /* Compute bs2 and bsm2, recycling bs1 and bsm1. bs2=bs1+b1; bsm2=bsm1-b1  */
  mpn_add (bs2, bs1, n+1, b1, t);
  if ( flags & toom3_5_vm1_neg )
    {
      bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t);
      flags ^= toom3_5_vm2_neg;
    }
  else
    {
      bsm2[n] = 0;
      if (t == n)
	{
	  if (mpn_cmp (bsm1, b1, n) < 0)
	    {
	      mpn_sub_n (bsm2, b1, bsm1, n);
	      flags ^= toom3_5_vm2_neg;
	    }
	  else
	    {
	      mpn_sub_n (bsm2, bsm1, b1, n);
	    }
	}
      else
	{
	  if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0)
	    {
	      mpn_sub_n (bsm2, b1, bsm1, t);
	      MPN_ZERO (bsm2 + t, n - t);
	      flags ^= toom3_5_vm2_neg;
	    }
	  else
	    {
	      mpn_sub (bsm2, bsm1, n, b1, t);
	    }
	}
    }

  /* Compute as2 and asm2.  */
  cy = mpn_lshift (asm2, a3, n, 3);			/* 8a3                */
  cy += mpn_lshift (a1a3, a1, n, 1);		        /*           2a1      */
  cy += mpn_add_n (a1a3, a1a3, asm2, n);		/* 8a3      +2a1      */
  a1a3[n] = cy;

  cy = mpn_lshift (a0a2, a2, n, 2);			/*           4a2           */
  a0a2[n] = cy + mpn_add_n (a0a2, a0, a0a2, n);		/*           4a2      + a0 */
  cy  = mpn_lshift (asm2, a4, s, 4);			/* 16a4                    */
  cy += mpn_add_n (a0a2, a0a2, asm2, s);		/* 16a4     +4a2      + a0 */
  MPN_INCR_U(a0a2 + s, n - s + 1, cy);

#if HAVE_NATIVE_mpn_addsub_n
  if (mpn_cmp (a0a2, a1a3, n+1) < 0)
    {
      mpn_addsub_n (as2, asm2, a1a3, a0a2, n+1);
      flags ^= toom3_5_vm1_neg;
    }
  else
    {
      mpn_addsub_n (as2, asm2, a0a2, a1a3, n+1);
    }
#else
  mpn_add_n (as2, a0a2, a1a3, n+1);
  if (mpn_cmp (a0a2, a1a3, n+1) < 0)
    {
      mpn_sub_n (asm2, a1a3, a0a2, n+1);
      flags ^= toom3_5_vm2_neg;
    }
  else
    {
      mpn_sub_n (asm2, a0a2, a1a3, n+1);
    }
#endif

  /* Compute as1 and asm1.  */
  cy = mpn_add (a0a2, a2, n, a4, s);
  a0a2[n] = cy + mpn_add_n (a0a2, a0, a0a2, n);
  asm1[n] = mpn_add_n (asm1, a1, a3, n);
#if HAVE_NATIVE_mpn_addsub_n
  if (mpn_cmp (a0a2, asm1, n+1) < 0)
    {
      cy = mpn_addsub_n (as1, asm1, asm1, a0a2, n+1);
      flags ^= toom3_5_vm1_neg;
    }
  else
    {
      cy = mpn_addsub_n (as1, asm1, a0a2, asm1, n+1);
    }
#else
  mpn_add_n (as1, a0a2, asm1, n+1);
  if (mpn_cmp (a0a2, asm1, n+1) < 0)
    {
      mpn_sub_n (asm1, asm1, a0a2, n+1);
      flags ^= toom3_5_vm1_neg;
    }
  else
    {
      mpn_sub_n (asm1, a0a2, asm1, n+1);
    }
#endif

  ASSERT (as1[n] <= 4);
  ASSERT (bs1[n] <= 1);
  ASSERT (asm1[n] <= 2);
/*   ASSERT (bsm1[n] <= 1); */
  ASSERT (as2[n] <=30);
  ASSERT (bs2[n] <= 2);
  ASSERT (asm2[n] <= 20);
  ASSERT (bsm2[n] <= 1);

  /* vm1, 2n+1 limbs */
  mpn_mul (vm1, asm1, n+1, bsm1, n);  /* W4 */

  /* vm2, 2n+1 limbs */
  mpn_mul_n (vm2, asm2, bsm2, n+1);  /* W2 */

  /* v2, 2n+1 limbs */
  mpn_mul_n (v2, as2, bs2, n+1);  /* W1 */

  /* v1, 2n+1 limbs */
  mpn_mul_n (v1, as1, bs1, n+1);  /* W3 */

  /* vinf, s+t limbs */   /* W0 */
  if (s > t)  mpn_mul (vinf, a4, s, b1, t);
  else        mpn_mul (vinf, b1, t, a4, s);

  /* v0, 2n limbs */
  mpn_mul_n (v0, ap, bp, n);  /* W5 */

  mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s);

#undef v0
#undef vm1
#undef v1
#undef vm2
#undef v2
#undef vinf
#undef bs1
#undef bs2
#undef bsm1
#undef bsm2
#undef asm1
#undef asm2
/* #undef as1 */
/* #undef as2 */
#undef a0a2
#undef b0b2
#undef a1a3
#undef b1d
#undef a0
#undef a1
#undef a2
#undef a3
#undef b0
#undef b1
#undef b2

  TMP_FREE;
}


#define CONCAT(prefix,M,N,suffix)  prefix ## M ## N ## suffix

#ifdef UNBALANCED
#define M 5
#define N 2
#define mpn_mul_toomMN CONCAT(mpn_toom,5,2,_mul)
#else
#define M 4
#define N 3
#define mpn_mul_toomMN CONCAT(mpn_toom,4,3,_mul)
#endif

#ifdef CHECK
#include <stdlib.h>
#include <stdio.h>

#ifndef SIZE
#define SIZE 10
#endif

void
dumpy (mp_srcptr p, mp_size_t n)
{
  mp_size_t i;
  for (i = n - 1; i >= 0; i--)
    {
      printf ("%0*lx", (int) (2 * sizeof (mp_limb_t)), p[i]);
      printf (" " + (i == 0));
    }
  puts ("");
}

int
main (int argc, char **argv)
{
  mp_size_t n, s, t, an, bn, clearn;
  mp_ptr ap, bp, refp, pp;
  mp_limb_t keep;
  int test;
  int maxn;
  int norandom;
  int err = 0;
  TMP_DECL;
  TMP_MARK;

  an = M * SIZE;
  bn = N * SIZE;
  norandom = 0;

  if (argc >= 2)
    {
      maxn = strtol (argv[1], 0, 0);
      an = M * maxn;
      bn = N * maxn;
      if (argc == 3)
	{
	  an = maxn;
	  bn = strtol (argv[2], 0, 0);
	  norandom = 1;
	}
    }
  else
    return 1;

  ap = TMP_ALLOC_LIMBS (an);
  bp = TMP_ALLOC_LIMBS (bn);
  refp = TMP_ALLOC_LIMBS (an + bn);
  pp = TMP_ALLOC_LIMBS (an + bn + 1);

  for (test = 0;; test++)
    {
      if (err == 0 && test % 0x10000 == 0)
	{
	  printf ("\r%d", test);  fflush (stdout);
	}
      if (! norandom)
	{
	  n = random () % maxn + 1;
	  s = random () % n + 1;
#if M == N
	  t = random () % s + 1;
#else
	  t = random () % n + 1;
#endif
	  an = (M - 1) * n + s;
	  bn = (N - 1) * n + t;
	}
      mpn_random2 (ap, an);
      clearn = random () % (an + 1);
      MPN_ZERO (ap + clearn, an - clearn);

      mpn_random2 (bp, bn);
      clearn = random () % (bn + 1);
      MPN_ZERO (bp + clearn, bn - clearn);

      mpn_random2 (pp, an + bn + 1);
      keep = pp[an + bn];

      mpn_mul_toomMN (pp, ap, an, bp, bn, NULL);
      mpn_mul (refp, ap, an, bp, bn);
      if (pp[an + bn] != keep || mpn_cmp (refp, pp, an + bn) != 0)
	{
	  printf ("ERROR in test %d\n", test);
	  if (pp[an + bn] != keep)
	    {
	      printf ("pp high:"); dumpy (pp + an + bn, 1);
	      printf ("keep:   "); dumpy (&keep, 1);
	    }
	  dumpy (ap, an);
	  dumpy (bp, bn);
	  dumpy (pp, an + bn);
	  dumpy (refp, an + bn);
	  if (++err > 5)
	    abort();
	}
    }
  TMP_FREE;
}
#endif

#ifdef TIMING
#include <stdlib.h>
#include <stdio.h>

#include "timing.h"

#ifndef SIZE
#define SIZE 10
#endif

int
main (int argc, char **argv)
{
  mp_size_t an, bn;
  mp_ptr ap, bp, refp, pp;
  double t;
  TMP_DECL;
  TMP_MARK;

  if (argc >= 2)
    {
      an = bn = strtol (argv[1], 0, 0);
      if (argc == 3)
	bn = strtol (argv[2], 0, 0);
    }
  else
    return 1;

  if (!(an > bn && an < 3 * bn))
    {
      fprintf (stderr, "Invalid value combination of an,bn\n");
      return 1;
    }

  ap = TMP_ALLOC_LIMBS (an);
  bp = TMP_ALLOC_LIMBS (bn);
  refp = TMP_ALLOC_LIMBS (an + bn);
  pp = TMP_ALLOC_LIMBS (an + bn);

  mpn_random (ap, an);
  mpn_random (bp, bn);
  TIME (t, mpn_mul_toomMN (pp, ap, an, bp, bn, NULL));
  printf ("mpn_mul_toom%d%d:   %f\n", M, N, t);
  TIME (t, mpn_mul (refp, ap, an, bp, bn));
  printf ("mpn_mul:          %f\n", t);
  TIME (t, mpn_mul_basecase (refp, ap, an, bp, bn));
  printf ("mpn_mul_basecase: %f\n", t);
  TMP_FREE;

  return 0;
}
#endif