aboutsummaryrefslogtreecommitdiff
path: root/Objects/longobject.c
diff options
context:
space:
mode:
Diffstat (limited to 'Objects/longobject.c')
-rw-r--r--Objects/longobject.c108
1 files changed, 84 insertions, 24 deletions
diff --git a/Objects/longobject.c b/Objects/longobject.c
index 09ae9455c5..b5648fca7d 100644
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@@ -74,12 +74,34 @@ maybe_small_long(PyLongObject *v)
#define KARATSUBA_CUTOFF 70
#define KARATSUBA_SQUARE_CUTOFF (2 * KARATSUBA_CUTOFF)
-/* For exponentiation, use the binary left-to-right algorithm
- * unless the exponent contains more than FIVEARY_CUTOFF digits.
- * In that case, do 5 bits at a time. The potential drawback is that
- * a table of 2**5 intermediate results is computed.
+/* For exponentiation, use the binary left-to-right algorithm unless the
+ ^ exponent contains more than HUGE_EXP_CUTOFF bits. In that case, do
+ * (no more than) EXP_WINDOW_SIZE bits at a time. The potential drawback is
+ * that a table of 2**(EXP_WINDOW_SIZE - 1) intermediate results is
+ * precomputed.
*/
-#define FIVEARY_CUTOFF 8
+#define EXP_WINDOW_SIZE 5
+#define EXP_TABLE_LEN (1 << (EXP_WINDOW_SIZE - 1))
+/* Suppose the exponent has bit length e. All ways of doing this
+ * need e squarings. The binary method also needs a multiply for
+ * each bit set. In a k-ary method with window width w, a multiply
+ * for each non-zero window, so at worst (and likely!)
+ * ceiling(e/w). The k-ary sliding window method has the same
+ * worst case, but the window slides so it can sometimes skip
+ * over an all-zero window that the fixed-window method can't
+ * exploit. In addition, the windowing methods need multiplies
+ * to precompute a table of small powers.
+ *
+ * For the sliding window method with width 5, 16 precomputation
+ * multiplies are needed. Assuming about half the exponent bits
+ * are set, then, the binary method needs about e/2 extra mults
+ * and the window method about 16 + e/5.
+ *
+ * The latter is smaller for e > 53 1/3. We don't have direct
+ * access to the bit length, though, so call it 60, which is a
+ * multiple of a long digit's max bit length (15 or 30 so far).
+ */
+#define HUGE_EXP_CUTOFF 60
#define SIGCHECK(PyTryBlock) \
do { \
@@ -4172,14 +4194,15 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
int negativeOutput = 0; /* if x<0 return negative output */
PyLongObject *z = NULL; /* accumulated result */
- Py_ssize_t i, j, k; /* counters */
+ Py_ssize_t i, j; /* counters */
PyLongObject *temp = NULL;
+ PyLongObject *a2 = NULL; /* may temporarily hold a**2 % c */
- /* 5-ary values. If the exponent is large enough, table is
- * precomputed so that table[i] == a**i % c for i in range(32).
+ /* k-ary values. If the exponent is large enough, table is
+ * precomputed so that table[i] == a**(2*i+1) % c for i in
+ * range(EXP_TABLE_LEN).
*/
- PyLongObject *table[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+ PyLongObject *table[EXP_TABLE_LEN] = {0};
/* a, b, c = v, w, x */
CHECK_BINOP(v, w);
@@ -4332,7 +4355,7 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
}
/* else bi is 0, and z==1 is correct */
}
- else if (i <= FIVEARY_CUTOFF) {
+ else if (i <= HUGE_EXP_CUTOFF / PyLong_SHIFT ) {
/* Left-to-right binary exponentiation (HAC Algorithm 14.79) */
/* http://www.cacr.math.uwaterloo.ca/hac/about/chap14.pdf */
@@ -4366,23 +4389,59 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
}
}
else {
- /* Left-to-right 5-ary exponentiation (HAC Algorithm 14.82) */
- Py_INCREF(z); /* still holds 1L */
- table[0] = z;
- for (i = 1; i < 32; ++i)
- MULT(table[i-1], a, table[i]);
+ /* Left-to-right k-ary sliding window exponentiation
+ * (Handbook of Applied Cryptography (HAC) Algorithm 14.85)
+ */
+ Py_INCREF(a);
+ table[0] = a;
+ MULT(a, a, a2);
+ /* table[i] == a**(2*i + 1) % c */
+ for (i = 1; i < EXP_TABLE_LEN; ++i)
+ MULT(table[i-1], a2, table[i]);
+ Py_CLEAR(a2);
+
+ /* Repeatedly extract the next (no more than) EXP_WINDOW_SIZE bits
+ * into `pending`, starting with the next 1 bit. The current bit
+ * length of `pending` is `blen`.
+ */
+ int pending = 0, blen = 0;
+#define ABSORB_PENDING do { \
+ int ntz = 0; /* number of trailing zeroes in `pending` */ \
+ assert(pending && blen); \
+ assert(pending >> (blen - 1)); \
+ assert(pending >> blen == 0); \
+ while ((pending & 1) == 0) { \
+ ++ntz; \
+ pending >>= 1; \
+ } \
+ assert(ntz < blen); \
+ blen -= ntz; \
+ do { \
+ MULT(z, z, z); \
+ } while (--blen); \
+ MULT(z, table[pending >> 1], z); \
+ while (ntz-- > 0) \
+ MULT(z, z, z); \
+ assert(blen == 0); \
+ pending = 0; \
+ } while(0)
for (i = Py_SIZE(b) - 1; i >= 0; --i) {
const digit bi = b->ob_digit[i];
-
- for (j = PyLong_SHIFT - 5; j >= 0; j -= 5) {
- const int index = (bi >> j) & 0x1f;
- for (k = 0; k < 5; ++k)
+ for (j = PyLong_SHIFT - 1; j >= 0; --j) {
+ const int bit = (bi >> j) & 1;
+ pending = (pending << 1) | bit;
+ if (pending) {
+ ++blen;
+ if (blen == EXP_WINDOW_SIZE)
+ ABSORB_PENDING;
+ }
+ else /* absorb strings of 0 bits */
MULT(z, z, z);
- if (index)
- MULT(z, table[index], z);
}
}
+ if (pending)
+ ABSORB_PENDING;
}
if (negativeOutput && (Py_SIZE(z) != 0)) {
@@ -4399,13 +4458,14 @@ long_pow(PyObject *v, PyObject *w, PyObject *x)
Py_CLEAR(z);
/* fall through */
Done:
- if (Py_SIZE(b) > FIVEARY_CUTOFF) {
- for (i = 0; i < 32; ++i)
+ if (Py_SIZE(b) > HUGE_EXP_CUTOFF / PyLong_SHIFT) {
+ for (i = 0; i < EXP_TABLE_LEN; ++i)
Py_XDECREF(table[i]);
}
Py_DECREF(a);
Py_DECREF(b);
Py_XDECREF(c);
+ Py_XDECREF(a2);
Py_XDECREF(temp);
return (PyObject *)z;
}