aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>2024-05-06 11:10:05 +0200
committerGitHub <noreply@github.com>2024-05-06 12:10:05 +0300
commita4812fd8f70640d8e4b83b8fcbb6224c9f5e0cf2 (patch)
treed9d6857e554fa07b6a3fbf24108210c6b4f4ac33
parenta81fe2a8f82d98c05c18908958efb757d35384b4 (diff)
downloadcpython3-upstream-3.12.tar.gz
[3.12] gh-118164: Break a loop between _pydecimal and _pylong and optimize int to str conversion (GH-118483) (GH-118590)upstream-3.12
For converting large ints to strings, CPython invokes a function in _pylong.py, which uses the decimal module to implement an asymptotically waaaaay sub-quadratic algorithm. But if the C decimal module isn't available, CPython uses _pydecimal.py instead. Which in turn frequently does str(int). If the int is very large, _pylong ends up doing the work, which in turn asks decimal to do "big" arithmetic, which in turn calls str(big_int), which in turn ... it can become infinite mutual recursion. This change introduces a different int->str function that doesn't use decimal. It's asymptotically worse, "Karatsuba time" instead of quadratic time, so still a huge improvement. _pylong switches to that when the C decimal isn't available. It is also used for not too large integers (less than 450_000 bits), where it is faster (up to 2 times for 30_000 bits) than the asymptotically better implementation that uses the C decimal. (cherry picked from commit 711c80bfca5dd17cb7c6ec26f0e44848b33aec04) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Tim Peters <tim.peters@gmail.com>
-rw-r--r--Lib/_pylong.py46
-rw-r--r--Lib/test/test_int.py31
-rw-r--r--Misc/NEWS.d/next/Core and Builtins/2024-05-02-15-57-07.gh-issue-118164.AF6kwI.rst4
3 files changed, 70 insertions, 11 deletions
diff --git a/Lib/_pylong.py b/Lib/_pylong.py
index 936346e187..30bee6fc9e 100644
--- a/Lib/_pylong.py
+++ b/Lib/_pylong.py
@@ -14,6 +14,10 @@ maximum performance, they should use something like gmpy2."""
import re
import decimal
+try:
+ import _decimal
+except ImportError:
+ _decimal = None
def int_to_decimal(n):
@@ -82,7 +86,47 @@ def int_to_decimal(n):
def int_to_decimal_string(n):
"""Asymptotically fast conversion of an 'int' to a decimal string."""
- return str(int_to_decimal(n))
+ w = n.bit_length()
+ if w > 450_000 and _decimal is not None:
+ # It is only usable with the C decimal implementation.
+ # _pydecimal.py calls str() on very large integers, which in its
+ # turn calls int_to_decimal_string(), causing very deep recursion.
+ return str(int_to_decimal(n))
+
+ # Fallback algorithm for the case when the C decimal module isn't
+ # available. This algorithm is asymptotically worse than the algorithm
+ # using the decimal module, but better than the quadratic time
+ # implementation in longobject.c.
+ def inner(n, w):
+ if w <= 1000:
+ return str(n)
+ w2 = w >> 1
+ d = pow10_cache.get(w2)
+ if d is None:
+ d = pow10_cache[w2] = 5**w2 << w2 # 10**i = (5*2)**i = 5**i * 2**i
+ hi, lo = divmod(n, d)
+ return inner(hi, w - w2) + inner(lo, w2).zfill(w2)
+
+ # The estimation of the number of decimal digits.
+ # There is no harm in small error. If we guess too large, there may
+ # be leading 0's that need to be stripped. If we guess too small, we
+ # may need to call str() recursively for the remaining highest digits,
+ # which can still potentially be a large integer. This is manifested
+ # only if the number has way more than 10**15 digits, that exceeds
+ # the 52-bit physical address limit in both Intel64 and AMD64.
+ w = int(w * 0.3010299956639812 + 1) # log10(2)
+ pow10_cache = {}
+ if n < 0:
+ n = -n
+ sign = '-'
+ else:
+ sign = ''
+ s = inner(n, w)
+ if s[0] == '0' and n:
+ # If our guess of w is too large, there may be leading 0's that
+ # need to be stripped.
+ s = s.lstrip('0')
+ return sign + s
def _str_to_int_inner(s):
diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py
index 5545ee39d8..affebffa53 100644
--- a/Lib/test/test_int.py
+++ b/Lib/test/test_int.py
@@ -834,17 +834,28 @@ class PyLongModuleTests(unittest.TestCase):
sys.set_int_max_str_digits(self._previous_limit)
super().tearDown()
- def test_pylong_int_to_decimal(self):
- n = (1 << 100_000) - 1
- suffix = '9883109375'
+ def _test_pylong_int_to_decimal(self, n, suffix):
s = str(n)
- assert s[-10:] == suffix
- s = str(-n)
- assert s[-10:] == suffix
- s = '%d' % n
- assert s[-10:] == suffix
- s = b'%d' % n
- assert s[-10:] == suffix.encode('ascii')
+ self.assertEqual(s[-10:], suffix)
+ s2 = str(-n)
+ self.assertEqual(s2, '-' + s)
+ s3 = '%d' % n
+ self.assertEqual(s3, s)
+ s4 = b'%d' % n
+ self.assertEqual(s4, s.encode('ascii'))
+
+ def test_pylong_int_to_decimal(self):
+ self._test_pylong_int_to_decimal((1 << 100_000), '9883109376')
+ self._test_pylong_int_to_decimal((1 << 100_000) - 1, '9883109375')
+ self._test_pylong_int_to_decimal(10**30_000, '0000000000')
+ self._test_pylong_int_to_decimal(10**30_000 - 1, '9999999999')
+ self._test_pylong_int_to_decimal(3**60_000, '9313200001')
+
+ @support.requires_resource('cpu')
+ def test_pylong_int_to_decimal_2(self):
+ self._test_pylong_int_to_decimal(2**1_000_000, '2747109376')
+ self._test_pylong_int_to_decimal(10**300_000, '0000000000')
+ self._test_pylong_int_to_decimal(3**600_000, '3132000001')
def test_pylong_int_divmod(self):
n = (1 << 100_000)
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-02-15-57-07.gh-issue-118164.AF6kwI.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-02-15-57-07.gh-issue-118164.AF6kwI.rst
new file mode 100644
index 0000000000..5eb3b6f500
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-02-15-57-07.gh-issue-118164.AF6kwI.rst
@@ -0,0 +1,4 @@
+Break a loop between the Python implementation of the :mod:`decimal` module
+and the Python code for integer to string conversion. Also optimize integer
+to string conversion for values in the range from 9_000 to 135_000 decimal
+digits.