lib/builtins/i386/umoddi3.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "../assembly.h"

// du_int __umoddi3(du_int a, du_int b);

// result = remainder of a / b.
// both inputs and the output are 64-bit unsigned integers.
// This will do whatever the underlying hardware is set to do on division by zero.
// No other exceptions are generated, as the divide cannot overflow.
//
// This is targeted at 32-bit x86 *only*, as this can be done directly in hardware
// on x86_64.  The performance goal is ~40 cycles per divide, which is faster than
// currently possible via simulation of integer divides on the x87 unit.
//

// Stephen Canon, December 2008

#ifdef __i386__

.text
.balign 4
DEFINE_COMPILERRT_FUNCTION(__umoddi3)

	pushl		%ebx
	movl	 20(%esp),			%ebx	// Find the index i of the leading bit in b.
	bsrl		%ebx,			%ecx	// If the high word of b is zero, jump to
	jz			9f						// the code to handle that special case [9].
	
	/* High word of b is known to be non-zero on this branch */
	
	movl	 16(%esp),			%eax	// Construct bhi, containing bits [1+i:32+i] of b
	
	shrl		%cl,			%eax	// Practically, this means that bhi is given by:
	shrl		%eax					//
	notl		%ecx					//		bhi = (high word of b) << (31 - i) |
	shll		%cl,			%ebx	//			  (low word of b) >> (1 + i)
	orl			%eax,			%ebx	//
	movl	 12(%esp),			%edx	// Load the high and low words of a, and jump
	movl	  8(%esp),			%eax	// to [2] if the high word is larger than bhi
	cmpl		%ebx,			%edx	// to avoid overflowing the upcoming divide.
	jae			2f						
		
	/* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
	
	divl		%ebx					// eax <-- qs, edx <-- r such that ahi:alo = bs*qs + r

	pushl		%edi
	notl		%ecx
	shrl		%eax
	shrl		%cl,			%eax	// q = qs >> (1 + i)
	movl		%eax,			%edi
	mull	 20(%esp)					// q*blo
	movl	 12(%esp),			%ebx
	movl	 16(%esp),			%ecx	// ECX:EBX = a
	subl		%eax,			%ebx
	sbbl		%edx,			%ecx	// ECX:EBX = a - q*blo
	movl	 24(%esp),			%eax
	imull		%edi,			%eax	// q*bhi
	subl		%eax,			%ecx	// ECX:EBX = a - q*b
	
	jnc			1f						// if positive, this is the result.
	addl	 20(%esp),			%ebx	// otherwise
	adcl	 24(%esp),			%ecx	// ECX:EBX = a - (q-1)*b = result
1:	movl		%ebx,			%eax
	movl		%ecx,			%edx
	
	popl		%edi
	popl		%ebx
	retl


2:	/* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
	 
	subl		%ebx,			%edx	// subtract bhi from ahi so that divide will not
	divl		%ebx					// overflow, and find q and r such that
										//
										//		ahi:alo = (1:q)*bhi + r
										//
										// Note that q is a number in (31-i).(1+i)
										// fix point.

	pushl		%edi
	notl		%ecx
	shrl		%eax
	orl			$0x80000000,	%eax
	shrl		%cl,			%eax	// q = (1:qs) >> (1 + i)
	movl		%eax,			%edi
	mull	 20(%esp)					// q*blo
	movl	 12(%esp),			%ebx
	movl	 16(%esp),			%ecx	// ECX:EBX = a
	subl		%eax,			%ebx
	sbbl		%edx,			%ecx	// ECX:EBX = a - q*blo
	movl	 24(%esp),			%eax
	imull		%edi,			%eax	// q*bhi
	subl		%eax,			%ecx	// ECX:EBX = a - q*b

	jnc			3f						// if positive, this is the result.
	addl	 20(%esp),			%ebx	// otherwise
	adcl	 24(%esp),			%ecx	// ECX:EBX = a - (q-1)*b = result
3:	movl		%ebx,			%eax
	movl		%ecx,			%edx
	
	popl		%edi
	popl		%ebx
	retl


9:	/* High word of b is zero on this branch */

	movl	 12(%esp),			%eax	// Find qhi and rhi such that
	movl	 16(%esp),			%ecx	//
	xorl		%edx,			%edx	//		ahi = qhi*b + rhi	with	0 ≤ rhi < b
	divl		%ecx					//
	movl		%eax,			%ebx	//
	movl	  8(%esp),			%eax	// Find rlo such that
	divl		%ecx					//
	movl		%edx,			%eax	//		rhi:alo = qlo*b + rlo  with 0 ≤ rlo < b
	popl		%ebx					//
	xorl		%edx,			%edx	// and return 0:rlo
	retl								// 
END_COMPILERRT_FUNCTION(__umoddi3)

#endif // __i386__

NO_EXEC_STACK_DIRECTIVE