diff options
Diffstat (limited to 'src/x86/win64.S')
-rw-r--r-- | src/x86/win64.S | 737 |
1 files changed, 227 insertions, 510 deletions
diff --git a/src/x86/win64.S b/src/x86/win64.S index 687f97c4..2c334c82 100644 --- a/src/x86/win64.S +++ b/src/x86/win64.S @@ -1,520 +1,237 @@ +#ifdef __x86_64__ #define LIBFFI_ASM #include <fficonfig.h> #include <ffi.h> +#include <ffi_cfi.h> +#include "asmnames.h" -/* Constants for ffi_call_win64 */ -#define STACK 0 -#define PREP_ARGS_FN 32 -#define ECIF 40 -#define CIF_BYTES 48 -#define CIF_FLAGS 56 -#define RVALUE 64 -#define FN 72 - -/* ffi_call_win64 (void (*prep_args_fn)(char *, extended_cif *), - extended_cif *ecif, unsigned bytes, unsigned flags, - unsigned *rvalue, void (*fn)()); - */ - -#ifdef _MSC_VER -PUBLIC ffi_call_win64 - -EXTRN __chkstk:NEAR -EXTRN ffi_closure_win64_inner:NEAR - -_TEXT SEGMENT - -;;; ffi_closure_win64 will be called with these registers set: -;;; rax points to 'closure' -;;; r11 contains a bit mask that specifies which of the -;;; first four parameters are float or double -;;; -;;; It must move the parameters passed in registers to their stack location, -;;; call ffi_closure_win64_inner for the actual work, then return the result. -;;; -ffi_closure_win64 PROC FRAME - ;; copy register arguments onto stack - test r11, 1 - jne first_is_float - mov QWORD PTR [rsp+8], rcx - jmp second -first_is_float: - movlpd QWORD PTR [rsp+8], xmm0 - -second: - test r11, 2 - jne second_is_float - mov QWORD PTR [rsp+16], rdx - jmp third -second_is_float: - movlpd QWORD PTR [rsp+16], xmm1 - -third: - test r11, 4 - jne third_is_float - mov QWORD PTR [rsp+24], r8 - jmp fourth -third_is_float: - movlpd QWORD PTR [rsp+24], xmm2 - -fourth: - test r11, 8 - jne fourth_is_float - mov QWORD PTR [rsp+32], r9 - jmp done -fourth_is_float: - movlpd QWORD PTR [rsp+32], xmm3 - -done: - .ALLOCSTACK 40 - sub rsp, 40 - .ENDPROLOG - mov rcx, rax ; context is first parameter - mov rdx, rsp ; stack is second parameter - add rdx, 48 ; point to start of arguments - mov rax, ffi_closure_win64_inner - call rax ; call the real closure function - add rsp, 40 - movd xmm0, rax ; If the closure returned a float, - ; ffi_closure_win64_inner wrote it to rax - ret 0 -ffi_closure_win64 ENDP - -ffi_call_win64 PROC FRAME - ;; copy registers onto stack - mov QWORD PTR [rsp+32], r9 - mov QWORD PTR [rsp+24], r8 - mov QWORD PTR [rsp+16], rdx - mov QWORD PTR [rsp+8], rcx - .PUSHREG rbp - push rbp - .ALLOCSTACK 48 - sub rsp, 48 ; 00000030H - .SETFRAME rbp, 32 - lea rbp, QWORD PTR [rsp+32] - .ENDPROLOG - - mov eax, DWORD PTR CIF_BYTES[rbp] - add rax, 15 - and rax, -16 - call __chkstk - sub rsp, rax - lea rax, QWORD PTR [rsp+32] - mov QWORD PTR STACK[rbp], rax - - mov rdx, QWORD PTR ECIF[rbp] - mov rcx, QWORD PTR STACK[rbp] - call QWORD PTR PREP_ARGS_FN[rbp] - - mov rsp, QWORD PTR STACK[rbp] - - movlpd xmm3, QWORD PTR [rsp+24] - movd r9, xmm3 - - movlpd xmm2, QWORD PTR [rsp+16] - movd r8, xmm2 - - movlpd xmm1, QWORD PTR [rsp+8] - movd rdx, xmm1 - - movlpd xmm0, QWORD PTR [rsp] - movd rcx, xmm0 - - call QWORD PTR FN[rbp] -ret_struct4b$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_4B - jne ret_struct2b$ - - mov rcx, QWORD PTR RVALUE[rbp] - mov DWORD PTR [rcx], eax - jmp ret_void$ - -ret_struct2b$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_2B - jne ret_struct1b$ - - mov rcx, QWORD PTR RVALUE[rbp] - mov WORD PTR [rcx], ax - jmp ret_void$ - -ret_struct1b$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SMALL_STRUCT_1B - jne ret_uint8$ - - mov rcx, QWORD PTR RVALUE[rbp] - mov BYTE PTR [rcx], al - jmp ret_void$ - -ret_uint8$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT8 - jne ret_sint8$ - - mov rcx, QWORD PTR RVALUE[rbp] - movzx rax, al - mov QWORD PTR [rcx], rax - jmp ret_void$ - -ret_sint8$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT8 - jne ret_uint16$ - - mov rcx, QWORD PTR RVALUE[rbp] - movsx rax, al - mov QWORD PTR [rcx], rax - jmp ret_void$ - -ret_uint16$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT16 - jne ret_sint16$ - - mov rcx, QWORD PTR RVALUE[rbp] - movzx rax, ax - mov QWORD PTR [rcx], rax - jmp SHORT ret_void$ - -ret_sint16$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT16 - jne ret_uint32$ - - mov rcx, QWORD PTR RVALUE[rbp] - movsx rax, ax - mov QWORD PTR [rcx], rax - jmp SHORT ret_void$ - -ret_uint32$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT32 - jne ret_sint32$ - - mov rcx, QWORD PTR RVALUE[rbp] - mov eax, eax - mov QWORD PTR [rcx], rax - jmp SHORT ret_void$ - -ret_sint32$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT32 - jne ret_float$ - - mov rcx, QWORD PTR RVALUE[rbp] - cdqe - mov QWORD PTR [rcx], rax - jmp SHORT ret_void$ - -ret_float$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_FLOAT - jne SHORT ret_double$ - - mov rax, QWORD PTR RVALUE[rbp] - movss DWORD PTR [rax], xmm0 - jmp SHORT ret_void$ - -ret_double$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_DOUBLE - jne SHORT ret_uint64$ - - mov rax, QWORD PTR RVALUE[rbp] - movlpd QWORD PTR [rax], xmm0 - jmp SHORT ret_void$ - -ret_uint64$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_UINT64 - jne SHORT ret_sint64$ - - mov rcx, QWORD PTR RVALUE[rbp] - mov QWORD PTR [rcx], rax - jmp SHORT ret_void$ - -ret_sint64$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_SINT64 - jne SHORT ret_pointer$ - - mov rcx, QWORD PTR RVALUE[rbp] - mov QWORD PTR [rcx], rax - jmp SHORT ret_void$ - -ret_pointer$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_POINTER - jne SHORT ret_int$ - - mov rcx, QWORD PTR RVALUE[rbp] - mov QWORD PTR [rcx], rax - jmp SHORT ret_void$ - -ret_int$: - cmp DWORD PTR CIF_FLAGS[rbp], FFI_TYPE_INT - jne SHORT ret_void$ - - mov rcx, QWORD PTR RVALUE[rbp] - cdqe - mov QWORD PTR [rcx], rax - jmp SHORT ret_void$ - -ret_void$: - xor rax, rax - - lea rsp, QWORD PTR [rbp+16] - pop rbp - ret 0 -ffi_call_win64 ENDP -_TEXT ENDS -END +#if defined(HAVE_AS_CFI_PSEUDO_OP) + .cfi_sections .debug_frame +#endif +#ifdef X86_WIN64 +#define SEH(...) __VA_ARGS__ +#define arg0 %rcx +#define arg1 %rdx +#define arg2 %r8 +#define arg3 %r9 #else +#define SEH(...) +#define arg0 %rdi +#define arg1 %rsi +#define arg2 %rdx +#define arg3 %rcx +#endif -#ifdef SYMBOL_UNDERSCORE -#define SYMBOL_NAME(name) _##name +/* This macro allows the safe creation of jump tables without an + actual table. The entry points into the table are all 8 bytes. + The use of ORG asserts that we're at the correct location. */ +/* ??? The clang assembler doesn't handle .org with symbolic expressions. */ +#if defined(__clang__) || defined(__APPLE__) || (defined (__sun__) && defined(__svr4__)) +# define E(BASE, X) .balign 8 #else -#define SYMBOL_NAME(name) name +# define E(BASE, X) .balign 8; .org BASE + X * 8 #endif -.text - -.extern SYMBOL_NAME(ffi_closure_win64_inner) - -# ffi_closure_win64 will be called with these registers set: -# rax points to 'closure' -# r11 contains a bit mask that specifies which of the -# first four parameters are float or double -# -# It must move the parameters passed in registers to their stack location, -# call ffi_closure_win64_inner for the actual work, then return the result. -# - .balign 16 - .globl SYMBOL_NAME(ffi_closure_win64) - .seh_proc SYMBOL_NAME(ffi_closure_win64) -SYMBOL_NAME(ffi_closure_win64): - # copy register arguments onto stack - test $1,%r11 - jne .Lfirst_is_float - mov %rcx, 8(%rsp) - jmp .Lsecond -.Lfirst_is_float: - movlpd %xmm0, 8(%rsp) - -.Lsecond: - test $2, %r11 - jne .Lsecond_is_float - mov %rdx, 16(%rsp) - jmp .Lthird -.Lsecond_is_float: - movlpd %xmm1, 16(%rsp) - -.Lthird: - test $4, %r11 - jne .Lthird_is_float - mov %r8,24(%rsp) - jmp .Lfourth -.Lthird_is_float: - movlpd %xmm2, 24(%rsp) - -.Lfourth: - test $8, %r11 - jne .Lfourth_is_float - mov %r9, 32(%rsp) - jmp .Ldone -.Lfourth_is_float: - movlpd %xmm3, 32(%rsp) - -.Ldone: - .seh_stackalloc 40 - sub $40, %rsp - .seh_endprologue - mov %rax, %rcx # context is first parameter - mov %rsp, %rdx # stack is second parameter - add $48, %rdx # point to start of arguments - leaq SYMBOL_NAME(ffi_closure_win64_inner)(%rip), %rax - callq *%rax # call the real closure function - add $40, %rsp - movq %rax, %xmm0 # If the closure returned a float, - # ffi_closure_win64_inner wrote it to rax - retq - .seh_endproc - - .balign 16 - .globl SYMBOL_NAME(ffi_call_win64) - .seh_proc SYMBOL_NAME(ffi_call_win64) -SYMBOL_NAME(ffi_call_win64): - # copy registers onto stack - mov %r9,32(%rsp) - mov %r8,24(%rsp) - mov %rdx,16(%rsp) - mov %rcx,8(%rsp) - .seh_pushreg rbp - push %rbp - .seh_stackalloc 48 - sub $48,%rsp - .seh_setframe rbp, 32 - lea 32(%rsp),%rbp - .seh_endprologue - - mov CIF_BYTES(%rbp),%eax - add $15, %rax - and $-16, %rax - cmpq $0x1000, %rax - jb Lch_done -Lch_probe: - subq $0x1000,%rsp - orl $0x0, (%rsp) - subq $0x1000,%rax - cmpq $0x1000,%rax - ja Lch_probe -Lch_done: - subq %rax, %rsp - orl $0x0, (%rsp) - lea 32(%rsp), %rax - mov %rax, STACK(%rbp) - - mov ECIF(%rbp), %rdx - mov STACK(%rbp), %rcx - callq *PREP_ARGS_FN(%rbp) - - mov STACK(%rbp), %rsp - - movlpd 24(%rsp), %xmm3 - movd %xmm3, %r9 - - movlpd 16(%rsp), %xmm2 - movd %xmm2, %r8 - - movlpd 8(%rsp), %xmm1 - movd %xmm1, %rdx - - movlpd (%rsp), %xmm0 - movd %xmm0, %rcx - - callq *FN(%rbp) -.Lret_struct4b: - cmpl $FFI_TYPE_SMALL_STRUCT_4B, CIF_FLAGS(%rbp) - jne .Lret_struct2b - - mov RVALUE(%rbp), %rcx - mov %eax, (%rcx) - jmp .Lret_void - -.Lret_struct2b: - cmpl $FFI_TYPE_SMALL_STRUCT_2B, CIF_FLAGS(%rbp) - jne .Lret_struct1b - - mov RVALUE(%rbp), %rcx - mov %ax, (%rcx) - jmp .Lret_void - -.Lret_struct1b: - cmpl $FFI_TYPE_SMALL_STRUCT_1B, CIF_FLAGS(%rbp) - jne .Lret_uint8 - - mov RVALUE(%rbp), %rcx - mov %al, (%rcx) - jmp .Lret_void - -.Lret_uint8: - cmpl $FFI_TYPE_UINT8, CIF_FLAGS(%rbp) - jne .Lret_sint8 - - mov RVALUE(%rbp), %rcx - movzbq %al, %rax - movq %rax, (%rcx) - jmp .Lret_void - -.Lret_sint8: - cmpl $FFI_TYPE_SINT8, CIF_FLAGS(%rbp) - jne .Lret_uint16 - - mov RVALUE(%rbp), %rcx - movsbq %al, %rax - movq %rax, (%rcx) - jmp .Lret_void - -.Lret_uint16: - cmpl $FFI_TYPE_UINT16, CIF_FLAGS(%rbp) - jne .Lret_sint16 - - mov RVALUE(%rbp), %rcx - movzwq %ax, %rax - movq %rax, (%rcx) - jmp .Lret_void - -.Lret_sint16: - cmpl $FFI_TYPE_SINT16, CIF_FLAGS(%rbp) - jne .Lret_uint32 - - mov RVALUE(%rbp), %rcx - movswq %ax, %rax - movq %rax, (%rcx) - jmp .Lret_void - -.Lret_uint32: - cmpl $FFI_TYPE_UINT32, CIF_FLAGS(%rbp) - jne .Lret_sint32 - - mov RVALUE(%rbp), %rcx - movl %eax, %eax - movq %rax, (%rcx) - jmp .Lret_void - -.Lret_sint32: - cmpl $FFI_TYPE_SINT32, CIF_FLAGS(%rbp) - jne .Lret_float - - mov RVALUE(%rbp), %rcx - cltq - movq %rax, (%rcx) - jmp .Lret_void - -.Lret_float: - cmpl $FFI_TYPE_FLOAT, CIF_FLAGS(%rbp) - jne .Lret_double - - mov RVALUE(%rbp), %rax - movss %xmm0, (%rax) - jmp .Lret_void - -.Lret_double: - cmpl $FFI_TYPE_DOUBLE, CIF_FLAGS(%rbp) - jne .Lret_uint64 - - mov RVALUE(%rbp), %rax - movlpd %xmm0, (%rax) - jmp .Lret_void - -.Lret_uint64: - cmpl $FFI_TYPE_UINT64, CIF_FLAGS(%rbp) - jne .Lret_sint64 - - mov RVALUE(%rbp), %rcx - mov %rax, (%rcx) - jmp .Lret_void - -.Lret_sint64: - cmpl $FFI_TYPE_SINT64, CIF_FLAGS(%rbp) - jne .Lret_pointer - - mov RVALUE(%rbp), %rcx - mov %rax, (%rcx) - jmp .Lret_void - -.Lret_pointer: - cmpl $FFI_TYPE_POINTER, CIF_FLAGS(%rbp) - jne .Lret_int - - mov RVALUE(%rbp), %rcx - mov %rax, (%rcx) - jmp .Lret_void - -.Lret_int: - cmpl $FFI_TYPE_INT, CIF_FLAGS(%rbp) - jne .Lret_void - - mov RVALUE(%rbp), %rcx - cltq - movq %rax, (%rcx) - jmp .Lret_void - -.Lret_void: - xor %rax, %rax - - lea 16(%rbp), %rsp - pop %rbp - retq - .seh_endproc -#endif /* !_MSC_VER */ - + .text + +/* ffi_call_win64 (void *stack, struct win64_call_frame *frame, void *r10) + + Bit o trickiness here -- FRAME is the base of the stack frame + for this function. This has been allocated by ffi_call. We also + deallocate some of the stack that has been alloca'd. */ + + .align 8 + .globl C(ffi_call_win64) + FFI_HIDDEN(C(ffi_call_win64)) + + SEH(.seh_proc ffi_call_win64) +C(ffi_call_win64): + cfi_startproc + /* Set up the local stack frame and install it in rbp/rsp. */ + movq (%rsp), %rax + movq %rbp, (arg1) + movq %rax, 8(arg1) + movq arg1, %rbp + cfi_def_cfa(%rbp, 16) + cfi_rel_offset(%rbp, 0) + SEH(.seh_pushreg %rbp) + SEH(.seh_setframe %rbp, 0) + SEH(.seh_endprologue) + movq arg0, %rsp + + movq arg2, %r10 + + /* Load all slots into both general and xmm registers. */ + movq (%rsp), %rcx + movsd (%rsp), %xmm0 + movq 8(%rsp), %rdx + movsd 8(%rsp), %xmm1 + movq 16(%rsp), %r8 + movsd 16(%rsp), %xmm2 + movq 24(%rsp), %r9 + movsd 24(%rsp), %xmm3 + + call *16(%rbp) + + movl 24(%rbp), %ecx + movq 32(%rbp), %r8 + leaq 0f(%rip), %r10 + cmpl $FFI_TYPE_SMALL_STRUCT_4B, %ecx + leaq (%r10, %rcx, 8), %r10 + ja 99f + jmp *%r10 + +/* Below, we're space constrained most of the time. Thus we eschew the + modern "mov, pop, ret" sequence (5 bytes) for "leave, ret" (2 bytes). */ +.macro epilogue + leaveq + cfi_remember_state + cfi_def_cfa(%rsp, 8) + cfi_restore(%rbp) + ret + cfi_restore_state +.endm + + .align 8 +0: +E(0b, FFI_TYPE_VOID) + epilogue +E(0b, FFI_TYPE_INT) + movslq %eax, %rax + movq %rax, (%r8) + epilogue +E(0b, FFI_TYPE_FLOAT) + movss %xmm0, (%r8) + epilogue +E(0b, FFI_TYPE_DOUBLE) + movsd %xmm0, (%r8) + epilogue +E(0b, FFI_TYPE_LONGDOUBLE) + call PLT(C(abort)) +E(0b, FFI_TYPE_UINT8) + movzbl %al, %eax + movq %rax, (%r8) + epilogue +E(0b, FFI_TYPE_SINT8) + movsbq %al, %rax + jmp 98f +E(0b, FFI_TYPE_UINT16) + movzwl %ax, %eax + movq %rax, (%r8) + epilogue +E(0b, FFI_TYPE_SINT16) + movswq %ax, %rax + jmp 98f +E(0b, FFI_TYPE_UINT32) + movl %eax, %eax + movq %rax, (%r8) + epilogue +E(0b, FFI_TYPE_SINT32) + movslq %eax, %rax + movq %rax, (%r8) + epilogue +E(0b, FFI_TYPE_UINT64) +98: movq %rax, (%r8) + epilogue +E(0b, FFI_TYPE_SINT64) + movq %rax, (%r8) + epilogue +E(0b, FFI_TYPE_STRUCT) + epilogue +E(0b, FFI_TYPE_POINTER) + movq %rax, (%r8) + epilogue +E(0b, FFI_TYPE_COMPLEX) + call PLT(C(abort)) +E(0b, FFI_TYPE_SMALL_STRUCT_1B) + movb %al, (%r8) + epilogue +E(0b, FFI_TYPE_SMALL_STRUCT_2B) + movw %ax, (%r8) + epilogue +E(0b, FFI_TYPE_SMALL_STRUCT_4B) + movl %eax, (%r8) + epilogue + + .align 8 +99: call PLT(C(abort)) + + epilogue + + cfi_endproc + SEH(.seh_endproc) + + +/* 32 bytes of outgoing register stack space, 8 bytes of alignment, + 16 bytes of result, 32 bytes of xmm registers. */ +#define ffi_clo_FS (32+8+16+32) +#define ffi_clo_OFF_R (32+8) +#define ffi_clo_OFF_X (32+8+16) + + .align 8 + .globl C(ffi_go_closure_win64) + FFI_HIDDEN(C(ffi_go_closure_win64)) + + SEH(.seh_proc ffi_go_closure_win64) +C(ffi_go_closure_win64): + cfi_startproc + /* Save all integer arguments into the incoming reg stack space. */ + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %r8, 24(%rsp) + movq %r9, 32(%rsp) + + movq 8(%r10), %rcx /* load cif */ + movq 16(%r10), %rdx /* load fun */ + movq %r10, %r8 /* closure is user_data */ + jmp 0f + cfi_endproc + SEH(.seh_endproc) + + .align 8 + .globl C(ffi_closure_win64) + FFI_HIDDEN(C(ffi_closure_win64)) + + SEH(.seh_proc ffi_closure_win64) +C(ffi_closure_win64): + cfi_startproc + /* Save all integer arguments into the incoming reg stack space. */ + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %r8, 24(%rsp) + movq %r9, 32(%rsp) + + movq FFI_TRAMPOLINE_SIZE(%r10), %rcx /* load cif */ + movq FFI_TRAMPOLINE_SIZE+8(%r10), %rdx /* load fun */ + movq FFI_TRAMPOLINE_SIZE+16(%r10), %r8 /* load user_data */ +0: + subq $ffi_clo_FS, %rsp + cfi_adjust_cfa_offset(ffi_clo_FS) + SEH(.seh_stackalloc ffi_clo_FS) + SEH(.seh_endprologue) + + /* Save all sse arguments into the stack frame. */ + movsd %xmm0, ffi_clo_OFF_X(%rsp) + movsd %xmm1, ffi_clo_OFF_X+8(%rsp) + movsd %xmm2, ffi_clo_OFF_X+16(%rsp) + movsd %xmm3, ffi_clo_OFF_X+24(%rsp) + + leaq ffi_clo_OFF_R(%rsp), %r9 + call PLT(C(ffi_closure_win64_inner)) + + /* Load the result into both possible result registers. */ + movq ffi_clo_OFF_R(%rsp), %rax + movsd ffi_clo_OFF_R(%rsp), %xmm0 + + addq $ffi_clo_FS, %rsp + cfi_adjust_cfa_offset(-ffi_clo_FS) + ret + + cfi_endproc + SEH(.seh_endproc) +#endif /* __x86_64__ */ + +#if defined __ELF__ && defined __linux__ + .section .note.GNU-stack,"",@progbits +#endif |