simd/jquantf-sse2-64.asm


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

;
; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
;
; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
; Copyright (C) 2009, D. R. Commander.
;
; Based on the x86 SIMD extension for IJG JPEG library
; Copyright (C) 1999-2006, MIYASAKA Masaru.
; For conditions of distribution and use, see copyright notice in jsimdext.inc
;
; This file should be assembled with NASM (Netwide Assembler),
; can *not* be assembled with Microsoft's MASM or any compatible
; assembler (including Borland's Turbo Assembler).
; NASM is available from http://nasm.sourceforge.net/ or
; http://sourceforge.net/project/showfiles.php?group_id=6208
;
; [TAB8]

%include "jsimdext.inc"
%include "jdct.inc"

; --------------------------------------------------------------------------
        SECTION SEG_TEXT
        BITS    64
;
; Load data into workspace, applying unsigned->signed conversion
;
; GLOBAL(void)
; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
;                            FAST_FLOAT *workspace);
;

; r10 = JSAMPARRAY sample_data
; r11 = JDIMENSION start_col
; r12 = FAST_FLOAT *workspace

        align   16
        global  EXTN(jsimd_convsamp_float_sse2)

EXTN(jsimd_convsamp_float_sse2):
        push    rbp
        mov     rax,rsp
        mov     rbp,rsp
        collect_args
        push    rbx

        pcmpeqw  xmm7,xmm7
        psllw    xmm7,7
        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)

        mov rsi, r10
        mov     eax, r11d
        mov rdi, r12
        mov     rcx, DCTSIZE/2
.convloop:
        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)

        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]

        psubb   xmm0,xmm7                       ; xmm0=(01234567)
        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)

        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)

        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)

        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)

        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1

        add     rsi, byte 2*SIZEOF_JSAMPROW
        add     rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
        dec     rcx
        jnz     short .convloop

        pop     rbx
        uncollect_args
        pop     rbp
        ret


; --------------------------------------------------------------------------
;
; Quantize/descale the coefficients, and store into coef_block
;
; GLOBAL(void)
; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
;                         FAST_FLOAT *workspace);
;

; r10 = JCOEFPTR coef_block
; r11 = FAST_FLOAT *divisors
; r12 = FAST_FLOAT *workspace

        align   16
        global  EXTN(jsimd_quantize_float_sse2)

EXTN(jsimd_quantize_float_sse2):
        push    rbp
        mov     rax,rsp
        mov     rbp,rsp
        collect_args

        mov rsi, r12
        mov rdx, r11
        mov rdi, r10
        mov     rax, DCTSIZE2/16
.quantloop:
        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]

        cvtps2dq xmm0,xmm0
        cvtps2dq xmm1,xmm1
        cvtps2dq xmm2,xmm2
        cvtps2dq xmm3,xmm3

        packssdw xmm0,xmm1
        packssdw xmm2,xmm3

        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2

        add     rsi, byte 16*SIZEOF_FAST_FLOAT
        add     rdx, byte 16*SIZEOF_FAST_FLOAT
        add     rdi, byte 16*SIZEOF_JCOEF
        dec     rax
        jnz     short .quantloop

        uncollect_args
        pop     rbp
        ret

; For some reason, the OS X linker does not honor the request to align the
; segment unless we do this.
        align   16