-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmemset32.asm
487 lines (419 loc) · 16.5 KB
/
memset32.asm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
;************************* memset32.asm *************************************
; Author: Agner Fog
; Date created: 2008-07-19
; Last modified: 2013-09-11
; Description:
; Faster version of the standard memset function:
; void * A_memset(void * dest, int c, size_t count);
; Sets 'count' bytes from 'dest' to the 8-bit value 'c'
;
; Overriding standard function memset:
; The alias ?OVR_memset is changed to _memset in the object file if
; it is desired to override the standard library function memset.
;
; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
;
; Position-independent code is generated if POSITIONINDEPENDENT is defined.
;
; Optimization:
; Uses XMM registers to set 16 bytes at a time, aligned.
;
; The latest version of this file is available at:
; www.agner.org/optimize/asmexamples.zip
; Copyright (c) 2008-2013 GNU General Public License www.gnu.org/licenses
;******************************************************************************
global _A_memset: function ; Function memset
global ?OVR_memset: function ; ?OVR removed if standard function memset overridden
global _GetMemsetCacheLimit: function ; Data blocks bigger than this will be stored uncached by memset
global _SetMemsetCacheLimit: function ; Change limit in GetMemsetCacheLimit
; Direct entries to CPU-specific versions
global _memset386: function ; version for old CPUs without SSE
global _memsetSSE2: function ; SSE2 version
global _memsetAVX: function ; version for CPUs with fast 256-bit store
; Imported from cachesize32.asm:
extern _DataCacheSize ; Get size of data cache
; Imported from instrset32.asm
extern _InstructionSet ; Instruction set for CPU dispatcher
; Imported from unalignedisfaster32.asm:
extern _Store256BitIsFaster ; Tells if a 256 bit store is faster than two 128 bit stores
; Define return from this function
%MACRO RETURNM 0
%IFDEF POSITIONINDEPENDENT
pop ebx
%ENDIF
mov eax, [esp+4] ; return dest
ret
%ENDMACRO
SECTION .text align=16
; extern "C" void * memset(void * dest, int c, size_t count);
; Function entry:
_A_memset:
?OVR_memset:
%IFNDEF POSITIONINDEPENDENT
jmp dword [memsetDispatch] ; Go to appropriate version, depending on instruction set
RP equ 0 ; RP = 0 if not position-independent
%ELSE ; Position-independent code
push ebx
call get_thunk_ebx ; get reference point for position-independent code
RP: ; reference point ebx = offset RP
; Make the following instruction with address relative to RP:
jmp dword [ebx+memsetDispatch-RP]
%ENDIF
_memsetAVX: ; AVX version. Use ymm register
%IFDEF POSITIONINDEPENDENT
push ebx
call get_thunk_ebx ; get reference point for position-independent code
add ebx, RP - $
memsetAVX@: ; local label
mov edx, [esp+4+4] ; dest
movzx eax, byte [esp+4+8] ; c
mov ecx, [esp+4+12] ; count
%ELSE
memsetAVX@: ; local label
mov edx, [esp+4] ; dest
movzx eax, byte [esp+8] ; c
mov ecx, [esp+12] ; count
%ENDIF
imul eax, 01010101H ; Broadcast c into all bytes of eax
cmp ecx, 16
ja B100
B050: ; count <= 16, both SSE2 and AVX version
%IFNDEF POSITIONINDEPENDENT
jmp dword [MemsetJTab+ecx*4]
%ELSE
jmp dword [MemsetJTab-RP+ebx+ecx*4]
%ENDIF
; Separate code for each count from 0 to 16:
M16: mov [edx+12], eax
M12: mov [edx+8], eax
M08: mov [edx+4], eax
M04: mov [edx], eax
M00: RETURNM
M15: mov [edx+11], eax
M11: mov [edx+7], eax
M07: mov [edx+3], eax
M03: mov [edx+1], ax
M01: mov [edx], al
RETURNM
M14: mov [edx+10], eax
M10: mov [edx+6], eax
M06: mov [edx+2], eax
M02: mov [edx], ax
RETURNM
M13: mov [edx+9], eax
M09: mov [edx+5], eax
M05: mov [edx+1], eax
mov [edx], al
RETURNM
align 16
B100: ; count > 16.
movd xmm0, eax
pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
lea eax, [edx+ecx] ; point to end
cmp ecx, 20H
jbe K600 ; faster to use xmm registers if small
; Store the first possibly unaligned 16 bytes
; It is faster to always write 16 bytes, possibly overlapping
; with the subsequent regular part, than to make possibly mispredicted
; branches depending on the size of the first part.
movups oword [edx], xmm0
; store another 16 bytes, aligned
add edx, 10H
and edx, -10H
movaps oword [edx], xmm0
; go to next 32 bytes boundary
add edx, 10H
and edx, -20H
; Check if count very big
%IFNDEF POSITIONINDEPENDENT
cmp ecx, [_MemsetCacheLimit]
%ELSE ; position-independent code
cmp ecx, [ebx+_MemsetCacheLimit-RP]
%ENDIF
ja K300 ; Use non-temporal store if count > MemsetCacheLimit
; find last 32 bytes boundary
mov ecx, eax
and ecx, -20H
; - size of 32-bytes blocks
sub edx, ecx
jnb K200 ; Jump if not negative
; extend value to 256 bits
vinsertf128 ymm0,ymm0,xmm0,1
K100: ; Loop through 32-bytes blocks
; ecx = end of 32-bytes blocks part
; edx = negative index from the end, counting up to zero
vmovaps [ecx+edx], ymm0
add edx, 20H
jnz K100
vzeroupper
K200: ; the last part from ecx to eax is < 32 bytes. write last 32 bytes with overlap
movups [eax-20H], xmm0
movups [eax-10H], xmm0
RETURNM
K300: ; Use non-temporal moves, same code as above:
; find last 32 bytes boundary
mov ecx, eax
and ecx, -20H
; - size of 32-bytes blocks
sub edx, ecx
jnb K500 ; Jump if not negative
; extend value to 256 bits
vinsertf128 ymm0,ymm0,xmm0,1
align 16
K400: ; Loop through 32-bytes blocks
; ecx = end of 32-bytes blocks part
; edx = negative index from the end, counting up to zero
vmovntps [ecx+edx], ymm0
add edx, 20H
jnz K400
vzeroupper
K500: ; the last part from ecx to eax is < 32 bytes. write last 32 bytes with overlap
movups [eax-20H], xmm0
movups [eax-10H], xmm0
RETURNM
K600: ; 16 < count <= 32
movups [edx], xmm0
movups [eax-10H], xmm0
RETURNM
align 16
_memsetSSE2: ; SSE2 version. Use xmm register
%IFDEF POSITIONINDEPENDENT
push ebx
call get_thunk_ebx ; get reference point for position-independent code
add ebx, RP - $
memsetSSE2@: ; local label
mov edx, [esp+4+4] ; dest
movzx eax, byte [esp+4+8] ; c
mov ecx, [esp+4+12] ; count
%ELSE
memsetSSE2@: ; local label
mov edx, [esp+4] ; dest
movzx eax, byte [esp+8] ; c
mov ecx, [esp+12] ; count
%ENDIF
imul eax, 01010101H ; Broadcast c into all bytes of eax
cmp ecx, 16
jna B050 ; small counts: same as AVX version
movd xmm0, eax
pshufd xmm0, xmm0, 0 ; Broadcast c into all bytes of xmm0
; Store the first unaligned part.
; The size of this part is 1 - 16 bytes.
; It is faster to always write 16 bytes, possibly overlapping
; with the subsequent regular part, than to make possibly mispredicted
; branches depending on the size of the first part.
movq qword [edx], xmm0
movq qword [edx+8], xmm0
; Check if count very big
%IFNDEF POSITIONINDEPENDENT
cmp ecx, [_MemsetCacheLimit]
%ELSE ; position-independent code
cmp ecx, [ebx+_MemsetCacheLimit-RP]
%ENDIF
ja M500 ; Use non-temporal store if count > MemsetCacheLimit
; Point to end of regular part:
; Round down dest+count to nearest preceding 16-bytes boundary
lea ecx, [edx+ecx-1]
and ecx, -10H
; Point to start of regular part:
; Round up dest to next 16-bytes boundary
add edx, 10H
and edx, -10H
; -(size of regular part)
sub edx, ecx
jnb M300 ; Jump if not negative
align 16
M200: ; Loop through regular part
; ecx = end of regular part
; edx = negative index from the end, counting up to zero
movdqa [ecx+edx], xmm0
add edx, 10H
jnz M200
M300: ; Do the last irregular part
; The size of this part is 1 - 16 bytes.
; It is faster to always write 16 bytes, possibly overlapping
; with the preceding regular part, than to make possibly mispredicted
; branches depending on the size of the last part.
%IFDEF POSITIONINDEPENDENT ; (ebx is pushed)
mov eax, [esp+4+4] ; dest
mov ecx, [esp+4+12] ; count
%ELSE
mov eax, [esp+4] ; dest
mov ecx, [esp+12] ; count
%ENDIF
movq qword [eax+ecx-10H], xmm0
movq qword [eax+ecx-8], xmm0
RETURNM
M500: ; Use non-temporal moves, same code as above:
; End of regular part:
; Round down dest+count to nearest preceding 16-bytes boundary
lea ecx, [edx+ecx-1]
and ecx, -10H
; Start of regular part:
; Round up dest to next 16-bytes boundary
add edx, 10H
and edx, -10H
; -(size of regular part)
sub edx, ecx
jnb M700 ; Jump if not negative
align 16
M600: ; Loop through regular part
; ecx = end of regular part
; edx = negative index from the end, counting up to zero
movntdq [ecx+edx], xmm0
add edx, 10H
jnz M600
M700: ; Do the last irregular part (same as M300)
%IFDEF POSITIONINDEPENDENT ; (ebx is pushed)
mov eax, [esp+4+4] ; dest
mov ecx, [esp+4+12] ; count
%ELSE
mov eax, [esp+4] ; dest
mov ecx, [esp+12] ; count
%ENDIF
movq qword [eax+ecx-10H], xmm0
movq qword [eax+ecx-8], xmm0
RETURNM
_memset386: ; 80386 version
%IFDEF POSITIONINDEPENDENT
push ebx
call get_thunk_ebx ; get reference point for position-independent code
add ebx, RP - $
memset386@: ; local label
mov edx, [esp+4+4] ; dest
xor eax, eax
mov al, byte [esp+4+8] ; c
mov ecx, [esp+4+12] ; count
%ELSE
memset386@: ; local label
mov edx, [esp+4] ; dest
xor eax, eax
mov al, byte [esp+8] ; c
mov ecx, [esp+12] ; count
%ENDIF
imul eax, 01010101H ; Broadcast c into all bytes of eax
push edi
mov edi, edx
cmp ecx, 4
jb N400
N200: test edi, 3
jz N300
; unaligned
N210: mov [edi], al ; store 1 byte until edi aligned
inc edi
dec ecx
test edi, 3
jnz N210
N300: ; aligned
mov edx, ecx
shr ecx, 2
cld
rep stosd ; store 4 bytes at a time
mov ecx, edx
and ecx, 3
N400: rep stosb ; store any remaining bytes
pop edi
RETURNM
; CPU dispatching for memset. This is executed only once
memsetCPUDispatch:
%IFNDEF POSITIONINDEPENDENT
pushad
call GetMemsetCacheLimit@ ; calculate cache limit
call _InstructionSet ; get supported instruction set
; Point to generic version of memset
mov dword [memsetDispatch], memset386@
cmp eax, 4 ; check SSE2
jb Q100
; SSE2 supported
; Point to SSE2 version of memset
mov dword [memsetDispatch], memsetSSE2@
call _Store256BitIsFaster ; check if 256-bit stores are available and faster
test eax, eax
jz Q100
mov dword [memsetDispatch], memsetAVX@
Q100: popad
; Continue in appropriate version of memset
jmp dword [memsetDispatch]
%ELSE ; Position-independent version
pushad
call GetMemsetCacheLimit@
call _InstructionSet
; Point to generic version of memset
lea esi, [ebx+memset386@-RP]
cmp eax, 4 ; check SSE2
jb Q100
; SSE2 supported
; Point to SSE2 version of memset
lea esi, [ebx+memsetSSE2@-RP]
call _Store256BitIsFaster ; check if 256-bit stores are available and faster
test eax, eax
jz Q100
lea esi, [ebx+memsetAVX@-RP]
Q100: mov [ebx+memsetDispatch-RP], esi
popad
; Continue in appropriate version of memset
jmp [ebx+memsetDispatch-RP]
get_thunk_ebx: ; load caller address into ebx for position-independent code
mov ebx, [esp]
ret
%ENDIF
; extern "C" size_t GetMemsetCacheLimit(); // Data blocks bigger than this will be stored uncached by memset
_GetMemsetCacheLimit:
GetMemsetCacheLimit@: ; local label
push ebx
%ifdef POSITIONINDEPENDENT
call get_thunk_ebx
add ebx, _MemsetCacheLimit - $
%else
mov ebx, _MemsetCacheLimit
%endif
mov eax, [ebx]
test eax, eax
jnz U200
; Get half the size of the largest level cache
push 0 ; 0 means largest level cache
call _DataCacheSize ; get cache size
pop ecx
shr eax, 1 ; half the size
jnz U100
mov eax, 400000H ; cannot determine cache size. use 4 Mbytes
U100: mov [ebx], eax
U200: pop ebx
ret
; extern "C" void SetMemsetCacheLimit(); // Change limit in GetMemsetCacheLimit
_SetMemsetCacheLimit:
push ebx
%ifdef POSITIONINDEPENDENT
call get_thunk_ebx
add ebx, _MemsetCacheLimit - $
%else
mov ebx, _MemsetCacheLimit
%endif
mov eax, [esp+8]
test eax, eax
jnz U400
; zero, means default
mov [ebx], eax
call GetMemsetCacheLimit@
U400:
mov [ebx], eax
pop ebx
ret
SECTION .data
align 16
; Jump table for count from 0 to 16:
MemsetJTab DD M00, M01, M02, M03, M04, M05, M06, M07
DD M08, M09, M10, M11, M12, M13, M14, M15, M16
; Pointer to appropriate version.
; This initially points to memsetCPUDispatch. memsetCPUDispatch will
; change this to the appropriate version of memset, so that
; memsetCPUDispatch is only executed once:
memsetDispatch DD memsetCPUDispatch
; Bypass cache by using non-temporal moves if count > MemsetCacheLimit
; The optimal value of MemsetCacheLimit is difficult to estimate, but
; a reasonable value is half the size of the largest cache
_MemsetCacheLimit: DD 0
%IFDEF POSITIONINDEPENDENT
; Fix potential problem in Mac linker
DD 0, 0
%ENDIF