|
1 ;uInt longest_match_x64( |
|
2 ; deflate_state *s, |
|
3 ; IPos cur_match); /* current match */ |
|
4 |
|
5 ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86 |
|
6 ; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant. |
|
7 ; |
|
8 ; File written by Gilles Vollant, by converting to assembly the longest_match |
|
9 ; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. |
|
10 ; |
|
11 ; and by taking inspiration on asm686 with masm, optimised assembly code |
|
12 ; from Brian Raiter, written 1998 |
|
13 ; |
|
14 ; http://www.zlib.net |
|
15 ; http://www.winimage.com/zLibDll |
|
16 ; http://www.muppetlabs.com/~breadbox/software/assembly.html |
|
17 ; |
|
18 ; to compile this file for infozip Zip, I use option: |
|
19 ; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm |
|
20 ; |
|
21 ; to compile this file for zLib, I use option: |
|
22 ; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm |
|
23 ; Be carrefull to adapt zlib1222add below to your version of zLib |
|
24 ; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change |
|
25 ; value of zlib1222add later) |
|
26 ; |
|
27 ; This file compile with Microsoft Macro Assembler (x64) for AMD64 |
|
28 ; |
|
29 ; ml64.exe is given with Visual Studio 2005 and Windows 2003 server DDK |
|
30 ; |
|
31 ; (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from |
|
32 ; http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price) |
|
33 ; |
|
34 |
|
35 |
|
36 ;uInt longest_match(s, cur_match) |
|
37 ; deflate_state *s; |
|
38 ; IPos cur_match; /* current match */ |
|
39 .code |
|
40 longest_match PROC |
|
41 |
|
42 |
|
43 ;LocalVarsSize equ 88 |
|
44 LocalVarsSize equ 72 |
|
45 |
|
46 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12 |
|
47 ; free register : r14,r15 |
|
48 ; register can be saved : rsp |
|
49 |
|
50 chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len |
|
51 ; low word: s->wmask |
|
52 ;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10 |
|
53 ;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11 |
|
54 ;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w |
|
55 ;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx |
|
56 ;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13 |
|
57 ;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d |
|
58 ;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9 |
|
59 IFDEF INFOZIP |
|
60 ELSE |
|
61 nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size |
|
62 ENDIF |
|
63 |
|
64 save_rdi equ rsp + 24 - LocalVarsSize |
|
65 save_rsi equ rsp + 32 - LocalVarsSize |
|
66 save_rbx equ rsp + 40 - LocalVarsSize |
|
67 save_rbp equ rsp + 48 - LocalVarsSize |
|
68 save_r12 equ rsp + 56 - LocalVarsSize |
|
69 save_r13 equ rsp + 64 - LocalVarsSize |
|
70 ;save_r14 equ rsp + 72 - LocalVarsSize |
|
71 ;save_r15 equ rsp + 80 - LocalVarsSize |
|
72 |
|
73 |
|
74 |
|
75 ; all the +4 offsets are due to the addition of pending_buf_size (in zlib |
|
76 ; in the deflate_state structure since the asm code was first written |
|
77 ; (if you compile with zlib 1.0.4 or older, remove the +4). |
|
78 ; Note : these value are good with a 8 bytes boundary pack structure |
|
79 |
|
80 |
|
81 MAX_MATCH equ 258 |
|
82 MIN_MATCH equ 3 |
|
83 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) |
|
84 |
|
85 |
|
86 ;;; Offsets for fields in the deflate_state structure. These numbers |
|
87 ;;; are calculated from the definition of deflate_state, with the |
|
88 ;;; assumption that the compiler will dword-align the fields. (Thus, |
|
89 ;;; changing the definition of deflate_state could easily cause this |
|
90 ;;; program to crash horribly, without so much as a warning at |
|
91 ;;; compile time. Sigh.) |
|
92 |
|
93 ; all the +zlib1222add offsets are due to the addition of fields |
|
94 ; in zlib in the deflate_state structure since the asm code was first written |
|
95 ; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). |
|
96 ; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). |
|
97 ; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). |
|
98 |
|
99 |
|
100 IFDEF INFOZIP |
|
101 |
|
102 _DATA SEGMENT |
|
103 COMM window_size:DWORD |
|
104 ; WMask ; 7fff |
|
105 COMM window:BYTE:010040H |
|
106 COMM prev:WORD:08000H |
|
107 ; MatchLen : unused |
|
108 ; PrevMatch : unused |
|
109 COMM strstart:DWORD |
|
110 COMM match_start:DWORD |
|
111 ; Lookahead : ignore |
|
112 COMM prev_length:DWORD ; PrevLen |
|
113 COMM max_chain_length:DWORD |
|
114 COMM good_match:DWORD |
|
115 COMM nice_match:DWORD |
|
116 prev_ad equ OFFSET prev |
|
117 window_ad equ OFFSET window |
|
118 nicematch equ nice_match |
|
119 _DATA ENDS |
|
120 WMask equ 07fffh |
|
121 |
|
122 ELSE |
|
123 |
|
124 IFNDEF zlib1222add |
|
125 zlib1222add equ 8 |
|
126 ENDIF |
|
127 dsWSize equ 56+zlib1222add+(zlib1222add/2) |
|
128 dsWMask equ 64+zlib1222add+(zlib1222add/2) |
|
129 dsWindow equ 72+zlib1222add |
|
130 dsPrev equ 88+zlib1222add |
|
131 dsMatchLen equ 128+zlib1222add |
|
132 dsPrevMatch equ 132+zlib1222add |
|
133 dsStrStart equ 140+zlib1222add |
|
134 dsMatchStart equ 144+zlib1222add |
|
135 dsLookahead equ 148+zlib1222add |
|
136 dsPrevLen equ 152+zlib1222add |
|
137 dsMaxChainLen equ 156+zlib1222add |
|
138 dsGoodMatch equ 172+zlib1222add |
|
139 dsNiceMatch equ 176+zlib1222add |
|
140 |
|
141 window_size equ [ rcx + dsWSize] |
|
142 WMask equ [ rcx + dsWMask] |
|
143 window_ad equ [ rcx + dsWindow] |
|
144 prev_ad equ [ rcx + dsPrev] |
|
145 strstart equ [ rcx + dsStrStart] |
|
146 match_start equ [ rcx + dsMatchStart] |
|
147 Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip |
|
148 prev_length equ [ rcx + dsPrevLen] |
|
149 max_chain_length equ [ rcx + dsMaxChainLen] |
|
150 good_match equ [ rcx + dsGoodMatch] |
|
151 nice_match equ [ rcx + dsNiceMatch] |
|
152 ENDIF |
|
153 |
|
154 ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match) |
|
155 |
|
156 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and |
|
157 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp |
|
158 ; |
|
159 ; All registers must be preserved across the call, except for |
|
160 ; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. |
|
161 |
|
162 |
|
163 |
|
164 ;;; Save registers that the compiler may be using, and adjust esp to |
|
165 ;;; make room for our stack frame. |
|
166 |
|
167 |
|
168 ;;; Retrieve the function arguments. r8d will hold cur_match |
|
169 ;;; throughout the entire function. edx will hold the pointer to the |
|
170 ;;; deflate_state structure during the function's setup (before |
|
171 ;;; entering the main loop. |
|
172 |
|
173 ; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match) |
|
174 |
|
175 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx |
|
176 |
|
177 mov [save_rdi],rdi |
|
178 mov [save_rsi],rsi |
|
179 mov [save_rbx],rbx |
|
180 mov [save_rbp],rbp |
|
181 IFDEF INFOZIP |
|
182 mov r8d,ecx |
|
183 ELSE |
|
184 mov r8d,edx |
|
185 ENDIF |
|
186 mov [save_r12],r12 |
|
187 mov [save_r13],r13 |
|
188 ; mov [save_r14],r14 |
|
189 ; mov [save_r15],r15 |
|
190 |
|
191 |
|
192 ;;; uInt wmask = s->w_mask; |
|
193 ;;; unsigned chain_length = s->max_chain_length; |
|
194 ;;; if (s->prev_length >= s->good_match) { |
|
195 ;;; chain_length >>= 2; |
|
196 ;;; } |
|
197 |
|
198 mov edi, prev_length |
|
199 mov esi, good_match |
|
200 mov eax, WMask |
|
201 mov ebx, max_chain_length |
|
202 cmp edi, esi |
|
203 jl LastMatchGood |
|
204 shr ebx, 2 |
|
205 LastMatchGood: |
|
206 |
|
207 ;;; chainlen is decremented once beforehand so that the function can |
|
208 ;;; use the sign flag instead of the zero flag for the exit test. |
|
209 ;;; It is then shifted into the high word, to make room for the wmask |
|
210 ;;; value, which it will always accompany. |
|
211 |
|
212 dec ebx |
|
213 shl ebx, 16 |
|
214 or ebx, eax |
|
215 |
|
216 ;;; on zlib only |
|
217 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; |
|
218 |
|
219 IFDEF INFOZIP |
|
220 mov [chainlenwmask], ebx |
|
221 ; on infozip nice_match = [nice_match] |
|
222 ELSE |
|
223 mov eax, nice_match |
|
224 mov [chainlenwmask], ebx |
|
225 mov r10d, Lookahead |
|
226 cmp r10d, eax |
|
227 cmovnl r10d, eax |
|
228 mov [nicematch],r10d |
|
229 ENDIF |
|
230 |
|
231 ;;; register Bytef *scan = s->window + s->strstart; |
|
232 mov r10, window_ad |
|
233 mov ebp, strstart |
|
234 lea r13, [r10 + rbp] |
|
235 |
|
236 ;;; Determine how many bytes the scan ptr is off from being |
|
237 ;;; dword-aligned. |
|
238 |
|
239 mov r9,r13 |
|
240 neg r13 |
|
241 and r13,3 |
|
242 |
|
243 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? |
|
244 ;;; s->strstart - (IPos)MAX_DIST(s) : NIL; |
|
245 IFDEF INFOZIP |
|
246 mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1)) |
|
247 ELSE |
|
248 mov eax, window_size |
|
249 sub eax, MIN_LOOKAHEAD |
|
250 ENDIF |
|
251 xor edi,edi |
|
252 sub ebp, eax |
|
253 |
|
254 mov r11d, prev_length |
|
255 |
|
256 cmovng ebp,edi |
|
257 |
|
258 ;;; int best_len = s->prev_length; |
|
259 |
|
260 |
|
261 ;;; Store the sum of s->window + best_len in esi locally, and in esi. |
|
262 |
|
263 lea rsi,[r10+r11] |
|
264 |
|
265 ;;; register ush scan_start = *(ushf*)scan; |
|
266 ;;; register ush scan_end = *(ushf*)(scan+best_len-1); |
|
267 ;;; Posf *prev = s->prev; |
|
268 |
|
269 movzx r12d,word ptr [r9] |
|
270 movzx ebx, word ptr [r9 + r11 - 1] |
|
271 |
|
272 mov rdi, prev_ad |
|
273 |
|
274 ;;; Jump into the main loop. |
|
275 |
|
276 mov edx, [chainlenwmask] |
|
277 |
|
278 cmp bx,word ptr [rsi + r8 - 1] |
|
279 jz LookupLoopIsZero |
|
280 |
|
281 LookupLoop1: |
|
282 and r8d, edx |
|
283 |
|
284 movzx r8d, word ptr [rdi + r8*2] |
|
285 cmp r8d, ebp |
|
286 jbe LeaveNow |
|
287 sub edx, 00010000h |
|
288 js LeaveNow |
|
289 |
|
290 LoopEntry1: |
|
291 cmp bx,word ptr [rsi + r8 - 1] |
|
292 jz LookupLoopIsZero |
|
293 |
|
294 LookupLoop2: |
|
295 and r8d, edx |
|
296 |
|
297 movzx r8d, word ptr [rdi + r8*2] |
|
298 cmp r8d, ebp |
|
299 jbe LeaveNow |
|
300 sub edx, 00010000h |
|
301 js LeaveNow |
|
302 |
|
303 LoopEntry2: |
|
304 cmp bx,word ptr [rsi + r8 - 1] |
|
305 jz LookupLoopIsZero |
|
306 |
|
307 LookupLoop4: |
|
308 and r8d, edx |
|
309 |
|
310 movzx r8d, word ptr [rdi + r8*2] |
|
311 cmp r8d, ebp |
|
312 jbe LeaveNow |
|
313 sub edx, 00010000h |
|
314 js LeaveNow |
|
315 |
|
316 LoopEntry4: |
|
317 |
|
318 cmp bx,word ptr [rsi + r8 - 1] |
|
319 jnz LookupLoop1 |
|
320 jmp LookupLoopIsZero |
|
321 |
|
322 |
|
323 ;;; do { |
|
324 ;;; match = s->window + cur_match; |
|
325 ;;; if (*(ushf*)(match+best_len-1) != scan_end || |
|
326 ;;; *(ushf*)match != scan_start) continue; |
|
327 ;;; [...] |
|
328 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit |
|
329 ;;; && --chain_length != 0); |
|
330 ;;; |
|
331 ;;; Here is the inner loop of the function. The function will spend the |
|
332 ;;; majority of its time in this loop, and majority of that time will |
|
333 ;;; be spent in the first ten instructions. |
|
334 ;;; |
|
335 ;;; Within this loop: |
|
336 ;;; ebx = scanend |
|
337 ;;; r8d = curmatch |
|
338 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) |
|
339 ;;; esi = windowbestlen - i.e., (window + bestlen) |
|
340 ;;; edi = prev |
|
341 ;;; ebp = limit |
|
342 |
|
343 LookupLoop: |
|
344 and r8d, edx |
|
345 |
|
346 movzx r8d, word ptr [rdi + r8*2] |
|
347 cmp r8d, ebp |
|
348 jbe LeaveNow |
|
349 sub edx, 00010000h |
|
350 js LeaveNow |
|
351 |
|
352 LoopEntry: |
|
353 |
|
354 cmp bx,word ptr [rsi + r8 - 1] |
|
355 jnz LookupLoop1 |
|
356 LookupLoopIsZero: |
|
357 cmp r12w, word ptr [r10 + r8] |
|
358 jnz LookupLoop1 |
|
359 |
|
360 |
|
361 ;;; Store the current value of chainlen. |
|
362 mov [chainlenwmask], edx |
|
363 |
|
364 ;;; Point edi to the string under scrutiny, and esi to the string we |
|
365 ;;; are hoping to match it up with. In actuality, esi and edi are |
|
366 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is |
|
367 ;;; initialized to -(MAX_MATCH_8 - scanalign). |
|
368 |
|
369 lea rsi,[r8+r10] |
|
370 mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8) |
|
371 lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8] |
|
372 lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8] |
|
373 |
|
374 prefetcht1 [rsi+rdx] |
|
375 prefetcht1 [rdi+rdx] |
|
376 |
|
377 |
|
378 ;;; Test the strings for equality, 8 bytes at a time. At the end, |
|
379 ;;; adjust rdx so that it is offset to the exact byte that mismatched. |
|
380 ;;; |
|
381 ;;; We already know at this point that the first three bytes of the |
|
382 ;;; strings match each other, and they can be safely passed over before |
|
383 ;;; starting the compare loop. So what this code does is skip over 0-3 |
|
384 ;;; bytes, as much as necessary in order to dword-align the edi |
|
385 ;;; pointer. (rsi will still be misaligned three times out of four.) |
|
386 ;;; |
|
387 ;;; It should be confessed that this loop usually does not represent |
|
388 ;;; much of the total running time. Replacing it with a more |
|
389 ;;; straightforward "rep cmpsb" would not drastically degrade |
|
390 ;;; performance. |
|
391 |
|
392 |
|
393 LoopCmps: |
|
394 mov rax, [rsi + rdx] |
|
395 xor rax, [rdi + rdx] |
|
396 jnz LeaveLoopCmps |
|
397 |
|
398 mov rax, [rsi + rdx + 8] |
|
399 xor rax, [rdi + rdx + 8] |
|
400 jnz LeaveLoopCmps8 |
|
401 |
|
402 |
|
403 mov rax, [rsi + rdx + 8+8] |
|
404 xor rax, [rdi + rdx + 8+8] |
|
405 jnz LeaveLoopCmps16 |
|
406 |
|
407 add rdx,8+8+8 |
|
408 |
|
409 jmp short LoopCmps |
|
410 LeaveLoopCmps16: add rdx,8 |
|
411 LeaveLoopCmps8: add rdx,8 |
|
412 LeaveLoopCmps: |
|
413 |
|
414 test eax, 0000FFFFh |
|
415 jnz LenLower |
|
416 |
|
417 test eax,0ffffffffh |
|
418 |
|
419 jnz LenLower32 |
|
420 |
|
421 add rdx,4 |
|
422 shr rax,32 |
|
423 or ax,ax |
|
424 jnz LenLower |
|
425 |
|
426 LenLower32: |
|
427 shr eax,16 |
|
428 add rdx,2 |
|
429 LenLower: sub al, 1 |
|
430 adc rdx, 0 |
|
431 ;;; Calculate the length of the match. If it is longer than MAX_MATCH, |
|
432 ;;; then automatically accept it as the best possible match and leave. |
|
433 |
|
434 lea rax, [rdi + rdx] |
|
435 sub rax, r9 |
|
436 cmp eax, MAX_MATCH |
|
437 jge LenMaximum |
|
438 |
|
439 ;;; If the length of the match is not longer than the best match we |
|
440 ;;; have so far, then forget it and return to the lookup loop. |
|
441 ;/////////////////////////////////// |
|
442 |
|
443 cmp eax, r11d |
|
444 jg LongerMatch |
|
445 |
|
446 lea rsi,[r10+r11] |
|
447 |
|
448 mov rdi, prev_ad |
|
449 mov edx, [chainlenwmask] |
|
450 jmp LookupLoop |
|
451 |
|
452 ;;; s->match_start = cur_match; |
|
453 ;;; best_len = len; |
|
454 ;;; if (len >= nice_match) break; |
|
455 ;;; scan_end = *(ushf*)(scan+best_len-1); |
|
456 |
|
457 LongerMatch: |
|
458 mov r11d, eax |
|
459 mov match_start, r8d |
|
460 cmp eax, [nicematch] |
|
461 jge LeaveNow |
|
462 |
|
463 lea rsi,[r10+rax] |
|
464 |
|
465 movzx ebx, word ptr [r9 + rax - 1] |
|
466 mov rdi, prev_ad |
|
467 mov edx, [chainlenwmask] |
|
468 jmp LookupLoop |
|
469 |
|
470 ;;; Accept the current string, with the maximum possible length. |
|
471 |
|
472 LenMaximum: |
|
473 mov r11d,MAX_MATCH |
|
474 mov match_start, r8d |
|
475 |
|
476 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; |
|
477 ;;; return s->lookahead; |
|
478 |
|
479 LeaveNow: |
|
480 IFDEF INFOZIP |
|
481 mov eax,r11d |
|
482 ELSE |
|
483 mov eax, Lookahead |
|
484 cmp r11d, eax |
|
485 cmovng eax, r11d |
|
486 ENDIF |
|
487 |
|
488 ;;; Restore the stack and return from whence we came. |
|
489 |
|
490 |
|
491 mov rsi,[save_rsi] |
|
492 mov rdi,[save_rdi] |
|
493 mov rbx,[save_rbx] |
|
494 mov rbp,[save_rbp] |
|
495 mov r12,[save_r12] |
|
496 mov r13,[save_r13] |
|
497 ; mov r14,[save_r14] |
|
498 ; mov r15,[save_r15] |
|
499 |
|
500 |
|
501 ret 0 |
|
502 ; please don't remove this string ! |
|
503 ; Your can freely use gvmat64 in any free or commercial app |
|
504 ; but it is far better don't remove the string in the binary! |
|
505 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 |
|
506 longest_match ENDP |
|
507 |
|
508 match_init PROC |
|
509 ret 0 |
|
510 match_init ENDP |
|
511 |
|
512 |
|
513 END |