symbian-qemu-0.9.1-12/zlib-1.2.3/contrib/masmx64/gvmat64.asm
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 ;uInt longest_match_x64(
       
     2 ;    deflate_state *s,
       
     3 ;    IPos cur_match);                             /* current match */
       
     4 
       
     5 ; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86
       
     6 ; Copyright (C) 1995-2005 Jean-loup Gailly, Brian Raiter and Gilles Vollant.
       
     7 ;
       
     8 ; File written by Gilles Vollant, by converting to assembly the longest_match
       
     9 ;  from Jean-loup Gailly in deflate.c of zLib and infoZip zip.
       
    10 ;
       
    11 ;  and by taking inspiration on asm686 with masm, optimised assembly code
       
    12 ;        from Brian Raiter, written 1998
       
    13 ;
       
    14 ;         http://www.zlib.net
       
    15 ;         http://www.winimage.com/zLibDll
       
    16 ;         http://www.muppetlabs.com/~breadbox/software/assembly.html
       
    17 ;
       
    18 ; to compile this file for infozip Zip, I use option:
       
    19 ;   ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm
       
    20 ;
       
    21 ; to compile this file for zLib, I use option:
       
    22 ;   ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
       
    23 ; Be carrefull to adapt zlib1222add below to your version of zLib
       
    24 ;   (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change
       
    25 ;    value of zlib1222add later)
       
    26 ;
       
    27 ; This file compile with Microsoft Macro Assembler (x64) for AMD64
       
    28 ;
       
    29 ;   ml64.exe is given with Visual Studio 2005 and Windows 2003 server DDK
       
    30 ;
       
    31 ;   (you can get Windows 2003 server DDK with ml64 and cl for AMD64 from
       
    32 ;      http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
       
    33 ;
       
    34 
       
    35 
       
    36 ;uInt longest_match(s, cur_match)
       
    37 ;    deflate_state *s;
       
    38 ;    IPos cur_match;                             /* current match */
       
    39 .code
       
    40 longest_match PROC
       
    41 
       
    42 
       
    43 ;LocalVarsSize   equ 88
       
    44  LocalVarsSize   equ 72
       
    45 
       
    46 ; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12
       
    47 ; free register :  r14,r15
       
    48 ; register can be saved : rsp
       
    49 
       
    50  chainlenwmask   equ  rsp + 8 - LocalVarsSize    ; high word: current chain len
       
    51                                                  ; low word: s->wmask
       
    52 ;window          equ  rsp + xx - LocalVarsSize   ; local copy of s->window ; stored in r10
       
    53 ;windowbestlen   equ  rsp + xx - LocalVarsSize   ; s->window + bestlen , use r10+r11
       
    54 ;scanstart       equ  rsp + xx - LocalVarsSize   ; first two bytes of string ; stored in r12w
       
    55 ;scanend         equ  rsp + xx - LocalVarsSize   ; last two bytes of string use ebx
       
    56 ;scanalign       equ  rsp + xx - LocalVarsSize   ; dword-misalignment of string r13
       
    57 ;bestlen         equ  rsp + xx - LocalVarsSize   ; size of best match so far -> r11d
       
    58 ;scan            equ  rsp + xx - LocalVarsSize   ; ptr to string wanting match -> r9
       
    59 IFDEF INFOZIP
       
    60 ELSE
       
    61  nicematch       equ  (rsp + 16 - LocalVarsSize) ; a good enough match size
       
    62 ENDIF
       
    63 
       
    64 save_rdi        equ  rsp + 24 - LocalVarsSize
       
    65 save_rsi        equ  rsp + 32 - LocalVarsSize
       
    66 save_rbx        equ  rsp + 40 - LocalVarsSize
       
    67 save_rbp        equ  rsp + 48 - LocalVarsSize
       
    68 save_r12        equ  rsp + 56 - LocalVarsSize
       
    69 save_r13        equ  rsp + 64 - LocalVarsSize
       
    70 ;save_r14        equ  rsp + 72 - LocalVarsSize
       
    71 ;save_r15        equ  rsp + 80 - LocalVarsSize
       
    72 
       
    73 
       
    74 
       
    75 ;  all the +4 offsets are due to the addition of pending_buf_size (in zlib
       
    76 ;  in the deflate_state structure since the asm code was first written
       
    77 ;  (if you compile with zlib 1.0.4 or older, remove the +4).
       
    78 ;  Note : these value are good with a 8 bytes boundary pack structure
       
    79 
       
    80 
       
    81     MAX_MATCH           equ     258
       
    82     MIN_MATCH           equ     3
       
    83     MIN_LOOKAHEAD       equ     (MAX_MATCH+MIN_MATCH+1)
       
    84 
       
    85 
       
    86 ;;; Offsets for fields in the deflate_state structure. These numbers
       
    87 ;;; are calculated from the definition of deflate_state, with the
       
    88 ;;; assumption that the compiler will dword-align the fields. (Thus,
       
    89 ;;; changing the definition of deflate_state could easily cause this
       
    90 ;;; program to crash horribly, without so much as a warning at
       
    91 ;;; compile time. Sigh.)
       
    92 
       
    93 ;  all the +zlib1222add offsets are due to the addition of fields
       
    94 ;  in zlib in the deflate_state structure since the asm code was first written
       
    95 ;  (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)").
       
    96 ;  (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0").
       
    97 ;  if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8").
       
    98 
       
    99 
       
   100 IFDEF INFOZIP
       
   101 
       
   102 _DATA   SEGMENT
       
   103 COMM    window_size:DWORD
       
   104 ; WMask ; 7fff
       
   105 COMM    window:BYTE:010040H
       
   106 COMM    prev:WORD:08000H
       
   107 ; MatchLen : unused
       
   108 ; PrevMatch : unused
       
   109 COMM    strstart:DWORD
       
   110 COMM    match_start:DWORD
       
   111 ; Lookahead : ignore
       
   112 COMM    prev_length:DWORD ; PrevLen
       
   113 COMM    max_chain_length:DWORD
       
   114 COMM    good_match:DWORD
       
   115 COMM    nice_match:DWORD
       
   116 prev_ad equ OFFSET prev
       
   117 window_ad equ OFFSET window
       
   118 nicematch equ nice_match
       
   119 _DATA ENDS
       
   120 WMask equ 07fffh
       
   121 
       
   122 ELSE
       
   123 
       
   124   IFNDEF zlib1222add
       
   125     zlib1222add equ 8
       
   126   ENDIF
       
   127 dsWSize         equ 56+zlib1222add+(zlib1222add/2)
       
   128 dsWMask         equ 64+zlib1222add+(zlib1222add/2)
       
   129 dsWindow        equ 72+zlib1222add
       
   130 dsPrev          equ 88+zlib1222add
       
   131 dsMatchLen      equ 128+zlib1222add
       
   132 dsPrevMatch     equ 132+zlib1222add
       
   133 dsStrStart      equ 140+zlib1222add
       
   134 dsMatchStart    equ 144+zlib1222add
       
   135 dsLookahead     equ 148+zlib1222add
       
   136 dsPrevLen       equ 152+zlib1222add
       
   137 dsMaxChainLen   equ 156+zlib1222add
       
   138 dsGoodMatch     equ 172+zlib1222add
       
   139 dsNiceMatch     equ 176+zlib1222add
       
   140 
       
   141 window_size     equ [ rcx + dsWSize]
       
   142 WMask           equ [ rcx + dsWMask]
       
   143 window_ad       equ [ rcx + dsWindow]
       
   144 prev_ad         equ [ rcx + dsPrev]
       
   145 strstart        equ [ rcx + dsStrStart]
       
   146 match_start     equ [ rcx + dsMatchStart]
       
   147 Lookahead       equ [ rcx + dsLookahead] ; 0ffffffffh on infozip
       
   148 prev_length     equ [ rcx + dsPrevLen]
       
   149 max_chain_length equ [ rcx + dsMaxChainLen]
       
   150 good_match      equ [ rcx + dsGoodMatch]
       
   151 nice_match      equ [ rcx + dsNiceMatch]
       
   152 ENDIF
       
   153 
       
   154 ; parameter 1 in r8(deflate state s), param 2 in rdx (cur match)
       
   155 
       
   156 ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
       
   157 ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
       
   158 ;
       
   159 ; All registers must be preserved across the call, except for
       
   160 ;   rax, rcx, rdx, r8, r9, r10, and r11, which are scratch.
       
   161 
       
   162 
       
   163 
       
   164 ;;; Save registers that the compiler may be using, and adjust esp to
       
   165 ;;; make room for our stack frame.
       
   166 
       
   167 
       
   168 ;;; Retrieve the function arguments. r8d will hold cur_match
       
   169 ;;; throughout the entire function. edx will hold the pointer to the
       
   170 ;;; deflate_state structure during the function's setup (before
       
   171 ;;; entering the main loop.
       
   172 
       
   173 ; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match)
       
   174 
       
   175 ; this clear high 32 bits of r8, which can be garbage in both r8 and rdx
       
   176 
       
   177         mov [save_rdi],rdi
       
   178         mov [save_rsi],rsi
       
   179         mov [save_rbx],rbx
       
   180         mov [save_rbp],rbp
       
   181 IFDEF INFOZIP
       
   182         mov r8d,ecx
       
   183 ELSE
       
   184         mov r8d,edx
       
   185 ENDIF
       
   186         mov [save_r12],r12
       
   187         mov [save_r13],r13
       
   188 ;        mov [save_r14],r14
       
   189 ;        mov [save_r15],r15
       
   190 
       
   191 
       
   192 ;;; uInt wmask = s->w_mask;
       
   193 ;;; unsigned chain_length = s->max_chain_length;
       
   194 ;;; if (s->prev_length >= s->good_match) {
       
   195 ;;;     chain_length >>= 2;
       
   196 ;;; }
       
   197 
       
   198         mov edi, prev_length
       
   199         mov esi, good_match
       
   200         mov eax, WMask
       
   201         mov ebx, max_chain_length
       
   202         cmp edi, esi
       
   203         jl  LastMatchGood
       
   204         shr ebx, 2
       
   205 LastMatchGood:
       
   206 
       
   207 ;;; chainlen is decremented once beforehand so that the function can
       
   208 ;;; use the sign flag instead of the zero flag for the exit test.
       
   209 ;;; It is then shifted into the high word, to make room for the wmask
       
   210 ;;; value, which it will always accompany.
       
   211 
       
   212         dec ebx
       
   213         shl ebx, 16
       
   214         or  ebx, eax
       
   215 
       
   216 ;;; on zlib only
       
   217 ;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
       
   218 
       
   219 IFDEF INFOZIP
       
   220         mov [chainlenwmask], ebx
       
   221 ; on infozip nice_match = [nice_match]
       
   222 ELSE
       
   223         mov eax, nice_match
       
   224         mov [chainlenwmask], ebx
       
   225         mov r10d, Lookahead
       
   226         cmp r10d, eax
       
   227         cmovnl r10d, eax
       
   228         mov [nicematch],r10d
       
   229 ENDIF
       
   230 
       
   231 ;;; register Bytef *scan = s->window + s->strstart;
       
   232         mov r10, window_ad
       
   233         mov ebp, strstart
       
   234         lea r13, [r10 + rbp]
       
   235 
       
   236 ;;; Determine how many bytes the scan ptr is off from being
       
   237 ;;; dword-aligned.
       
   238 
       
   239          mov r9,r13
       
   240          neg r13
       
   241          and r13,3
       
   242 
       
   243 ;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
       
   244 ;;;     s->strstart - (IPos)MAX_DIST(s) : NIL;
       
   245 IFDEF INFOZIP
       
   246         mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1))
       
   247 ELSE
       
   248         mov eax, window_size
       
   249         sub eax, MIN_LOOKAHEAD
       
   250 ENDIF
       
   251         xor edi,edi
       
   252         sub ebp, eax
       
   253 
       
   254         mov r11d, prev_length
       
   255 
       
   256         cmovng ebp,edi
       
   257 
       
   258 ;;; int best_len = s->prev_length;
       
   259 
       
   260 
       
   261 ;;; Store the sum of s->window + best_len in esi locally, and in esi.
       
   262 
       
   263        lea  rsi,[r10+r11]
       
   264 
       
   265 ;;; register ush scan_start = *(ushf*)scan;
       
   266 ;;; register ush scan_end   = *(ushf*)(scan+best_len-1);
       
   267 ;;; Posf *prev = s->prev;
       
   268 
       
   269         movzx r12d,word ptr [r9]
       
   270         movzx ebx, word ptr [r9 + r11 - 1]
       
   271 
       
   272         mov rdi, prev_ad
       
   273 
       
   274 ;;; Jump into the main loop.
       
   275 
       
   276         mov edx, [chainlenwmask]
       
   277 
       
   278         cmp bx,word ptr [rsi + r8 - 1]
       
   279         jz  LookupLoopIsZero
       
   280 
       
   281 LookupLoop1:
       
   282         and r8d, edx
       
   283 
       
   284         movzx   r8d, word ptr [rdi + r8*2]
       
   285         cmp r8d, ebp
       
   286         jbe LeaveNow
       
   287         sub edx, 00010000h
       
   288         js  LeaveNow
       
   289 
       
   290 LoopEntry1:
       
   291         cmp bx,word ptr [rsi + r8 - 1]
       
   292         jz  LookupLoopIsZero
       
   293 
       
   294 LookupLoop2:
       
   295         and r8d, edx
       
   296 
       
   297         movzx   r8d, word ptr [rdi + r8*2]
       
   298         cmp r8d, ebp
       
   299         jbe LeaveNow
       
   300         sub edx, 00010000h
       
   301         js  LeaveNow
       
   302 
       
   303 LoopEntry2:
       
   304         cmp bx,word ptr [rsi + r8 - 1]
       
   305         jz  LookupLoopIsZero
       
   306 
       
   307 LookupLoop4:
       
   308         and r8d, edx
       
   309 
       
   310         movzx   r8d, word ptr [rdi + r8*2]
       
   311         cmp r8d, ebp
       
   312         jbe LeaveNow
       
   313         sub edx, 00010000h
       
   314         js  LeaveNow
       
   315 
       
   316 LoopEntry4:
       
   317 
       
   318         cmp bx,word ptr [rsi + r8 - 1]
       
   319         jnz LookupLoop1
       
   320         jmp LookupLoopIsZero
       
   321 
       
   322 
       
   323 ;;; do {
       
   324 ;;;     match = s->window + cur_match;
       
   325 ;;;     if (*(ushf*)(match+best_len-1) != scan_end ||
       
   326 ;;;         *(ushf*)match != scan_start) continue;
       
   327 ;;;     [...]
       
   328 ;;; } while ((cur_match = prev[cur_match & wmask]) > limit
       
   329 ;;;          && --chain_length != 0);
       
   330 ;;;
       
   331 ;;; Here is the inner loop of the function. The function will spend the
       
   332 ;;; majority of its time in this loop, and majority of that time will
       
   333 ;;; be spent in the first ten instructions.
       
   334 ;;;
       
   335 ;;; Within this loop:
       
   336 ;;; ebx = scanend
       
   337 ;;; r8d = curmatch
       
   338 ;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
       
   339 ;;; esi = windowbestlen - i.e., (window + bestlen)
       
   340 ;;; edi = prev
       
   341 ;;; ebp = limit
       
   342 
       
   343 LookupLoop:
       
   344         and r8d, edx
       
   345 
       
   346         movzx   r8d, word ptr [rdi + r8*2]
       
   347         cmp r8d, ebp
       
   348         jbe LeaveNow
       
   349         sub edx, 00010000h
       
   350         js  LeaveNow
       
   351 
       
   352 LoopEntry:
       
   353 
       
   354         cmp bx,word ptr [rsi + r8 - 1]
       
   355         jnz LookupLoop1
       
   356 LookupLoopIsZero:
       
   357         cmp     r12w, word ptr [r10 + r8]
       
   358         jnz LookupLoop1
       
   359 
       
   360 
       
   361 ;;; Store the current value of chainlen.
       
   362         mov [chainlenwmask], edx
       
   363 
       
   364 ;;; Point edi to the string under scrutiny, and esi to the string we
       
   365 ;;; are hoping to match it up with. In actuality, esi and edi are
       
   366 ;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is
       
   367 ;;; initialized to -(MAX_MATCH_8 - scanalign).
       
   368 
       
   369         lea rsi,[r8+r10]
       
   370         mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8)
       
   371         lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8]
       
   372         lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8]
       
   373 
       
   374         prefetcht1 [rsi+rdx]
       
   375         prefetcht1 [rdi+rdx]
       
   376 
       
   377 
       
   378 ;;; Test the strings for equality, 8 bytes at a time. At the end,
       
   379 ;;; adjust rdx so that it is offset to the exact byte that mismatched.
       
   380 ;;;
       
   381 ;;; We already know at this point that the first three bytes of the
       
   382 ;;; strings match each other, and they can be safely passed over before
       
   383 ;;; starting the compare loop. So what this code does is skip over 0-3
       
   384 ;;; bytes, as much as necessary in order to dword-align the edi
       
   385 ;;; pointer. (rsi will still be misaligned three times out of four.)
       
   386 ;;;
       
   387 ;;; It should be confessed that this loop usually does not represent
       
   388 ;;; much of the total running time. Replacing it with a more
       
   389 ;;; straightforward "rep cmpsb" would not drastically degrade
       
   390 ;;; performance.
       
   391 
       
   392 
       
   393 LoopCmps:
       
   394         mov rax, [rsi + rdx]
       
   395         xor rax, [rdi + rdx]
       
   396         jnz LeaveLoopCmps
       
   397 
       
   398         mov rax, [rsi + rdx + 8]
       
   399         xor rax, [rdi + rdx + 8]
       
   400         jnz LeaveLoopCmps8
       
   401 
       
   402 
       
   403         mov rax, [rsi + rdx + 8+8]
       
   404         xor rax, [rdi + rdx + 8+8]
       
   405         jnz LeaveLoopCmps16
       
   406 
       
   407         add rdx,8+8+8
       
   408 
       
   409         jmp short LoopCmps
       
   410 LeaveLoopCmps16: add rdx,8
       
   411 LeaveLoopCmps8: add rdx,8
       
   412 LeaveLoopCmps:
       
   413 
       
   414         test    eax, 0000FFFFh
       
   415         jnz LenLower
       
   416 
       
   417         test eax,0ffffffffh
       
   418 
       
   419         jnz LenLower32
       
   420 
       
   421         add rdx,4
       
   422         shr rax,32
       
   423         or ax,ax
       
   424         jnz LenLower
       
   425 
       
   426 LenLower32:
       
   427         shr eax,16
       
   428         add rdx,2
       
   429 LenLower:   sub al, 1
       
   430         adc rdx, 0
       
   431 ;;; Calculate the length of the match. If it is longer than MAX_MATCH,
       
   432 ;;; then automatically accept it as the best possible match and leave.
       
   433 
       
   434         lea rax, [rdi + rdx]
       
   435         sub rax, r9
       
   436         cmp eax, MAX_MATCH
       
   437         jge LenMaximum
       
   438 
       
   439 ;;; If the length of the match is not longer than the best match we
       
   440 ;;; have so far, then forget it and return to the lookup loop.
       
   441 ;///////////////////////////////////
       
   442 
       
   443         cmp eax, r11d
       
   444         jg  LongerMatch
       
   445 
       
   446         lea rsi,[r10+r11]
       
   447 
       
   448         mov rdi, prev_ad
       
   449         mov edx, [chainlenwmask]
       
   450         jmp LookupLoop
       
   451 
       
   452 ;;;         s->match_start = cur_match;
       
   453 ;;;         best_len = len;
       
   454 ;;;         if (len >= nice_match) break;
       
   455 ;;;         scan_end = *(ushf*)(scan+best_len-1);
       
   456 
       
   457 LongerMatch:
       
   458         mov r11d, eax
       
   459         mov match_start, r8d
       
   460         cmp eax, [nicematch]
       
   461         jge LeaveNow
       
   462 
       
   463         lea rsi,[r10+rax]
       
   464 
       
   465         movzx   ebx, word ptr [r9 + rax - 1]
       
   466         mov rdi, prev_ad
       
   467         mov edx, [chainlenwmask]
       
   468         jmp LookupLoop
       
   469 
       
   470 ;;; Accept the current string, with the maximum possible length.
       
   471 
       
   472 LenMaximum:
       
   473         mov r11d,MAX_MATCH
       
   474         mov match_start, r8d
       
   475 
       
   476 ;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len;
       
   477 ;;; return s->lookahead;
       
   478 
       
   479 LeaveNow:
       
   480 IFDEF INFOZIP
       
   481         mov eax,r11d
       
   482 ELSE
       
   483         mov eax, Lookahead
       
   484         cmp r11d, eax
       
   485         cmovng eax, r11d
       
   486 ENDIF
       
   487 
       
   488 ;;; Restore the stack and return from whence we came.
       
   489 
       
   490 
       
   491         mov rsi,[save_rsi]
       
   492         mov rdi,[save_rdi]
       
   493         mov rbx,[save_rbx]
       
   494         mov rbp,[save_rbp]
       
   495         mov r12,[save_r12]
       
   496         mov r13,[save_r13]
       
   497 ;        mov r14,[save_r14]
       
   498 ;        mov r15,[save_r15]
       
   499 
       
   500 
       
   501         ret 0
       
   502 ; please don't remove this string !
       
   503 ; Your can freely use gvmat64 in any free or commercial app
       
   504 ; but it is far better don't remove the string in the binary!
       
   505     db     0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0
       
   506 longest_match   ENDP
       
   507 
       
   508 match_init PROC
       
   509   ret 0
       
   510 match_init ENDP
       
   511 
       
   512 
       
   513 END