|
1 /* match.s -- Pentium-Pro-optimized version of longest_match() |
|
2 * Written for zlib 1.1.2 |
|
3 * Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> |
|
4 * |
|
5 * This is free software; you can redistribute it and/or modify it |
|
6 * under the terms of the GNU General Public License. |
|
7 */ |
|
8 |
|
9 #ifndef NO_UNDERLINE |
|
10 #define match_init _match_init |
|
11 #define longest_match _longest_match |
|
12 #endif |
|
13 |
|
14 #define MAX_MATCH (258) |
|
15 #define MIN_MATCH (3) |
|
16 #define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) |
|
17 #define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) |
|
18 |
|
19 /* stack frame offsets */ |
|
20 |
|
21 #define chainlenwmask 0 /* high word: current chain len */ |
|
22 /* low word: s->wmask */ |
|
23 #define window 4 /* local copy of s->window */ |
|
24 #define windowbestlen 8 /* s->window + bestlen */ |
|
25 #define scanstart 16 /* first two bytes of string */ |
|
26 #define scanend 12 /* last two bytes of string */ |
|
27 #define scanalign 20 /* dword-misalignment of string */ |
|
28 #define nicematch 24 /* a good enough match size */ |
|
29 #define bestlen 28 /* size of best match so far */ |
|
30 #define scan 32 /* ptr to string wanting match */ |
|
31 |
|
32 #define LocalVarsSize (36) |
|
33 /* saved ebx 36 */ |
|
34 /* saved edi 40 */ |
|
35 /* saved esi 44 */ |
|
36 /* saved ebp 48 */ |
|
37 /* return address 52 */ |
|
38 #define deflatestate 56 /* the function arguments */ |
|
39 #define curmatch 60 |
|
40 |
|
41 /* All the +zlib1222add offsets are due to the addition of fields |
|
42 * in zlib in the deflate_state structure since the asm code was first written |
|
43 * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). |
|
44 * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). |
|
45 * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). |
|
46 */ |
|
47 |
|
48 #define zlib1222add (8) |
|
49 |
|
50 #define dsWSize (36+zlib1222add) |
|
51 #define dsWMask (44+zlib1222add) |
|
52 #define dsWindow (48+zlib1222add) |
|
53 #define dsPrev (56+zlib1222add) |
|
54 #define dsMatchLen (88+zlib1222add) |
|
55 #define dsPrevMatch (92+zlib1222add) |
|
56 #define dsStrStart (100+zlib1222add) |
|
57 #define dsMatchStart (104+zlib1222add) |
|
58 #define dsLookahead (108+zlib1222add) |
|
59 #define dsPrevLen (112+zlib1222add) |
|
60 #define dsMaxChainLen (116+zlib1222add) |
|
61 #define dsGoodMatch (132+zlib1222add) |
|
62 #define dsNiceMatch (136+zlib1222add) |
|
63 |
|
64 |
|
65 .file "match.S" |
|
66 |
|
67 .globl match_init, longest_match |
|
68 |
|
69 .text |
|
70 |
|
71 /* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ |
|
72 |
|
73 longest_match: |
|
74 |
|
75 /* Save registers that the compiler may be using, and adjust %esp to */ |
|
76 /* make room for our stack frame. */ |
|
77 |
|
78 pushl %ebp |
|
79 pushl %edi |
|
80 pushl %esi |
|
81 pushl %ebx |
|
82 subl $LocalVarsSize, %esp |
|
83 |
|
84 /* Retrieve the function arguments. %ecx will hold cur_match */ |
|
85 /* throughout the entire function. %edx will hold the pointer to the */ |
|
86 /* deflate_state structure during the function's setup (before */ |
|
87 /* entering the main loop). */ |
|
88 |
|
89 movl deflatestate(%esp), %edx |
|
90 movl curmatch(%esp), %ecx |
|
91 |
|
92 /* uInt wmask = s->w_mask; */ |
|
93 /* unsigned chain_length = s->max_chain_length; */ |
|
94 /* if (s->prev_length >= s->good_match) { */ |
|
95 /* chain_length >>= 2; */ |
|
96 /* } */ |
|
97 |
|
98 movl dsPrevLen(%edx), %eax |
|
99 movl dsGoodMatch(%edx), %ebx |
|
100 cmpl %ebx, %eax |
|
101 movl dsWMask(%edx), %eax |
|
102 movl dsMaxChainLen(%edx), %ebx |
|
103 jl LastMatchGood |
|
104 shrl $2, %ebx |
|
105 LastMatchGood: |
|
106 |
|
107 /* chainlen is decremented once beforehand so that the function can */ |
|
108 /* use the sign flag instead of the zero flag for the exit test. */ |
|
109 /* It is then shifted into the high word, to make room for the wmask */ |
|
110 /* value, which it will always accompany. */ |
|
111 |
|
112 decl %ebx |
|
113 shll $16, %ebx |
|
114 orl %eax, %ebx |
|
115 movl %ebx, chainlenwmask(%esp) |
|
116 |
|
117 /* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ |
|
118 |
|
119 movl dsNiceMatch(%edx), %eax |
|
120 movl dsLookahead(%edx), %ebx |
|
121 cmpl %eax, %ebx |
|
122 jl LookaheadLess |
|
123 movl %eax, %ebx |
|
124 LookaheadLess: movl %ebx, nicematch(%esp) |
|
125 |
|
126 /* register Bytef *scan = s->window + s->strstart; */ |
|
127 |
|
128 movl dsWindow(%edx), %esi |
|
129 movl %esi, window(%esp) |
|
130 movl dsStrStart(%edx), %ebp |
|
131 lea (%esi,%ebp), %edi |
|
132 movl %edi, scan(%esp) |
|
133 |
|
134 /* Determine how many bytes the scan ptr is off from being */ |
|
135 /* dword-aligned. */ |
|
136 |
|
137 movl %edi, %eax |
|
138 negl %eax |
|
139 andl $3, %eax |
|
140 movl %eax, scanalign(%esp) |
|
141 |
|
142 /* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ |
|
143 /* s->strstart - (IPos)MAX_DIST(s) : NIL; */ |
|
144 |
|
145 movl dsWSize(%edx), %eax |
|
146 subl $MIN_LOOKAHEAD, %eax |
|
147 subl %eax, %ebp |
|
148 jg LimitPositive |
|
149 xorl %ebp, %ebp |
|
150 LimitPositive: |
|
151 |
|
152 /* int best_len = s->prev_length; */ |
|
153 |
|
154 movl dsPrevLen(%edx), %eax |
|
155 movl %eax, bestlen(%esp) |
|
156 |
|
157 /* Store the sum of s->window + best_len in %esi locally, and in %esi. */ |
|
158 |
|
159 addl %eax, %esi |
|
160 movl %esi, windowbestlen(%esp) |
|
161 |
|
162 /* register ush scan_start = *(ushf*)scan; */ |
|
163 /* register ush scan_end = *(ushf*)(scan+best_len-1); */ |
|
164 /* Posf *prev = s->prev; */ |
|
165 |
|
166 movzwl (%edi), %ebx |
|
167 movl %ebx, scanstart(%esp) |
|
168 movzwl -1(%edi,%eax), %ebx |
|
169 movl %ebx, scanend(%esp) |
|
170 movl dsPrev(%edx), %edi |
|
171 |
|
172 /* Jump into the main loop. */ |
|
173 |
|
174 movl chainlenwmask(%esp), %edx |
|
175 jmp LoopEntry |
|
176 |
|
177 .balign 16 |
|
178 |
|
179 /* do { |
|
180 * match = s->window + cur_match; |
|
181 * if (*(ushf*)(match+best_len-1) != scan_end || |
|
182 * *(ushf*)match != scan_start) continue; |
|
183 * [...] |
|
184 * } while ((cur_match = prev[cur_match & wmask]) > limit |
|
185 * && --chain_length != 0); |
|
186 * |
|
187 * Here is the inner loop of the function. The function will spend the |
|
188 * majority of its time in this loop, and majority of that time will |
|
189 * be spent in the first ten instructions. |
|
190 * |
|
191 * Within this loop: |
|
192 * %ebx = scanend |
|
193 * %ecx = curmatch |
|
194 * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) |
|
195 * %esi = windowbestlen - i.e., (window + bestlen) |
|
196 * %edi = prev |
|
197 * %ebp = limit |
|
198 */ |
|
199 LookupLoop: |
|
200 andl %edx, %ecx |
|
201 movzwl (%edi,%ecx,2), %ecx |
|
202 cmpl %ebp, %ecx |
|
203 jbe LeaveNow |
|
204 subl $0x00010000, %edx |
|
205 js LeaveNow |
|
206 LoopEntry: movzwl -1(%esi,%ecx), %eax |
|
207 cmpl %ebx, %eax |
|
208 jnz LookupLoop |
|
209 movl window(%esp), %eax |
|
210 movzwl (%eax,%ecx), %eax |
|
211 cmpl scanstart(%esp), %eax |
|
212 jnz LookupLoop |
|
213 |
|
214 /* Store the current value of chainlen. */ |
|
215 |
|
216 movl %edx, chainlenwmask(%esp) |
|
217 |
|
218 /* Point %edi to the string under scrutiny, and %esi to the string we */ |
|
219 /* are hoping to match it up with. In actuality, %esi and %edi are */ |
|
220 /* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ |
|
221 /* initialized to -(MAX_MATCH_8 - scanalign). */ |
|
222 |
|
223 movl window(%esp), %esi |
|
224 movl scan(%esp), %edi |
|
225 addl %ecx, %esi |
|
226 movl scanalign(%esp), %eax |
|
227 movl $(-MAX_MATCH_8), %edx |
|
228 lea MAX_MATCH_8(%edi,%eax), %edi |
|
229 lea MAX_MATCH_8(%esi,%eax), %esi |
|
230 |
|
231 /* Test the strings for equality, 8 bytes at a time. At the end, |
|
232 * adjust %edx so that it is offset to the exact byte that mismatched. |
|
233 * |
|
234 * We already know at this point that the first three bytes of the |
|
235 * strings match each other, and they can be safely passed over before |
|
236 * starting the compare loop. So what this code does is skip over 0-3 |
|
237 * bytes, as much as necessary in order to dword-align the %edi |
|
238 * pointer. (%esi will still be misaligned three times out of four.) |
|
239 * |
|
240 * It should be confessed that this loop usually does not represent |
|
241 * much of the total running time. Replacing it with a more |
|
242 * straightforward "rep cmpsb" would not drastically degrade |
|
243 * performance. |
|
244 */ |
|
245 LoopCmps: |
|
246 movl (%esi,%edx), %eax |
|
247 xorl (%edi,%edx), %eax |
|
248 jnz LeaveLoopCmps |
|
249 movl 4(%esi,%edx), %eax |
|
250 xorl 4(%edi,%edx), %eax |
|
251 jnz LeaveLoopCmps4 |
|
252 addl $8, %edx |
|
253 jnz LoopCmps |
|
254 jmp LenMaximum |
|
255 LeaveLoopCmps4: addl $4, %edx |
|
256 LeaveLoopCmps: testl $0x0000FFFF, %eax |
|
257 jnz LenLower |
|
258 addl $2, %edx |
|
259 shrl $16, %eax |
|
260 LenLower: subb $1, %al |
|
261 adcl $0, %edx |
|
262 |
|
263 /* Calculate the length of the match. If it is longer than MAX_MATCH, */ |
|
264 /* then automatically accept it as the best possible match and leave. */ |
|
265 |
|
266 lea (%edi,%edx), %eax |
|
267 movl scan(%esp), %edi |
|
268 subl %edi, %eax |
|
269 cmpl $MAX_MATCH, %eax |
|
270 jge LenMaximum |
|
271 |
|
272 /* If the length of the match is not longer than the best match we */ |
|
273 /* have so far, then forget it and return to the lookup loop. */ |
|
274 |
|
275 movl deflatestate(%esp), %edx |
|
276 movl bestlen(%esp), %ebx |
|
277 cmpl %ebx, %eax |
|
278 jg LongerMatch |
|
279 movl windowbestlen(%esp), %esi |
|
280 movl dsPrev(%edx), %edi |
|
281 movl scanend(%esp), %ebx |
|
282 movl chainlenwmask(%esp), %edx |
|
283 jmp LookupLoop |
|
284 |
|
285 /* s->match_start = cur_match; */ |
|
286 /* best_len = len; */ |
|
287 /* if (len >= nice_match) break; */ |
|
288 /* scan_end = *(ushf*)(scan+best_len-1); */ |
|
289 |
|
290 LongerMatch: movl nicematch(%esp), %ebx |
|
291 movl %eax, bestlen(%esp) |
|
292 movl %ecx, dsMatchStart(%edx) |
|
293 cmpl %ebx, %eax |
|
294 jge LeaveNow |
|
295 movl window(%esp), %esi |
|
296 addl %eax, %esi |
|
297 movl %esi, windowbestlen(%esp) |
|
298 movzwl -1(%edi,%eax), %ebx |
|
299 movl dsPrev(%edx), %edi |
|
300 movl %ebx, scanend(%esp) |
|
301 movl chainlenwmask(%esp), %edx |
|
302 jmp LookupLoop |
|
303 |
|
304 /* Accept the current string, with the maximum possible length. */ |
|
305 |
|
306 LenMaximum: movl deflatestate(%esp), %edx |
|
307 movl $MAX_MATCH, bestlen(%esp) |
|
308 movl %ecx, dsMatchStart(%edx) |
|
309 |
|
310 /* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ |
|
311 /* return s->lookahead; */ |
|
312 |
|
313 LeaveNow: |
|
314 movl deflatestate(%esp), %edx |
|
315 movl bestlen(%esp), %ebx |
|
316 movl dsLookahead(%edx), %eax |
|
317 cmpl %eax, %ebx |
|
318 jg LookaheadRet |
|
319 movl %ebx, %eax |
|
320 LookaheadRet: |
|
321 |
|
322 /* Restore the stack and return from whence we came. */ |
|
323 |
|
324 addl $LocalVarsSize, %esp |
|
325 popl %ebx |
|
326 popl %esi |
|
327 popl %edi |
|
328 popl %ebp |
|
329 match_init: ret |