|
1 /* |
|
2 * Copyright © 2009 Nokia Corporation |
|
3 * |
|
4 * Permission is hereby granted, free of charge, to any person obtaining a |
|
5 * copy of this software and associated documentation files (the "Software"), |
|
6 * to deal in the Software without restriction, including without limitation |
|
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
8 * and/or sell copies of the Software, and to permit persons to whom the |
|
9 * Software is furnished to do so, subject to the following conditions: |
|
10 * |
|
11 * The above copyright notice and this permission notice (including the next |
|
12 * paragraph) shall be included in all copies or substantial portions of the |
|
13 * Software. |
|
14 * |
|
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
|
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
|
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
|
21 * DEALINGS IN THE SOFTWARE. |
|
22 * |
|
23 * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com) |
|
24 */ |
|
25 |
|
26 /* |
|
27 * This file contains a macro ('generate_composite_function') which can |
|
28 * construct 2D image processing functions, based on a common template. |
|
29 * Any combinations of source, destination and mask images with 8bpp, |
|
30 * 16bpp, 24bpp, 32bpp color formats are supported. |
|
31 * |
|
32 * This macro takes care of: |
|
33 * - handling of leading and trailing unaligned pixels |
|
34 * - doing most of the work related to L2 cache preload |
|
35 * - encourages the use of software pipelining for better instructions |
|
36 * scheduling |
|
37 * |
|
38 * The user of this macro has to provide some configuration parameters |
|
39 * (bit depths for the images, prefetch distance, etc.) and a set of |
|
40 * macros, which should implement basic code chunks responsible for |
|
41 * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage |
|
42 * examples. |
|
43 * |
|
44 * TODO: |
|
45 * - try overlapped pixel method (from Ian Rickards) when processing |
|
46 * exactly two blocks of pixels |
|
47 * - maybe add an option to do reverse scanline processing |
|
48 */ |
|
49 |
|
50 /* |
|
51 * Bit flags for 'generate_composite_function' macro which are used |
|
52 * to tune generated functions behavior. |
|
53 */ |
|
54 .set FLAG_DST_WRITEONLY, 0 |
|
55 .set FLAG_DST_READWRITE, 1 |
|
56 .set FLAG_DEINTERLEAVE_32BPP, 2 |
|
57 |
|
58 /* |
|
59 * Offset in stack where mask and source pointer/stride can be accessed |
|
60 * from 'init' macro. This is useful for doing special handling for solid mask. |
|
61 */ |
|
62 .set ARGS_STACK_OFFSET, 40 |
|
63 |
|
64 /* |
|
65 * Constants for selecting preferable prefetch type. |
|
66 */ |
|
67 .set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */ |
|
68 .set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */ |
|
69 .set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */ |
|
70 |
|
71 /* |
|
72 * Definitions of supplementary pixld/pixst macros (for partial load/store of |
|
73 * pixel data). |
|
74 */ |
|
75 |
|
76 .macro pixldst1 op, elem_size, reg1, mem_operand, abits |
|
77 .if abits > 0 |
|
78 op&.&elem_size {d®1}, [&mem_operand&, :&abits&]! |
|
79 .else |
|
80 op&.&elem_size {d®1}, [&mem_operand&]! |
|
81 .endif |
|
82 .endm |
|
83 |
|
84 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits |
|
85 .if abits > 0 |
|
86 op&.&elem_size {d®1, d®2}, [&mem_operand&, :&abits&]! |
|
87 .else |
|
88 op&.&elem_size {d®1, d®2}, [&mem_operand&]! |
|
89 .endif |
|
90 .endm |
|
91 |
|
92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits |
|
93 .if abits > 0 |
|
94 op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&, :&abits&]! |
|
95 .else |
|
96 op&.&elem_size {d®1, d®2, d®3, d®4}, [&mem_operand&]! |
|
97 .endif |
|
98 .endm |
|
99 |
|
100 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits |
|
101 op&.&elem_size {d®1[idx]}, [&mem_operand&]! |
|
102 .endm |
|
103 |
|
104 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand |
|
105 op&.&elem_size {d®1, d®2, d®3}, [&mem_operand&]! |
|
106 .endm |
|
107 |
|
108 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand |
|
109 op&.&elem_size {d®1[idx], d®2[idx], d®3[idx]}, [&mem_operand&]! |
|
110 .endm |
|
111 |
|
112 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits |
|
113 .if numbytes == 32 |
|
114 pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \ |
|
115 %(basereg+6), %(basereg+7), mem_operand, abits |
|
116 .elseif numbytes == 16 |
|
117 pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits |
|
118 .elseif numbytes == 8 |
|
119 pixldst1 op, elem_size, %(basereg+1), mem_operand, abits |
|
120 .elseif numbytes == 4 |
|
121 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32) |
|
122 pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits |
|
123 .elseif elem_size == 16 |
|
124 pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits |
|
125 pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits |
|
126 .else |
|
127 pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits |
|
128 pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits |
|
129 pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits |
|
130 pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits |
|
131 .endif |
|
132 .elseif numbytes == 2 |
|
133 .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16) |
|
134 pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits |
|
135 .else |
|
136 pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits |
|
137 pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits |
|
138 .endif |
|
139 .elseif numbytes == 1 |
|
140 pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits |
|
141 .else |
|
142 .error "unsupported size: numbytes" |
|
143 .endif |
|
144 .endm |
|
145 |
|
146 .macro pixld numpix, bpp, basereg, mem_operand, abits=0 |
|
147 .if bpp > 0 |
|
148 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
|
149 pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \ |
|
150 %(basereg+6), %(basereg+7), mem_operand, abits |
|
151 .elseif (bpp == 24) && (numpix == 8) |
|
152 pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand |
|
153 .elseif (bpp == 24) && (numpix == 4) |
|
154 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand |
|
155 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand |
|
156 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand |
|
157 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand |
|
158 .elseif (bpp == 24) && (numpix == 2) |
|
159 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand |
|
160 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand |
|
161 .elseif (bpp == 24) && (numpix == 1) |
|
162 pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand |
|
163 .else |
|
164 pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits |
|
165 .endif |
|
166 .endif |
|
167 .endm |
|
168 |
|
169 .macro pixst numpix, bpp, basereg, mem_operand, abits=0 |
|
170 .if bpp > 0 |
|
171 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
|
172 pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \ |
|
173 %(basereg+6), %(basereg+7), mem_operand, abits |
|
174 .elseif (bpp == 24) && (numpix == 8) |
|
175 pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand |
|
176 .elseif (bpp == 24) && (numpix == 4) |
|
177 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand |
|
178 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand |
|
179 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand |
|
180 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand |
|
181 .elseif (bpp == 24) && (numpix == 2) |
|
182 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand |
|
183 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand |
|
184 .elseif (bpp == 24) && (numpix == 1) |
|
185 pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand |
|
186 .else |
|
187 pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits |
|
188 .endif |
|
189 .endif |
|
190 .endm |
|
191 |
|
192 .macro pixld_a numpix, bpp, basereg, mem_operand |
|
193 .if (bpp * numpix) <= 128 |
|
194 pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix) |
|
195 .else |
|
196 pixld numpix, bpp, basereg, mem_operand, 128 |
|
197 .endif |
|
198 .endm |
|
199 |
|
200 .macro pixst_a numpix, bpp, basereg, mem_operand |
|
201 .if (bpp * numpix) <= 128 |
|
202 pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix) |
|
203 .else |
|
204 pixst numpix, bpp, basereg, mem_operand, 128 |
|
205 .endif |
|
206 .endm |
|
207 |
|
208 .macro vuzp8 reg1, reg2 |
|
209 vuzp.8 d®1, d®2 |
|
210 .endm |
|
211 |
|
212 .macro vzip8 reg1, reg2 |
|
213 vzip.8 d®1, d®2 |
|
214 .endm |
|
215 |
|
216 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ |
|
217 .macro pixdeinterleave bpp, basereg |
|
218 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
|
219 vuzp8 %(basereg+0), %(basereg+1) |
|
220 vuzp8 %(basereg+2), %(basereg+3) |
|
221 vuzp8 %(basereg+1), %(basereg+3) |
|
222 vuzp8 %(basereg+0), %(basereg+2) |
|
223 .endif |
|
224 .endm |
|
225 |
|
226 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */ |
|
227 .macro pixinterleave bpp, basereg |
|
228 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0) |
|
229 vzip8 %(basereg+0), %(basereg+2) |
|
230 vzip8 %(basereg+1), %(basereg+3) |
|
231 vzip8 %(basereg+2), %(basereg+3) |
|
232 vzip8 %(basereg+0), %(basereg+1) |
|
233 .endif |
|
234 .endm |
|
235 |
|
236 /* |
|
237 * This is a macro for implementing cache preload. The main idea is that |
|
238 * cache preload logic is mostly independent from the rest of pixels |
|
239 * processing code. It starts at the top left pixel and moves forward |
|
240 * across pixels and can jump across scanlines. Prefetch distance is |
|
241 * handled in an 'incremental' way: it starts from 0 and advances to the |
|
242 * optimal distance over time. After reaching optimal prefetch distance, |
|
243 * it is kept constant. There are some checks which prevent prefetching |
|
244 * unneeded pixel lines below the image (but it still can prefetch a bit |
|
245 * more data on the right side of the image - not a big issue and may |
|
246 * be actually helpful when rendering text glyphs). Additional trick is |
|
247 * the use of LDR instruction for prefetch instead of PLD when moving to |
|
248 * the next line, the point is that we have a high chance of getting TLB |
|
249 * miss in this case, and PLD would be useless. |
|
250 * |
|
251 * This sounds like it may introduce a noticeable overhead (when working with |
|
252 * fully cached data). But in reality, due to having a separate pipeline and |
|
253 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can |
|
254 * execute simultaneously with NEON and be completely shadowed by it. Thus |
|
255 * we get no performance overhead at all (*). This looks like a very nice |
|
256 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, |
|
257 * but still can implement some rather advanced prefetch logic in sofware |
|
258 * for almost zero cost! |
|
259 * |
|
260 * (*) The overhead of the prefetcher is visible when running some trivial |
|
261 * pixels processing like simple copy. Anyway, having prefetch is a must |
|
262 * when working with the graphics data. |
|
263 */ |
|
264 .macro PF a, x:vararg |
|
265 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED) |
|
266 a x |
|
267 .endif |
|
268 .endm |
|
269 |
|
270 .macro cache_preload std_increment, boost_increment |
|
271 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0) |
|
272 .if regs_shortage |
|
273 PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */ |
|
274 .endif |
|
275 .if std_increment != 0 |
|
276 PF add PF_X, PF_X, #std_increment |
|
277 .endif |
|
278 PF tst PF_CTL, #0xF |
|
279 PF addne PF_X, PF_X, #boost_increment |
|
280 PF subne PF_CTL, PF_CTL, #1 |
|
281 PF cmp PF_X, ORIG_W |
|
282 .if src_bpp_shift >= 0 |
|
283 PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift] |
|
284 .endif |
|
285 .if dst_r_bpp != 0 |
|
286 PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift] |
|
287 .endif |
|
288 .if mask_bpp_shift >= 0 |
|
289 PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift] |
|
290 .endif |
|
291 PF subge PF_X, PF_X, ORIG_W |
|
292 PF subges PF_CTL, PF_CTL, #0x10 |
|
293 .if src_bpp_shift >= 0 |
|
294 PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]! |
|
295 .endif |
|
296 .if dst_r_bpp != 0 |
|
297 PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]! |
|
298 .endif |
|
299 .if mask_bpp_shift >= 0 |
|
300 PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]! |
|
301 .endif |
|
302 .endif |
|
303 .endm |
|
304 |
|
305 .macro cache_preload_simple |
|
306 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE) |
|
307 .if src_bpp > 0 |
|
308 pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)] |
|
309 .endif |
|
310 .if dst_r_bpp > 0 |
|
311 pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)] |
|
312 .endif |
|
313 .if mask_bpp > 0 |
|
314 pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)] |
|
315 .endif |
|
316 .endif |
|
317 .endm |
|
318 |
|
319 /* |
|
320 * Macro which is used to process leading pixels until destination |
|
321 * pointer is properly aligned (at 16 bytes boundary). When destination |
|
322 * buffer uses 16bpp format, this is unnecessary, or even pointless. |
|
323 */ |
|
324 .macro ensure_destination_ptr_alignment process_pixblock_head, \ |
|
325 process_pixblock_tail, \ |
|
326 process_pixblock_tail_head |
|
327 .if dst_w_bpp != 24 |
|
328 tst DST_R, #0xF |
|
329 beq 2f |
|
330 |
|
331 .irp lowbit, 1, 2, 4, 8, 16 |
|
332 local skip1 |
|
333 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) |
|
334 .if lowbit < 16 /* we don't need more than 16-byte alignment */ |
|
335 tst DST_R, #lowbit |
|
336 beq 1f |
|
337 .endif |
|
338 pixld (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC |
|
339 pixld (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK |
|
340 .if dst_r_bpp > 0 |
|
341 pixld_a (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R |
|
342 .else |
|
343 add DST_R, DST_R, #lowbit |
|
344 .endif |
|
345 PF add PF_X, PF_X, #(lowbit * 8 / dst_w_bpp) |
|
346 sub W, W, #(lowbit * 8 / dst_w_bpp) |
|
347 1: |
|
348 .endif |
|
349 .endr |
|
350 pixdeinterleave src_bpp, src_basereg |
|
351 pixdeinterleave mask_bpp, mask_basereg |
|
352 pixdeinterleave dst_r_bpp, dst_r_basereg |
|
353 |
|
354 process_pixblock_head |
|
355 cache_preload 0, pixblock_size |
|
356 cache_preload_simple |
|
357 process_pixblock_tail |
|
358 |
|
359 pixinterleave dst_w_bpp, dst_w_basereg |
|
360 .irp lowbit, 1, 2, 4, 8, 16 |
|
361 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp)) |
|
362 .if lowbit < 16 /* we don't need more than 16-byte alignment */ |
|
363 tst DST_W, #lowbit |
|
364 beq 1f |
|
365 .endif |
|
366 pixst_a (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W |
|
367 1: |
|
368 .endif |
|
369 .endr |
|
370 .endif |
|
371 2: |
|
372 .endm |
|
373 |
|
374 /* |
|
375 * Special code for processing up to (pixblock_size - 1) remaining |
|
376 * trailing pixels. As SIMD processing performs operation on |
|
377 * pixblock_size pixels, anything smaller than this has to be loaded |
|
378 * and stored in a special way. Loading and storing of pixel data is |
|
379 * performed in such a way that we fill some 'slots' in the NEON |
|
380 * registers (some slots naturally are unused), then perform compositing |
|
381 * operation as usual. In the end, the data is taken from these 'slots' |
|
382 * and saved to memory. |
|
383 * |
|
384 * cache_preload_flag - allows to suppress prefetch if |
|
385 * set to 0 |
|
386 * dst_aligned_flag - selects whether destination buffer |
|
387 * is aligned |
|
388 */ |
|
389 .macro process_trailing_pixels cache_preload_flag, \ |
|
390 dst_aligned_flag, \ |
|
391 process_pixblock_head, \ |
|
392 process_pixblock_tail, \ |
|
393 process_pixblock_tail_head |
|
394 tst W, #(pixblock_size - 1) |
|
395 beq 2f |
|
396 .irp chunk_size, 16, 8, 4, 2, 1 |
|
397 .if pixblock_size > chunk_size |
|
398 tst W, #chunk_size |
|
399 beq 1f |
|
400 pixld chunk_size, src_bpp, src_basereg, SRC |
|
401 pixld chunk_size, mask_bpp, mask_basereg, MASK |
|
402 .if dst_aligned_flag != 0 |
|
403 pixld_a chunk_size, dst_r_bpp, dst_r_basereg, DST_R |
|
404 .else |
|
405 pixld chunk_size, dst_r_bpp, dst_r_basereg, DST_R |
|
406 .endif |
|
407 .if cache_preload_flag != 0 |
|
408 PF add PF_X, PF_X, #chunk_size |
|
409 .endif |
|
410 1: |
|
411 .endif |
|
412 .endr |
|
413 pixdeinterleave src_bpp, src_basereg |
|
414 pixdeinterleave mask_bpp, mask_basereg |
|
415 pixdeinterleave dst_r_bpp, dst_r_basereg |
|
416 |
|
417 process_pixblock_head |
|
418 .if cache_preload_flag != 0 |
|
419 cache_preload 0, pixblock_size |
|
420 cache_preload_simple |
|
421 .endif |
|
422 process_pixblock_tail |
|
423 pixinterleave dst_w_bpp, dst_w_basereg |
|
424 .irp chunk_size, 16, 8, 4, 2, 1 |
|
425 .if pixblock_size > chunk_size |
|
426 tst W, #chunk_size |
|
427 beq 1f |
|
428 .if dst_aligned_flag != 0 |
|
429 pixst_a chunk_size, dst_w_bpp, dst_w_basereg, DST_W |
|
430 .else |
|
431 pixst chunk_size, dst_w_bpp, dst_w_basereg, DST_W |
|
432 .endif |
|
433 1: |
|
434 .endif |
|
435 .endr |
|
436 2: |
|
437 .endm |
|
438 |
|
439 /* |
|
440 * Macro, which performs all the needed operations to switch to the next |
|
441 * scanline and start the next loop iteration unless all the scanlines |
|
442 * are already processed. |
|
443 */ |
|
444 .macro advance_to_next_scanline start_of_loop_label |
|
445 .if regs_shortage |
|
446 ldrd W, [sp] /* load W and H (width and height) from stack */ |
|
447 .else |
|
448 mov W, ORIG_W |
|
449 .endif |
|
450 add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift |
|
451 .if src_bpp != 0 |
|
452 add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift |
|
453 .endif |
|
454 .if mask_bpp != 0 |
|
455 add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift |
|
456 .endif |
|
457 .if (dst_w_bpp != 24) |
|
458 sub DST_W, DST_W, W, lsl #dst_bpp_shift |
|
459 .endif |
|
460 .if (src_bpp != 24) && (src_bpp != 0) |
|
461 sub SRC, SRC, W, lsl #src_bpp_shift |
|
462 .endif |
|
463 .if (mask_bpp != 24) && (mask_bpp != 0) |
|
464 sub MASK, MASK, W, lsl #mask_bpp_shift |
|
465 .endif |
|
466 subs H, H, #1 |
|
467 mov DST_R, DST_W |
|
468 .if regs_shortage |
|
469 str H, [sp, #4] /* save updated height to stack */ |
|
470 .endif |
|
471 bge start_of_loop_label |
|
472 .endm |
|
473 |
|
474 /* |
|
475 * Registers are allocated in the following way by default: |
|
476 * d0, d1, d2, d3 - reserved for loading source pixel data |
|
477 * d4, d5, d6, d7 - reserved for loading destination pixel data |
|
478 * d24, d25, d26, d27 - reserved for loading mask pixel data |
|
479 * d28, d29, d30, d31 - final destination pixel data for writeback to memory |
|
480 */ |
|
481 .macro generate_composite_function fname, \ |
|
482 src_bpp_, \ |
|
483 mask_bpp_, \ |
|
484 dst_w_bpp_, \ |
|
485 flags, \ |
|
486 pixblock_size_, \ |
|
487 prefetch_distance, \ |
|
488 init, \ |
|
489 cleanup, \ |
|
490 process_pixblock_head, \ |
|
491 process_pixblock_tail, \ |
|
492 process_pixblock_tail_head, \ |
|
493 dst_w_basereg_ = 28, \ |
|
494 dst_r_basereg_ = 4, \ |
|
495 src_basereg_ = 0, \ |
|
496 mask_basereg_ = 24 |
|
497 |
|
498 .func fname |
|
499 .global fname |
|
500 /* For ELF format also set function visibility to hidden */ |
|
501 #ifdef __ELF__ |
|
502 .hidden fname |
|
503 .type fname, %function |
|
504 #endif |
|
505 fname: |
|
506 push {r4-r12, lr} /* save all registers */ |
|
507 |
|
508 /* |
|
509 * Select prefetch type for this function. If prefetch distance is |
|
510 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch |
|
511 * has to be used instead of ADVANCED. |
|
512 */ |
|
513 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT |
|
514 .if prefetch_distance == 0 |
|
515 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE |
|
516 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \ |
|
517 ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24)) |
|
518 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE |
|
519 .endif |
|
520 |
|
521 /* |
|
522 * Make some macro arguments globally visible and accessible |
|
523 * from other macros |
|
524 */ |
|
525 .set src_bpp, src_bpp_ |
|
526 .set mask_bpp, mask_bpp_ |
|
527 .set dst_w_bpp, dst_w_bpp_ |
|
528 .set pixblock_size, pixblock_size_ |
|
529 .set dst_w_basereg, dst_w_basereg_ |
|
530 .set dst_r_basereg, dst_r_basereg_ |
|
531 .set src_basereg, src_basereg_ |
|
532 .set mask_basereg, mask_basereg_ |
|
533 |
|
534 /* |
|
535 * Assign symbolic names to registers |
|
536 */ |
|
537 W .req r0 /* width (is updated during processing) */ |
|
538 H .req r1 /* height (is updated during processing) */ |
|
539 DST_W .req r2 /* destination buffer pointer for writes */ |
|
540 DST_STRIDE .req r3 /* destination image stride */ |
|
541 SRC .req r4 /* source buffer pointer */ |
|
542 SRC_STRIDE .req r5 /* source image stride */ |
|
543 DST_R .req r6 /* destination buffer pointer for reads */ |
|
544 |
|
545 MASK .req r7 /* mask pointer */ |
|
546 MASK_STRIDE .req r8 /* mask stride */ |
|
547 |
|
548 PF_CTL .req r9 /* combined lines counter and prefetch */ |
|
549 /* distance increment counter */ |
|
550 PF_X .req r10 /* pixel index in a scanline for current */ |
|
551 /* pretetch position */ |
|
552 PF_SRC .req r11 /* pointer to source scanline start */ |
|
553 /* for prefetch purposes */ |
|
554 PF_DST .req r12 /* pointer to destination scanline start */ |
|
555 /* for prefetch purposes */ |
|
556 PF_MASK .req r14 /* pointer to mask scanline start */ |
|
557 /* for prefetch purposes */ |
|
558 /* |
|
559 * Check whether we have enough registers for all the local variables. |
|
560 * If we don't have enough registers, original width and height are |
|
561 * kept on top of stack (and 'regs_shortage' variable is set to indicate |
|
562 * this for the rest of code). Even if there are enough registers, the |
|
563 * allocation scheme may be a bit different depending on whether source |
|
564 * or mask is not used. |
|
565 */ |
|
566 .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED) |
|
567 ORIG_W .req r10 /* saved original width */ |
|
568 DUMMY .req r12 /* temporary register */ |
|
569 .set regs_shortage, 0 |
|
570 .elseif mask_bpp == 0 |
|
571 ORIG_W .req r7 /* saved original width */ |
|
572 DUMMY .req r8 /* temporary register */ |
|
573 .set regs_shortage, 0 |
|
574 .elseif src_bpp == 0 |
|
575 ORIG_W .req r4 /* saved original width */ |
|
576 DUMMY .req r5 /* temporary register */ |
|
577 .set regs_shortage, 0 |
|
578 .else |
|
579 ORIG_W .req r1 /* saved original width */ |
|
580 DUMMY .req r1 /* temporary register */ |
|
581 .set regs_shortage, 1 |
|
582 .endif |
|
583 |
|
584 .set mask_bpp_shift, -1 |
|
585 .if src_bpp == 32 |
|
586 .set src_bpp_shift, 2 |
|
587 .elseif src_bpp == 24 |
|
588 .set src_bpp_shift, 0 |
|
589 .elseif src_bpp == 16 |
|
590 .set src_bpp_shift, 1 |
|
591 .elseif src_bpp == 8 |
|
592 .set src_bpp_shift, 0 |
|
593 .elseif src_bpp == 0 |
|
594 .set src_bpp_shift, -1 |
|
595 .else |
|
596 .error "requested src bpp (src_bpp) is not supported" |
|
597 .endif |
|
598 .if mask_bpp == 32 |
|
599 .set mask_bpp_shift, 2 |
|
600 .elseif mask_bpp == 24 |
|
601 .set mask_bpp_shift, 0 |
|
602 .elseif mask_bpp == 8 |
|
603 .set mask_bpp_shift, 0 |
|
604 .elseif mask_bpp == 0 |
|
605 .set mask_bpp_shift, -1 |
|
606 .else |
|
607 .error "requested mask bpp (mask_bpp) is not supported" |
|
608 .endif |
|
609 .if dst_w_bpp == 32 |
|
610 .set dst_bpp_shift, 2 |
|
611 .elseif dst_w_bpp == 24 |
|
612 .set dst_bpp_shift, 0 |
|
613 .elseif dst_w_bpp == 16 |
|
614 .set dst_bpp_shift, 1 |
|
615 .elseif dst_w_bpp == 8 |
|
616 .set dst_bpp_shift, 0 |
|
617 .else |
|
618 .error "requested dst bpp (dst_w_bpp) is not supported" |
|
619 .endif |
|
620 |
|
621 .if (((flags) & FLAG_DST_READWRITE) != 0) |
|
622 .set dst_r_bpp, dst_w_bpp |
|
623 .else |
|
624 .set dst_r_bpp, 0 |
|
625 .endif |
|
626 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) |
|
627 .set DEINTERLEAVE_32BPP_ENABLED, 1 |
|
628 .else |
|
629 .set DEINTERLEAVE_32BPP_ENABLED, 0 |
|
630 .endif |
|
631 |
|
632 .if prefetch_distance < 0 || prefetch_distance > 15 |
|
633 .error "invalid prefetch distance (prefetch_distance)" |
|
634 .endif |
|
635 |
|
636 .if src_bpp > 0 |
|
637 ldr SRC, [sp, #40] |
|
638 .endif |
|
639 .if mask_bpp > 0 |
|
640 ldr MASK, [sp, #48] |
|
641 .endif |
|
642 PF mov PF_X, #0 |
|
643 .if src_bpp > 0 |
|
644 ldr SRC_STRIDE, [sp, #44] |
|
645 .endif |
|
646 .if mask_bpp > 0 |
|
647 ldr MASK_STRIDE, [sp, #52] |
|
648 .endif |
|
649 mov DST_R, DST_W |
|
650 |
|
651 .if src_bpp == 24 |
|
652 sub SRC_STRIDE, SRC_STRIDE, W |
|
653 sub SRC_STRIDE, SRC_STRIDE, W, lsl #1 |
|
654 .endif |
|
655 .if mask_bpp == 24 |
|
656 sub MASK_STRIDE, MASK_STRIDE, W |
|
657 sub MASK_STRIDE, MASK_STRIDE, W, lsl #1 |
|
658 .endif |
|
659 .if dst_w_bpp == 24 |
|
660 sub DST_STRIDE, DST_STRIDE, W |
|
661 sub DST_STRIDE, DST_STRIDE, W, lsl #1 |
|
662 .endif |
|
663 |
|
664 /* |
|
665 * Setup advanced prefetcher initial state |
|
666 */ |
|
667 PF mov PF_SRC, SRC |
|
668 PF mov PF_DST, DST_R |
|
669 PF mov PF_MASK, MASK |
|
670 /* PF_CTL = prefetch_distance | ((h - 1) << 4) */ |
|
671 PF mov PF_CTL, H, lsl #4 |
|
672 PF add PF_CTL, #(prefetch_distance - 0x10) |
|
673 |
|
674 init |
|
675 .if regs_shortage |
|
676 push {r0, r1} |
|
677 .endif |
|
678 subs H, H, #1 |
|
679 .if regs_shortage |
|
680 str H, [sp, #4] /* save updated height to stack */ |
|
681 .else |
|
682 mov ORIG_W, W |
|
683 .endif |
|
684 blt 9f |
|
685 cmp W, #(pixblock_size * 2) |
|
686 blt 8f |
|
687 /* |
|
688 * This is the start of the pipelined loop, which if optimized for |
|
689 * long scanlines |
|
690 */ |
|
691 0: |
|
692 ensure_destination_ptr_alignment process_pixblock_head, \ |
|
693 process_pixblock_tail, \ |
|
694 process_pixblock_tail_head |
|
695 |
|
696 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ |
|
697 pixld_a pixblock_size, dst_r_bpp, \ |
|
698 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R |
|
699 pixld pixblock_size, src_bpp, \ |
|
700 (src_basereg - pixblock_size * src_bpp / 64), SRC |
|
701 pixld pixblock_size, mask_bpp, \ |
|
702 (mask_basereg - pixblock_size * mask_bpp / 64), MASK |
|
703 PF add PF_X, PF_X, #pixblock_size |
|
704 process_pixblock_head |
|
705 cache_preload 0, pixblock_size |
|
706 cache_preload_simple |
|
707 subs W, W, #(pixblock_size * 2) |
|
708 blt 2f |
|
709 1: |
|
710 process_pixblock_tail_head |
|
711 cache_preload_simple |
|
712 subs W, W, #pixblock_size |
|
713 bge 1b |
|
714 2: |
|
715 process_pixblock_tail |
|
716 pixst_a pixblock_size, dst_w_bpp, \ |
|
717 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W |
|
718 |
|
719 /* Process the remaining trailing pixels in the scanline */ |
|
720 process_trailing_pixels 1, 1, \ |
|
721 process_pixblock_head, \ |
|
722 process_pixblock_tail, \ |
|
723 process_pixblock_tail_head |
|
724 advance_to_next_scanline 0b |
|
725 |
|
726 .if regs_shortage |
|
727 pop {r0, r1} |
|
728 .endif |
|
729 cleanup |
|
730 pop {r4-r12, pc} /* exit */ |
|
731 /* |
|
732 * This is the start of the loop, designed to process images with small width |
|
733 * (less than pixblock_size * 2 pixels). In this case neither pipelining |
|
734 * nor prefetch are used. |
|
735 */ |
|
736 8: |
|
737 /* Process exactly pixblock_size pixels if needed */ |
|
738 tst W, #pixblock_size |
|
739 beq 1f |
|
740 pixld pixblock_size, dst_r_bpp, \ |
|
741 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R |
|
742 pixld pixblock_size, src_bpp, \ |
|
743 (src_basereg - pixblock_size * src_bpp / 64), SRC |
|
744 pixld pixblock_size, mask_bpp, \ |
|
745 (mask_basereg - pixblock_size * mask_bpp / 64), MASK |
|
746 process_pixblock_head |
|
747 process_pixblock_tail |
|
748 pixst pixblock_size, dst_w_bpp, \ |
|
749 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W |
|
750 1: |
|
751 /* Process the remaining trailing pixels in the scanline */ |
|
752 process_trailing_pixels 0, 0, \ |
|
753 process_pixblock_head, \ |
|
754 process_pixblock_tail, \ |
|
755 process_pixblock_tail_head |
|
756 advance_to_next_scanline 8b |
|
757 9: |
|
758 .if regs_shortage |
|
759 pop {r0, r1} |
|
760 .endif |
|
761 cleanup |
|
762 pop {r4-r12, pc} /* exit */ |
|
763 |
|
764 .unreq SRC |
|
765 .unreq MASK |
|
766 .unreq DST_R |
|
767 .unreq DST_W |
|
768 .unreq ORIG_W |
|
769 .unreq W |
|
770 .unreq H |
|
771 .unreq SRC_STRIDE |
|
772 .unreq DST_STRIDE |
|
773 .unreq MASK_STRIDE |
|
774 .unreq PF_CTL |
|
775 .unreq PF_X |
|
776 .unreq PF_SRC |
|
777 .unreq PF_DST |
|
778 .unreq PF_MASK |
|
779 .unreq DUMMY |
|
780 .endfunc |
|
781 .endm |
|
782 |
|
783 /* |
|
784 * A simplified variant of function generation template for a single |
|
785 * scanline processing (for implementing pixman combine functions) |
|
786 */ |
|
787 .macro generate_composite_function_single_scanline fname, \ |
|
788 src_bpp_, \ |
|
789 mask_bpp_, \ |
|
790 dst_w_bpp_, \ |
|
791 flags, \ |
|
792 pixblock_size_, \ |
|
793 init, \ |
|
794 cleanup, \ |
|
795 process_pixblock_head, \ |
|
796 process_pixblock_tail, \ |
|
797 process_pixblock_tail_head, \ |
|
798 dst_w_basereg_ = 28, \ |
|
799 dst_r_basereg_ = 4, \ |
|
800 src_basereg_ = 0, \ |
|
801 mask_basereg_ = 24 |
|
802 |
|
803 .func fname |
|
804 .global fname |
|
805 /* For ELF format also set function visibility to hidden */ |
|
806 #ifdef __ELF__ |
|
807 .hidden fname |
|
808 .type fname, %function |
|
809 #endif |
|
810 fname: |
|
811 .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE |
|
812 /* |
|
813 * Make some macro arguments globally visible and accessible |
|
814 * from other macros |
|
815 */ |
|
816 .set src_bpp, src_bpp_ |
|
817 .set mask_bpp, mask_bpp_ |
|
818 .set dst_w_bpp, dst_w_bpp_ |
|
819 .set pixblock_size, pixblock_size_ |
|
820 .set dst_w_basereg, dst_w_basereg_ |
|
821 .set dst_r_basereg, dst_r_basereg_ |
|
822 .set src_basereg, src_basereg_ |
|
823 .set mask_basereg, mask_basereg_ |
|
824 /* |
|
825 * Assign symbolic names to registers |
|
826 */ |
|
827 W .req r0 /* width (is updated during processing) */ |
|
828 DST_W .req r1 /* destination buffer pointer for writes */ |
|
829 SRC .req r2 /* source buffer pointer */ |
|
830 DST_R .req ip /* destination buffer pointer for reads */ |
|
831 MASK .req r3 /* mask pointer */ |
|
832 |
|
833 .if (((flags) & FLAG_DST_READWRITE) != 0) |
|
834 .set dst_r_bpp, dst_w_bpp |
|
835 .else |
|
836 .set dst_r_bpp, 0 |
|
837 .endif |
|
838 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0) |
|
839 .set DEINTERLEAVE_32BPP_ENABLED, 1 |
|
840 .else |
|
841 .set DEINTERLEAVE_32BPP_ENABLED, 0 |
|
842 .endif |
|
843 |
|
844 init |
|
845 mov DST_R, DST_W |
|
846 |
|
847 cmp W, #pixblock_size |
|
848 blt 8f |
|
849 |
|
850 ensure_destination_ptr_alignment process_pixblock_head, \ |
|
851 process_pixblock_tail, \ |
|
852 process_pixblock_tail_head |
|
853 |
|
854 subs W, W, #pixblock_size |
|
855 blt 7f |
|
856 |
|
857 /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */ |
|
858 pixld_a pixblock_size, dst_r_bpp, \ |
|
859 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R |
|
860 pixld pixblock_size, src_bpp, \ |
|
861 (src_basereg - pixblock_size * src_bpp / 64), SRC |
|
862 pixld pixblock_size, mask_bpp, \ |
|
863 (mask_basereg - pixblock_size * mask_bpp / 64), MASK |
|
864 process_pixblock_head |
|
865 subs W, W, #pixblock_size |
|
866 blt 2f |
|
867 1: |
|
868 process_pixblock_tail_head |
|
869 subs W, W, #pixblock_size |
|
870 bge 1b |
|
871 2: |
|
872 process_pixblock_tail |
|
873 pixst_a pixblock_size, dst_w_bpp, \ |
|
874 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W |
|
875 7: |
|
876 /* Process the remaining trailing pixels in the scanline (dst aligned) */ |
|
877 process_trailing_pixels 0, 1, \ |
|
878 process_pixblock_head, \ |
|
879 process_pixblock_tail, \ |
|
880 process_pixblock_tail_head |
|
881 |
|
882 cleanup |
|
883 bx lr /* exit */ |
|
884 8: |
|
885 /* Process the remaining trailing pixels in the scanline (dst unaligned) */ |
|
886 process_trailing_pixels 0, 0, \ |
|
887 process_pixblock_head, \ |
|
888 process_pixblock_tail, \ |
|
889 process_pixblock_tail_head |
|
890 |
|
891 cleanup |
|
892 bx lr /* exit */ |
|
893 |
|
894 .unreq SRC |
|
895 .unreq MASK |
|
896 .unreq DST_R |
|
897 .unreq DST_W |
|
898 .unreq W |
|
899 .endfunc |
|
900 .endm |
|
901 |
|
902 .macro default_init |
|
903 .endm |
|
904 |
|
905 .macro default_cleanup |
|
906 .endm |