FCL/sf/mw/qt: comparison src/3rdparty/pixman/pixman-arm-neon-asm.h

equal deleted inserted replaced

-:b72c6db6890b
+:5dc02b23752f
+/*
+* Copyright © 2009 Nokia Corporation
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+* DEALINGS IN THE SOFTWARE.
+*
+* Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
+*/
+/*
+* This file contains a macro ('generate_composite_function') which can
+* construct 2D image processing functions, based on a common template.
+* Any combinations of source, destination and mask images with 8bpp,
+* 16bpp, 24bpp, 32bpp color formats are supported.
+*
+* This macro takes care of:
+*  - handling of leading and trailing unaligned pixels
+*  - doing most of the work related to L2 cache preload
+*  - encourages the use of software pipelining for better instructions
+*    scheduling
+*
+* The user of this macro has to provide some configuration parameters
+* (bit depths for the images, prefetch distance, etc.) and a set of
+* macros, which should implement basic code chunks responsible for
+* pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
+* examples.
+*
+* TODO:
+*  - try overlapped pixel method (from Ian Rickards) when processing
+*    exactly two blocks of pixels
+*  - maybe add an option to do reverse scanline processing
+*/
+/*
+* Bit flags for 'generate_composite_function' macro which are used
+* to tune generated functions behavior.
+*/
+.set FLAG_DST_WRITEONLY,       0
+.set FLAG_DST_READWRITE,       1
+.set FLAG_DEINTERLEAVE_32BPP,  2
+/*
+* Offset in stack where mask and source pointer/stride can be accessed
+* from 'init' macro. This is useful for doing special handling for solid mask.
+*/
+.set ARGS_STACK_OFFSET,        40
+/*
+* Constants for selecting preferable prefetch type.
+*/
+.set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
+.set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
+.set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
+/*
+* Definitions of supplementary pixld/pixst macros (for partial load/store of
+* pixel data).
+*/
+.macro pixldst1 op, elem_size, reg1, mem_operand, abits
+.if abits > 0
+op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
+.else
+op&.&elem_size {d&reg1}, [&mem_operand&]!
+.endif
+.endm
+.macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
+.if abits > 0
+op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
+.else
+op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
+.endif
+.endm
+.macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
+.if abits > 0
+op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
+.else
+op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
+.endif
+.endm
+.macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
+op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
+.endm
+.macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
+op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
+.endm
+.macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
+op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
+.endm
+.macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
+.if numbytes == 32
+pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
+%(basereg+6), %(basereg+7), mem_operand, abits
+.elseif numbytes == 16
+pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
+.elseif numbytes == 8
+pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
+.elseif numbytes == 4
+.if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
+pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
+.elseif elem_size == 16
+pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
+pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
+.else
+pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
+pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
+pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
+pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
+.endif
+.elseif numbytes == 2
+.if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
+pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
+.else
+pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
+pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
+.endif
+.elseif numbytes == 1
+pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
+.else
+.error "unsupported size: numbytes"
+.endif
+.endm
+.macro pixld numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
+%(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+.macro pixst numpix, bpp, basereg, mem_operand, abits=0
+.if bpp > 0
+.if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
+%(basereg+6), %(basereg+7), mem_operand, abits
+.elseif (bpp == 24) && (numpix == 8)
+pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
+.elseif (bpp == 24) && (numpix == 4)
+pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
+pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
+pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
+pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
+.elseif (bpp == 24) && (numpix == 2)
+pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
+pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
+.elseif (bpp == 24) && (numpix == 1)
+pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
+.else
+pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
+.endif
+.endif
+.endm
+.macro pixld_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+pixld numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+.macro pixst_a numpix, bpp, basereg, mem_operand
+.if (bpp * numpix) <= 128
+pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
+.else
+pixst numpix, bpp, basereg, mem_operand, 128
+.endif
+.endm
+.macro vuzp8 reg1, reg2
+vuzp.8 d&reg1, d&reg2
+.endm
+.macro vzip8 reg1, reg2
+vzip.8 d&reg1, d&reg2
+.endm
+/* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixdeinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+vuzp8 %(basereg+0), %(basereg+1)
+vuzp8 %(basereg+2), %(basereg+3)
+vuzp8 %(basereg+1), %(basereg+3)
+vuzp8 %(basereg+0), %(basereg+2)
+.endif
+.endm
+/* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
+.macro pixinterleave bpp, basereg
+.if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
+vzip8 %(basereg+0), %(basereg+2)
+vzip8 %(basereg+1), %(basereg+3)
+vzip8 %(basereg+2), %(basereg+3)
+vzip8 %(basereg+0), %(basereg+1)
+.endif
+.endm
+/*
+* This is a macro for implementing cache preload. The main idea is that
+* cache preload logic is mostly independent from the rest of pixels
+* processing code. It starts at the top left pixel and moves forward
+* across pixels and can jump across scanlines. Prefetch distance is
+* handled in an 'incremental' way: it starts from 0 and advances to the
+* optimal distance over time. After reaching optimal prefetch distance,
+* it is kept constant. There are some checks which prevent prefetching
+* unneeded pixel lines below the image (but it still can prefetch a bit
+* more data on the right side of the image - not a big issue and may
+* be actually helpful when rendering text glyphs). Additional trick is
+* the use of LDR instruction for prefetch instead of PLD when moving to
+* the next line, the point is that we have a high chance of getting TLB
+* miss in this case, and PLD would be useless.
+*
+* This sounds like it may introduce a noticeable overhead (when working with
+* fully cached data). But in reality, due to having a separate pipeline and
+* instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
+* execute simultaneously with NEON and be completely shadowed by it. Thus
+* we get no performance overhead at all (*). This looks like a very nice
+* feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
+* but still can implement some rather advanced prefetch logic in sofware
+* for almost zero cost!
+*
+* (*) The overhead of the prefetcher is visible when running some trivial
+* pixels processing like simple copy. Anyway, having prefetch is a must
+* when working with the graphics data.
+*/
+.macro PF a, x:vararg
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
+a x
+.endif
+.endm
+.macro cache_preload std_increment, boost_increment
+.if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
+.if regs_shortage
+PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
+.endif
+.if std_increment != 0
+PF add PF_X, PF_X, #std_increment
+.endif
+PF tst PF_CTL, #0xF
+PF addne PF_X, PF_X, #boost_increment
+PF subne PF_CTL, PF_CTL, #1
+PF cmp PF_X, ORIG_W
+.if src_bpp_shift >= 0
+PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
+.endif
+.if dst_r_bpp != 0
+PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
+.endif
+.if mask_bpp_shift >= 0
+PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+.endif
+PF subge PF_X, PF_X, ORIG_W
+PF subges PF_CTL, PF_CTL, #0x10
+.if src_bpp_shift >= 0
+PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
+.endif
+.if dst_r_bpp != 0
+PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
+.endif
+.if mask_bpp_shift >= 0
+PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+.endif
+.endif
+.endm
+.macro cache_preload_simple
+.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
+.if src_bpp > 0
+pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
+.endif
+.if dst_r_bpp > 0
+pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
+.endif
+.if mask_bpp > 0
+pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
+.endif
+.endif
+.endm
+/*
+* Macro which is used to process leading pixels until destination
+* pointer is properly aligned (at 16 bytes boundary). When destination
+* buffer uses 16bpp format, this is unnecessary, or even pointless.
+*/
+.macro ensure_destination_ptr_alignment process_pixblock_head, \
+process_pixblock_tail, \
+process_pixblock_tail_head
+.if dst_w_bpp != 24
+tst         DST_R, #0xF
+beq         2f
+.irp lowbit, 1, 2, 4, 8, 16
+local skip1
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+tst         DST_R, #lowbit
+beq         1f
+.endif
+pixld       (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
+pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
+.if dst_r_bpp > 0
+pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
+.else
+add         DST_R, DST_R, #lowbit
+.endif
+PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
+sub         W, W, #(lowbit * 8 / dst_w_bpp)
+1:
+.endif
+.endr
+pixdeinterleave src_bpp, src_basereg
+pixdeinterleave mask_bpp, mask_basereg
+pixdeinterleave dst_r_bpp, dst_r_basereg
+process_pixblock_head
+cache_preload 0, pixblock_size
+cache_preload_simple
+process_pixblock_tail
+pixinterleave dst_w_bpp, dst_w_basereg
+.irp lowbit, 1, 2, 4, 8, 16
+.if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
+.if lowbit < 16 /* we don't need more than 16-byte alignment */
+tst         DST_W, #lowbit
+beq         1f
+.endif
+pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
+1:
+.endif
+.endr
+.endif
+2:
+.endm
+/*
+* Special code for processing up to (pixblock_size - 1) remaining
+* trailing pixels. As SIMD processing performs operation on
+* pixblock_size pixels, anything smaller than this has to be loaded
+* and stored in a special way. Loading and storing of pixel data is
+* performed in such a way that we fill some 'slots' in the NEON
+* registers (some slots naturally are unused), then perform compositing
+* operation as usual. In the end, the data is taken from these 'slots'
+* and saved to memory.
+*
+* cache_preload_flag - allows to suppress prefetch if
+*                      set to 0
+* dst_aligned_flag   - selects whether destination buffer
+*                      is aligned
+*/
+.macro process_trailing_pixels cache_preload_flag, \
+dst_aligned_flag, \
+process_pixblock_head, \
+process_pixblock_tail, \
+process_pixblock_tail_head
+tst         W, #(pixblock_size - 1)
+beq         2f
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+tst         W, #chunk_size
+beq         1f
+pixld       chunk_size, src_bpp, src_basereg, SRC
+pixld       chunk_size, mask_bpp, mask_basereg, MASK
+.if dst_aligned_flag != 0
+pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.else
+pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
+.endif
+.if cache_preload_flag != 0
+PF add      PF_X, PF_X, #chunk_size
+.endif
+1:
+.endif
+.endr
+pixdeinterleave src_bpp, src_basereg
+pixdeinterleave mask_bpp, mask_basereg
+pixdeinterleave dst_r_bpp, dst_r_basereg
+process_pixblock_head
+.if cache_preload_flag != 0
+cache_preload 0, pixblock_size
+cache_preload_simple
+.endif
+process_pixblock_tail
+pixinterleave dst_w_bpp, dst_w_basereg
+.irp chunk_size, 16, 8, 4, 2, 1
+.if pixblock_size > chunk_size
+tst         W, #chunk_size
+beq         1f
+.if dst_aligned_flag != 0
+pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.else
+pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
+.endif
+1:
+.endif
+.endr
+2:
+.endm
+/*
+* Macro, which performs all the needed operations to switch to the next
+* scanline and start the next loop iteration unless all the scanlines
+* are already processed.
+*/
+.macro advance_to_next_scanline start_of_loop_label
+.if regs_shortage
+ldrd        W, [sp] /* load W and H (width and height) from stack */
+.else
+mov         W, ORIG_W
+.endif
+add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
+.if src_bpp != 0
+add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
+.endif
+.if mask_bpp != 0
+add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
+.endif
+.if (dst_w_bpp != 24)
+sub         DST_W, DST_W, W, lsl #dst_bpp_shift
+.endif
+.if (src_bpp != 24) && (src_bpp != 0)
+sub         SRC, SRC, W, lsl #src_bpp_shift
+.endif
+.if (mask_bpp != 24) && (mask_bpp != 0)
+sub         MASK, MASK, W, lsl #mask_bpp_shift
+.endif
+subs        H, H, #1
+mov         DST_R, DST_W
+.if regs_shortage
+str         H, [sp, #4] /* save updated height to stack */
+.endif
+bge         start_of_loop_label
+.endm
+/*
+* Registers are allocated in the following way by default:
+* d0, d1, d2, d3     - reserved for loading source pixel data
+* d4, d5, d6, d7     - reserved for loading destination pixel data
+* d24, d25, d26, d27 - reserved for loading mask pixel data
+* d28, d29, d30, d31 - final destination pixel data for writeback to memory
+*/
+.macro generate_composite_function fname, \
+src_bpp_, \
+mask_bpp_, \
+dst_w_bpp_, \
+flags, \
+pixblock_size_, \
+prefetch_distance, \
+init, \
+cleanup, \
+process_pixblock_head, \
+process_pixblock_tail, \
+process_pixblock_tail_head, \
+dst_w_basereg_ = 28, \
+dst_r_basereg_ = 4, \
+src_basereg_   = 0, \
+mask_basereg_  = 24
+.func fname
+.global fname
+/* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+.hidden fname
+.type fname, %function
+#endif
+fname:
+push        {r4-r12, lr}        /* save all registers */
+/*
+* Select prefetch type for this function. If prefetch distance is
+* set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
+* has to be used instead of ADVANCED.
+*/
+.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
+.if prefetch_distance == 0
+.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
+((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
+.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
+.endif
+/*
+* Make some macro arguments globally visible and accessible
+* from other macros
+*/
+.set src_bpp, src_bpp_
+.set mask_bpp, mask_bpp_
+.set dst_w_bpp, dst_w_bpp_
+.set pixblock_size, pixblock_size_
+.set dst_w_basereg, dst_w_basereg_
+.set dst_r_basereg, dst_r_basereg_
+.set src_basereg, src_basereg_
+.set mask_basereg, mask_basereg_
+/*
+* Assign symbolic names to registers
+*/
+W           .req        r0      /* width (is updated during processing) */
+H           .req        r1      /* height (is updated during processing) */
+DST_W       .req        r2      /* destination buffer pointer for writes */
+DST_STRIDE  .req        r3      /* destination image stride */
+SRC         .req        r4      /* source buffer pointer */
+SRC_STRIDE  .req        r5      /* source image stride */
+DST_R       .req        r6      /* destination buffer pointer for reads */
+MASK        .req        r7      /* mask pointer */
+MASK_STRIDE .req        r8      /* mask stride */
+PF_CTL      .req        r9      /* combined lines counter and prefetch */
+/* distance increment counter */
+PF_X        .req        r10     /* pixel index in a scanline for current */
+/* pretetch position */
+PF_SRC      .req        r11     /* pointer to source scanline start */
+/* for prefetch purposes */
+PF_DST      .req        r12     /* pointer to destination scanline start */
+/* for prefetch purposes */
+PF_MASK     .req        r14     /* pointer to mask scanline start */
+/* for prefetch purposes */
+/*
+* Check whether we have enough registers for all the local variables.
+* If we don't have enough registers, original width and height are
+* kept on top of stack (and 'regs_shortage' variable is set to indicate
+* this for the rest of code). Even if there are enough registers, the
+* allocation scheme may be a bit different depending on whether source
+* or mask is not used.
+*/
+.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
+ORIG_W      .req        r10     /* saved original width */
+DUMMY       .req        r12     /* temporary register */
+.set        regs_shortage, 0
+.elseif mask_bpp == 0
+ORIG_W      .req        r7      /* saved original width */
+DUMMY       .req        r8      /* temporary register */
+.set        regs_shortage, 0
+.elseif src_bpp == 0
+ORIG_W      .req        r4      /* saved original width */
+DUMMY       .req        r5      /* temporary register */
+.set        regs_shortage, 0
+.else
+ORIG_W      .req        r1      /* saved original width */
+DUMMY       .req        r1      /* temporary register */
+.set        regs_shortage, 1
+.endif
+.set mask_bpp_shift, -1
+.if src_bpp == 32
+.set src_bpp_shift, 2
+.elseif src_bpp == 24
+.set src_bpp_shift, 0
+.elseif src_bpp == 16
+.set src_bpp_shift, 1
+.elseif src_bpp == 8
+.set src_bpp_shift, 0
+.elseif src_bpp == 0
+.set src_bpp_shift, -1
+.else
+.error "requested src bpp (src_bpp) is not supported"
+.endif
+.if mask_bpp == 32
+.set mask_bpp_shift, 2
+.elseif mask_bpp == 24
+.set mask_bpp_shift, 0
+.elseif mask_bpp == 8
+.set mask_bpp_shift, 0
+.elseif mask_bpp == 0
+.set mask_bpp_shift, -1
+.else
+.error "requested mask bpp (mask_bpp) is not supported"
+.endif
+.if dst_w_bpp == 32
+.set dst_bpp_shift, 2
+.elseif dst_w_bpp == 24
+.set dst_bpp_shift, 0
+.elseif dst_w_bpp == 16
+.set dst_bpp_shift, 1
+.elseif dst_w_bpp == 8
+.set dst_bpp_shift, 0
+.else
+.error "requested dst bpp (dst_w_bpp) is not supported"
+.endif
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+.set dst_r_bpp, dst_w_bpp
+.else
+.set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+.set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+.set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+.if prefetch_distance < 0 || prefetch_distance > 15
+.error "invalid prefetch distance (prefetch_distance)"
+.endif
+.if src_bpp > 0
+ldr         SRC, [sp, #40]
+.endif
+.if mask_bpp > 0
+ldr         MASK, [sp, #48]
+.endif
+PF mov      PF_X, #0
+.if src_bpp > 0
+ldr         SRC_STRIDE, [sp, #44]
+.endif
+.if mask_bpp > 0
+ldr         MASK_STRIDE, [sp, #52]
+.endif
+mov         DST_R, DST_W
+.if src_bpp == 24
+sub         SRC_STRIDE, SRC_STRIDE, W
+sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
+.endif
+.if mask_bpp == 24
+sub         MASK_STRIDE, MASK_STRIDE, W
+sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
+.endif
+.if dst_w_bpp == 24
+sub         DST_STRIDE, DST_STRIDE, W
+sub         DST_STRIDE, DST_STRIDE, W, lsl #1
+.endif
+/*
+* Setup advanced prefetcher initial state
+*/
+PF mov      PF_SRC, SRC
+PF mov      PF_DST, DST_R
+PF mov      PF_MASK, MASK
+/* PF_CTL = prefetch_distance | ((h - 1) << 4) */
+PF mov      PF_CTL, H, lsl #4
+PF add      PF_CTL, #(prefetch_distance - 0x10)
+init
+.if regs_shortage
+push        {r0, r1}
+.endif
+subs        H, H, #1
+.if regs_shortage
+str         H, [sp, #4] /* save updated height to stack */
+.else
+mov         ORIG_W, W
+.endif
+blt         9f
+cmp         W, #(pixblock_size * 2)
+blt         8f
+/*
+* This is the start of the pipelined loop, which if optimized for
+* long scanlines
+*/
+0:
+ensure_destination_ptr_alignment process_pixblock_head, \
+process_pixblock_tail, \
+process_pixblock_tail_head
+/* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+pixld_a     pixblock_size, dst_r_bpp, \
+(dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+pixld       pixblock_size, src_bpp, \
+(src_basereg - pixblock_size * src_bpp / 64), SRC
+pixld       pixblock_size, mask_bpp, \
+(mask_basereg - pixblock_size * mask_bpp / 64), MASK
+PF add      PF_X, PF_X, #pixblock_size
+process_pixblock_head
+cache_preload 0, pixblock_size
+cache_preload_simple
+subs        W, W, #(pixblock_size * 2)
+blt         2f
+1:
+process_pixblock_tail_head
+cache_preload_simple
+subs        W, W, #pixblock_size
+bge         1b
+2:
+process_pixblock_tail
+pixst_a     pixblock_size, dst_w_bpp, \
+(dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+/* Process the remaining trailing pixels in the scanline */
+process_trailing_pixels 1, 1, \
+process_pixblock_head, \
+process_pixblock_tail, \
+process_pixblock_tail_head
+advance_to_next_scanline 0b
+.if regs_shortage
+pop         {r0, r1}
+.endif
+cleanup
+pop         {r4-r12, pc}  /* exit */
+/*
+* This is the start of the loop, designed to process images with small width
+* (less than pixblock_size * 2 pixels). In this case neither pipelining
+* nor prefetch are used.
+*/
+8:
+/* Process exactly pixblock_size pixels if needed */
+tst         W, #pixblock_size
+beq         1f
+pixld       pixblock_size, dst_r_bpp, \
+(dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+pixld       pixblock_size, src_bpp, \
+(src_basereg - pixblock_size * src_bpp / 64), SRC
+pixld       pixblock_size, mask_bpp, \
+(mask_basereg - pixblock_size * mask_bpp / 64), MASK
+process_pixblock_head
+process_pixblock_tail
+pixst       pixblock_size, dst_w_bpp, \
+(dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+1:
+/* Process the remaining trailing pixels in the scanline */
+process_trailing_pixels 0, 0, \
+process_pixblock_head, \
+process_pixblock_tail, \
+process_pixblock_tail_head
+advance_to_next_scanline 8b
+9:
+.if regs_shortage
+pop         {r0, r1}
+.endif
+cleanup
+pop         {r4-r12, pc}  /* exit */
+.unreq      SRC
+.unreq      MASK
+.unreq      DST_R
+.unreq      DST_W
+.unreq      ORIG_W
+.unreq      W
+.unreq      H
+.unreq      SRC_STRIDE
+.unreq      DST_STRIDE
+.unreq      MASK_STRIDE
+.unreq      PF_CTL
+.unreq      PF_X
+.unreq      PF_SRC
+.unreq      PF_DST
+.unreq      PF_MASK
+.unreq      DUMMY
+.endfunc
+.endm
+/*
+* A simplified variant of function generation template for a single
+* scanline processing (for implementing pixman combine functions)
+*/
+.macro generate_composite_function_single_scanline fname, \
+src_bpp_, \
+mask_bpp_, \
+dst_w_bpp_, \
+flags, \
+pixblock_size_, \
+init, \
+cleanup, \
+process_pixblock_head, \
+process_pixblock_tail, \
+process_pixblock_tail_head, \
+dst_w_basereg_ = 28, \
+dst_r_basereg_ = 4, \
+src_basereg_   = 0, \
+mask_basereg_  = 24
+.func fname
+.global fname
+/* For ELF format also set function visibility to hidden */
+#ifdef __ELF__
+.hidden fname
+.type fname, %function
+#endif
+fname:
+.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
+/*
+* Make some macro arguments globally visible and accessible
+* from other macros
+*/
+.set src_bpp, src_bpp_
+.set mask_bpp, mask_bpp_
+.set dst_w_bpp, dst_w_bpp_
+.set pixblock_size, pixblock_size_
+.set dst_w_basereg, dst_w_basereg_
+.set dst_r_basereg, dst_r_basereg_
+.set src_basereg, src_basereg_
+.set mask_basereg, mask_basereg_
+/*
+* Assign symbolic names to registers
+*/
+W           .req        r0      /* width (is updated during processing) */
+DST_W       .req        r1      /* destination buffer pointer for writes */
+SRC         .req        r2      /* source buffer pointer */
+DST_R       .req        ip      /* destination buffer pointer for reads */
+MASK        .req        r3      /* mask pointer */
+.if (((flags) & FLAG_DST_READWRITE) != 0)
+.set dst_r_bpp, dst_w_bpp
+.else
+.set dst_r_bpp, 0
+.endif
+.if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
+.set DEINTERLEAVE_32BPP_ENABLED, 1
+.else
+.set DEINTERLEAVE_32BPP_ENABLED, 0
+.endif
+init
+mov         DST_R, DST_W
+cmp         W, #pixblock_size
+blt         8f
+ensure_destination_ptr_alignment process_pixblock_head, \
+process_pixblock_tail, \
+process_pixblock_tail_head
+subs        W, W, #pixblock_size
+blt         7f
+/* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
+pixld_a     pixblock_size, dst_r_bpp, \
+(dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
+pixld       pixblock_size, src_bpp, \
+(src_basereg - pixblock_size * src_bpp / 64), SRC
+pixld       pixblock_size, mask_bpp, \
+(mask_basereg - pixblock_size * mask_bpp / 64), MASK
+process_pixblock_head
+subs        W, W, #pixblock_size
+blt         2f
+1:
+process_pixblock_tail_head
+subs        W, W, #pixblock_size
+bge         1b
+2:
+process_pixblock_tail
+pixst_a     pixblock_size, dst_w_bpp, \
+(dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
+7:
+/* Process the remaining trailing pixels in the scanline (dst aligned) */
+process_trailing_pixels 0, 1, \
+process_pixblock_head, \
+process_pixblock_tail, \
+process_pixblock_tail_head
+cleanup
+bx         lr  /* exit */
+8:
+/* Process the remaining trailing pixels in the scanline (dst unaligned) */
+process_trailing_pixels 0, 0, \
+process_pixblock_head, \
+process_pixblock_tail, \
+process_pixblock_tail_head
+cleanup
+bx          lr  /* exit */
+.unreq      SRC
+.unreq      MASK
+.unreq      DST_R
+.unreq      DST_W
+.unreq      W
+.endfunc
+.endm
+.macro default_init
+.endm
+.macro default_cleanup
+.endm