src/3rdparty/pixman/pixman-arm-neon-asm.h
changeset 30 5dc02b23752f
equal deleted inserted replaced
29:b72c6db6890b 30:5dc02b23752f
       
     1 /*
       
     2  * Copyright © 2009 Nokia Corporation
       
     3  *
       
     4  * Permission is hereby granted, free of charge, to any person obtaining a
       
     5  * copy of this software and associated documentation files (the "Software"),
       
     6  * to deal in the Software without restriction, including without limitation
       
     7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
       
     8  * and/or sell copies of the Software, and to permit persons to whom the
       
     9  * Software is furnished to do so, subject to the following conditions:
       
    10  *
       
    11  * The above copyright notice and this permission notice (including the next
       
    12  * paragraph) shall be included in all copies or substantial portions of the
       
    13  * Software.
       
    14  *
       
    15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
       
    16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
       
    17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
       
    18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
       
    19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
       
    20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
       
    21  * DEALINGS IN THE SOFTWARE.
       
    22  *
       
    23  * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
       
    24  */
       
    25 
       
    26 /*
       
    27  * This file contains a macro ('generate_composite_function') which can
       
    28  * construct 2D image processing functions, based on a common template.
       
    29  * Any combinations of source, destination and mask images with 8bpp,
       
    30  * 16bpp, 24bpp, 32bpp color formats are supported.
       
    31  *
       
    32  * This macro takes care of:
       
    33  *  - handling of leading and trailing unaligned pixels
       
    34  *  - doing most of the work related to L2 cache preload
       
    35  *  - encourages the use of software pipelining for better instructions
       
    36  *    scheduling
       
    37  *
       
    38  * The user of this macro has to provide some configuration parameters
       
    39  * (bit depths for the images, prefetch distance, etc.) and a set of
       
    40  * macros, which should implement basic code chunks responsible for
       
    41  * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
       
    42  * examples.
       
    43  *
       
    44  * TODO:
       
    45  *  - try overlapped pixel method (from Ian Rickards) when processing
       
    46  *    exactly two blocks of pixels
       
    47  *  - maybe add an option to do reverse scanline processing
       
    48  */
       
    49 
       
    50 /*
       
    51  * Bit flags for 'generate_composite_function' macro which are used
       
    52  * to tune generated functions behavior.
       
    53  */
       
    54 .set FLAG_DST_WRITEONLY,       0
       
    55 .set FLAG_DST_READWRITE,       1
       
    56 .set FLAG_DEINTERLEAVE_32BPP,  2
       
    57 
       
    58 /*
       
    59  * Offset in stack where mask and source pointer/stride can be accessed
       
    60  * from 'init' macro. This is useful for doing special handling for solid mask.
       
    61  */
       
    62 .set ARGS_STACK_OFFSET,        40
       
    63 
       
    64 /*
       
    65  * Constants for selecting preferable prefetch type.
       
    66  */
       
    67 .set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
       
    68 .set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
       
    69 .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
       
    70 
       
    71 /*
       
    72  * Definitions of supplementary pixld/pixst macros (for partial load/store of
       
    73  * pixel data).
       
    74  */
       
    75 
       
    76 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
       
    77 .if abits > 0
       
    78     op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
       
    79 .else
       
    80     op&.&elem_size {d&reg1}, [&mem_operand&]!
       
    81 .endif
       
    82 .endm
       
    83 
       
    84 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
       
    85 .if abits > 0
       
    86     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
       
    87 .else
       
    88     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
       
    89 .endif
       
    90 .endm
       
    91 
       
    92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
       
    93 .if abits > 0
       
    94     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
       
    95 .else
       
    96     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
       
    97 .endif
       
    98 .endm
       
    99 
       
   100 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
       
   101     op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
       
   102 .endm
       
   103 
       
   104 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
       
   105     op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
       
   106 .endm
       
   107 
       
   108 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
       
   109     op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
       
   110 .endm
       
   111 
       
   112 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
       
   113 .if numbytes == 32
       
   114     pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
       
   115                               %(basereg+6), %(basereg+7), mem_operand, abits
       
   116 .elseif numbytes == 16
       
   117     pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
       
   118 .elseif numbytes == 8
       
   119     pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
       
   120 .elseif numbytes == 4
       
   121     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
       
   122         pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
       
   123     .elseif elem_size == 16
       
   124         pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
       
   125         pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
       
   126     .else
       
   127         pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
       
   128         pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
       
   129         pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
       
   130         pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
       
   131     .endif
       
   132 .elseif numbytes == 2
       
   133     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
       
   134         pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
       
   135     .else
       
   136         pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
       
   137         pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
       
   138     .endif
       
   139 .elseif numbytes == 1
       
   140     pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
       
   141 .else
       
   142     .error "unsupported size: numbytes"
       
   143 .endif
       
   144 .endm
       
   145 
       
   146 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
       
   147 .if bpp > 0
       
   148 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
       
   149     pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
       
   150                       %(basereg+6), %(basereg+7), mem_operand, abits
       
   151 .elseif (bpp == 24) && (numpix == 8)
       
   152     pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
       
   153 .elseif (bpp == 24) && (numpix == 4)
       
   154     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
       
   155     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
       
   156     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
       
   157     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
       
   158 .elseif (bpp == 24) && (numpix == 2)
       
   159     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
       
   160     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
       
   161 .elseif (bpp == 24) && (numpix == 1)
       
   162     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
       
   163 .else
       
   164     pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
       
   165 .endif
       
   166 .endif
       
   167 .endm
       
   168 
       
   169 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
       
   170 .if bpp > 0
       
   171 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
       
   172     pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
       
   173                       %(basereg+6), %(basereg+7), mem_operand, abits
       
   174 .elseif (bpp == 24) && (numpix == 8)
       
   175     pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
       
   176 .elseif (bpp == 24) && (numpix == 4)
       
   177     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
       
   178     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
       
   179     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
       
   180     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
       
   181 .elseif (bpp == 24) && (numpix == 2)
       
   182     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
       
   183     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
       
   184 .elseif (bpp == 24) && (numpix == 1)
       
   185     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
       
   186 .else
       
   187     pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
       
   188 .endif
       
   189 .endif
       
   190 .endm
       
   191 
       
   192 .macro pixld_a numpix, bpp, basereg, mem_operand
       
   193 .if (bpp * numpix) <= 128
       
   194     pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
       
   195 .else
       
   196     pixld numpix, bpp, basereg, mem_operand, 128
       
   197 .endif
       
   198 .endm
       
   199 
       
   200 .macro pixst_a numpix, bpp, basereg, mem_operand
       
   201 .if (bpp * numpix) <= 128
       
   202     pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
       
   203 .else
       
   204     pixst numpix, bpp, basereg, mem_operand, 128
       
   205 .endif
       
   206 .endm
       
   207 
       
   208 .macro vuzp8 reg1, reg2
       
   209     vuzp.8 d&reg1, d&reg2
       
   210 .endm
       
   211 
       
   212 .macro vzip8 reg1, reg2
       
   213     vzip.8 d&reg1, d&reg2
       
   214 .endm
       
   215 
       
   216 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
       
   217 .macro pixdeinterleave bpp, basereg
       
   218 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
       
   219     vuzp8 %(basereg+0), %(basereg+1)
       
   220     vuzp8 %(basereg+2), %(basereg+3)
       
   221     vuzp8 %(basereg+1), %(basereg+3)
       
   222     vuzp8 %(basereg+0), %(basereg+2)
       
   223 .endif
       
   224 .endm
       
   225 
       
   226 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
       
   227 .macro pixinterleave bpp, basereg
       
   228 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
       
   229     vzip8 %(basereg+0), %(basereg+2)
       
   230     vzip8 %(basereg+1), %(basereg+3)
       
   231     vzip8 %(basereg+2), %(basereg+3)
       
   232     vzip8 %(basereg+0), %(basereg+1)
       
   233 .endif
       
   234 .endm
       
   235 
       
   236 /*
       
   237  * This is a macro for implementing cache preload. The main idea is that
       
   238  * cache preload logic is mostly independent from the rest of pixels
       
   239  * processing code. It starts at the top left pixel and moves forward
       
   240  * across pixels and can jump across scanlines. Prefetch distance is
       
   241  * handled in an 'incremental' way: it starts from 0 and advances to the
       
   242  * optimal distance over time. After reaching optimal prefetch distance,
       
   243  * it is kept constant. There are some checks which prevent prefetching
       
   244  * unneeded pixel lines below the image (but it still can prefetch a bit
       
   245  * more data on the right side of the image - not a big issue and may
       
   246  * be actually helpful when rendering text glyphs). Additional trick is
       
   247  * the use of LDR instruction for prefetch instead of PLD when moving to
       
   248  * the next line, the point is that we have a high chance of getting TLB
       
   249  * miss in this case, and PLD would be useless.
       
   250  *
       
   251  * This sounds like it may introduce a noticeable overhead (when working with
       
   252  * fully cached data). But in reality, due to having a separate pipeline and
       
   253  * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
       
   254  * execute simultaneously with NEON and be completely shadowed by it. Thus
       
   255  * we get no performance overhead at all (*). This looks like a very nice
       
   256  * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
       
   257  * but still can implement some rather advanced prefetch logic in sofware
       
   258  * for almost zero cost!
       
   259  *
       
   260  * (*) The overhead of the prefetcher is visible when running some trivial
       
   261  * pixels processing like simple copy. Anyway, having prefetch is a must
       
   262  * when working with the graphics data.
       
   263  */
       
   264 .macro PF a, x:vararg
       
   265 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
       
   266     a x
       
   267 .endif
       
   268 .endm
       
   269 
       
   270 .macro cache_preload std_increment, boost_increment
       
   271 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
       
   272 .if regs_shortage
       
   273     PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
       
   274 .endif
       
   275 .if std_increment != 0
       
   276     PF add PF_X, PF_X, #std_increment
       
   277 .endif
       
   278     PF tst PF_CTL, #0xF
       
   279     PF addne PF_X, PF_X, #boost_increment
       
   280     PF subne PF_CTL, PF_CTL, #1
       
   281     PF cmp PF_X, ORIG_W
       
   282 .if src_bpp_shift >= 0
       
   283     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
       
   284 .endif
       
   285 .if dst_r_bpp != 0
       
   286     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
       
   287 .endif
       
   288 .if mask_bpp_shift >= 0
       
   289     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
       
   290 .endif
       
   291     PF subge PF_X, PF_X, ORIG_W
       
   292     PF subges PF_CTL, PF_CTL, #0x10
       
   293 .if src_bpp_shift >= 0
       
   294     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
       
   295 .endif
       
   296 .if dst_r_bpp != 0
       
   297     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
       
   298 .endif
       
   299 .if mask_bpp_shift >= 0
       
   300     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
       
   301 .endif
       
   302 .endif
       
   303 .endm
       
   304 
       
   305 .macro cache_preload_simple
       
   306 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
       
   307 .if src_bpp > 0
       
   308     pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
       
   309 .endif
       
   310 .if dst_r_bpp > 0
       
   311     pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
       
   312 .endif
       
   313 .if mask_bpp > 0
       
   314     pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
       
   315 .endif
       
   316 .endif
       
   317 .endm
       
   318 
       
   319 /*
       
   320  * Macro which is used to process leading pixels until destination
       
   321  * pointer is properly aligned (at 16 bytes boundary). When destination
       
   322  * buffer uses 16bpp format, this is unnecessary, or even pointless.
       
   323  */
       
   324 .macro ensure_destination_ptr_alignment process_pixblock_head, \
       
   325                                         process_pixblock_tail, \
       
   326                                         process_pixblock_tail_head
       
   327 .if dst_w_bpp != 24
       
   328     tst         DST_R, #0xF
       
   329     beq         2f
       
   330 
       
   331 .irp lowbit, 1, 2, 4, 8, 16
       
   332 local skip1
       
   333 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
       
   334 .if lowbit < 16 /* we don't need more than 16-byte alignment */
       
   335     tst         DST_R, #lowbit
       
   336     beq         1f
       
   337 .endif
       
   338     pixld       (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
       
   339     pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
       
   340 .if dst_r_bpp > 0
       
   341     pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
       
   342 .else
       
   343     add         DST_R, DST_R, #lowbit
       
   344 .endif
       
   345     PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
       
   346     sub         W, W, #(lowbit * 8 / dst_w_bpp)
       
   347 1:
       
   348 .endif
       
   349 .endr
       
   350     pixdeinterleave src_bpp, src_basereg
       
   351     pixdeinterleave mask_bpp, mask_basereg
       
   352     pixdeinterleave dst_r_bpp, dst_r_basereg
       
   353 
       
   354     process_pixblock_head
       
   355     cache_preload 0, pixblock_size
       
   356     cache_preload_simple
       
   357     process_pixblock_tail
       
   358 
       
   359     pixinterleave dst_w_bpp, dst_w_basereg
       
   360 .irp lowbit, 1, 2, 4, 8, 16
       
   361 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
       
   362 .if lowbit < 16 /* we don't need more than 16-byte alignment */
       
   363     tst         DST_W, #lowbit
       
   364     beq         1f
       
   365 .endif
       
   366     pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
       
   367 1:
       
   368 .endif
       
   369 .endr
       
   370 .endif
       
   371 2:
       
   372 .endm
       
   373 
       
   374 /*
       
   375  * Special code for processing up to (pixblock_size - 1) remaining
       
   376  * trailing pixels. As SIMD processing performs operation on
       
   377  * pixblock_size pixels, anything smaller than this has to be loaded
       
   378  * and stored in a special way. Loading and storing of pixel data is
       
   379  * performed in such a way that we fill some 'slots' in the NEON
       
   380  * registers (some slots naturally are unused), then perform compositing
       
   381  * operation as usual. In the end, the data is taken from these 'slots'
       
   382  * and saved to memory.
       
   383  *
       
   384  * cache_preload_flag - allows to suppress prefetch if
       
   385  *                      set to 0
       
   386  * dst_aligned_flag   - selects whether destination buffer
       
   387  *                      is aligned
       
   388  */
       
   389 .macro process_trailing_pixels cache_preload_flag, \
       
   390                                dst_aligned_flag, \
       
   391                                process_pixblock_head, \
       
   392                                process_pixblock_tail, \
       
   393                                process_pixblock_tail_head
       
   394     tst         W, #(pixblock_size - 1)
       
   395     beq         2f
       
   396 .irp chunk_size, 16, 8, 4, 2, 1
       
   397 .if pixblock_size > chunk_size
       
   398     tst         W, #chunk_size
       
   399     beq         1f
       
   400     pixld       chunk_size, src_bpp, src_basereg, SRC
       
   401     pixld       chunk_size, mask_bpp, mask_basereg, MASK
       
   402 .if dst_aligned_flag != 0
       
   403     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
       
   404 .else
       
   405     pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
       
   406 .endif
       
   407 .if cache_preload_flag != 0
       
   408     PF add      PF_X, PF_X, #chunk_size
       
   409 .endif
       
   410 1:
       
   411 .endif
       
   412 .endr
       
   413     pixdeinterleave src_bpp, src_basereg
       
   414     pixdeinterleave mask_bpp, mask_basereg
       
   415     pixdeinterleave dst_r_bpp, dst_r_basereg
       
   416 
       
   417     process_pixblock_head
       
   418 .if cache_preload_flag != 0
       
   419     cache_preload 0, pixblock_size
       
   420     cache_preload_simple
       
   421 .endif
       
   422     process_pixblock_tail
       
   423     pixinterleave dst_w_bpp, dst_w_basereg
       
   424 .irp chunk_size, 16, 8, 4, 2, 1
       
   425 .if pixblock_size > chunk_size
       
   426     tst         W, #chunk_size
       
   427     beq         1f
       
   428 .if dst_aligned_flag != 0
       
   429     pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
       
   430 .else
       
   431     pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
       
   432 .endif
       
   433 1:
       
   434 .endif
       
   435 .endr
       
   436 2:
       
   437 .endm
       
   438 
       
   439 /*
       
   440  * Macro, which performs all the needed operations to switch to the next
       
   441  * scanline and start the next loop iteration unless all the scanlines
       
   442  * are already processed.
       
   443  */
       
   444 .macro advance_to_next_scanline start_of_loop_label
       
   445 .if regs_shortage
       
   446     ldrd        W, [sp] /* load W and H (width and height) from stack */
       
   447 .else
       
   448     mov         W, ORIG_W
       
   449 .endif
       
   450     add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
       
   451 .if src_bpp != 0
       
   452     add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
       
   453 .endif
       
   454 .if mask_bpp != 0
       
   455     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
       
   456 .endif
       
   457 .if (dst_w_bpp != 24)
       
   458     sub         DST_W, DST_W, W, lsl #dst_bpp_shift
       
   459 .endif
       
   460 .if (src_bpp != 24) && (src_bpp != 0)
       
   461     sub         SRC, SRC, W, lsl #src_bpp_shift
       
   462 .endif
       
   463 .if (mask_bpp != 24) && (mask_bpp != 0)
       
   464     sub         MASK, MASK, W, lsl #mask_bpp_shift
       
   465 .endif
       
   466     subs        H, H, #1
       
   467     mov         DST_R, DST_W
       
   468 .if regs_shortage
       
   469     str         H, [sp, #4] /* save updated height to stack */
       
   470 .endif
       
   471     bge         start_of_loop_label
       
   472 .endm
       
   473 
       
   474 /*
       
   475  * Registers are allocated in the following way by default:
       
   476  * d0, d1, d2, d3     - reserved for loading source pixel data
       
   477  * d4, d5, d6, d7     - reserved for loading destination pixel data
       
   478  * d24, d25, d26, d27 - reserved for loading mask pixel data
       
   479  * d28, d29, d30, d31 - final destination pixel data for writeback to memory
       
   480  */
       
   481 .macro generate_composite_function fname, \
       
   482                                    src_bpp_, \
       
   483                                    mask_bpp_, \
       
   484                                    dst_w_bpp_, \
       
   485                                    flags, \
       
   486                                    pixblock_size_, \
       
   487                                    prefetch_distance, \
       
   488                                    init, \
       
   489                                    cleanup, \
       
   490                                    process_pixblock_head, \
       
   491                                    process_pixblock_tail, \
       
   492                                    process_pixblock_tail_head, \
       
   493                                    dst_w_basereg_ = 28, \
       
   494                                    dst_r_basereg_ = 4, \
       
   495                                    src_basereg_   = 0, \
       
   496                                    mask_basereg_  = 24
       
   497 
       
   498     .func fname
       
   499     .global fname
       
   500     /* For ELF format also set function visibility to hidden */
       
   501 #ifdef __ELF__
       
   502     .hidden fname
       
   503     .type fname, %function
       
   504 #endif
       
   505 fname:
       
   506     push        {r4-r12, lr}        /* save all registers */
       
   507 
       
   508 /*
       
   509  * Select prefetch type for this function. If prefetch distance is
       
   510  * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
       
   511  * has to be used instead of ADVANCED.
       
   512  */
       
   513     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
       
   514 .if prefetch_distance == 0
       
   515     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
       
   516 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
       
   517         ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
       
   518     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
       
   519 .endif
       
   520 
       
   521 /*
       
   522  * Make some macro arguments globally visible and accessible
       
   523  * from other macros
       
   524  */
       
   525     .set src_bpp, src_bpp_
       
   526     .set mask_bpp, mask_bpp_
       
   527     .set dst_w_bpp, dst_w_bpp_
       
   528     .set pixblock_size, pixblock_size_
       
   529     .set dst_w_basereg, dst_w_basereg_
       
   530     .set dst_r_basereg, dst_r_basereg_
       
   531     .set src_basereg, src_basereg_
       
   532     .set mask_basereg, mask_basereg_
       
   533 
       
   534 /*
       
   535  * Assign symbolic names to registers
       
   536  */
       
   537     W           .req        r0      /* width (is updated during processing) */
       
   538     H           .req        r1      /* height (is updated during processing) */
       
   539     DST_W       .req        r2      /* destination buffer pointer for writes */
       
   540     DST_STRIDE  .req        r3      /* destination image stride */
       
   541     SRC         .req        r4      /* source buffer pointer */
       
   542     SRC_STRIDE  .req        r5      /* source image stride */
       
   543     DST_R       .req        r6      /* destination buffer pointer for reads */
       
   544 
       
   545     MASK        .req        r7      /* mask pointer */
       
   546     MASK_STRIDE .req        r8      /* mask stride */
       
   547 
       
   548     PF_CTL      .req        r9      /* combined lines counter and prefetch */
       
   549                                     /* distance increment counter */
       
   550     PF_X        .req        r10     /* pixel index in a scanline for current */
       
   551                                     /* pretetch position */
       
   552     PF_SRC      .req        r11     /* pointer to source scanline start */
       
   553                                     /* for prefetch purposes */
       
   554     PF_DST      .req        r12     /* pointer to destination scanline start */
       
   555                                     /* for prefetch purposes */
       
   556     PF_MASK     .req        r14     /* pointer to mask scanline start */
       
   557                                     /* for prefetch purposes */
       
   558 /*
       
   559  * Check whether we have enough registers for all the local variables.
       
   560  * If we don't have enough registers, original width and height are
       
   561  * kept on top of stack (and 'regs_shortage' variable is set to indicate
       
   562  * this for the rest of code). Even if there are enough registers, the
       
   563  * allocation scheme may be a bit different depending on whether source
       
   564  * or mask is not used.
       
   565  */
       
   566 .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
       
   567     ORIG_W      .req        r10     /* saved original width */
       
   568     DUMMY       .req        r12     /* temporary register */
       
   569     .set        regs_shortage, 0
       
   570 .elseif mask_bpp == 0
       
   571     ORIG_W      .req        r7      /* saved original width */
       
   572     DUMMY       .req        r8      /* temporary register */
       
   573     .set        regs_shortage, 0
       
   574 .elseif src_bpp == 0
       
   575     ORIG_W      .req        r4      /* saved original width */
       
   576     DUMMY       .req        r5      /* temporary register */
       
   577     .set        regs_shortage, 0
       
   578 .else
       
   579     ORIG_W      .req        r1      /* saved original width */
       
   580     DUMMY       .req        r1      /* temporary register */
       
   581     .set        regs_shortage, 1
       
   582 .endif
       
   583 
       
   584     .set mask_bpp_shift, -1
       
   585 .if src_bpp == 32
       
   586     .set src_bpp_shift, 2
       
   587 .elseif src_bpp == 24
       
   588     .set src_bpp_shift, 0
       
   589 .elseif src_bpp == 16
       
   590     .set src_bpp_shift, 1
       
   591 .elseif src_bpp == 8
       
   592     .set src_bpp_shift, 0
       
   593 .elseif src_bpp == 0
       
   594     .set src_bpp_shift, -1
       
   595 .else
       
   596     .error "requested src bpp (src_bpp) is not supported"
       
   597 .endif
       
   598 .if mask_bpp == 32
       
   599     .set mask_bpp_shift, 2
       
   600 .elseif mask_bpp == 24
       
   601     .set mask_bpp_shift, 0
       
   602 .elseif mask_bpp == 8
       
   603     .set mask_bpp_shift, 0
       
   604 .elseif mask_bpp == 0
       
   605     .set mask_bpp_shift, -1
       
   606 .else
       
   607     .error "requested mask bpp (mask_bpp) is not supported"
       
   608 .endif
       
   609 .if dst_w_bpp == 32
       
   610     .set dst_bpp_shift, 2
       
   611 .elseif dst_w_bpp == 24
       
   612     .set dst_bpp_shift, 0
       
   613 .elseif dst_w_bpp == 16
       
   614     .set dst_bpp_shift, 1
       
   615 .elseif dst_w_bpp == 8
       
   616     .set dst_bpp_shift, 0
       
   617 .else
       
   618     .error "requested dst bpp (dst_w_bpp) is not supported"
       
   619 .endif
       
   620 
       
   621 .if (((flags) & FLAG_DST_READWRITE) != 0)
       
   622     .set dst_r_bpp, dst_w_bpp
       
   623 .else
       
   624     .set dst_r_bpp, 0
       
   625 .endif
       
   626 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
       
   627     .set DEINTERLEAVE_32BPP_ENABLED, 1
       
   628 .else
       
   629     .set DEINTERLEAVE_32BPP_ENABLED, 0
       
   630 .endif
       
   631 
       
   632 .if prefetch_distance < 0 || prefetch_distance > 15
       
   633     .error "invalid prefetch distance (prefetch_distance)"
       
   634 .endif
       
   635 
       
   636 .if src_bpp > 0
       
   637     ldr         SRC, [sp, #40]
       
   638 .endif
       
   639 .if mask_bpp > 0
       
   640     ldr         MASK, [sp, #48]
       
   641 .endif
       
   642     PF mov      PF_X, #0
       
   643 .if src_bpp > 0
       
   644     ldr         SRC_STRIDE, [sp, #44]
       
   645 .endif
       
   646 .if mask_bpp > 0
       
   647     ldr         MASK_STRIDE, [sp, #52]
       
   648 .endif
       
   649     mov         DST_R, DST_W
       
   650 
       
   651 .if src_bpp == 24
       
   652     sub         SRC_STRIDE, SRC_STRIDE, W
       
   653     sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
       
   654 .endif
       
   655 .if mask_bpp == 24
       
   656     sub         MASK_STRIDE, MASK_STRIDE, W
       
   657     sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
       
   658 .endif
       
   659 .if dst_w_bpp == 24
       
   660     sub         DST_STRIDE, DST_STRIDE, W
       
   661     sub         DST_STRIDE, DST_STRIDE, W, lsl #1
       
   662 .endif
       
   663 
       
   664 /*
       
   665  * Setup advanced prefetcher initial state
       
   666  */
       
   667     PF mov      PF_SRC, SRC
       
   668     PF mov      PF_DST, DST_R
       
   669     PF mov      PF_MASK, MASK
       
   670     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
       
   671     PF mov      PF_CTL, H, lsl #4
       
   672     PF add      PF_CTL, #(prefetch_distance - 0x10)
       
   673 
       
   674     init
       
   675 .if regs_shortage
       
   676     push        {r0, r1}
       
   677 .endif
       
   678     subs        H, H, #1
       
   679 .if regs_shortage
       
   680     str         H, [sp, #4] /* save updated height to stack */
       
   681 .else
       
   682     mov         ORIG_W, W
       
   683 .endif
       
   684     blt         9f
       
   685     cmp         W, #(pixblock_size * 2)
       
   686     blt         8f
       
   687 /*
       
   688  * This is the start of the pipelined loop, which if optimized for
       
   689  * long scanlines
       
   690  */
       
   691 0:
       
   692     ensure_destination_ptr_alignment process_pixblock_head, \
       
   693                                      process_pixblock_tail, \
       
   694                                      process_pixblock_tail_head
       
   695 
       
   696     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
       
   697     pixld_a     pixblock_size, dst_r_bpp, \
       
   698                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
       
   699     pixld       pixblock_size, src_bpp, \
       
   700                 (src_basereg - pixblock_size * src_bpp / 64), SRC
       
   701     pixld       pixblock_size, mask_bpp, \
       
   702                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
       
   703     PF add      PF_X, PF_X, #pixblock_size
       
   704     process_pixblock_head
       
   705     cache_preload 0, pixblock_size
       
   706     cache_preload_simple
       
   707     subs        W, W, #(pixblock_size * 2)
       
   708     blt         2f
       
   709 1:
       
   710     process_pixblock_tail_head
       
   711     cache_preload_simple
       
   712     subs        W, W, #pixblock_size
       
   713     bge         1b
       
   714 2:
       
   715     process_pixblock_tail
       
   716     pixst_a     pixblock_size, dst_w_bpp, \
       
   717                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
       
   718 
       
   719     /* Process the remaining trailing pixels in the scanline */
       
   720     process_trailing_pixels 1, 1, \
       
   721                             process_pixblock_head, \
       
   722                             process_pixblock_tail, \
       
   723                             process_pixblock_tail_head
       
   724     advance_to_next_scanline 0b
       
   725 
       
   726 .if regs_shortage
       
   727     pop         {r0, r1}
       
   728 .endif
       
   729     cleanup
       
   730     pop         {r4-r12, pc}  /* exit */
       
   731 /*
       
   732  * This is the start of the loop, designed to process images with small width
       
   733  * (less than pixblock_size * 2 pixels). In this case neither pipelining
       
   734  * nor prefetch are used.
       
   735  */
       
   736 8:
       
   737     /* Process exactly pixblock_size pixels if needed */
       
   738     tst         W, #pixblock_size
       
   739     beq         1f
       
   740     pixld       pixblock_size, dst_r_bpp, \
       
   741                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
       
   742     pixld       pixblock_size, src_bpp, \
       
   743                 (src_basereg - pixblock_size * src_bpp / 64), SRC
       
   744     pixld       pixblock_size, mask_bpp, \
       
   745                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
       
   746     process_pixblock_head
       
   747     process_pixblock_tail
       
   748     pixst       pixblock_size, dst_w_bpp, \
       
   749                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
       
   750 1:
       
   751     /* Process the remaining trailing pixels in the scanline */
       
   752     process_trailing_pixels 0, 0, \
       
   753                             process_pixblock_head, \
       
   754                             process_pixblock_tail, \
       
   755                             process_pixblock_tail_head
       
   756     advance_to_next_scanline 8b
       
   757 9:
       
   758 .if regs_shortage
       
   759     pop         {r0, r1}
       
   760 .endif
       
   761     cleanup
       
   762     pop         {r4-r12, pc}  /* exit */
       
   763 
       
   764     .unreq      SRC
       
   765     .unreq      MASK
       
   766     .unreq      DST_R
       
   767     .unreq      DST_W
       
   768     .unreq      ORIG_W
       
   769     .unreq      W
       
   770     .unreq      H
       
   771     .unreq      SRC_STRIDE
       
   772     .unreq      DST_STRIDE
       
   773     .unreq      MASK_STRIDE
       
   774     .unreq      PF_CTL
       
   775     .unreq      PF_X
       
   776     .unreq      PF_SRC
       
   777     .unreq      PF_DST
       
   778     .unreq      PF_MASK
       
   779     .unreq      DUMMY
       
   780     .endfunc
       
   781 .endm
       
   782 
       
   783 /*
       
   784  * A simplified variant of function generation template for a single
       
   785  * scanline processing (for implementing pixman combine functions)
       
   786  */
       
   787 .macro generate_composite_function_single_scanline fname, \
       
   788                                                    src_bpp_, \
       
   789                                                    mask_bpp_, \
       
   790                                                    dst_w_bpp_, \
       
   791                                                    flags, \
       
   792                                                    pixblock_size_, \
       
   793                                                    init, \
       
   794                                                    cleanup, \
       
   795                                                    process_pixblock_head, \
       
   796                                                    process_pixblock_tail, \
       
   797                                                    process_pixblock_tail_head, \
       
   798                                                    dst_w_basereg_ = 28, \
       
   799                                                    dst_r_basereg_ = 4, \
       
   800                                                    src_basereg_   = 0, \
       
   801                                                    mask_basereg_  = 24
       
   802 
       
   803     .func fname
       
   804     .global fname
       
   805     /* For ELF format also set function visibility to hidden */
       
   806 #ifdef __ELF__
       
   807     .hidden fname
       
   808     .type fname, %function
       
   809 #endif
       
   810 fname:
       
   811     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
       
   812 /*
       
   813  * Make some macro arguments globally visible and accessible
       
   814  * from other macros
       
   815  */
       
   816     .set src_bpp, src_bpp_
       
   817     .set mask_bpp, mask_bpp_
       
   818     .set dst_w_bpp, dst_w_bpp_
       
   819     .set pixblock_size, pixblock_size_
       
   820     .set dst_w_basereg, dst_w_basereg_
       
   821     .set dst_r_basereg, dst_r_basereg_
       
   822     .set src_basereg, src_basereg_
       
   823     .set mask_basereg, mask_basereg_
       
   824 /*
       
   825  * Assign symbolic names to registers
       
   826  */
       
   827     W           .req        r0      /* width (is updated during processing) */
       
   828     DST_W       .req        r1      /* destination buffer pointer for writes */
       
   829     SRC         .req        r2      /* source buffer pointer */
       
   830     DST_R       .req        ip      /* destination buffer pointer for reads */
       
   831     MASK        .req        r3      /* mask pointer */
       
   832 
       
   833 .if (((flags) & FLAG_DST_READWRITE) != 0)
       
   834     .set dst_r_bpp, dst_w_bpp
       
   835 .else
       
   836     .set dst_r_bpp, 0
       
   837 .endif
       
   838 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
       
   839     .set DEINTERLEAVE_32BPP_ENABLED, 1
       
   840 .else
       
   841     .set DEINTERLEAVE_32BPP_ENABLED, 0
       
   842 .endif
       
   843 
       
   844     init
       
   845     mov         DST_R, DST_W
       
   846 
       
   847     cmp         W, #pixblock_size
       
   848     blt         8f
       
   849 
       
   850     ensure_destination_ptr_alignment process_pixblock_head, \
       
   851                                      process_pixblock_tail, \
       
   852                                      process_pixblock_tail_head
       
   853 
       
   854     subs        W, W, #pixblock_size
       
   855     blt         7f
       
   856 
       
   857     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
       
   858     pixld_a     pixblock_size, dst_r_bpp, \
       
   859                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
       
   860     pixld       pixblock_size, src_bpp, \
       
   861                 (src_basereg - pixblock_size * src_bpp / 64), SRC
       
   862     pixld       pixblock_size, mask_bpp, \
       
   863                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
       
   864     process_pixblock_head
       
   865     subs        W, W, #pixblock_size
       
   866     blt         2f
       
   867 1:
       
   868     process_pixblock_tail_head
       
   869     subs        W, W, #pixblock_size
       
   870     bge         1b
       
   871 2:
       
   872     process_pixblock_tail
       
   873     pixst_a     pixblock_size, dst_w_bpp, \
       
   874                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
       
   875 7:
       
   876     /* Process the remaining trailing pixels in the scanline (dst aligned) */
       
   877     process_trailing_pixels 0, 1, \
       
   878                             process_pixblock_head, \
       
   879                             process_pixblock_tail, \
       
   880                             process_pixblock_tail_head
       
   881 
       
   882     cleanup
       
   883     bx         lr  /* exit */
       
   884 8:
       
   885     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
       
   886     process_trailing_pixels 0, 0, \
       
   887                             process_pixblock_head, \
       
   888                             process_pixblock_tail, \
       
   889                             process_pixblock_tail_head
       
   890 
       
   891     cleanup
       
   892     bx          lr  /* exit */
       
   893 
       
   894     .unreq      SRC
       
   895     .unreq      MASK
       
   896     .unreq      DST_R
       
   897     .unreq      DST_W
       
   898     .unreq      W
       
   899     .endfunc
       
   900 .endm
       
   901 
       
   902 .macro default_init
       
   903 .endm
       
   904 
       
   905 .macro default_cleanup
       
   906 .endm