|
1 #!/usr/bin/perl |
|
2 # |
|
3 |
|
4 |
|
5 |
|
6 print <<EOF |
|
7 /* This file is autogenerated. Do not edit. */ |
|
8 /* |
|
9 * LIBOIL - Library of Optimized Inner Loops |
|
10 * Copyright (c) 2005 David A. Schleef <ds@schleef.org> |
|
11 * All rights reserved. |
|
12 * |
|
13 * Redistribution and use in source and binary forms, with or without |
|
14 * modification, are permitted provided that the following conditions |
|
15 * are met: |
|
16 * 1. Redistributions of source code must retain the above copyright |
|
17 * notice, this list of conditions and the following disclaimer. |
|
18 * 2. Redistributions in binary form must reproduce the above copyright |
|
19 * notice, this list of conditions and the following disclaimer in the |
|
20 * documentation and/or other materials provided with the distribution. |
|
21 * |
|
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
|
23 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|
24 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, |
|
26 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
|
27 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
|
28 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
|
30 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING |
|
31 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
32 * POSSIBILITY OF SUCH DAMAGE. |
|
33 */ |
|
34 |
|
35 #ifdef HAVE_CONFIG_H |
|
36 #include "config.h" |
|
37 #endif |
|
38 |
|
39 #include <math.h> |
|
40 |
|
41 #include <liboil/liboil.h> |
|
42 #include <liboil/liboilclasses.h> |
|
43 |
|
44 EOF |
|
45 ; |
|
46 |
|
47 |
|
48 sub binary_pointer |
|
49 { |
|
50 my $kernel = shift; |
|
51 my $precision = shift; |
|
52 my $type = "oil_type_$precision"; |
|
53 my $operator = shift; |
|
54 |
|
55 print <<EOF |
|
56 static void |
|
57 ${kernel}_${precision}_pointer (${type} *dest, ${type} *src1, ${type} *src2, int n) |
|
58 { |
|
59 while (n) { |
|
60 *dest = *src1 ${operator} *src2; |
|
61 dest++; |
|
62 src1++; |
|
63 src2++; |
|
64 n--; |
|
65 } |
|
66 } |
|
67 OIL_DEFINE_IMPL (${kernel}_${precision}_pointer, ${kernel}_${precision}); |
|
68 |
|
69 EOF |
|
70 ; |
|
71 } |
|
72 |
|
73 sub binary_unroll2 |
|
74 { |
|
75 my $kernel = shift; |
|
76 my $precision = shift; |
|
77 my $type = "oil_type_$precision"; |
|
78 my $operator = shift; |
|
79 |
|
80 print <<EOF |
|
81 static void |
|
82 ${kernel}_${precision}_unroll2 (${type} *dest, ${type} *src1, ${type} *src2, int n) |
|
83 { |
|
84 int i; |
|
85 |
|
86 if (n & 1) { |
|
87 dest[0] = src1[0] ${operator} src2[0]; |
|
88 dest++; |
|
89 src1++; |
|
90 src2++; |
|
91 n--; |
|
92 } |
|
93 for(i=0;i<n;i+=2){ |
|
94 dest[i] = src1[i] ${operator} src2[i]; |
|
95 dest[i+1] = src1[i+1] ${operator} src2[i+1]; |
|
96 } |
|
97 } |
|
98 OIL_DEFINE_IMPL (${kernel}_${precision}_unroll2, ${kernel}_${precision}); |
|
99 |
|
100 EOF |
|
101 ; |
|
102 } |
|
103 |
|
104 sub binary_unroll4a |
|
105 { |
|
106 my $kernel = shift; |
|
107 my $precision = shift; |
|
108 my $type = "oil_type_$precision"; |
|
109 my $operator = shift; |
|
110 |
|
111 print <<EOF |
|
112 static void |
|
113 ${kernel}_${precision}_unroll4a (${type} *dest, ${type} *src1, ${type} *src2, int n) |
|
114 { |
|
115 int i; |
|
116 |
|
117 while (n & 3) { |
|
118 dest[0] = src1[0] ${operator} src2[0]; |
|
119 dest++; |
|
120 src1++; |
|
121 src2++; |
|
122 n--; |
|
123 } |
|
124 for(i=0;i<n;i+=4){ |
|
125 dest[i] = src1[i] ${operator} src2[i]; |
|
126 dest[i+1] = src1[i+1] ${operator} src2[i+1]; |
|
127 dest[i+2] = src1[i+2] ${operator} src2[i+2]; |
|
128 dest[i+3] = src1[i+3] ${operator} src2[i+3]; |
|
129 } |
|
130 } |
|
131 OIL_DEFINE_IMPL (${kernel}_${precision}_unroll4a, ${kernel}_${precision}); |
|
132 |
|
133 EOF |
|
134 ; |
|
135 } |
|
136 |
|
137 sub binary_unroll4b |
|
138 { |
|
139 my $kernel = shift; |
|
140 my $precision = shift; |
|
141 my $type = "oil_type_$precision"; |
|
142 my $operator = shift; |
|
143 |
|
144 print <<EOF |
|
145 static void |
|
146 ${kernel}_${precision}_unroll4b (${type} *dest, ${type} *src1, ${type} *src2, int n) |
|
147 { |
|
148 int i; |
|
149 |
|
150 for(i=0;i<(n&(~0x3));i+=4){ |
|
151 dest[i+0] = src1[i+0] ${operator} src2[i+0]; |
|
152 dest[i+1] = src1[i+1] ${operator} src2[i+1]; |
|
153 dest[i+2] = src1[i+2] ${operator} src2[i+2]; |
|
154 dest[i+3] = src1[i+3] ${operator} src2[i+3]; |
|
155 } |
|
156 for(;i<n;i++){ |
|
157 dest[i] = src1[i] ${operator} src2[i]; |
|
158 } |
|
159 } |
|
160 OIL_DEFINE_IMPL (${kernel}_${precision}_unroll4b, ${kernel}_${precision}); |
|
161 |
|
162 EOF |
|
163 ; |
|
164 } |
|
165 |
|
166 sub binary_unroll4c |
|
167 { |
|
168 my $kernel = shift; |
|
169 my $precision = shift; |
|
170 my $type = "oil_type_$precision"; |
|
171 my $operator = shift; |
|
172 |
|
173 print <<EOF |
|
174 static void |
|
175 ${kernel}_${precision}_unroll4c (${type} *dest, ${type} *src1, ${type} *src2, int n) |
|
176 { |
|
177 int i; |
|
178 |
|
179 for(i=0;i<(n&(~0x3));i+=4){ |
|
180 *dest++ = *src1++ ${operator} *src2++; |
|
181 *dest++ = *src1++ ${operator} *src2++; |
|
182 *dest++ = *src1++ ${operator} *src2++; |
|
183 *dest++ = *src1++ ${operator} *src2++; |
|
184 } |
|
185 for(;i<n;i++){ |
|
186 *dest++ = *src1++ ${operator} *src2++; |
|
187 } |
|
188 } |
|
189 OIL_DEFINE_IMPL (${kernel}_${precision}_unroll4c, ${kernel}_${precision}); |
|
190 |
|
191 EOF |
|
192 ; |
|
193 } |
|
194 |
|
195 my %binary_operators = ( |
|
196 "add" => "+", |
|
197 "subtract" => "-", |
|
198 "multiply" => "*", |
|
199 "divide" => "/" |
|
200 ); |
|
201 |
|
202 my @types = ( "f32", "f64" ); |
|
203 |
|
204 while ( ($name, $op) = each %binary_operators ) { |
|
205 foreach $prec (@types) { |
|
206 binary_pointer($name, $prec, $op); |
|
207 binary_unroll2($name, $prec, $op); |
|
208 binary_unroll4a($name, $prec, $op); |
|
209 binary_unroll4b($name, $prec, $op); |
|
210 binary_unroll4c($name, $prec, $op); |
|
211 } |
|
212 } |
|
213 |
|
214 exit 0; |
|
215 |
|
216 binary_pointer("subtract", "f32", "-"); |
|
217 binary_unroll2("subtract", "f32", "-"); |
|
218 binary_unroll4a("subtract", "f32", "-"); |
|
219 binary_unroll4b("subtract", "f32", "-"); |
|
220 binary_unroll4c("subtract", "f32", "-"); |
|
221 |
|
222 binary_pointer("add", "f32", "+"); |
|
223 binary_unroll2("add", "f32", "+"); |
|
224 binary_unroll4a("add", "f32", "+"); |
|
225 binary_unroll4b("add", "f32", "+"); |
|
226 binary_unroll4c("add", "f32", "+"); |
|
227 |
|
228 binary_pointer("multiply", "f32", "*"); |
|
229 binary_unroll2("multiply", "f32", "*"); |
|
230 binary_unroll4a("multiply", "f32", "*"); |
|
231 binary_unroll4b("multiply", "f32", "*"); |
|
232 binary_unroll4c("multiply", "f32", "*"); |
|
233 |
|
234 binary_pointer("divide", "f32", "/"); |
|
235 binary_unroll2("divide", "f32", "/"); |
|
236 binary_unroll4a("divide", "f32", "/"); |
|
237 binary_unroll4b("divide", "f32", "/"); |
|
238 binary_unroll4c("divide", "f32", "/"); |
|
239 |
|
240 binary_pointer("subtract", "f64", "-"); |
|
241 binary_unroll2("subtract", "f64", "-"); |
|
242 binary_unroll4a("subtract", "f64", "-"); |
|
243 binary_unroll4b("subtract", "f64", "-"); |
|
244 binary_unroll4c("subtract", "f64", "-"); |
|
245 |
|
246 binary_pointer("add", "f64", "+"); |
|
247 binary_unroll2("add", "f64", "+"); |
|
248 binary_unroll4a("add", "f64", "+"); |
|
249 binary_unroll4b("add", "f64", "+"); |
|
250 binary_unroll4c("add", "f64", "+"); |
|
251 |
|
252 binary_pointer("multiply", "f64", "*"); |
|
253 binary_unroll2("multiply", "f64", "*"); |
|
254 binary_unroll4a("multiply", "f64", "*"); |
|
255 binary_unroll4b("multiply", "f64", "*"); |
|
256 binary_unroll4c("multiply", "f64", "*"); |
|
257 |
|
258 binary_pointer("divide", "f64", "/"); |
|
259 binary_unroll2("divide", "f64", "/"); |
|
260 binary_unroll4a("divide", "f64", "/"); |
|
261 binary_unroll4b("divide", "f64", "/"); |
|
262 binary_unroll4c("divide", "f64", "/"); |
|
263 |
|
264 $blah = " |
|
265 static void |
|
266 subtract_f32_ref (float *dest, float *src1, float *src2, int n) |
|
267 { |
|
268 int i; |
|
269 |
|
270 for(i=0;i<n;i++){ |
|
271 dest[i] = src1[i] - src2[i]; |
|
272 } |
|
273 } |
|
274 OIL_DEFINE_IMPL (subtract_f32_ref, subtract_f32); |
|
275 |
|
276 static void |
|
277 multiply_f32_ref (float *dest, float *src1, float *src2, int n) |
|
278 { |
|
279 int i; |
|
280 |
|
281 for(i=0;i<n;i++){ |
|
282 dest[i] = src1[i] * src2[i]; |
|
283 } |
|
284 } |
|
285 OIL_DEFINE_IMPL (multiply_f32_ref, multiply_f32); |
|
286 |
|
287 static void |
|
288 divide_f32_ref (float *dest, float *src1, float *src2, int n) |
|
289 { |
|
290 int i; |
|
291 |
|
292 for(i=0;i<n;i++){ |
|
293 dest[i] = src1[i] / src2[i]; |
|
294 } |
|
295 } |
|
296 OIL_DEFINE_IMPL_REF (divide_f32_ref, divide_f32); |
|
297 |
|
298 static void |
|
299 minimum_f32_ref (float *dest, float *src1, float *src2, int n) |
|
300 { |
|
301 int i; |
|
302 |
|
303 for(i=0;i<n;i++){ |
|
304 dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i]; |
|
305 } |
|
306 } |
|
307 OIL_DEFINE_IMPL_REF (minimum_f32_ref, minimum_f32); |
|
308 |
|
309 static void |
|
310 maximum_f32_ref (float *dest, float *src1, float *src2, int n) |
|
311 { |
|
312 int i; |
|
313 |
|
314 for(i=0;i<n;i++){ |
|
315 dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i]; |
|
316 } |
|
317 } |
|
318 OIL_DEFINE_IMPL_REF (maximum_f32_ref, maximum_f32); |
|
319 |
|
320 static void |
|
321 negative_f32_ref (float *dest, float *src1, int n) |
|
322 { |
|
323 int i; |
|
324 |
|
325 for(i=0;i<n;i++){ |
|
326 dest[i] = -src1[i]; |
|
327 } |
|
328 } |
|
329 OIL_DEFINE_IMPL_REF (negative_f32_ref, negative_f32); |
|
330 |
|
331 static void |
|
332 inverse_f32_ref (float *dest, float *src1, int n) |
|
333 { |
|
334 int i; |
|
335 |
|
336 for(i=0;i<n;i++){ |
|
337 dest[i] = 1.0/src1[i]; |
|
338 } |
|
339 } |
|
340 OIL_DEFINE_IMPL_REF (inverse_f32_ref, inverse_f32); |
|
341 |
|
342 static void |
|
343 sign_f32_ref (float *dest, float *src1, int n) |
|
344 { |
|
345 int i; |
|
346 |
|
347 for(i=0;i<n;i++){ |
|
348 dest[i] = (src1[i] < 0) ? -src1[i] : src1[i]; |
|
349 } |
|
350 } |
|
351 OIL_DEFINE_IMPL_REF (sign_f32_ref, sign_f32); |
|
352 |
|
353 static void |
|
354 floor_f32_ref (float *dest, float *src1, int n) |
|
355 { |
|
356 int i; |
|
357 |
|
358 for(i=0;i<n;i++){ |
|
359 dest[i] = floor(src1[i]); |
|
360 } |
|
361 } |
|
362 OIL_DEFINE_IMPL_REF (floor_f32_ref, floor_f32); |
|
363 |
|
364 |
|
365 |
|
366 static void |
|
367 scalaradd_f32_ns_ref (float *dest, float *src1, float *src2, int n) |
|
368 { |
|
369 int i; |
|
370 |
|
371 for(i=0;i<n;i++){ |
|
372 dest[i] = src1[i] + src2[0]; |
|
373 } |
|
374 } |
|
375 OIL_DEFINE_IMPL_REF (scalaradd_f32_ns_ref, scalaradd_f32_ns); |
|
376 |
|
377 static void |
|
378 scalarmultiply_f32_ns_ref (float *dest, float *src1, float *src2, int n) |
|
379 { |
|
380 int i; |
|
381 |
|
382 for(i=0;i<n;i++){ |
|
383 dest[i] = src1[i] * src2[0]; |
|
384 } |
|
385 } |
|
386 OIL_DEFINE_IMPL_REF (scalarmultiply_f32_ns_ref, scalarmultiply_f32_ns); |
|
387 |
|
388 |
|
389 "; |