diff -r e4d67989cc36 -r 47c74d1534e1 genericopenlibs/liboil/src/multsum_sse.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genericopenlibs/liboil/src/multsum_sse.c Fri Apr 16 16:46:38 2010 +0300 @@ -0,0 +1,96 @@ +/* +* Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include +#include +#include + +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) + +#define MULTSUM_SSE2_NSTRIDED(i) { \ + t1 = _mm_load_pd(&OIL_GET(src1, i, double)); \ + t2 = _mm_load_pd(&OIL_GET(src2, i, double)); \ + t1 = _mm_mul_pd(t1,t2); \ + sum.reg = _mm_add_pd(sum.reg,t1); \ +} +#define MULTSUM_SSE2_NSTRIDEDP(i) { \ + t1 = _mm_load_pd(&OIL_GET(src1, i*sstr1, double)); \ + t2 = _mm_loadl_pd(t2, &OIL_GET(src2, i*sstr2, double)); \ + t2 = _mm_loadh_pd(t2, &OIL_GET(src2, (i+1)*sstr2, double)); \ + t1 = _mm_mul_pd(t1,t2); \ + sum.reg = _mm_add_pd(sum.reg,t1); \ +} +#define MULTSUM_SSE2_STRIDED(i) { \ + t1 = _mm_loadl_pd(t1, &OIL_GET(src1, i*sstr1, double)); \ + t1 = _mm_loadh_pd(t1, &OIL_GET(src1, (i+1)*sstr1, double)); \ + t2 = _mm_loadl_pd(t2, &OIL_GET(src2, i*sstr2, double)); \ + t2 = _mm_loadh_pd(t2, &OIL_GET(src2, (i+1)*sstr2, double)); \ + t1 = _mm_mul_pd(t1,t2); \ + sum.reg = _mm_add_pd(sum.reg,t1); \ +} + + +#ifdef ENABLE_BROKEN_IMPLS +SSE_FUNCTION static void +multsum_f64_sse2_unroll4(double *dest, + const double *src1, int sstr1, + const double *src2, int sstr2, + int n) +{ + __m128d t1, t2; + union { + __m128d reg; + double vals[2]; + } sum; + int i = 0; + + sum.reg = _mm_setzero_pd(); + while (i < n-3) { + MULTSUM_SSE2_STRIDED(0); + MULTSUM_SSE2_STRIDED(2); + + OIL_INCREMENT(src1, 4*sstr1); + OIL_INCREMENT(src2, 4*sstr2); + i += 4; + } + while (i < n-1) { + MULTSUM_SSE2_STRIDED(0); + + OIL_INCREMENT(src1, 2*sstr1); + OIL_INCREMENT(src2, 2*sstr2); + i+=2; + } + *dest = sum.vals[0] + sum.vals[1]; + if (i < n) { + *dest += (OIL_GET(src1,0,double)*OIL_GET(src2,0,double)); + } +} +OIL_DEFINE_IMPL_FULL (multsum_f64_sse2_unroll4, multsum_f64, OIL_IMPL_FLAG_SSE2); +#endif + + + +#ifdef __SYMBIAN32__ + +OilFunctionImpl* __oil_function_impl_multsum_f64_sse2_unroll4, multsum_f64() { + return &_oil_function_impl_multsum_f64_sse2_unroll4, multsum_f64; +} +#endif +