diff -Nru boinc-app-seti-7.28~svn2781/client/vector/analyzeFuncs_neon.S boinc-app-seti-7.28~svn2858/client/vector/analyzeFuncs_neon.S --- boinc-app-seti-7.28~svn2781/client/vector/analyzeFuncs_neon.S 2015-02-23 15:18:24.000000000 +0000 +++ boinc-app-seti-7.28~svn2858/client/vector/analyzeFuncs_neon.S 2015-04-08 17:04:24.000000000 +0000 @@ -40,17 +40,17 @@ * vfp_ChirpData.S * Author: Mateusz Szpakowski */ - + #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .syntax unified + .syntax unified #endif .arch armv7-a .fpu neon #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .eabi_attribute 27, 3 - .eabi_attribute 28, 1 + .eabi_attribute 27, 3 + .eabi_attribute 28, 1 #endif - + .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 @@ -590,17 +590,17 @@ * neon_FoldSubs.S * Author: Mateusz Szpakowski */ - + #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .syntax unified -#endif + .syntax unified +#endif .arch armv7-a .fpu neon #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .eabi_attribute 27, 3 - .eabi_attribute 28, 1 + .eabi_attribute 27, 3 + .eabi_attribute 28, 1 #endif - + .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 @@ -2497,16 +2497,17 @@ * _Z21neon_GetPowerSpectrumPA2_fPfi.S * Author: Mateusz Szpakowski */ + #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .syntax unified -#endif + .syntax unified +#endif .arch armv7-a .fpu neon #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .eabi_attribute 27, 3 - .eabi_attribute 28, 1 + .eabi_attribute 27, 3 + .eabi_attribute 28, 1 #endif - + .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 diff -Nru boinc-app-seti-7.28~svn2781/client/vector/analyzeFuncs_vfp.S boinc-app-seti-7.28~svn2858/client/vector/analyzeFuncs_vfp.S --- boinc-app-seti-7.28~svn2781/client/vector/analyzeFuncs_vfp.S 2015-02-23 15:18:24.000000000 +0000 +++ boinc-app-seti-7.28~svn2858/client/vector/analyzeFuncs_vfp.S 2015-04-08 17:04:24.000000000 +0000 @@ -40,17 +40,17 @@ * vfp_ChirpData.S * Author: Mateusz Szpakowski */ - + #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .syntax unified -#endif + .syntax unified +#endif .arch armv6 .fpu vfp #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .eabi_attribute 27, 3 - .eabi_attribute 28, 1 + .eabi_attribute 27, 3 + .eabi_attribute 28, 1 #endif - + .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 @@ -623,17 +623,17 @@ * vfp_FoldSubs.S * Author: Mateusz Szpakowski */ - + #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .syntax unified -#endif + .syntax unified +#endif .arch armv6 .fpu vfp #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .eabi_attribute 27, 3 - .eabi_attribute 28, 1 + .eabi_attribute 27, 3 + .eabi_attribute 28, 1 #endif - + .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 @@ -2133,17 +2133,17 @@ * vfp_GetPowerSpectrum.S * Author: Mateusz Szpakowski */ - + #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .syntax unified -#endif + .syntax unified +#endif .arch armv6 .fpu vfp #if defined(__VFP_FP__) && !defined(__SOFTFP__) - .eabi_attribute 27, 3 - .eabi_attribute 28, 1 + .eabi_attribute 27, 3 + .eabi_attribute 28, 1 #endif - + .eabi_attribute 20, 1 .eabi_attribute 21, 1 .eabi_attribute 23, 3 diff -Nru boinc-app-seti-7.28~svn2781/debian/bzr-builder.manifest boinc-app-seti-7.28~svn2858/debian/bzr-builder.manifest --- boinc-app-seti-7.28~svn2781/debian/bzr-builder.manifest 2015-02-23 15:18:24.000000000 +0000 +++ boinc-app-seti-7.28~svn2858/debian/bzr-builder.manifest 2015-04-08 17:04:25.000000000 +0000 @@ -1,2 +1,2 @@ -# bzr-builder format 0.4 deb-version 7.28~svn2781-0~259 -lp:~costamagnagianfranco/boinc-app-seti/trunk revid:git-v1:f845986547e50773ae0be545a56a0b5478cbec85 +# bzr-builder format 0.4 deb-version 7.28~svn2858-0~261 +lp:~costamagnagianfranco/boinc-app-seti/trunk revid:git-v1:a6d24076110fedd66f59c66bdc7f0c1a4bbd3eb2 diff -Nru boinc-app-seti-7.28~svn2781/debian/changelog boinc-app-seti-7.28~svn2858/debian/changelog --- boinc-app-seti-7.28~svn2781/debian/changelog 2015-02-23 15:18:24.000000000 +0000 +++ boinc-app-seti-7.28~svn2858/debian/changelog 2015-04-08 17:04:25.000000000 +0000 @@ -1,14 +1,12 @@ -boinc-app-seti (7.28~svn2781-0~259~ubuntu14.10.1) utopic; urgency=low +boinc-app-seti (7.28~svn2858-0~261~ubuntu14.10.1) utopic; urgency=low * Auto build. - -- LocutusOfBorg Mon, 23 Feb 2015 15:18:24 +0000 + -- LocutusOfBorg Wed, 08 Apr 2015 17:04:25 +0000 -boinc-app-seti (7.28~svn2781-1) UNRELEASED; urgency=medium +boinc-app-seti (7.28~svn2858-1) UNRELEASED; urgency=medium * New upstream release. - * Add d/p/215-remove-vfp-Chirp.patch and tweak 214_fix_armhf.patch, - thanks Stephen Maclagan! * d/p/216-fix-build.patch fix build error -- Gianfranco Costamagna Mon, 23 Feb 2015 14:49:35 +0100 diff -Nru boinc-app-seti-7.28~svn2781/.pc/214_fix_armhf.patch/client/vector/analyzeFuncs_neon.S boinc-app-seti-7.28~svn2858/.pc/214_fix_armhf.patch/client/vector/analyzeFuncs_neon.S --- boinc-app-seti-7.28~svn2781/.pc/214_fix_armhf.patch/client/vector/analyzeFuncs_neon.S 2015-02-23 15:18:24.000000000 +0000 +++ boinc-app-seti-7.28~svn2858/.pc/214_fix_armhf.patch/client/vector/analyzeFuncs_neon.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,2690 +0,0 @@ -// Copyright (c) 1999-2011 Regents of the University of California -// -// FFTW: Copyright (c) 2003,2006 Matteo Frigo -// Copyright (c) 2003,2006 Massachusets Institute of Technology -// -// fft8g.[cpp,h]: Copyright (c) 1995-2001 Takya Ooura -// -// ASMLIB: Copyright (c) 2004 Agner Fog - -// This program is free software; you can redistribute it and/or modify it -// under the terms of the GNU General Public License as published by the -// Free Software Foundation; either version 2, or (at your option) any later -// version. - -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -// more details. - -// You should have received a copy of the GNU General Public License along -// with this program; see the file COPYING. If not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -// In addition, as a special exception, the Regents of the University of -// California give permission to link the code of this program with libraries -// that provide specific optimized fast Fourier transform (FFT) functions -// as an alternative to FFTW and distribute a linked executable and -// source code. You must obey the GNU General Public License in all -// respects for all of the code used other than the FFT library itself. -// Any modification required to support these libraries must be distributed -// under the terms of this license. If you modify this program, you may extend -// this exception to your version of the program, but you are not obligated to -// do so. If you do not wish to do so, delete this exception statement from -// your version. Please be aware that FFTW and ASMLIB are not covered by -// this exception, therefore you may not use FFTW and ASMLIB in any derivative -// work so modified without permission of the authors of those packages. - -#ifdef __arm__ -/* - * vfp_ChirpData.S - * Author: Mateusz Szpakowski - */ - - .arch armv7-a - .fpu neon - .eabi_attribute 20, 1 - .eabi_attribute 21, 1 - .eabi_attribute 23, 3 - .eabi_attribute 24, 1 - .eabi_attribute 25, 1 - .eabi_attribute 26, 2 - .eabi_attribute 30, 2 - .eabi_attribute 18, 4 - .text - .align 2 -.Lhalfd: - .double 0.5 -.LroundVal: - .double 4503599627370496.0 - .align 4 -.Linc8d: - .double 0,1,2,3,4,5,6,7 -.Lsinapprox: - .float 1.5707963268,1.5707963268,1.5707963268,1.5707963268 - .float -0.6466386396,-0.6466386396,-0.6466386396,-0.6466386396 - .float 0.0679105987,0.0679105987,0.0679105987,0.0679105987 - .float -0.0011573807,-0.0011573807,-0.0011573807,-0.0011573807 -.Lcosapprox: - .float 1.0,1.0,1.0,1.0 - .float -1.2341299769,-1.2341299769,-1.2341299769,-1.2341299769 - .float 0.2465220241,0.2465220241,0.2465220241,0.2465220241 - .float -0.0123926179,-0.0123926179,-0.0123926179,-0.0123926179 -.Ltwosq: - .float 2.0,2.0,2.0,2.0 - .float 2.0,2.0,2.0,2.0 - .float 2.0,2.0,2.0,2.0 - .float 2.0,2.0,2.0,2.0 - - .align 2 - .global _Z14neon_ChirpDataPA2_fS0_idid - .type _Z14neon_ChirpDataPA2_fS0_idid, %function -_Z14neon_ChirpDataPA2_fS0_idid: - push {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} -#define stargidx (40+64) - /* r0 - input data - * r1 - output data - * r2 - chirprateind - * sp[0-1] - chirp_rate - * sp[2] - numDataPoints - * sp[4-5] - sample_rate - */ - tst r2,r2 - bne .Lrealfunc - mov r3,r0 // swap addresses - mov r0,r1 - mov r1,r3 - ldr r2,[sp,#stargidx+8] // numDataPoints - lsl r2,r2,#3 - bl memcpy(PLT) - b .Lendoffunc - /* - * real functions - */ -.Lrealfunc: - ldr r4,.LGOT1 -.LPIC1: - add r4,pc,r4 - ldr r5,.LGOT1+4 - ldr r5,[r4,r5] - - ldr r3,[sp,#stargidx+8] // numDataPoints - - add r6,r3,r3,lsl #1 - lsl r6,r6,#2 - fldd d0,[r5,#32] - fmsr s4,r6 - fuitod d1,s4 - faddd d0,d0,d1 - fstd d0,[r5,#32] - - add r3,r0,r3,lsl #3 - sub r3,r3,#8*15 - - fldd d11,.Lhalfd - fldd d9,[sp,#stargidx] // chirp_rate - fldd d10,[sp,#stargidx+16] // sample_rate - fmuld d10,d10,d10 - fmuld d9,d9,d11 - fdivd d9,d9,d10 - - fldd d10,.LroundVal - fsubd d11,d11,d11 // zero - fcmpd d9,d11 - fmstat - fnegdmi d10,d10 // negate is negative srate - - sub sp,sp,#16+64 - add r7,sp,#16 - add r11,sp,#16+32 - fstmiad sp,{d9,d10} - - mov r4,#0 // i - - adr r5,.Linc8d - adr r6,.Lsinapprox - adr r10,.Lcosapprox - adr r9,.Ltwosq - - cmp r0,r3 - bhs .Lendmainloop -.Lmainloop: - add r8,r4,#8 - fmsr s8,r4 - fmsr s9,r8 - fuitod d0,s8 - fuitod d1,s9 - fldmiad r5,{d8,d9,d10,d11,d12,d13,d14,d15} - fldmiad sp,{d2,d3} - faddd d16,d8,d0 - faddd d17,d9,d0 - faddd d18,d10,d0 - faddd d19,d11,d0 - faddd d20,d12,d0 - faddd d21,d13,d0 - faddd d22,d14,d0 - faddd d23,d15,d0 - faddd d24,d8,d1 - faddd d25,d9,d1 - faddd d26,d10,d1 - faddd d27,d11,d1 - faddd d28,d12,d1 - faddd d29,d13,d1 - faddd d30,d14,d1 - faddd d31,d15,d1 - // square of i - fmuld d16,d16,d16 - fmuld d17,d17,d17 - fmuld d18,d18,d18 - fmuld d19,d19,d19 - fmuld d20,d20,d20 - fmuld d21,d21,d21 - fmuld d22,d22,d22 - fmuld d23,d23,d23 - fmuld d24,d24,d24 - fmuld d25,d25,d25 - fmuld d26,d26,d26 - fmuld d27,d27,d27 - fmuld d28,d28,d28 - fmuld d29,d29,d29 - fmuld d30,d30,d30 - fmuld d31,d31,d31 - // multiply by srate - fmuld d16,d16,d2 - fmuld d17,d17,d2 - fmuld d18,d18,d2 - fmuld d19,d19,d2 - fmuld d20,d20,d2 - fmuld d21,d21,d2 - fmuld d22,d22,d2 - fmuld d23,d23,d2 - fmuld d24,d24,d2 - fmuld d25,d25,d2 - fmuld d26,d26,d2 - fmuld d27,d27,d2 - fmuld d28,d28,d2 - fmuld d29,d29,d2 - fmuld d30,d30,d2 - fmuld d31,d31,d2 - - // rounding to -0.5/+0.5 - faddd d8,d16,d3 - faddd d9,d17,d3 - faddd d10,d18,d3 - faddd d11,d19,d3 - faddd d12,d20,d3 - faddd d13,d21,d3 - faddd d14,d22,d3 - faddd d15,d23,d3 - fsubd d8,d8,d3 - fsubd d9,d9,d3 - fsubd d10,d10,d3 - fsubd d11,d11,d3 - fsubd d12,d12,d3 - fsubd d13,d13,d3 - fsubd d14,d14,d3 - fsubd d15,d15,d3 - fsubd d16,d16,d8 - fsubd d17,d17,d9 - fsubd d18,d18,d10 - fsubd d19,d19,d11 - fsubd d20,d20,d12 - fsubd d21,d21,d13 - fsubd d22,d22,d14 - fsubd d23,d23,d15 - - faddd d8,d24,d3 - faddd d9,d25,d3 - faddd d10,d26,d3 - faddd d11,d27,d3 - faddd d12,d28,d3 - faddd d13,d29,d3 - faddd d14,d30,d3 - faddd d15,d31,d3 - fsubd d8,d8,d3 - fsubd d9,d9,d3 - fsubd d10,d10,d3 - fsubd d11,d11,d3 - fsubd d12,d12,d3 - fsubd d13,d13,d3 - fsubd d14,d14,d3 - fsubd d15,d15,d3 - fsubd d24,d24,d8 - fsubd d25,d25,d9 - fsubd d26,d26,d10 - fsubd d27,d27,d11 - fsubd d28,d28,d12 - fsubd d29,d29,d13 - fsubd d30,d30,d14 - fsubd d31,d31,d15 - // to single precision - fcvtsd s0,d16 - fcvtsd s1,d17 - fcvtsd s2,d18 - fcvtsd s3,d19 - fcvtsd s4,d20 - fcvtsd s5,d21 - fcvtsd s6,d22 - fcvtsd s7,d23 - fcvtsd s8,d24 - fcvtsd s9,d25 - fcvtsd s10,d26 - fcvtsd s11,d27 - fcvtsd s12,d28 - fcvtsd s13,d29 - fcvtsd s14,d30 - fcvtsd s15,d31 - - //fldmias r6,{s8,s9,s10,s11,s12,s13,s14,s15} - vldmia r6,{q8,q9,q10,q11} - // square of y - vmul.f32 q4,q0,q0 - vmul.f32 q5,q1,q1 - vmul.f32 q6,q2,q2 - vmul.f32 q7,q3,q3 - // sine - vmul.f32 q12,q4,q11 - vmul.f32 q13,q5,q11 - vmul.f32 q14,q6,q11 - vmul.f32 q15,q7,q11 - vadd.f32 q12,q12,q10 - vadd.f32 q13,q13,q10 - vadd.f32 q14,q14,q10 - vadd.f32 q15,q15,q10 - vmul.f32 q12,q12,q4 - vmul.f32 q13,q13,q5 - vmul.f32 q14,q14,q6 - vmul.f32 q15,q15,q7 - vadd.f32 q12,q12,q9 - vadd.f32 q13,q13,q9 - vadd.f32 q14,q14,q9 - vadd.f32 q15,q15,q9 - vmul.f32 q12,q12,q4 - vmul.f32 q13,q13,q5 - vmul.f32 q14,q14,q6 - vmul.f32 q15,q15,q7 - vadd.f32 q12,q12,q8 - vadd.f32 q13,q13,q8 - vadd.f32 q14,q14,q8 - vadd.f32 q15,q15,q8 - vmul.f32 q12,q12,q0 - vmul.f32 q13,q13,q1 - vmul.f32 q14,q14,q2 - vmul.f32 q15,q15,q3 - vldmia r10,{q0,q1,q2,q3} - // cosine - vmul.f32 q8,q4,q3 - vmul.f32 q9,q5,q3 - vmul.f32 q10,q6,q3 - vmul.f32 q11,q7,q3 - vadd.f32 q8,q8,q2 - vadd.f32 q9,q9,q2 - vadd.f32 q10,q10,q2 - vadd.f32 q11,q11,q2 - vmul.f32 q8,q8,q4 - vmul.f32 q9,q9,q5 - vmul.f32 q10,q10,q6 - vmul.f32 q11,q11,q7 - vadd.f32 q8,q8,q1 - vadd.f32 q9,q9,q1 - vadd.f32 q10,q10,q1 - vadd.f32 q11,q11,q1 - vmul.f32 q8,q8,q4 - vmul.f32 q9,q9,q5 - vmul.f32 q10,q10,q6 - vmul.f32 q11,q11,q7 - vadd.f32 q8,q8,q0 - vadd.f32 q9,q9,q0 - vadd.f32 q10,q10,q0 - vadd.f32 q11,q11,q0 - // q8-q11 - cosine, q12-q15 - sine - // doubling cosine/sine - vmul.f32 q4,q8,q12 // c*s - vmul.f32 q5,q9,q13 - vmul.f32 q6,q10,q14 - vmul.f32 q7,q11,q15 - vmul.f32 q0,q8,q8 // c*c - vmul.f32 q1,q9,q9 - vmul.f32 q2,q10,q10 - vmul.f32 q3,q11,q11 - vmls.f32 q0,q12,q12 // c*c-s*s - vmls.f32 q1,q13,q13 - vmls.f32 q2,q14,q14 - vmls.f32 q3,q15,q15 - vadd.f32 q4,q4,q4 // 2*c*s - vadd.f32 q5,q5,q5 - vadd.f32 q6,q6,q6 - vadd.f32 q7,q7,q7 - // next doubling cosine/sine - vmul.f32 q8,q0,q4 // cd1 = x*y - vmul.f32 q9,q1,q5 - vmul.f32 q10,q2,q6 - vmul.f32 q11,q3,q7 - vmul.f32 q4,q4,q4 // cd3 = y*y - vmul.f32 q5,q5,q5 - vmul.f32 q6,q6,q6 - vmul.f32 q7,q7,q7 - vmul.f32 q0,q0,q0 // cd2 = x*x - vmul.f32 q1,q1,q1 - vmul.f32 q2,q2,q2 - vmul.f32 q3,q3,q3 - vadd.f32 q12,q0,q4 // norm = cd2+cd3 - vadd.f32 q13,q1,q5 - vadd.f32 q14,q2,q6 - vadd.f32 q15,q3,q7 - vadd.f32 q8,q8,q8 // s = cd1*2 - vadd.f32 q9,q9,q9 - vadd.f32 q10,q10,q10 - vadd.f32 q11,q11,q11 - vsub.f32 q0,q0,q4 // c = cd2-cd3 - vsub.f32 q1,q1,q5 - vsub.f32 q2,q2,q6 - vsub.f32 q3,q3,q7 - // c - q0-q3, s - q8-q11, norm - q12-q15 - vstmia r7,{q8,q9,q10,q11} - vldmia r9,{q4,q5,q6,q7} - // reciprocal of magnitude - // iter 1: invmag = 2.0-mag - vsub.f32 q4,q4,q12 - vsub.f32 q5,q5,q13 - vsub.f32 q6,q6,q14 - vsub.f32 q7,q7,q15 - // iter 2: invmag = invmag*(2.0-mag*invmag) - vrecps.f32 q8,q4,q12 - vrecps.f32 q9,q5,q13 - vrecps.f32 q10,q6,q14 - vrecps.f32 q11,q7,q15 - vmul.f32 q4,q4,q8 - vmul.f32 q5,q5,q9 - vmul.f32 q6,q6,q10 - vmul.f32 q7,q7,q11 - vldmia r7,{q8,q9,q10,q11} - // invnorm - q4-q7 - // correct cosine/sine - vmul.f32 q0,q0,q4 - vmul.f32 q1,q1,q5 - vmul.f32 q2,q2,q6 - vmul.f32 q3,q3,q7 - vmul.f32 q8,q8,q4 - vmul.f32 q9,q9,q5 - vmul.f32 q10,q10,q6 - vmul.f32 q11,q11,q7 - vzip.32 q0,q8 - vzip.32 q1,q9 - vzip.32 q2,q10 - vzip.32 q3,q11 - vstmia r7,{q2,q3} - vstmia r11,{q10,q11} - // multiply by data - pld [r0,#256] - vldmia r0!,{q12,q13,q14,q15} - vmul.f32 q4,q12,q0 - vmul.f32 q5,q13,q8 - vmul.f32 q6,q14,q1 - vmul.f32 q7,q15,q9 - vrev64.32 q2,q0 - vrev64.32 q10,q8 - vrev64.32 q3,q1 - vrev64.32 q11,q9 - vmul.f32 q2,q2,q12 - vmul.f32 q10,q10,q13 - vmul.f32 q3,q3,q14 - vmul.f32 q11,q11,q15 - fnegs s17,s17 - fnegs s19,s19 - fnegs s21,s21 - fnegs s23,s23 - fnegs s25,s25 - fnegs s27,s27 - fnegs s29,s29 - fnegs s31,s31 - vtrn.32 q4,q2 - vtrn.32 q5,q10 - vtrn.32 q6,q3 - vtrn.32 q7,q11 - vadd.f32 q4,q4,q2 - vadd.f32 q5,q5,q10 - vadd.f32 q6,q6,q3 - vadd.f32 q7,q7,q11 - vstmia r1!,{q4,q5,q6,q7} - vldmia r7,{q0,q1} - vldmia r11,{q8,q9} - pld [r0,#256] - vldmia r0!,{q12,q13,q14,q15} - vmul.f32 q4,q12,q0 - vmul.f32 q5,q13,q8 - vmul.f32 q6,q14,q1 - vmul.f32 q7,q15,q9 - vrev64.32 q2,q0 - vrev64.32 q10,q8 - vrev64.32 q3,q1 - vrev64.32 q11,q9 - vmul.f32 q2,q2,q12 - vmul.f32 q10,q10,q13 - vmul.f32 q3,q3,q14 - vmul.f32 q11,q11,q15 - fnegs s17,s17 - fnegs s19,s19 - fnegs s21,s21 - fnegs s23,s23 - fnegs s25,s25 - fnegs s27,s27 - fnegs s29,s29 - fnegs s31,s31 - vtrn.32 q4,q2 - vtrn.32 q5,q10 - vtrn.32 q6,q3 - vtrn.32 q7,q11 - vadd.f32 q4,q4,q2 - vadd.f32 q5,q5,q10 - vadd.f32 q6,q6,q3 - vadd.f32 q7,q7,q11 - vstmia r1!,{q4,q5,q6,q7} - - add r4,r4,#16 - cmp r0,r3 - blo .Lmainloop -.Lendmainloop: - add r3,r3,#8*15 - cmp r0,r3 - bhs .Lendsmallloop - adr r6,.Lsincosapprox -.Lsmallloop: - fmsr s24,r4 - fldmiad sp,{d9,d10} - fuitod d0,s24 - // square of i - fmuld d0,d0,d0 - // multiply by srate - fmuld d0,d0,d9 - // rounding to -0.5/+0.5 - faddd d12,d0,d10 - fsubd d12,d12,d10 - fsubd d0,d0,d12 - fcvtsd s24,d0 - - fldmias r6,{s8,s9,s10,s11,s12,s13,s14,s15} - // square of y - fmuls s0,s24,s24 - // sine/cosine - fmuls s16,s0,s11 - fmuls s17,s0,s15 - fadds s16,s16,s10 - fadds s17,s17,s14 - fmuls s16,s16,s0 - fmuls s17,s17,s0 - fadds s16,s16,s9 - fadds s17,s17,s13 - fmuls s16,s16,s0 - fmuls s17,s17,s0 - fadds s16,s16,s8 - fadds s17,s17,s12 // s16 - sine - fmuls s16,s16,s24 // s17 - cosine - // doubling cosine/sine - fmuls s18,s16,s17 - fmuls s19,s16,s16 - fmuls s20,s17,s17 - fadds s18,s18,s18 // y=2*s*c - fsubs s19,s20,s19 // x=c*c-s*s - fmuls s21,s18,s19 // cd1 - fmuls s22,s19,s19 // cd2 - fmuls s23,s18,s18 // cd3 - fsubs s8,s22,s23 // c - fadds s9,s21,s21 // s - // compute 1.0/norm - fadds s10,s22,s23 // mag - // reciprocal - flds s11,.Ltwos - // iter1: invmag = 2.0-mag - fsubs s12,s11,s10 - // iter2: invmag = invmag*(2.0-invmag*mag) - fmuls s13,s12,s10 - fsubs s13,s11,s13 - fmuls s12,s12,s13 - // iter3: invmag = invmag*(2.0-invmag*mag) - fmuls s13,s12,s10 - fsubs s13,s11,s13 - fmuls s12,s12,s13 - // correct cosine/sine - fmuls s8,s8,s12 - fmuls s9,s9,s12 - // multiply data - fldmias r0!,{s14,s15} - fmuls s6,s14,s8 - fmuls s7,s14,s9 - fnmacs s6,s15,s9 - fmacs s7,s15,s8 - fstmias r1!,{s6,s7} - - add r4,r4,#1 - cmp r0,r3 - blo .Lsmallloop -.Lendsmallloop: - add sp,sp,#16+64 -.Lendoffunc: - mov r0,#0 - vpop {d8,d9,d10,d11,d12,d13,d14,d15} - pop {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - bx lr - - .align 2 -.LGOT1: - .word _GLOBAL_OFFSET_TABLE_-(.LPIC1+8) - .word analysis_state(GOT) - .align 2 -.Ltwos: - .float 2.0 -.Lsincosapprox: - .float 1.5707963268,-0.6466386396,0.0679105987,-0.0011573807 // sine - .float 1.0,-1.2341299769,0.2465220241,-0.0123926179 // cosine -/* - * neon_FoldSubs.S - * Author: Mateusz Szpakowski - */ - - .arch armv7-a - .fpu neon - .eabi_attribute 20, 1 - .eabi_attribute 21, 1 - .eabi_attribute 23, 3 - .eabi_attribute 24, 1 - .eabi_attribute 25, 1 - .eabi_attribute 26, 2 - .eabi_attribute 30, 2 - .eabi_attribute 18, 4 - .text - .align 2 - /***** - * fold array by 3 - ******/ - .global neon_foldArrayBy3_ll31 - .type neon_foldArrayBy3_ll31, %function -neon_foldArrayBy3_ll31: - push {r4,r5,r6,lr} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - ldrd r4,[r1] // di,dest - add r6,r0,r4,lsl #2 // end - - veor.i32 q12,q12,q12 - sub r6,r6,#4*15 - cmp r0,r6 - bhs .Lendf3loop1 -.Lf3loop1: -.macro FOLDBY3_CORE - vldmia r0!,{q0,q1,q2,q3} - vldmia r2!,{q4,q5,q6,q7} - vldmia r3!,{q8,q9,q10,q11} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 q3,q3,q7 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 q3,q3,q11 - vstmia r5!,{q0,q1,q2,q3} - - vmax.f32 q13,q0,q1 - vmax.f32 q14,q2,q3 - vmax.f32 q12,q12,q13 - vmax.f32 q12,q12,q14 -.endm - FOLDBY3_CORE - - cmp r0,r6 - blo .Lf3loop1 -.Lendf3loop1: - and r4,r4,#15 - cmp r4,#8 - blo .Lf3lt8 - beq .Lf3eq8 - - cmp r4,#12 - blo .Lf3lt12 - beq .Lf3eq12 - - cmp r4,#14 - blo .Lf3lt14 - beq .Lf3eq14 - // 15 elems - vldmia r0!,{q0,q1,q2} - vldmia r0!,{s12,s13,s14} - vldmia r2!,{q4,q5,q6} - vldmia r2!,{s28,s29,s30} - vldmia r3!,{q8,q9,q10} - vldmia r3!,{d22} - vldmia r3!, {s15} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 d6,d6,d14 - vadd.f32 s14,s14,s30 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 d6,d6,d22 - vadd.f32 s14,s14,s15 - - vstmia r5!,{q0,q1,q2} - vstmia r5!,{s12,s13,s14} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 q12,q12,q3 - - b .Lf3end -.Lf3eq14: - // 14 elems - vldmia r0!,{q0,q1,q2} - vldmia r0!,{d6} - vldmia r2!,{q4,q5,q6} - vldmia r2!,{d14} - vldmia r3!,{q8,q9,q10} - vldmia r3!,{d22} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 d6,d6,d14 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 d6,d6,d22 - vstmia r5!,{q0,q1,q2} - vstmia r5!,{d6} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 d24,d24,d6 - - b .Lf3end -.Lf3lt14: - // 13 elems - vldmia r0!,{q0,q1,q2} - vldmia r0!,{s12} - vldmia r2!,{q4,q5,q6} - vldmia r2!,{s13} - vldmia r3!,{q8,q9,q10} - vldmia r3!,{s14} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - fadds s12,s12,s13 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - fadds s12,s12,s14 - vstmia r5!,{q0,q1,q2} - vstmia r5!,{s12} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 d24,d24,d6 - - b .Lf3end -.Lf3eq12: - // 12 elems - vldmia r0!,{q0,q1,q2} - vldmia r2!,{q4,q5,q6} - vldmia r3!,{q8,q9,q10} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vstmia r5!,{q0,q1,q2} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - - b .Lf3end -.Lf3lt12: - cmp r4,#10 - blo .Lf3lt10 - beq .Lf3eq10 - // 11 elems - vldmia r0!,{q0,q1} - vldmia r0!,{s8,s9,s10} - vldmia r2!,{q4,q5} - vldmia r2!,{s24,s25,s26} - vldmia r3!,{q8,q9} - vldmia r3!,{d20} - vldmia r3!,{s11} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 d4,d4,d12 - vadd.f32 s10,s10,s26 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 d4,d4,d20 - vadd.f32 s10,s10,s11 - vstmia r5!,{q0,q1} - vstmia r5!,{s8,s9,s10} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - - b .Lf3end -.Lf3eq10: - // 10 elems - vldmia r0!,{q0,q1} - vldmia r0!,{d4} - vldmia r2!,{q4,q5} - vldmia r2!,{d12} - vldmia r3!,{q8,q9} - vldmia r3!,{d20} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 d4,d4,d12 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 d4,d4,d20 - vstmia r5!,{q0,q1} - vstmia r5!,{d4} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 d24,d24,d4 - - b .Lf3end -.Lf3lt10: - // 9 elems - vldmia r0!,{q0,q1} - vldmia r0!,{s8} - vldmia r2!,{q4,q5} - vldmia r2!,{s9} - vldmia r3!,{q8,q9} - vldmia r3!,{s10} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - fadds s8,s8,s9 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - fadds s8,s8,s10 - vstmia r5!,{q0,q1} - vstmia r5!,{s8} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 d24,d24,d4 - - b .Lf3end -.Lf3eq8: - // 8 elems - vldmia r0!,{q0,q1} - vldmia r2!,{q4,q5} - vldmia r3!,{q8,q9} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vstmia r5!,{q0,q1} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - - b .Lf3end -.Lf3lt8: - cmp r4,#4 - blo .Lf3lt4 - beq .Lf3eq4 - - cmp r4,#6 - blo .Lf3lt6 - beq .Lf3eq6 - // 7 elems - vldmia r0!,{q0} - vldmia r0!,{s4,s5,s6} - vldmia r2!,{q4} - vldmia r2!,{s20,s21,s22} - vldmia r3!,{q8} - vldmia r3!,{d18} - vldmia r3!,{s7} - - vadd.f32 q0,q0,q4 - vadd.f32 d2,d2,d10 - vadd.f32 s6,s6,s22 - vadd.f32 q0,q0,q8 - vadd.f32 d2,d2,d18 - vadd.f32 s6,s6,s7 - vstmia r5!,{q0} - vstmia r5!,{s4,s5,s6} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - - b .Lf3end -.Lf3eq6: - // 6 elems - vldmia r0!,{q0} - vldmia r0!,{d2} - vldmia r2!,{q4} - vldmia r2!,{d10} - vldmia r3!,{q8} - vldmia r3!,{d18} - - vadd.f32 q0,q0,q4 - vadd.f32 d2,d2,d10 - vadd.f32 q0,q0,q8 - vadd.f32 d2,d2,d18 - vstmia r5!,{q0} - vstmia r5!,{d2} - - vmax.f32 q12,q12,q0 - vmax.f32 d24,d24,d2 - - b .Lf3end -.Lf3lt6: - // 5 elems - vldmia r0!,{q0} - vldmia r0!,{s4} - vldmia r2!,{q4} - vldmia r2!,{s20} - vldmia r3!,{q8} - vldmia r3!,{s5} - - vadd.f32 q0,q0,q4 - fadds s4,s4,s20 - vadd.f32 q0,q0,q8 - fadds s4,s4,s5 - vstmia r5!,{q0} - vstmia r5!,{s4} - - vmax.f32 q12,q12,q0 - vmax.f32 d24,d24,d2 - - b .Lf3end -.Lf3eq4: - // 4 elems - vldmia r0!,{q0} - vldmia r2!,{q4} - vldmia r3!,{q8} - - vadd.f32 q0,q0,q4 - vadd.f32 q0,q0,q8 - vstmia r5!,{q0} - - vmax.f32 q12,q12,q0 - - b .Lf3end -.Lf3lt4: - cmp r4,#2 - blo .Lf3lt2 - beq .Lf3eq2 - // 3 elems - vldmia r0!,{s0,s1,s2} - vldmia r2!,{s16,s17,s18} - vldmia r3!,{d16} - vldmia r3!,{s3} - - vadd.f32 d0,d0,d8 - vadd.f32 s2,s2,s18 - vadd.f32 d0,d0,d16 - vadd.f32 s2,s2,s3 - vstmia r5!,{s0,s1,s2} - - vmax.f32 q12,q12,q0 - - b .Lf3end -.Lf3eq2: - // 2 elems - vldmia r0!,{d0} - vldmia r2!,{d4} - vldmia r3!,{d8} - - vadd.f32 d0,d0,d4 - vadd.f32 d0,d0,d8 - vstmia r5!,{d0} - - vmax.f32 d24,d24,d0 - - b .Lf3end -.Lf3lt2: - cmp r4,#0 - beq .Lf3eq0 - - vldmia r0!,{s0} - vldmia r2!,{s1} - vldmia r3!,{s2} - - vadd.f32 s0,s0,s1 - vadd.f32 s0,s0,s2 - vstmia r5!,{s0} - - vmax.f32 d24,d24,d0 -.Lf3eq0: -.Lf3end: - vpmax.f32 d24,d24,d25 - vpmax.f32 d0,d24,d24 - - fmrs r0,s0 - vpop {d8,d9,d10,d11,d12,d13,d14,d15} - pop {r4,r5,r6,lr} - bx lr - - - .global neon_foldArrayBy3_lge31 - .type neon_foldArrayBy3_lge31, %function -neon_foldArrayBy3_lge31: - push {r4,r5,r6,lr} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - ldrd r4,[r1] // di,dest - add r6,r0,r4,lsl #2 // end - - veor.i32 q12,q12,q12 - sub r6,r6,#4*31 - cmp r0,r6 - bhs .Lendf3loop2 -.Lf3loop2: - FOLDBY3_CORE - FOLDBY3_CORE - - cmp r0,r6 - blo .Lf3loop2 -.Lendf3loop2: - add r6,r6,#4*16 - cmp r0,r6 - bhs .Lendf3loop1 -.Lf3loop3: - FOLDBY3_CORE - - cmp r0,r6 - blo .Lf3loop3 -.Lendf3loop3: - b .Lendf3loop1 - -.Lfoldby3sel: -.rept 31 - .word neon_foldArrayBy3_ll31 -.endr - .word neon_foldArrayBy3_lge31 - - /****** - * fold array by 4 - ******/ - .global neon_foldArrayBy4_ll31 - .type neon_foldArrayBy4_ll31, %function -neon_foldArrayBy4_ll31: - push {r4,r5,r6,r7,r8,lr} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - ldr r6,[r1,#16] // tmp2 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - add r6,r0,r6,lsl #2 - ldrd r4,[r1] // di,dest - add r7,r0,r4,lsl #2 // end - - veor.i32 q12,q12,q12 - sub r7,r7,#4*15 - cmp r0,r7 - bhs .Lendf4loop1 -.Lf4loop1: -.macro FOLDBY4_CORE - vldmia r0!,{q0,q1,q2,q3} - vldmia r2!,{q4,q5,q6,q7} - vldmia r3!,{q8,q9,q10,q11} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 q3,q3,q7 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 q3,q3,q11 - vldmia r6!,{q8,q9,q10,q11} - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 q3,q3,q11 - vstmia r5!,{q0,q1,q2,q3} - - vmax.f32 q13,q0,q1 - vmax.f32 q14,q2,q3 - vmax.f32 q12,q12,q13 - vmax.f32 q12,q12,q14 -.endm - FOLDBY4_CORE - - cmp r0,r7 - blo .Lf4loop1 -.Lendf4loop1: - and r4,r4,#15 - cmp r4,#8 - blo .Lf4lt8 - beq .Lf4eq8 - - cmp r4,#12 - blo .Lf4lt12 - beq .Lf4eq12 - - cmp r4,#14 - blo .Lf4lt14 - beq .Lf4eq14 - // 15 elems - vldmia r0!,{q0,q1,q2} - vldmia r0!,{s12,s13,s14} - vldmia r2!,{q4,q5,q6} - vldmia r2!,{s28,s29,s30} - vldmia r3!,{q8,q9,q10} - vldmia r3!,{d22} - vldmia r3, {s15} - vldmia r6!,{q13,q14,q15} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 d6,d6,d14 - vadd.f32 s14,s14,s30 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 d6,d6,d22 - vadd.f32 s14,s14,s15 - vldmia r6!,{s16,s17,s18} - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - vadd.f32 d6,d6,d8 - vadd.f32 s14,s14,s18 - - vstmia r5!,{q0,q1,q2} - vstmia r5!,{s12,s13,s14} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 q12,q12,q3 - - b .Lf4end -.Lf4eq14: - // 14 elems - vldmia r0!,{q0,q1,q2} - vldmia r0!,{d6} - vldmia r2!,{q4,q5,q6} - vldmia r2!,{d14} - vldmia r3!,{q8,q9,q10} - vldmia r3!,{d22} - vldmia r6!,{q13,q14,q15} - vldmia r6!,{d7} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 d6,d6,d14 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 d6,d6,d22 - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - vadd.f32 d6,d6,d7 - vstmia r5!,{q0,q1,q2} - vstmia r5!,{d6} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 d24,d24,d6 - - b .Lf4end -.Lf4lt14: - // 13 elems - vldmia r0!,{q0,q1,q2} - vldmia r0!,{s12} - vldmia r2!,{q4,q5,q6} - vldmia r2!,{s13} - vldmia r3!,{q8,q9,q10} - vldmia r3!,{s14} - vldmia r6!,{q13,q14,q15} - vldmia r6!,{s15} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - fadds s12,s12,s13 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - fadds s12,s12,s14 - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - fadds s12,s12,s15 - vstmia r5!,{q0,q1,q2} - vstmia r5!,{s12} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 d24,d24,d6 - - b .Lf4end -.Lf4eq12: - // 12 elems - vldmia r0!,{q0,q1,q2} - vldmia r2!,{q4,q5,q6} - vldmia r3!,{q8,q9,q10} - vldmia r6!,{q13,q14,q15} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - vstmia r5!,{q0,q1,q2} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - - b .Lf4end -.Lf4lt12: - cmp r4,#10 - blo .Lf4lt10 - beq .Lf4eq10 - // 11 elems - vldmia r0!,{q0,q1} - vldmia r0!,{s8,s9,s10} - vldmia r2!,{q4,q5} - vldmia r2!,{s24,s25,s26} - vldmia r3!,{q8,q9} - vldmia r3!,{d20} - vldmia r3!,{s11} - vldmia r6!,{q13,q14} - vldmia r6!,{s12,s13,s14} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 d4,d4,d12 - vadd.f32 s10,s10,s26 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 d4,d4,d20 - vadd.f32 s10,s10,s11 - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 d4,d4,d6 - vadd.f32 s10,s10,s14 - vstmia r5!,{q0,q1} - vstmia r5!,{s8,s9,s10} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - - b .Lf4end -.Lf4eq10: - // 10 elems - vldmia r0!,{q0,q1} - vldmia r0!,{d4} - vldmia r2!,{q4,q5} - vldmia r2!,{d12} - vldmia r3!,{q8,q9} - vldmia r3!,{d20} - vldmia r6!,{q13,q14} - vldmia r6!,{d5} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 d4,d4,d12 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 d4,d4,d20 - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 d4,d4,d5 - vstmia r5!,{q0,q1} - vstmia r5!,{d4} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 d24,d24,d4 - - b .Lf4end -.Lf4lt10: - // 9 elems - vldmia r0!,{q0,q1} - vldmia r0!,{s8} - vldmia r2!,{q4,q5} - vldmia r2!,{s9} - vldmia r3!,{q8,q9} - vldmia r3!,{s10} - vldmia r6!,{q13,q14} - vldmia r6!,{s11} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - fadds s8,s8,s9 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - fadds s8,s8,s10 - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - fadds s8,s8,s11 - vstmia r5!,{q0,q1} - vstmia r5!,{s8} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 d24,d24,d4 - - b .Lf4end -.Lf4eq8: - // 8 elems - vldmia r0!,{q0,q1} - vldmia r2!,{q4,q5} - vldmia r3!,{q8,q9} - vldmia r6!,{q13,q14} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vstmia r5!,{q0,q1} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - - b .Lf4end -.Lf4lt8: - cmp r4,#4 - blo .Lf4lt4 - beq .Lf4eq4 - - cmp r4,#6 - blo .Lf4lt6 - beq .Lf4eq6 - // 7 elems - vldmia r0!,{q0} - vldmia r0!,{s4,s5,s6} - vldmia r2!,{q4} - vldmia r2!,{s20,s21,s22} - vldmia r3!,{q8} - vldmia r3!,{d18} - vldmia r3!,{s7} - vldmia r6!,{q13} - vldmia r6!,{s8,s9,s10} - - vadd.f32 q0,q0,q4 - vadd.f32 d2,d2,d10 - vadd.f32 s6,s6,s22 - vadd.f32 q0,q0,q8 - vadd.f32 d2,d2,d18 - vadd.f32 s6,s6,s7 - vadd.f32 q0,q0,q13 - vadd.f32 d2,d2,d4 - vadd.f32 s6,s6,s10 - vstmia r5!,{q0} - vstmia r5!,{s4,s5,s6} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - - b .Lf4end -.Lf4eq6: - // 6 elems - vldmia r0!,{q0} - vldmia r0!,{d2} - vldmia r2!,{q4} - vldmia r2!,{d10} - vldmia r3!,{q8} - vldmia r3!,{d18} - vldmia r6!,{q13} - vldmia r6!,{d3} - - vadd.f32 q0,q0,q4 - vadd.f32 d2,d2,d10 - vadd.f32 q0,q0,q8 - vadd.f32 d2,d2,d18 - vadd.f32 q0,q0,q13 - vadd.f32 d2,d2,d3 - vstmia r5!,{q0} - vstmia r5!,{d2} - - vmax.f32 q12,q12,q0 - vmax.f32 d24,d24,d2 - - b .Lf4end -.Lf4lt6: - // 5 elems - vldmia r0!,{q0} - vldmia r0!,{s4} - vldmia r2!,{q4} - vldmia r2!,{s20} - vldmia r3!,{q8} - vldmia r3!,{s5} - vldmia r6!,{q13} - vldmia r6!,{s6} - - vadd.f32 q0,q0,q4 - fadds s4,s4,s20 - vadd.f32 q0,q0,q8 - fadds s4,s4,s5 - vadd.f32 q0,q0,q13 - fadds s4,s4,s6 - vstmia r5!,{q0} - vstmia r5!,{s4} - - vmax.f32 q12,q12,q0 - vmax.f32 d24,d24,d2 - - b .Lf4end -.Lf4eq4: - // 4 elems - vldmia r0!,{q0} - vldmia r2!,{q4} - vldmia r3!,{q8} - vldmia r6!,{q13} - - vadd.f32 q0,q0,q4 - vadd.f32 q0,q0,q8 - vadd.f32 q0,q0,q13 - vstmia r5!,{q0} - - vmax.f32 q12,q12,q0 - - b .Lf4end -.Lf4lt4: - cmp r4,#2 - blo .Lf4lt2 - beq .Lf4eq2 - // 3 elems - vldmia r0!,{s0,s1,s2} - vldmia r2!,{s16,s17,s18} - vldmia r3!,{d16} - vldmia r3!,{s3} - vldmia r6!,{s4,s5,s6} - - vadd.f32 d0,d0,d8 - vadd.f32 s2,s2,s18 - vadd.f32 d0,d0,d16 - vadd.f32 s2,s2,s3 - vadd.f32 d0,d0,d2 - vadd.f32 s2,s2,s6 - vstmia r5!,{s0,s1,s2} - - vmax.f32 q12,q12,q0 - - b .Lf4end -.Lf4eq2: - // 2 elems - vldmia r0!,{d0} - vldmia r2!,{d4} - vldmia r3!,{d8} - vldmia r6!,{d12} - - vadd.f32 d0,d0,d4 - vadd.f32 d0,d0,d8 - vadd.f32 d0,d0,d12 - vstmia r5!,{d0} - - vmax.f32 d24,d24,d0 - - b .Lf4end -.Lf4lt2: - cmp r4,#0 - beq .Lf4eq0 - - vldmia r0!,{s0} - vldmia r2!,{s1} - vldmia r3!,{s2} - vldmia r6!,{s3} - - vadd.f32 s0,s0,s1 - vadd.f32 s0,s0,s2 - vadd.f32 s0,s0,s3 - vstmia r5!,{s0} - - vmax.f32 d24,d24,d0 -.Lf4eq0: -.Lf4end: - vpmax.f32 d24,d24,d25 - vpmax.f32 d0,d24,d24 - - fmrs r0,s0 - vpop {d8,d9,d10,d11,d12,d13,d14,d15} - pop {r4,r5,r6,r7,r8,lr} - bx lr - - - .global neon_foldArrayBy4_lge31 - .type neon_foldArrayBy4_lge31, %function -neon_foldArrayBy4_lge31: - push {r4,r5,r6,r7,r8,lr} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - ldr r6,[r1,#16] // tmp2 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - add r6,r0,r6,lsl #2 - ldrd r4,[r1] // di,dest - add r7,r0,r4,lsl #2 // end - - veor.i32 q12,q12,q12 - sub r7,r7,#4*31 - cmp r0,r7 - bhs .Lendf4loop2 -.Lf4loop2: - FOLDBY4_CORE - FOLDBY4_CORE - - cmp r0,r7 - blo .Lf4loop2 -.Lendf4loop2: - add r7,r7,#4*16 - cmp r0,r7 - bhs .Lendf4loop1 -.Lf4loop3: - FOLDBY4_CORE - - cmp r0,r7 - blo .Lf4loop3 -.Lendf4loop3: - b .Lendf4loop1 - -.Lfoldby4sel: -.rept 31 - .word neon_foldArrayBy4_ll31 -.endr - .word neon_foldArrayBy4_lge31 - - /****** - * fold array by 5 - ******/ - .global neon_foldArrayBy5_ll31 - .type neon_foldArrayBy5_ll31, %function -neon_foldArrayBy5_ll31: - push {r4,r5,r6,r7,r8,lr} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - ldrd r6,[r1,#16] // tmp2 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - add r6,r0,r6,lsl #2 - add r7,r0,r7,lsl #2 - ldrd r4,[r1] // di,dest - add r8,r0,r4,lsl #2 // end - - veor.i32 q12,q12,q12 - sub r8,r8,#4*15 - cmp r0,r8 - bhs .Lendf5loop1 -.Lf5loop1: -.macro FOLDBY5_CORE - vldmia r0!,{q0,q1,q2,q3} - vldmia r2!,{q4,q5,q6,q7} - vldmia r3!,{q8,q9,q10,q11} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 q3,q3,q7 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 q3,q3,q11 - vldmia r6!,{q8,q9,q10,q11} - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 q3,q3,q11 - vldmia r7!,{q8,q9,q10,q11} - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 q3,q3,q11 - vstmia r5!,{q0,q1,q2,q3} - - vmax.f32 q13,q0,q1 - vmax.f32 q14,q2,q3 - vmax.f32 q12,q12,q13 - vmax.f32 q12,q12,q14 -.endm - FOLDBY5_CORE - - cmp r0,r8 - blo .Lf5loop1 -.Lendf5loop1: - and r4,r4,#15 - cmp r4,#8 - blo .Lf5lt8 - beq .Lf5eq8 - - cmp r4,#12 - blo .Lf5lt12 - beq .Lf5eq12 - - cmp r4,#14 - blo .Lf5lt14 - beq .Lf5eq14 - // 15 elems - vldmia r0!,{q0,q1,q2} - vldmia r0!,{s12,s13,s14} - vldmia r2!,{q4,q5,q6} - vldmia r2!,{s28,s29,s30} - vldmia r3!,{q8,q9,q10} - vldmia r3!,{d22} - vldmia r3, {s15} - vldmia r6!,{q13,q14,q15} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 d6,d6,d14 - vadd.f32 s14,s14,s30 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 d6,d6,d22 - vadd.f32 s14,s14,s15 - vldmia r6!,{s16,s17,s18} - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - vadd.f32 d6,d6,d8 - vadd.f32 s14,s14,s18 - vldmia r7!,{q13,q14,q15} - vldmia r7!,{s16,s17,s18} - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - vadd.f32 d6,d6,d8 - vadd.f32 s14,s14,s18 - - vstmia r5!,{q0,q1,q2} - vstmia r5!,{s12,s13,s14} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 q12,q12,q3 - - b .Lf5end -.Lf5eq14: - // 14 elems - vldmia r0!,{q0,q1,q2} - vldmia r0!,{d6} - vldmia r2!,{q4,q5,q6} - vldmia r2!,{d14} - vldmia r3!,{q8,q9,q10} - vldmia r3!,{d22} - vldmia r6!,{q13,q14,q15} - vldmia r6!,{d7} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 d6,d6,d14 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - vadd.f32 d6,d6,d22 - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - vadd.f32 d6,d6,d7 - vldmia r7!,{q13,q14,q15} - vldmia r7!,{d7} - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - vadd.f32 d6,d6,d7 - vstmia r5!,{q0,q1,q2} - vstmia r5!,{d6} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 d24,d24,d6 - - b .Lf5end -.Lf5lt14: - // 13 elems - vldmia r0!,{q0,q1,q2} - vldmia r0!,{s12} - vldmia r2!,{q4,q5,q6} - vldmia r2!,{s13} - vldmia r3!,{q8,q9,q10} - vldmia r3!,{s14} - vldmia r6!,{q13,q14,q15} - vldmia r6!,{s15} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - fadds s12,s12,s13 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q2,q2,q10 - fadds s12,s12,s14 - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - fadds s12,s12,s15 - vldmia r7!,{q13,q14,q15} - vldmia r7!,{s15} - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - fadds s12,s12,s15 - vstmia r5!,{q0,q1,q2} - vstmia r5!,{s12} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 d24,d24,d6 - - b .Lf5end -.Lf5eq12: - // 12 elems - vldmia r0!,{q0,q1,q2} - vldmia r2!,{q3,q4,q5} - vldmia r3!,{q6,q7,q8} - vldmia r6!,{q9,q10,q11} - vldmia r7!,{q13,q14,q15} - - vadd.f32 q0,q0,q3 - vadd.f32 q1,q1,q4 - vadd.f32 q2,q2,q5 - vadd.f32 q0,q0,q6 - vadd.f32 q1,q1,q7 - vadd.f32 q2,q2,q8 - vadd.f32 q0,q0,q9 - vadd.f32 q1,q1,q10 - vadd.f32 q2,q2,q11 - vadd.f32 q0,q0,q13 - vadd.f32 q1,q1,q14 - vadd.f32 q2,q2,q15 - vstmia r5!,{q0,q1,q2} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - - b .Lf5end -.Lf5lt12: - cmp r4,#10 - blo .Lf5lt10 - beq .Lf5eq10 - // 11 elems - vldmia r0!,{q0,q1} - vldmia r0!,{s8,s9,s10} - vldmia r2!,{q7,q8} - vldmia r2!,{d6} - vldmia r2!,{s11} - vldmia r3!,{q9,q10} - vldmia r3!,{s14,s15,s16} - vldmia r6!,{q11} - vldmia r6!,{q13} - vldmia r6!,{s18,s19,s20} - vldmia r7!,{q14} - vldmia r7!,{q15} - vldmia r7!,{s22,s23,s24} - - vadd.f32 q0,q0,q7 - vadd.f32 q1,q1,q8 - vadd.f32 d4,d4,d6 - vadd.f32 s10,s10,s11 - vadd.f32 q0,q0,q9 - vadd.f32 q1,q1,q10 - vadd.f32 d4,d4,d7 - vadd.f32 s10,s10,s16 - vadd.f32 q0,q0,q11 - vadd.f32 q1,q1,q13 - vadd.f32 d4,d4,d9 - vadd.f32 s10,s10,s20 - vadd.f32 q0,q0,q14 - vadd.f32 q1,q1,q15 - vadd.f32 d4,d4,d11 - vadd.f32 s10,s10,s24 - vstmia r5!,{q0,q1} - vstmia r5!,{s8,s9,s10} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - - b .Lf5end -.Lf5eq10: - // 10 elems - vldmia r0!,{q0,q1} - vldmia r0!,{d4} - vldmia r2!,{q3,q4} - vldmia r2!,{d5} - vldmia r3!,{q5,q6} - vldmia r3!,{d14} - vldmia r6!,{q8,q9} - vldmia r6!,{d15} - vldmia r7!,{q10,q11} - vldmia r7!,{d26} - - vadd.f32 q0,q0,q3 - vadd.f32 q1,q1,q4 - vadd.f32 d4,d4,d5 - vadd.f32 q0,q0,q5 - vadd.f32 q1,q1,q6 - vadd.f32 d4,d4,d14 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 d4,d4,d15 - vadd.f32 q0,q0,q10 - vadd.f32 q1,q1,q11 - vadd.f32 d4,d4,d26 - vstmia r5!,{q0,q1} - vstmia r5!,{d4} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 d24,d24,d4 - - b .Lf5end -.Lf5lt10: - // 9 elems - vldmia r0!,{q0,q1} - vldmia r0!,{s8} - vldmia r2!,{q4,q5} - vldmia r2!,{s9} - vldmia r3!,{q6,q7} - vldmia r3!,{s10} - vldmia r6!,{q8,q9} - vldmia r6!,{s11} - vldmia r7!,{q10,q11} - vldmia r7!,{s12} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - fadds s8,s8,s9 - vadd.f32 q0,q0,q6 - vadd.f32 q1,q1,q7 - fadds s8,s8,s10 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - fadds s8,s8,s11 - vadd.f32 q0,q0,q10 - vadd.f32 q1,q1,q11 - fadds s8,s8,s12 - vstmia r5!,{q0,q1} - vstmia r5!,{s8} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 d24,d24,d4 - - b .Lf5end -.Lf5eq8: - // 8 elems - vldmia r0!,{q0,q1} - vldmia r2!,{q2,q3} - vldmia r3!,{q4,q5} - vldmia r6!,{q8,q9} - vldmia r7!,{q10,q11} - - vadd.f32 q0,q0,q2 - vadd.f32 q1,q1,q3 - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q0,q0,q8 - vadd.f32 q1,q1,q9 - vadd.f32 q0,q0,q10 - vadd.f32 q1,q1,q11 - vstmia r5!,{q0,q1} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - - b .Lf5end -.Lf5lt8: - cmp r4,#4 - blo .Lf5lt4 - beq .Lf5eq4 - - cmp r4,#6 - blo .Lf5lt6 - beq .Lf5eq6 - // 7 elems - vldmia r0!,{q0} - vldmia r0!,{s4,s5,s6} - vldmia r2!,{q2} - vldmia r2!,{d6} - vldmia r2!,{s7} - vldmia r3!,{q4} - vldmia r3!,{s20,s21,s22} - vldmia r6!,{q6} - vldmia r6!,{s28,s29,s30} - vldmia r7!,{q8} - vldmia r7!,{d7} - vldmia r7!,{s31} - - vadd.f32 q0,q0,q2 - vadd.f32 d2,d2,d6 - vadd.f32 s6,s6,s7 - vadd.f32 q0,q0,q4 - vadd.f32 d2,d2,d10 - vadd.f32 s6,s6,s22 - vadd.f32 q0,q0,q6 - vadd.f32 d2,d2,d14 - vadd.f32 s6,s6,s30 - vadd.f32 q0,q0,q8 - vadd.f32 d2,d2,d7 - vadd.f32 s6,s6,s31 - vstmia r5!,{q0} - vstmia r5!,{s4,s5,s6} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - - b .Lf5end -.Lf5eq6: - // 6 elems - vldmia r0!,{q0} - vldmia r0!,{d2} - vldmia r2!,{q2} - vldmia r2!,{d6} - vldmia r3!,{q4} - vldmia r3!,{d10} - vldmia r6!,{q6} - vldmia r6!,{d14} - vldmia r7!,{q8} - vldmia r7!,{d15} - - vadd.f32 q0,q0,q2 - vadd.f32 d2,d2,d6 - vadd.f32 q0,q0,q4 - vadd.f32 d2,d2,d10 - vadd.f32 q0,q0,q6 - vadd.f32 d2,d2,d14 - vadd.f32 q0,q0,q8 - vadd.f32 d2,d2,d15 - vstmia r5!,{q0} - vstmia r5!,{d2} - - vmax.f32 q12,q12,q0 - vmax.f32 d24,d24,d2 - - b .Lf5end -.Lf5lt6: - // 5 elems - vldmia r0!,{q0} - vldmia r0!,{s4} - vldmia r2!,{q2} - vldmia r2!,{s5} - vldmia r3!,{q4} - vldmia r3!,{s6} - vldmia r6!,{q6} - vldmia r6!,{s7} - vldmia r7!,{q8} - vldmia r7!,{s12} - - vadd.f32 q0,q0,q2 - fadds s4,s4,s5 - vadd.f32 q0,q0,q4 - fadds s4,s4,s6 - vadd.f32 q0,q0,q6 - fadds s4,s4,s7 - vadd.f32 q0,q0,q8 - fadds s4,s4,s12 - vstmia r5!,{q0} - vstmia r5!,{s4} - - vmax.f32 q12,q12,q0 - vmax.f32 d24,d24,d2 - - b .Lf5end -.Lf5eq4: - // 4 elems - vldmia r0!,{q0} - vldmia r2!,{q4} - vldmia r3!,{q8} - vldmia r6!,{q13} - vldmia r7!,{q14} - - vadd.f32 q0,q0,q4 - vadd.f32 q0,q0,q8 - vadd.f32 q0,q0,q13 - vadd.f32 q0,q0,q14 - vstmia r5!,{q0} - - vmax.f32 q12,q12,q0 - - b .Lf5end -.Lf5lt4: - cmp r4,#2 - blo .Lf5lt2 - beq .Lf5eq2 - // 3 elems - vldmia r0!,{s0,s1,s2} - vldmia r2!,{d2} - vldmia r2!,{s3} - vldmia r3!,{s6,s7,s8} - vldmia r6!,{s10,s11,s12} - vldmia r7!,{s14,s15,s16} - - vadd.f32 d0,d0,d2 - vadd.f32 s2,s2,s3 - vadd.f32 d0,d0,d3 - vadd.f32 s2,s2,s8 - vadd.f32 d0,d0,d5 - vadd.f32 s2,s2,s12 - vadd.f32 d0,d0,d7 - vadd.f32 s2,s2,s16 - vstmia r5!,{s0,s1,s2} - - vmax.f32 q12,q12,q0 - - b .Lf5end -.Lf5eq2: - // 2 elems - vldmia r0!,{d0} - vldmia r2!,{d4} - vldmia r3!,{d8} - vldmia r6!,{d12} - vldmia r7!,{d13} - - vadd.f32 d0,d0,d4 - vadd.f32 d0,d0,d8 - vadd.f32 d0,d0,d12 - vadd.f32 d0,d0,d13 - vstmia r5!,{d0} - - vmax.f32 d24,d24,d0 - - b .Lf5end -.Lf5lt2: - cmp r4,#0 - beq .Lf5eq0 - - vldmia r0!,{s0} - vldmia r2!,{s1} - vldmia r3!,{s2} - vldmia r6!,{s3} - vldmia r7!,{s4} - - vadd.f32 s0,s0,s1 - vadd.f32 s0,s0,s2 - vadd.f32 s0,s0,s3 - vadd.f32 s0,s0,s4 - vstmia r5!,{s0} - - vmax.f32 d24,d24,d0 -.Lf5eq0: -.Lf5end: - vpmax.f32 d24,d24,d25 - vpmax.f32 d0,d24,d24 - - fmrs r0,s0 - vpop {d8,d9,d10,d11,d12,d13,d14,d15} - pop {r4,r5,r6,r7,r8,lr} - bx lr - -.Lfoldby5sel: -.rept 31 - .word neon_foldArrayBy5_ll31 -.endr - .word neon_foldArrayBy5_lge31 - - - .global neon_foldArrayBy5_lge31 - .type neon_foldArrayBy5_lge31, %function -neon_foldArrayBy5_lge31: - push {r4,r5,r6,r7,r8,lr} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - ldrd r6,[r1,#16] // tmp2 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - add r6,r0,r6,lsl #2 - add r7,r0,r7,lsl #2 - ldrd r4,[r1] // di,dest - add r8,r0,r4,lsl #2 // end - - veor.i32 q12,q12,q12 - sub r8,r8,#4*31 - cmp r0,r8 - bhs .Lendf5loop2 -.Lf5loop2: - FOLDBY5_CORE - FOLDBY5_CORE - - cmp r0,r8 - blo .Lf5loop2 -.Lendf5loop2: - add r8,r8,#4*16 - cmp r0,r8 - bhs .Lendf5loop1 -.Lf5loop3: - FOLDBY5_CORE - - cmp r0,r8 - blo .Lf5loop3 -.Lendf5loop3: - b .Lendf5loop1 - - - /***** - * fold array by 2 - ******/ - .global neon_foldArrayBy2_ll31 - .type neon_foldArrayBy2_ll31, %function -neon_foldArrayBy2_ll31: - push {r4,r5,r6,lr} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} - - ldr r0,[r0,#4] // ss1 - ldrd r2,[r1,#8] // tmp0 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - ldrd r4,[r1] // di,dest - add r6,r2,r4,lsl #2 // end - - veor.i32 q12,q12,q12 - sub r6,r6,#4*15 - cmp r2,r6 - bhs .Lendf2loop1 -.Lf2loop1: -.macro FOLDBY2_CORE - vldmia r2!,{q0,q1,q2,q3} - vldmia r3!,{q4,q5,q6,q7} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 q3,q3,q7 - vstmia r5!,{q0,q1,q2,q3} - - vmax.f32 q13,q0,q1 - vmax.f32 q14,q2,q3 - vmax.f32 q12,q12,q13 - vmax.f32 q12,q12,q14 -.endm - FOLDBY2_CORE - - cmp r2,r6 - blo .Lf2loop1 -.Lendf2loop1: - and r4,r4,#15 - cmp r4,#8 - blo .Lf2lt8 - beq .Lf2eq8 - - cmp r4,#12 - blo .Lf2lt12 - beq .Lf2eq12 - - cmp r4,#14 - blo .Lf2lt14 - beq .Lf2eq14 - // 15 elems - vldmia r2!,{q0,q1,q2} - vldmia r2!,{s12,s13,s14} - vldmia r3!,{q4,q5,q6} - vldmia r3!,{s28,s29} - vldmia r3!,{s15} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 d6,d6,d14 - vadd.f32 s14,s14,s15 - vstmia r5!,{q0,q1,q2} - vstmia r5!,{s12,s13,s14} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 q12,q12,q3 - - b .Lf2end -.Lf2eq14: - // 14 elems - vldmia r2!,{q0,q1,q2} - vldmia r2!,{d6} - vldmia r3!,{q4,q5,q6} - vldmia r3!,{d14} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vadd.f32 d6,d6,d14 - vstmia r5!,{q0,q1,q2} - vstmia r5!,{d6} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 d24,d24,d6 - - b .Lf2end -.Lf2lt14: - // 13 elems - vldmia r2!,{q0,q1,q2} - vldmia r2!,{s12} - vldmia r3!,{q4,q5,q6} - vldmia r3!,{s13} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - fadds s12,s12,s13 - vstmia r5!,{q0,q1,q2} - vstmia r5!,{s12} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - vmax.f32 d24,d24,d6 - - b .Lf2end -.Lf2eq12: - // 12 elems - vldmia r2!,{q0,q1,q2} - vldmia r3!,{q4,q5,q6} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 q2,q2,q6 - vstmia r5!,{q0,q1,q2} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - - b .Lf2end -.Lf2lt12: - cmp r4,#10 - blo .Lf2lt10 - beq .Lf2eq10 - // 11 elems - vldmia r2!,{q0,q1} - vldmia r2!,{s8,s9,s10} - vldmia r3!,{q4,q5} - vldmia r3!,{s24,s25} - vldmia r3!,{s11} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 d4,d4,d12 - vadd.f32 s10,s10,s11 - vstmia r5!,{q0,q1} - vstmia r5!,{s8,s9,s10} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 q12,q12,q2 - - b .Lf2end -.Lf2eq10: - // 10 elems - vldmia r2!,{q0,q1} - vldmia r2!,{d4} - vldmia r3!,{q4,q5} - vldmia r3!,{d12} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vadd.f32 d4,d4,d12 - vstmia r5!,{q0,q1} - vstmia r5!,{d4} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 d24,d24,d4 - - b .Lf2end -.Lf2lt10: - // 9 elems - vldmia r2!,{q0,q1} - vldmia r2!,{s8} - vldmia r3!,{q4,q5} - vldmia r3!,{s9} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - fadds s8,s8,s9 - vstmia r5!,{q0,q1} - vstmia r5!,{s8} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - vmax.f32 d24,d24,d4 - - b .Lf2end -.Lf2eq8: - // 8 elems - vldmia r2!,{q0,q1} - vldmia r3!,{q4,q5} - - vadd.f32 q0,q0,q4 - vadd.f32 q1,q1,q5 - vstmia r5!,{q0,q1} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - - b .Lf2end -.Lf2lt8: - cmp r4,#4 - blo .Lf2lt4 - beq .Lf2eq4 - - cmp r4,#6 - blo .Lf2lt6 - beq .Lf2eq6 - // 7 elems - vldmia r2!,{q0} - vldmia r2!,{s4,s5,s6} - vldmia r3!,{q4} - vldmia r3!,{s20,s21} - vldmia r3!,{s7} - - vadd.f32 q0,q0,q4 - vadd.f32 d2,d2,d10 - vadd.f32 s6,s6,s7 - vstmia r5!,{q0} - vstmia r5!,{s4,s5,s6} - - vmax.f32 q12,q12,q0 - vmax.f32 q12,q12,q1 - - b .Lf2end -.Lf2eq6: - // 6 elems - vldmia r2!,{q0} - vldmia r2!,{d2} - vldmia r3!,{q4} - vldmia r3!,{d10} - - vadd.f32 q0,q0,q4 - vadd.f32 d2,d2,d10 - vstmia r5!,{q0} - vstmia r5!,{d2} - - vmax.f32 q12,q12,q0 - vmax.f32 d24,d24,d2 - - b .Lf2end -.Lf2lt6: - // 5 elems - vldmia r2!,{q0} - vldmia r2!,{s4} - vldmia r3!,{q4} - vldmia r3!,{s5} - - vadd.f32 q0,q0,q4 - fadds s4,s4,s5 - vstmia r5!,{q0} - vstmia r5!,{s4} - - vmax.f32 q12,q12,q0 - vmax.f32 d24,d24,d2 - - b .Lf2end -.Lf2eq4: - // 4 elems - vldmia r2!,{q0} - vldmia r3!,{q4} - - vadd.f32 q0,q0,q4 - vstmia r5!,{q0} - - vmax.f32 q12,q12,q0 - - b .Lf2end -.Lf2lt4: - cmp r4,#2 - blo .Lf2lt2 - beq .Lf2eq2 - // 3 elems - vldmia r2!,{s0,s1,s2} - vldmia r3!,{s16,s17} - vldmia r3!,{s3} - - vadd.f32 d0,d0,d8 - vadd.f32 s2,s2,s3 - vstmia r5!,{s0,s1,s2} - - vmax.f32 q12,q12,q0 - - b .Lf2end -.Lf2eq2: - // 2 elems - vldmia r2!,{d0} - vldmia r3!,{d4} - - vadd.f32 d0,d0,d4 - vstmia r5!,{d0} - - vmax.f32 d24,d24,d0 - - b .Lf2end -.Lf2lt2: - cmp r4,#0 - beq .Lf2eq0 - - vldmia r2!,{s0} - vldmia r3!,{s1} - - vadd.f32 s0,s0,s1 - vstmia r5!,{s0} - - vmax.f32 d24,d24,d0 -.Lf2eq0: -.Lf2end: - vpmax.f32 d24,d24,d25 - vpmax.f32 d0,d24,d24 - - fmrs r0,s0 - vpop {d8,d9,d10,d11,d12,d13,d14,d15} - pop {r4,r5,r6,lr} - bx lr - - - .global neon_foldArrayBy2_lge31 - .type neon_foldArrayBy2_lge31, %function -neon_foldArrayBy2_lge31: - push {r4,r5,r6,lr} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} - - ldr r0,[r0,#4] // ss1 - ldrd r2,[r1,#8] // tmp0 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - ldrd r4,[r1] // di,dest - add r6,r2,r4,lsl #2 // end - - veor.i32 q12,q12,q12 - sub r6,r6,#4*31 - cmp r2,r6 - bhs .Lendf2loop2 -.Lf2loop2: - FOLDBY2_CORE - FOLDBY2_CORE - - cmp r2,r6 - blo .Lf2loop2 -.Lendf2loop2: - add r6,r6,#4*16 - cmp r2,r6 - bhs .Lendf2loop1 -.Lf2loop3: - FOLDBY2_CORE - - cmp r2,r6 - blo .Lf2loop3 -.Lendf2loop3: - b .Lendf2loop1 - -.Lfoldby2sel: -.rept 31 - .word neon_foldArrayBy2_ll31 -.endr - .word neon_foldArrayBy2_lge31 - - .align 2 -.Lname: - .string "opt NEON" - - .align 2 - .global neonFoldMain -neonFoldMain: - .word .Lfoldby3sel - .word .Lfoldby4sel - .word .Lfoldby5sel - .word .Lfoldby2sel - .word .Lfoldby2sel - .word .Lname - /* - - - * _Z21neon_GetPowerSpectrumPA2_fPfi.S - * Author: Mateusz Szpakowski - */ - - .arch armv7-a - .fpu neon - .eabi_attribute 20, 1 - .eabi_attribute 21, 1 - .eabi_attribute 23, 3 - .eabi_attribute 24, 1 - .eabi_attribute 25, 1 - .eabi_attribute 26, 2 - .eabi_attribute 30, 2 - .eabi_attribute 18, 4 - .text - .align 2 - .global _Z21neon_GetPowerSpectrumPA2_fPfi - .type _Z21neon_GetPowerSpectrumPA2_fPfi, %function -_Z21neon_GetPowerSpectrumPA2_fPfi: - push {r4,r5} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} - ldr r3,.LGOTa -.LPICa: - add r3,pc,r3 - ldr r4,.LGOTa+4 - ldr r4,[r3,r4] - add r5,r2,r2,lsl #1 - fldd d0,[r4,#32] - fmsr s4,r5 - fuitod d1,s4 - faddd d0,d0,d1 - fstd d0,[r4,#32] - - cmp r2,#4096 - blo .Lsecondversion - - add r2,r0,r2, lsl #3 - sub r2,r2,#31*8 - /* r0 - freqData - * r1 - PowerSpectrum - * r2 - end of freqData - */ - cmp r0,r2 - bhs .Lendmainloopa -.Lmainloopa: - pld [r0,#128] - vldmia r0!,{q0,q1,q2,q3,q4,q5,q6,q7} - vmul.f32 q0,q0,q0 - vmul.f32 q1,q1,q1 - vmul.f32 q2,q2,q2 - vmul.f32 q3,q3,q3 - vmul.f32 q4,q4,q4 - vmul.f32 q5,q5,q5 - vmul.f32 q6,q6,q6 - vmul.f32 q7,q7,q7 - vpadd.f32 d16,d0,d1 - vpadd.f32 d17,d2,d3 - vpadd.f32 d18,d4,d5 - vpadd.f32 d19,d6,d7 - vpadd.f32 d20,d8,d9 - vpadd.f32 d21,d10,d11 - vpadd.f32 d22,d12,d13 - vpadd.f32 d23,d14,d15 - - pld [r0,#128] - vldmia r0!,{q0,q1,q2,q3,q4,q5,q6,q7} - vstmia r1!,{q8,q9,q10,q11} - vmul.f32 q0,q0,q0 - vmul.f32 q1,q1,q1 - vmul.f32 q2,q2,q2 - vmul.f32 q3,q3,q3 - vmul.f32 q4,q4,q4 - vmul.f32 q5,q5,q5 - vmul.f32 q6,q6,q6 - vmul.f32 q7,q7,q7 - vpadd.f32 d16,d0,d1 - vpadd.f32 d17,d2,d3 - vpadd.f32 d18,d4,d5 - vpadd.f32 d19,d6,d7 - vpadd.f32 d20,d8,d9 - vpadd.f32 d21,d10,d11 - vpadd.f32 d22,d12,d13 - vpadd.f32 d23,d14,d15 - vstmia r1!,{q8,q9,q10,q11} - - cmp r0,r2 - blo .Lmainloopa - -.Lendmainloopa: - add r2,r2,#8*24 - bhs .Lendsmallloopa -.Lsmallloopa: - vldmia r0!,{q0,q1,q2,q3} - vmul.f32 q0,q0,q0 - vmul.f32 q1,q1,q1 - vmul.f32 q2,q2,q2 - vmul.f32 q3,q3,q3 - vpadd.f32 d16,d0,d1 - vpadd.f32 d17,d2,d3 - vpadd.f32 d18,d4,d5 - vpadd.f32 d19,d6,d7 - vstmia r1!,{q8,q9,q10,q11} - cmp r0,r2 - blo .Lsmallloopa -.Lendsmallloopa: - add r2,r2,#8*7 - cmp r0,r2 - beq .Lendmicroloop -.Lmicroloop: - fldmias r0!,{s0,s1} - fmuls s2,s0,s0 - fmacs s2,s1,s1 - fstmias r1!,{s2} - - cmp r0,r2 - blo .Lmicroloop -.Lendmicroloop: - mov r0,#0 - vpop {d8,d9,d10,d11,d12,d13,d14,d15} - pop {r4,r5} - bx lr - - - /* - * second version - */ -.Lsecondversion: - add r2,r0,r2, lsl #3 - sub r2,r2,#31*8 - /* r0 - freqData - * r1 - PowerSpectrum - * r2 - end of freqData - */ - cmp r0,r2 - bhs .Lendmainloop2 -.Lmainloop2: - vldmia r0!,{q0,q1,q2,q3,q4,q5,q6,q7} - vldmia r0!,{q8,q9,q10,q11,q12,q13,q14,q15} - vuzp.32 q0,q1 - vuzp.32 q2,q3 - vuzp.32 q4,q5 - vuzp.32 q6,q7 - vuzp.32 q8,q9 - vuzp.32 q10,q11 - vuzp.32 q12,q13 - vuzp.32 q14,q15 - vmul.f32 q0,q0,q0 - vmul.f32 q2,q2,q2 - vmul.f32 q4,q4,q4 - vmul.f32 q6,q6,q6 - vmul.f32 q8,q8,q8 - vmul.f32 q10,q10,q10 - vmul.f32 q12,q12,q12 - vmul.f32 q14,q14,q14 - vmla.f32 q0,q1,q1 - vmla.f32 q2,q3,q3 - vmla.f32 q4,q5,q5 - vmla.f32 q6,q7,q7 - vmla.f32 q8,q9,q9 - vmla.f32 q10,q11,q11 - vmla.f32 q12,q13,q13 - vmla.f32 q14,q15,q15 - vstmia r1!,{q0} - vstmia r1!,{q2} - vstmia r1!,{q4} - vstmia r1!,{q6} - vstmia r1!,{q8} - vstmia r1!,{q10} - vstmia r1!,{q12} - vstmia r1!,{q14} - - cmp r0,r2 - blo .Lmainloop2 - -.Lendmainloop2: - add r2,r2,#8*24 - bhs .Lendsmallloop2 -.Lsmallloop2: - vldmia r0!,{q0,q1,q2,q3} - vmul.f32 q0,q0,q0 - vmul.f32 q1,q1,q1 - vmul.f32 q2,q2,q2 - vmul.f32 q3,q3,q3 - vpadd.f32 d16,d0,d1 - vpadd.f32 d17,d2,d3 - vpadd.f32 d18,d4,d5 - vpadd.f32 d19,d6,d7 - vstmia r1!,{q8,q9,q10,q11} - cmp r0,r2 - blo .Lsmallloop2 -.Lendsmallloop2: - add r2,r2,#8*7 - cmp r0,r2 - beq .Lendmicroloop -.Lmicroloop2: - fldmias r0!,{s0,s1} - fmuls s2,s0,s0 - fmacs s2,s1,s1 - fstmias r1!,{s2} - - cmp r0,r2 - blo .Lmicroloop2 - b .Lendmicroloop - - .align 2 -.LGOTa: - .word _GLOBAL_OFFSET_TABLE_-(.LPICa+8) - .word analysis_state(GOT) - -#endif // __arm__ diff -Nru boinc-app-seti-7.28~svn2781/.pc/214_fix_armhf.patch/client/vector/analyzeFuncs_vfp.S boinc-app-seti-7.28~svn2858/.pc/214_fix_armhf.patch/client/vector/analyzeFuncs_vfp.S --- boinc-app-seti-7.28~svn2781/.pc/214_fix_armhf.patch/client/vector/analyzeFuncs_vfp.S 2015-02-23 15:18:24.000000000 +0000 +++ boinc-app-seti-7.28~svn2858/.pc/214_fix_armhf.patch/client/vector/analyzeFuncs_vfp.S 1970-01-01 00:00:00.000000000 +0000 @@ -1,2243 +0,0 @@ -// Copyright (c) 1999-2011 Regents of the University of California -// -// FFTW: Copyright (c) 2003,2006 Matteo Frigo -// Copyright (c) 2003,2006 Massachusets Institute of Technology -// -// fft8g.[cpp,h]: Copyright (c) 1995-2001 Takya Ooura -// -// ASMLIB: Copyright (c) 2004 Agner Fog - -// This program is free software; you can redistribute it and/or modify it -// under the terms of the GNU General Public License as published by the -// Free Software Foundation; either version 2, or (at your option) any later -// version. - -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -// more details. - -// You should have received a copy of the GNU General Public License along -// with this program; see the file COPYING. If not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -// In addition, as a special exception, the Regents of the University of -// California give permission to link the code of this program with libraries -// that provide specific optimized fast Fourier transform (FFT) functions -// as an alternative to FFTW and distribute a linked executable and -// source code. You must obey the GNU General Public License in all -// respects for all of the code used other than the FFT library itself. -// Any modification required to support these libraries must be distributed -// under the terms of this license. If you modify this program, you may extend -// this exception to your version of the program, but you are not obligated to -// do so. If you do not wish to do so, delete this exception statement from -// your version. Please be aware that FFTW and ASMLIB are not covered by -// this exception, therefore you may not use FFTW and ASMLIB in any derivative -// work so modified without permission of the authors of those packages. - -#ifdef __arm__ -/* - * vfp_ChirpData.S - * Author: Mateusz Szpakowski - */ - - .arch armv6 - .fpu vfp - .eabi_attribute 20, 1 - .eabi_attribute 21, 1 - .eabi_attribute 23, 3 - .eabi_attribute 24, 1 - .eabi_attribute 25, 1 - .eabi_attribute 26, 2 - .eabi_attribute 30, 2 - .eabi_attribute 18, 4 - .text - .align 2 -.Lhalfd: - .double 0.5 -.LroundVal: - .double 4503599627370496.0 -.Linc8d: - .double 0,1,2,3,4,5,6,7 -.Lsincosapprox: - .float 1.5707963268,-0.6466386396,0.0679105987,-0.0011573807 // sine - .float 1.0,-1.2341299769,0.2465220241,-0.0123926179 // cosine - - .align 2 - .global _Z13vfp_ChirpDataPA2_fS0_idid - .type _Z13vfp_ChirpDataPA2_fS0_idid, %function -_Z13vfp_ChirpDataPA2_fS0_idid: - push {r4,r5,r6,r7,r8,lr} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} -#define stargidx (24+64) - /* r0 - input data - * r1 - output data - * r2 - chirprateind - * sp[0-1] - chirp_rate - * sp[2] - numDataPoints - * sp[4-5] - sample_rate - */ - tst r2,r2 - bne .Lrealfunc - mov r3,r0 // swap addresses - mov r0,r1 - mov r1,r3 - ldr r2,[sp,#stargidx+8] // numDataPoints - lsl r2,r2,#3 - bl memcpy(PLT) - b .Lendoffunc - /* - * real functions - */ -.Lrealfunc: - ldr r4,.LGOT1 -.LPIC1: - add r4,pc,r4 - ldr r5,.LGOT1+4 - ldr r5,[r4,r5] - - ldr r3,[sp,#stargidx+8] // numDataPoints - - add r6,r3,r3,lsl #1 - lsl r6,r6,#2 - fldd d0,[r5,#32] - fmsr s4,r6 - fuitod d1,s4 - faddd d0,d0,d1 - fstd d0,[r5,#32] - - add r3,r0,r3,lsl #3 - sub r3,r3,#8*7 - - fldd d11,.Lhalfd - fldd d9,[sp,#stargidx] // chirp_rate - fldd d10,[sp,#stargidx+16] // sample_rate - fmuld d10,d10,d10 - fmuld d9,d9,d11 - fdivd d9,d9,d10 - - fldd d10,.LroundVal - fsubd d11,d11,d11 // zero - fcmpd d9,d11 - fmstat - fnegdmi d10,d10 // negate is negative srate - - sub sp,sp,#24+32+8 - add r7,sp,#24 - fstmiad sp,{d9,d10} - - mov r4,#0 // i - - adr r5,.Linc8d - adr r6,.Lsincosapprox - adr r8,.Lsincosapprox+4*4 - - cmp r0,r3 - bhs .Lendmainloop -.Lmainloop: - fmsr s24,r4 - fuitod d8,s24 - fldmiad r5,{d0,d1,d2,d3,d4,d5,d6,d7} - fldmiad sp,{d9,d10} - faddd d0,d0,d8 - faddd d1,d1,d8 - faddd d2,d2,d8 - faddd d3,d3,d8 - faddd d4,d4,d8 - faddd d5,d5,d8 - faddd d6,d6,d8 - faddd d7,d7,d8 - // square of i - fmuld d0,d0,d0 - fmuld d1,d1,d1 - fmuld d2,d2,d2 - fmuld d3,d3,d3 - fmuld d4,d4,d4 - fmuld d5,d5,d5 - fmuld d6,d6,d6 - fmuld d7,d7,d7 - // multiply by srate - fmuld d0,d0,d9 - fmuld d1,d1,d9 - fmuld d2,d2,d9 - fmuld d3,d3,d9 - fmuld d4,d4,d9 - fmuld d5,d5,d9 - fmuld d6,d6,d9 - fmuld d7,d7,d9 - - // rounding to -0.5/+0.5 - faddd d12,d0,d10 - faddd d13,d1,d10 - faddd d14,d2,d10 - faddd d15,d3,d10 - fsubd d12,d12,d10 - fsubd d13,d13,d10 - fsubd d14,d14,d10 - fsubd d15,d15,d10 - fsubd d0,d0,d12 - fsubd d1,d1,d13 - fsubd d2,d2,d14 - fsubd d3,d3,d15 - // second half of xxxx - faddd d12,d4,d10 - faddd d13,d5,d10 - faddd d14,d6,d10 - faddd d15,d7,d10 - fsubd d12,d12,d10 - fsubd d13,d13,d10 - fsubd d14,d14,d10 - fsubd d15,d15,d10 - fsubd d4,d4,d12 - fsubd d5,d5,d13 - fsubd d6,d6,d14 - fsubd d7,d7,d15 - // to single precision - fcvtsd s24,d0 - fcvtsd s25,d1 - fcvtsd s26,d2 - fcvtsd s27,d3 - fcvtsd s28,d4 - fcvtsd s29,d5 - fcvtsd s30,d6 - fcvtsd s31,d7 - - fldmias r6,{s16,s17,s18,s19} - // square of y - fmuls s0,s24,s24 - fmuls s1,s25,s25 - fmuls s2,s26,s26 - fmuls s3,s27,s27 - fmuls s4,s28,s28 - fmuls s5,s29,s29 - fmuls s6,s30,s30 - fmuls s7,s31,s31 - // sine - fmuls s8,s0,s19 - fmuls s9,s1,s19 - fmuls s10,s2,s19 - fmuls s11,s3,s19 - fmuls s12,s4,s19 - fmuls s13,s5,s19 - fmuls s14,s6,s19 - fmuls s15,s7,s19 - fadds s8,s8,s18 - fadds s9,s9,s18 - fadds s10,s10,s18 - fadds s11,s11,s18 - fadds s12,s12,s18 - fadds s13,s13,s18 - fadds s14,s14,s18 - fadds s15,s15,s18 - fmuls s8,s8,s0 - fmuls s9,s9,s1 - fmuls s10,s10,s2 - fmuls s11,s11,s3 - fmuls s12,s12,s4 - fmuls s13,s13,s5 - fmuls s14,s14,s6 - fmuls s15,s15,s7 - fadds s8,s8,s17 - fadds s9,s9,s17 - fadds s10,s10,s17 - fadds s11,s11,s17 - fadds s12,s12,s17 - fadds s13,s13,s17 - fadds s14,s14,s17 - fadds s15,s15,s17 - fmuls s8,s8,s0 - fmuls s9,s9,s1 - fmuls s10,s10,s2 - fmuls s11,s11,s3 - fmuls s12,s12,s4 - fmuls s13,s13,s5 - fmuls s14,s14,s6 - fmuls s15,s15,s7 - fadds s8,s8,s16 - fadds s9,s9,s16 - fadds s10,s10,s16 - fadds s11,s11,s16 - fadds s12,s12,s16 - fadds s13,s13,s16 - fadds s14,s14,s16 - fadds s15,s15,s16 - fmuls s8,s8,s24 - fmuls s9,s9,s25 - fmuls s10,s10,s26 - fmuls s11,s11,s27 - fmuls s12,s12,s28 - fmuls s13,s13,s29 - fmuls s14,s14,s30 - fmuls s15,s15,s31 - fldmias r8,{s24,s25,s26,s27} - // cosine - fmuls s16,s0,s27 - fmuls s17,s1,s27 - fmuls s18,s2,s27 - fmuls s19,s3,s27 - fmuls s20,s4,s27 - fmuls s21,s5,s27 - fmuls s22,s6,s27 - fmuls s23,s7,s27 - fadds s16,s16,s26 - fadds s17,s17,s26 - fadds s18,s18,s26 - fadds s19,s19,s26 - fadds s20,s20,s26 - fadds s21,s21,s26 - fadds s22,s22,s26 - fadds s23,s23,s26 - fmuls s16,s16,s0 - fmuls s17,s17,s1 - fmuls s18,s18,s2 - fmuls s19,s19,s3 - fmuls s20,s20,s4 - fmuls s21,s21,s5 - fmuls s22,s22,s6 - fmuls s23,s23,s7 - fadds s16,s16,s25 - fadds s17,s17,s25 - fadds s18,s18,s25 - fadds s19,s19,s25 - fadds s20,s20,s25 - fadds s21,s21,s25 - fadds s22,s22,s25 - fadds s23,s23,s25 - fmuls s16,s16,s0 - fmuls s17,s17,s1 - fmuls s18,s18,s2 - fmuls s19,s19,s3 - fmuls s20,s20,s4 - fmuls s21,s21,s5 - fmuls s22,s22,s6 - fmuls s23,s23,s7 - fadds s16,s16,s24 - fadds s17,s17,s24 - fadds s18,s18,s24 - fadds s19,s19,s24 - fadds s20,s20,s24 - fadds s21,s21,s24 - fadds s22,s22,s24 - fadds s23,s23,s24 - // load sine - // doubling cosine/sine - fmuls s0,s8,s16 // c*s - fmuls s1,s9,s17 - fmuls s2,s10,s18 - fmuls s3,s11,s19 - fmuls s4,s12,s20 - fmuls s5,s13,s21 - fmuls s6,s14,s22 - fmuls s7,s15,s23 - fmuls s16,s16,s16 // c*c - fmuls s17,s17,s17 - fmuls s18,s18,s18 - fmuls s19,s19,s19 - fmuls s20,s20,s20 - fmuls s21,s21,s21 - fmuls s22,s22,s22 - fmuls s23,s23,s23 - fnmacs s16,s8,s8 // c*c-s*s = x - fnmacs s17,s9,s9 - fnmacs s18,s10,s10 - fnmacs s19,s11,s11 - fnmacs s20,s12,s12 - fnmacs s21,s13,s13 - fnmacs s22,s14,s14 - fnmacs s23,s15,s15 - fadds s0,s0,s0 // 2*s*c = y - fadds s1,s1,s1 - fadds s2,s2,s2 - fadds s3,s3,s3 - fadds s4,s4,s4 - fadds s5,s5,s5 - fadds s6,s6,s6 - fadds s7,s7,s7 - fmuls s8,s0,s16 // cd1 = x*y - fmuls s9,s1,s17 - fmuls s10,s2,s18 - fmuls s11,s3,s19 - fmuls s12,s4,s20 - fmuls s13,s5,s21 - fmuls s14,s6,s22 - fmuls s15,s7,s23 - fmuls s0,s0,s0 // cd3 = y*y - fmuls s1,s1,s1 - fmuls s2,s2,s2 - fmuls s3,s3,s3 - fmuls s4,s4,s4 - fmuls s5,s5,s5 - fmuls s6,s6,s6 - fmuls s7,s7,s7 - fmuls s24,s16,s16 // cd2 = x*x - fmuls s25,s17,s17 - fmuls s26,s18,s18 - fmuls s27,s19,s19 - fmuls s28,s20,s20 - fmuls s29,s21,s21 - fmuls s30,s22,s22 - fmuls s31,s23,s23 - fadds s24,s24,s0 // norm = x*x+y*y - fadds s25,s25,s1 - fadds s26,s26,s2 - fadds s27,s27,s3 - fadds s28,s28,s4 - fadds s29,s29,s5 - fadds s30,s30,s6 - fadds s31,s31,s7 - fmscs s0,s16,s16 // c = x*x-y*y - fmscs s1,s17,s17 - fmscs s2,s18,s18 - fmscs s3,s19,s19 - fmscs s4,s20,s20 - fmscs s5,s21,s21 - fmscs s6,s22,s22 - fmscs s7,s23,s23 - fadds s16,s8,s8 // s = 2*x*y - fadds s17,s9,s9 - fadds s18,s10,s10 - fadds s19,s11,s11 - fadds s20,s12,s12 - fadds s21,s13,s13 - fadds s22,s14,s14 - fadds s23,s15,s15 - fsts s7,[sp,#24+64] - flds s7,.Ltwos - fstmias r7,{s16,s17,s18,s19,s20,s21,s22,s23} - // reciprocal of magnitude - // iter 1: invmag = 2.0-mag - fsubs s8,s7,s24 - fsubs s9,s7,s25 - fsubs s10,s7,s26 - fsubs s11,s7,s27 - fsubs s12,s7,s28 - fsubs s13,s7,s29 - fsubs s14,s7,s30 - fsubs s15,s7,s31 - // iter 2: invmag = invmag*(2.0-mag*invmag) - fmuls s16,s8,s24 - fmuls s17,s9,s25 - fmuls s18,s10,s26 - fmuls s19,s11,s27 - fmuls s20,s12,s28 - fmuls s21,s13,s29 - fmuls s22,s14,s30 - fmuls s23,s15,s31 - fsubs s16,s7,s16 - fsubs s17,s7,s17 - fsubs s18,s7,s18 - fsubs s19,s7,s19 - fsubs s20,s7,s20 - fsubs s21,s7,s21 - fsubs s22,s7,s22 - fsubs s23,s7,s23 - fmuls s8,s16,s8 - fmuls s9,s17,s9 - fmuls s10,s18,s10 - fmuls s11,s19,s11 - fmuls s12,s20,s12 - fmuls s13,s21,s13 - fmuls s14,s22,s14 - fmuls s15,s23,s15 - // iter 3: invmag = invmag*(2.0-mag*invmag) - fmuls s16,s8,s24 - fmuls s17,s9,s25 - fmuls s18,s10,s26 - fmuls s19,s11,s27 - fmuls s20,s12,s28 - fmuls s21,s13,s29 - fmuls s22,s14,s30 - fmuls s23,s15,s31 - fsubs s16,s7,s16 - fsubs s17,s7,s17 - fsubs s18,s7,s18 - fsubs s19,s7,s19 - fsubs s20,s7,s20 - fsubs s21,s7,s21 - fsubs s22,s7,s22 - fsubs s23,s7,s23 - fmuls s8,s16,s8 - fmuls s9,s17,s9 - fmuls s10,s18,s10 - fmuls s11,s19,s11 - fmuls s12,s20,s12 - fmuls s13,s21,s13 - fmuls s14,s22,s14 - fmuls s15,s23,s15 - // restore sine values - fldmias r7,{s16,s17,s18,s19,s20,s21,s22,s23} - // correct cosine/sine - flds s7,[sp,#24+64] - fmuls s0,s0,s8 - fmuls s1,s1,s9 - fmuls s2,s2,s10 - fmuls s3,s3,s11 - fmuls s4,s4,s12 - fmuls s5,s5,s13 - fmuls s6,s6,s14 - fmuls s7,s7,s15 - fmuls s16,s16,s8 - fmuls s17,s17,s9 - fmuls s18,s18,s10 - fmuls s19,s19,s11 - fmuls s20,s20,s12 - fmuls s21,s21,s13 - fmuls s22,s22,s14 - fmuls s23,s23,s15 - pld [r0,#128] - // multiply by data - fldmias r0!,{s8,s9,s10,s11,s12,s13,s14,s15} - fmuls s24,s8,s0 - fmuls s25,s9,s0 - fmuls s26,s10,s1 - fmuls s27,s11,s1 - fmuls s28,s12,s2 - fmuls s29,s13,s2 - fmuls s30,s14,s3 - fmuls s31,s15,s3 - fnmacs s24,s9,s16 - fmacs s25,s8,s16 - fnmacs s26,s11,s17 - fmacs s27,s10,s17 - fnmacs s28,s13,s18 - fmacs s29,s12,s18 - fnmacs s30,s15,s19 - fmacs s31,s14,s19 - fstmias r1!,{s24,s25,s26,s27,s28,s29,s30,s31} - pld [r0,#128] - fldmias r0!,{s8,s9,s10,s11,s12,s13,s14,s15} - fmuls s24,s8,s4 - fmuls s25,s9,s4 - fmuls s26,s10,s5 - fmuls s27,s11,s5 - fmuls s28,s12,s6 - fmuls s29,s13,s6 - fmuls s30,s14,s7 - fmuls s31,s15,s7 - fnmacs s24,s9,s20 - fmacs s25,s8,s20 - fnmacs s26,s11,s21 - fmacs s27,s10,s21 - fnmacs s28,s13,s22 - fmacs s29,s12,s22 - fnmacs s30,s15,s23 - fmacs s31,s14,s23 - fstmias r1!,{s24,s25,s26,s27,s28,s29,s30,s31} - - add r4,r4,#8 - cmp r0,r3 - blo .Lmainloop -.Lendmainloop: - add r3,r3,#8*7 - cmp r0,r3 - bhs .Lendsmallloop -.Lsmallloop: - fmsr s24,r4 - fldmiad sp,{d9,d10} - fuitod d0,s24 - // square of i - fmuld d0,d0,d0 - // multiply by srate - fmuld d0,d0,d9 - // rounding to -0.5/+0.5 - faddd d12,d0,d10 - fsubd d12,d12,d10 - fsubd d0,d0,d12 - fcvtsd s24,d0 - - fldmias r6,{s8,s9,s10,s11,s12,s13,s14,s15} - // square of y - fmuls s0,s24,s24 - // sine/cosine - fmuls s16,s0,s11 - fmuls s17,s0,s15 - fadds s16,s16,s10 - fadds s17,s17,s14 - fmuls s16,s16,s0 - fmuls s17,s17,s0 - fadds s16,s16,s9 - fadds s17,s17,s13 - fmuls s16,s16,s0 - fmuls s17,s17,s0 - fadds s16,s16,s8 - fadds s17,s17,s12 // s16 - sine - fmuls s16,s16,s24 // s17 - cosine - // doubling cosine/sine - fmuls s18,s16,s17 - fmuls s19,s16,s16 - fmuls s20,s17,s17 - fadds s18,s18,s18 // y=2*s*c - fsubs s19,s20,s19 // x=c*c-s*s - fmuls s21,s18,s19 // cd1 - fmuls s22,s19,s19 // cd2 - fmuls s23,s18,s18 // cd3 - fsubs s8,s22,s23 // c - fadds s9,s21,s21 // s - // compute 1.0/norm - fadds s10,s22,s23 // mag - // reciprocal - flds s11,.Ltwos - // iter1: invmag = 2.0-mag - fsubs s12,s11,s10 - // iter2: invmag = invmag*(2.0-invmag*mag) - fmuls s13,s12,s10 - fsubs s13,s11,s13 - fmuls s12,s12,s13 - // correct cosine/sine - fmuls s8,s8,s12 - fmuls s9,s9,s12 - // multiply data - fldmias r0!,{s14,s15} - fmuls s6,s14,s8 - fmuls s7,s14,s9 - fnmacs s6,s15,s9 - fmacs s7,s15,s8 - fstmias r1!,{s6,s7} - - add r4,r4,#1 - cmp r0,r3 - blo .Lsmallloop -.Lendsmallloop: - add sp,sp,#24+32+8 -.Lendoffunc: - mov r0,#0 - vpop {d8,d9,d10,d11,d12,d13,d14,d15} - pop {r4,r5,r6,r7,r8,lr} - bx lr - - .align 2 -.LGOT1: - .word _GLOBAL_OFFSET_TABLE_-(.LPIC1+8) - .word analysis_state(GOT) -.Ltwos: - .float 2.0 -/* - * vfp_FoldSubs.S - * Author: Mateusz Szpakowski - */ - - .arch armv6 - .fpu vfp - .eabi_attribute 20, 1 - .eabi_attribute 21, 1 - .eabi_attribute 23, 3 - .eabi_attribute 24, 1 - .eabi_attribute 25, 1 - .eabi_attribute 26, 2 - .eabi_attribute 30, 2 - .eabi_attribute 18, 4 - .text - .align 2 - /***** - * fold array by 3 - ******/ -.Lzeros: - .float 0.0,0.0 - - .global vfp_foldArrayBy3_ll31 - .type vfp_foldArrayBy3_ll31, %function -vfp_foldArrayBy3_ll31: - push {r4,r5,r6,lr} - vpush {d8,d9,d10,d11,d12} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - ldrd r4,[r1] // di,dest - add r6,r0,r4,lsl #2 // end - - fldd d12,.Lzeros - sub r6,r6,#4*7 - cmp r0,r6 - bhs .Lendf3loop2 -.Lf3loop2: -.macro FOLDBY3_CORE - fldmias r0!,{s0,s1,s2,s3,s4,s5,s6,s7} - fldmias r2!,{s8,s9,s10,s11,s12,s13,s14,s15} - fldmias r3!,{s16,s17,s18,s19,s20,s21,s22,s23} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fadds s6,s6,s14 - fadds s7,s7,s15 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fadds s4,s4,s20 - fadds s5,s5,s21 - fadds s6,s6,s22 - fadds s7,s7,s23 - fstmias r5!,{s0,s1,s2,s3,s4,s5,s6,s7} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcmpes s24,s6 - fcpysmi s25,s5 - fmstat - fcmpes s25,s7 - fcpysmi s24,s6 - fmstat - fcpysmi s25,s7 -.endm - FOLDBY3_CORE - - cmp r0,r6 - blo .Lf3loop2 -.Lendf3loop2: - and r4,r4,#7 - cmp r4,#4 - blo .Lf3lt4 - beq .Lf3eq4 - - cmp r4,#6 - blo .Lf3lt6 - beq .Lf3eq6 - - fldmias r0!,{s0,s1,s2,s3,s4,s5,s6} - fldmias r2!,{s8,s9,s10,s11,s12,s13,s14} - fldmias r3!,{s16,s17,s18,s19,s20,s21,s22} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fadds s6,s6,s14 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fadds s4,s4,s20 - fadds s5,s5,s21 - fadds s6,s6,s22 - fstmias r5!,{s0,s1,s2,s3,s4,s5,s6} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcmpes s24,s6 - fcpysmi s25,s5 - fmstat - fcpysmi s24,s6 - b .Lf3end -.Lf3eq6: - fldmias r0!,{s0,s1,s2,s3,s4,s5} - fldmias r2!,{s8,s9,s10,s11,s12,s13} - fldmias r3!,{s16,s17,s18,s19,s20,s21} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fadds s4,s4,s20 - fadds s5,s5,s21 - fstmias r5!,{s0,s1,s2,s3,s4,s5} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcpysmi s25,s5 - b .Lf3end -.Lf3lt6: - fldmias r0!,{s0,s1,s2,s3,s4} - fldmias r2!,{s8,s9,s10,s11,s12} - fldmias r3!,{s16,s17,s18,s19,s20} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fadds s4,s4,s20 - fstmias r5!,{s0,s1,s2,s3,s4} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcpysmi s24,s4 - b .Lf3end -.Lf3eq4: - fldmias r0!,{s0,s1,s2,s3} - fldmias r2!,{s8,s9,s10,s11} - fldmias r3!,{s16,s17,s18,s19} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fstmias r5!,{s0,s1,s2,s3} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcpysmi s25,s3 - b .Lf3end -.Lf3lt4: - cmp r4,#2 - blo .Lf3lt2 - beq .Lf3eq2 - - fldmias r0!,{s0,s1,s2} - fldmias r2!,{s8,s9,s10} - fldmias r3!,{s16,s17,s18} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fstmias r5!,{s0,s1,s2} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcpysmi s24,s2 - b .Lf3end -.Lf3eq2: - fldmias r0!,{s0,s1} - fldmias r2!,{s8,s9} - fldmias r3!,{s16,s17} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s0,s0,s16 - fadds s1,s1,s17 - fstmias r5!,{s0,s1} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcpysmi s25,s1 - b .Lf3end -.Lf3lt2: - cmp r4,#0 - beq .Lf3eq0 - fldmias r0!,{s0} - fldmias r2!,{s1} - fldmias r3!,{s2} - - fadds s0,s0,s1 - fadds s0,s0,s2 - fstmias r5!,{s0} - - fcmpes s24,s0 - fmstat - fcpysmi s24,s0 -.Lf3eq0: - -.Lf3end: - fcmpes s24,s25 - fmstat - fcpysmi s24,s25 - - fmrs r0,s24 - vpop {d8,d9,d10,d11,d12} - pop {r4,r5,r6,lr} - bx lr - -.Lzeros1: - .float 0.0,0.0 - - .global vfp_foldArrayBy3_lge31 - .type vfp_foldArrayBy3_lge31, %function -vfp_foldArrayBy3_lge31: - push {r4,r5,r6,lr} - vpush {d8,d9,d10,d11,d12} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - ldrd r4,[r1] // di,dest - add r6,r0,r4,lsl #2 // end - - fldd d12,.Lzeros1 - sub r6,r6,#4*31 - cmp r0,r6 - bhs .Lendf3loop4 -.Lf3loop4: - FOLDBY3_CORE - FOLDBY3_CORE - FOLDBY3_CORE - FOLDBY3_CORE - - cmp r0,r6 - blo .Lf3loop4 -.Lendf3loop4: - add r6,r6,#4*24 - cmp r0,r6 - bhs .Lendf3loop2 -.Lf3loop5: - FOLDBY3_CORE - - cmp r0,r6 - blo .Lf3loop5 -.Lendf3loop5: - b .Lendf3loop2 - -.Lfoldby3sel: -.rept 31 - .word vfp_foldArrayBy3_ll31 -.endr - .word vfp_foldArrayBy3_lge31 - -.Lzeros2: - .float 0.0,0.0 - /***** - * fold array by 4 - ******/ - .global vfp_foldArrayBy4_ll31 - .type vfp_foldArrayBy4_ll31, %function -vfp_foldArrayBy4_ll31: - push {r4,r5,r6,r7,r8,lr} - vpush {d8,d9,d10,d11,d12} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - ldr r6,[r1,#16] // tmp2 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - add r6,r0,r6,lsl #2 - ldrd r4,[r1] // di,dest - add r7,r0,r4,lsl #2 // end - - fldd d12,.Lzeros2 - sub r7,r7,#4*7 - cmp r0,r7 - bhs .Lendf4loop2 -.Lf4loop2: -.macro FOLDBY4_CORE - fldmias r0!,{s0,s1,s2,s3,s4,s5,s6,s7} - fldmias r2!,{s8,s9,s10,s11,s12,s13,s14,s15} - fldmias r3!,{s16,s17,s18,s19,s20,s21,s22,s23} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fadds s6,s6,s14 - fadds s7,s7,s15 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fldmias r6!,{s8,s9,s10,s11,s12,s13,s14,s15} - fadds s4,s4,s20 - fadds s5,s5,s21 - fadds s6,s6,s22 - fadds s7,s7,s23 - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fadds s6,s6,s14 - fadds s7,s7,s15 - - fstmias r5!,{s0,s1,s2,s3,s4,s5,s6,s7} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcmpes s24,s6 - fcpysmi s25,s5 - fmstat - fcmpes s25,s7 - fcpysmi s24,s6 - fmstat - fcpysmi s25,s7 -.endm - FOLDBY4_CORE - - cmp r0,r7 - blo .Lf4loop2 -.Lendf4loop2: - and r4,r4,#7 - cmp r4,#4 - blo .Lf4lt4 - beq .Lf4eq4 - - cmp r4,#6 - blo .Lf4lt6 - beq .Lf4eq6 - - fldmias r0!,{s0,s1,s2,s3,s4,s5,s6} - fldmias r2!,{s8,s9,s10,s11,s12,s13,s14} - fldmias r3!,{s16,s17,s18,s19,s20,s21,s22} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fadds s6,s6,s14 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fadds s4,s4,s20 - fadds s5,s5,s21 - fadds s6,s6,s22 - fldmias r6!,{s16,s17,s18,s19,s20,s21,s22} - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fadds s4,s4,s20 - fadds s5,s5,s21 - fadds s6,s6,s22 - fstmias r5!,{s0,s1,s2,s3,s4,s5,s6} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcmpes s24,s6 - fcpysmi s25,s5 - fmstat - fcpysmi s24,s6 - b .Lf4end -.Lf4eq6: - fldmias r0!,{s0,s1,s2,s3,s4,s5} - fldmias r2!,{s6,s7,s8,s9,s10,s11} - fldmias r3!,{s12,s13,s14,s15,s16,s17} - fldmias r6!,{s18,s19,s20,s21,s22,s23} - - fadds s0,s0,s6 - fadds s1,s1,s7 - fadds s2,s2,s8 - fadds s3,s3,s9 - fadds s4,s4,s10 - fadds s5,s5,s11 - fadds s0,s0,s12 - fadds s1,s1,s13 - fadds s2,s2,s14 - fadds s3,s3,s15 - fadds s4,s4,s16 - fadds s5,s5,s17 - fadds s0,s0,s18 - fadds s1,s1,s19 - fadds s2,s2,s20 - fadds s3,s3,s21 - fadds s4,s4,s22 - fadds s5,s5,s23 - fstmias r5!,{s0,s1,s2,s3,s4,s5} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcpysmi s25,s5 - b .Lf4end -.Lf4lt6: - fldmias r0!,{s0,s1,s2,s3,s4} - fldmias r2!,{s6,s7,s8,s9,s10} - fldmias r3!,{s12,s13,s14,s15,s16} - fldmias r6!,{s18,s19,s20,s21,s22} - - fadds s0,s0,s6 - fadds s1,s1,s7 - fadds s2,s2,s8 - fadds s3,s3,s9 - fadds s4,s4,s10 - fadds s0,s0,s12 - fadds s1,s1,s13 - fadds s2,s2,s14 - fadds s3,s3,s15 - fadds s4,s4,s16 - fadds s0,s0,s18 - fadds s1,s1,s19 - fadds s2,s2,s20 - fadds s3,s3,s21 - fadds s4,s4,s22 - fstmias r5!,{s0,s1,s2,s3,s4} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcpysmi s24,s4 - b .Lf4end -.Lf4eq4: - fldmias r0!,{s0,s1,s2,s3} - fldmias r2!,{s6,s7,s8,s9} - fldmias r3!,{s12,s13,s14,s15} - fldmias r6!,{s18,s19,s20,s21} - - fadds s0,s0,s6 - fadds s1,s1,s7 - fadds s2,s2,s8 - fadds s3,s3,s9 - fadds s0,s0,s12 - fadds s1,s1,s13 - fadds s2,s2,s14 - fadds s3,s3,s15 - fadds s0,s0,s18 - fadds s1,s1,s19 - fadds s2,s2,s20 - fadds s3,s3,s21 - fstmias r5!,{s0,s1,s2,s3} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcpysmi s25,s3 - b .Lf4end -.Lf4lt4: - cmp r4,#2 - blo .Lf4lt2 - beq .Lf4eq2 - - fldmias r0!,{s0,s1,s2} - fldmias r2!,{s6,s7,s8} - fldmias r3!,{s12,s13,s14} - fldmias r6!,{s18,s19,s20} - - fadds s0,s0,s6 - fadds s1,s1,s7 - fadds s2,s2,s8 - fadds s0,s0,s12 - fadds s1,s1,s13 - fadds s2,s2,s14 - fadds s0,s0,s18 - fadds s1,s1,s19 - fadds s2,s2,s20 - fstmias r5!,{s0,s1,s2} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcpysmi s24,s2 - b .Lf4end -.Lf4eq2: - fldmias r0!,{s0,s1} - fldmias r2!,{s6,s7} - fldmias r3!,{s12,s13} - fldmias r6!,{s18,s19} - - fadds s0,s0,s6 - fadds s1,s1,s7 - fadds s0,s0,s12 - fadds s1,s1,s13 - fadds s0,s0,s18 - fadds s1,s1,s19 - fstmias r5!,{s0,s1} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcpysmi s25,s1 - b .Lf4end -.Lf4lt2: - cmp r4,#0 - beq .Lf4eq0 - fldmias r0!,{s0} - fldmias r2!,{s1} - fldmias r3!,{s2} - fldmias r6!,{s3} - - fadds s0,s0,s1 - fadds s0,s0,s2 - fadds s0,s0,s3 - fstmias r5!,{s0} - - fcmpes s24,s0 - fmstat - fcpysmi s24,s0 -.Lf4eq0: - -.Lf4end: - fcmpes s24,s25 - fmstat - fcpysmi s24,s25 - - fmrs r0,s24 - vpop {d8,d9,d10,d11,d12} - pop {r4,r5,r6,r7,r8,lr} - bx lr - -.Lzeros2_1: - .float 0.0,0.0 - .global vfp_foldArrayBy4_lge31 - .type vfp_foldArrayBy4_lge31, %function -vfp_foldArrayBy4_lge31: - push {r4,r5,r6,r7,r8,lr} - vpush {d8,d9,d10,d11,d12} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - ldr r6,[r1,#16] // tmp2 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - add r6,r0,r6,lsl #2 - ldrd r4,[r1] // di,dest - add r7,r0,r4,lsl #2 // end - - fldd d12,.Lzeros2_1 - sub r7,r7,#4*31 - cmp r0,r7 - bhs .Lendf4loop4 -.Lf4loop4: - FOLDBY4_CORE - FOLDBY4_CORE - FOLDBY4_CORE - FOLDBY4_CORE - - cmp r0,r7 - blo .Lf4loop4 -.Lendf4loop4: - add r7,r7,#4*24 - cmp r0,r7 - bhs .Lendf4loop2 -.Lf4loop5: - FOLDBY4_CORE - - cmp r0,r7 - blo .Lf4loop5 -.Lendf4loop5: - b .Lendf4loop2 - -.Lfoldby4sel: -.rept 31 - .word vfp_foldArrayBy4_ll31 -.endr - .word vfp_foldArrayBy4_lge31 - -.Lzeros3: - .float 0.0,0.0 - /***** - * fold array by 5 - ******/ - .global vfp_foldArrayBy5_ll31 - .type vfp_foldArrayBy5_ll31, %function -vfp_foldArrayBy5_ll31: - push {r4,r5,r6,r7,r8,lr} - vpush {d8,d9,d10,d11,d12} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - ldrd r6,[r1,#16] // tmp2,tmp3 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - add r6,r0,r6,lsl #2 - add r7,r0,r7,lsl #2 - ldrd r4,[r1] // di,dest - add r8,r0,r4,lsl #2 // end - - fldd d12,.Lzeros3 - sub r8,r8,#4*7 - cmp r0,r8 - bhs .Lendf5loop2 -.Lf5loop2: -.macro FOLDBY5_CORE - fldmias r0!,{s0,s1,s2,s3,s4,s5,s6,s7} - fldmias r2!,{s8,s9,s10,s11,s12,s13,s14,s15} - fldmias r3!,{s16,s17,s18,s19,s20,s21,s22,s23} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fadds s6,s6,s14 - fadds s7,s7,s15 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fldmias r6!,{s8,s9,s10,s11,s12,s13,s14,s15} - fadds s4,s4,s20 - fadds s5,s5,s21 - fadds s6,s6,s22 - fadds s7,s7,s23 - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fldmias r7!,{s16,s17,s18,s19,s20,s21,s22,s23} - fadds s5,s5,s13 - fadds s6,s6,s14 - fadds s7,s7,s15 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fadds s4,s4,s20 - fadds s5,s5,s21 - fadds s6,s6,s22 - fadds s7,s7,s23 - - fstmias r5!,{s0,s1,s2,s3,s4,s5,s6,s7} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcmpes s24,s6 - fcpysmi s25,s5 - fmstat - fcmpes s25,s7 - fcpysmi s24,s6 - fmstat - fcpysmi s25,s7 -.endm - FOLDBY5_CORE - - cmp r0,r8 - blo .Lf5loop2 -.Lendf5loop2: - and r4,r4,#7 - cmp r4,#4 - blo .Lf5lt4 - beq .Lf5eq4 - - cmp r4,#6 - blo .Lf5lt6 - beq .Lf5eq6 - - fldmias r0!,{s0,s1,s2,s3,s4,s5,s6} - fldmias r2!,{s8,s9,s10,s11,s12,s13,s14} - fldmias r3!,{s16,s17,s18,s19,s20,s21,s22} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fadds s6,s6,s14 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fadds s4,s4,s20 - fadds s5,s5,s21 - fadds s6,s6,s22 - fldmias r6!,{s16,s17,s18,s19,s20,s21,s22} - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fadds s4,s4,s20 - fadds s5,s5,s21 - fadds s6,s6,s22 - fldmias r7!,{s16,s17,s18,s19,s20,s21,s22} - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fadds s4,s4,s20 - fadds s5,s5,s21 - fadds s6,s6,s22 - fstmias r5!,{s0,s1,s2,s3,s4,s5,s6} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcmpes s24,s6 - fcpysmi s25,s5 - fmstat - fcpysmi s24,s6 - b .Lf5end -.Lf5eq6: - fldmias r0!,{s0,s1,s2,s3,s4,s5} - fldmias r2!,{s6,s7,s8,s9,s10,s11} - fldmias r3!,{s12,s13,s14,s15,s16,s17} - fldmias r6!,{s18,s19,s20,s21,s22,s23} - - fadds s0,s0,s6 - fadds s1,s1,s7 - fadds s2,s2,s8 - fadds s3,s3,s9 - fadds s4,s4,s10 - fadds s5,s5,s11 - fadds s0,s0,s12 - fadds s1,s1,s13 - fadds s2,s2,s14 - fadds s3,s3,s15 - fadds s4,s4,s16 - fadds s5,s5,s17 - fadds s0,s0,s18 - fadds s1,s1,s19 - fadds s2,s2,s20 - fadds s3,s3,s21 - fadds s4,s4,s22 - fadds s5,s5,s23 - fldmias r7!,{s18,s19,s20,s21,s22,s23} - fadds s0,s0,s18 - fadds s1,s1,s19 - fadds s2,s2,s20 - fadds s3,s3,s21 - fadds s4,s4,s22 - fadds s5,s5,s23 - fstmias r5!,{s0,s1,s2,s3,s4,s5} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcpysmi s25,s5 - b .Lf5end -.Lf5lt6: - fldmias r0!,{s0,s1,s2,s3,s4} - fldmias r2!,{s6,s7,s8,s9,s10} - fldmias r3!,{s12,s13,s14,s15,s16} - fldmias r6!,{s18,s19,s20,s21,s22} - - fadds s0,s0,s6 - fadds s1,s1,s7 - fadds s2,s2,s8 - fadds s3,s3,s9 - fadds s4,s4,s10 - fadds s0,s0,s12 - fadds s1,s1,s13 - fadds s2,s2,s14 - fadds s3,s3,s15 - fadds s4,s4,s16 - fadds s0,s0,s18 - fadds s1,s1,s19 - fadds s2,s2,s20 - fadds s3,s3,s21 - fadds s4,s4,s22 - fldmias r7!,{s18,s19,s20,s21,s22} - fadds s0,s0,s18 - fadds s1,s1,s19 - fadds s2,s2,s20 - fadds s3,s3,s21 - fadds s4,s4,s22 - fstmias r5!,{s0,s1,s2,s3,s4} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcpysmi s24,s4 - b .Lf5end -.Lf5eq4: - fldmias r0!,{s0,s1,s2,s3} - fldmias r2!,{s4,s5,s6,s7} - fldmias r3!,{s8,s9,s10,s11} - fldmias r6!,{s12,s13,s14,s15} - fldmias r7!,{s16,s17,s18,s19} - - fadds s0,s0,s4 - fadds s1,s1,s5 - fadds s2,s2,s6 - fadds s3,s3,s7 - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s0,s0,s12 - fadds s1,s1,s13 - fadds s2,s2,s14 - fadds s3,s3,s15 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fadds s3,s3,s19 - fstmias r5!,{s0,s1,s2,s3} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcpysmi s25,s3 - b .Lf5end -.Lf5lt4: - cmp r4,#2 - blo .Lf5lt2 - beq .Lf5eq2 - - fldmias r0!,{s0,s1,s2} - fldmias r2!,{s4,s5,s6} - fldmias r3!,{s8,s9,s10} - fldmias r6!,{s12,s13,s14} - fldmias r7!,{s16,s17,s18} - - fadds s0,s0,s4 - fadds s1,s1,s5 - fadds s2,s2,s6 - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s0,s0,s12 - fadds s1,s1,s13 - fadds s2,s2,s14 - fadds s0,s0,s16 - fadds s1,s1,s17 - fadds s2,s2,s18 - fstmias r5!,{s0,s1,s2} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcpysmi s24,s2 - b .Lf5end -.Lf5eq2: - fldmias r0!,{s0,s1} - fldmias r2!,{s4,s5} - fldmias r3!,{s8,s9} - fldmias r6!,{s12,s13} - fldmias r7!,{s16,s17} - - fadds s0,s0,s4 - fadds s1,s1,s5 - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s0,s0,s12 - fadds s1,s1,s13 - fadds s0,s0,s16 - fadds s1,s1,s17 - fstmias r5!,{s0,s1} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcpysmi s25,s1 - b .Lf5end -.Lf5lt2: - cmp r4,#0 - beq .Lf5eq0 - fldmias r0!,{s0} - fldmias r2!,{s1} - fldmias r3!,{s2} - fldmias r6!,{s3} - fldmias r7!,{s4} - - fadds s0,s0,s1 - fadds s0,s0,s2 - fadds s0,s0,s3 - fadds s0,s0,s4 - fstmias r5!,{s0} - - fcmpes s24,s0 - fmstat - fcpysmi s24,s0 -.Lf5eq0: - -.Lf5end: - fcmpes s24,s25 - fmstat - fcpysmi s24,s25 - - fmrs r0,s24 - vpop {d8,d9,d10,d11,d12} - pop {r4,r5,r6,r7,r8,lr} - bx lr - -.Lzeros3_1: - .float 0.0,0.0 - - .global vfp_foldArrayBy5_lge31 - .type vfp_foldArrayBy5_lge31, %function -vfp_foldArrayBy5_lge31: - push {r4,r5,r6,r7,r8,lr} - vpush {d8,d9,d10,d11,d12} - - ldr r0,[r0] // ss0 - ldrd r2,[r1,#8] // tmp0 - ldrd r6,[r1,#16] // tmp2,tmp3 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - add r6,r0,r6,lsl #2 - add r7,r0,r7,lsl #2 - ldrd r4,[r1] // di,dest - add r8,r0,r4,lsl #2 // end - - fldd d12,.Lzeros3_1 - sub r8,r8,#4*31 - cmp r0,r8 - bhs .Lendf5loop4 -.Lf5loop4: - FOLDBY5_CORE - FOLDBY5_CORE - FOLDBY5_CORE - FOLDBY5_CORE - - cmp r0,r8 - blo .Lf5loop4 -.Lendf5loop4: - add r8,r8,#4*24 - cmp r0,r8 - bhs .Lendf5loop2 -.Lf5loop5: - FOLDBY5_CORE - - cmp r0,r8 - blo .Lf5loop5 -.Lendf5loop5: - b .Lendf5loop2 - -.Lfoldby5sel: -.rept 31 - .word vfp_foldArrayBy5_ll31 -.endr - .word vfp_foldArrayBy5_lge31 - -.Lzeros4: - .float 0.0,0.0 - /***** - * fold array by 2 - ******/ - .global vfp_foldArrayBy2_ll31 - .type vfp_foldArrayBy2_ll31, %function -vfp_foldArrayBy2_ll31: - push {r4,r5,r6,lr} - vpush {d8,d9,d10,d11,d12} - - ldr r0,[r0,#4] // ss1 - ldrd r2,[r1,#8] // tmp0 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - ldrd r4,[r1] // di,dest - add r6,r2,r4,lsl #2 // end - - fldd d12,.Lzeros4 - sub r6,r6,#4*7 - cmp r2,r6 - bhs .Lendf2loop2 -.Lf2loop2: -.macro FOLDBY2_CORE - fldmias r2!,{s0,s1,s2,s3,s4,s5,s6,s7} - fldmias r3!,{s8,s9,s10,s11,s12,s13,s14,s15} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fadds s6,s6,s14 - fadds s7,s7,s15 - fstmias r5!,{s0,s1,s2,s3,s4,s5,s6,s7} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcmpes s24,s6 - fcpysmi s25,s5 - fmstat - fcmpes s25,s7 - fcpysmi s24,s6 - fmstat - fcpysmi s25,s7 -.endm - FOLDBY2_CORE - - cmp r2,r6 - blo .Lf2loop2 -.Lendf2loop2: - and r4,r4,#7 - cmp r4,#4 - blo .Lf2lt4 - beq .Lf2eq4 - - cmp r4,#6 - blo .Lf2lt6 - beq .Lf2eq6 - - fldmias r2!,{s0,s1,s2,s3,s4,s5,s6} - fldmias r3!,{s8,s9,s10,s11,s12,s13,s14} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fadds s6,s6,s14 - fstmias r5!,{s0,s1,s2,s3,s4,s5,s6} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcmpes s24,s6 - fcpysmi s25,s5 - fmstat - fcpysmi s24,s6 - b .Lf2end -.Lf2eq6: - fldmias r2!,{s0,s1,s2,s3,s4,s5} - fldmias r3!,{s8,s9,s10,s11,s12,s13} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - fadds s5,s5,s13 - fstmias r5!,{s0,s1,s2,s3,s4,s5} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcmpes s25,s5 - fcpysmi s24,s4 - fmstat - fcpysmi s25,s5 - b .Lf2end -.Lf2lt6: - fldmias r2!,{s0,s1,s2,s3,s4} - fldmias r3!,{s8,s9,s10,s11,s12} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - fadds s4,s4,s12 - - fstmias r5!,{s0,s1,s2,s3,s4} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcmpes s24,s4 - fcpysmi s25,s3 - fmstat - fcpysmi s24,s4 - b .Lf2end -.Lf2eq4: - fldmias r2!,{s0,s1,s2,s3} - fldmias r3!,{s8,s9,s10,s11} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - fadds s3,s3,s11 - - fstmias r5!,{s0,s1,s2,s3} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcmpes s25,s3 - fcpysmi s24,s2 - fmstat - fcpysmi s25,s3 - b .Lf2end -.Lf2lt4: - cmp r4,#2 - blo .Lf2lt2 - beq .Lf2eq2 - - fldmias r2!,{s0,s1,s2} - fldmias r3!,{s8,s9,s10} - - fadds s0,s0,s8 - fadds s1,s1,s9 - fadds s2,s2,s10 - - fstmias r5!,{s0,s1,s2} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcmpes s24,s2 - fcpysmi s25,s1 - fmstat - fcpysmi s24,s2 - b .Lf2end -.Lf2eq2: - fldmias r2!,{s0,s1} - fldmias r3!,{s8,s9} - - fadds s0,s0,s8 - fadds s1,s1,s9 - - fstmias r5!,{s0,s1} - - fcmpes s24,s0 - fmstat - fcmpes s25,s1 - fcpysmi s24,s0 - fmstat - fcpysmi s25,s1 - b .Lf2end -.Lf2lt2: - cmp r4,#0 - beq .Lf2eq0 - fldmias r2!,{s0} - fldmias r3!,{s1} - - fadds s0,s0,s1 - fstmias r5!,{s0} - - fcmpes s24,s0 - fmstat - fcpysmi s24,s0 -.Lf2eq0: - -.Lf2end: - fcmpes s24,s25 - fmstat - fcpysmi s24,s25 - - fmrs r0,s24 - vpop {d8,d9,d10,d11,d12} - pop {r4,r5,r6,lr} - bx lr - - .global vfp_foldArrayBy2_lge31 - .type vfp_foldArrayBy2_lge31, %function -vfp_foldArrayBy2_lge31: - push {r4,r5,r6,lr} - vpush {d8,d9,d10,d11,d12} - - ldr r0,[r0,#4] // ss1 - ldrd r2,[r1,#8] // tmp0 - add r2,r0,r2,lsl #2 - add r3,r0,r3,lsl #2 - ldrd r4,[r1] // di,dest - add r6,r2,r4,lsl #2 // end - - fldd d12,.Lzeros4 - sub r6,r6,#4*31 - cmp r2,r6 - bhs .Lendf2loop4 -.Lf2loop4: - FOLDBY2_CORE - FOLDBY2_CORE - FOLDBY2_CORE - FOLDBY2_CORE - - cmp r2,r6 - blo .Lf2loop4 -.Lendf2loop4: - add r6,r6,#4*24 - cmp r2,r6 - bhs .Lendf2loop2 -.Lf2loop5: - FOLDBY2_CORE - - cmp r2,r6 - blo .Lf2loop5 - b .Lendf2loop2 - -.Lfoldby2sel: -.rept 31 - .word vfp_foldArrayBy2_ll31 -.endr - .word vfp_foldArrayBy2_lge31 - - .align 2 -.Lname: - .string "opt VFP" - - .align 2 - .global vfpFoldMain -vfpFoldMain: - .word .Lfoldby3sel - .word .Lfoldby4sel - .word .Lfoldby5sel - .word .Lfoldby2sel - .word .Lfoldby2sel - .word .Lname - /* - * vfp_GetPowerSpectrum.S - * Author: Mateusz Szpakowski - */ - - .arch armv6 - .fpu vfp - .eabi_attribute 20, 1 - .eabi_attribute 21, 1 - .eabi_attribute 23, 3 - .eabi_attribute 24, 1 - .eabi_attribute 25, 1 - .eabi_attribute 26, 2 - .eabi_attribute 30, 2 - .eabi_attribute 18, 4 - .text - .align 2 - .global _Z20vfp_GetPowerSpectrumPA2_fPfi - .type _Z20vfp_GetPowerSpectrumPA2_fPfi, %function -_Z20vfp_GetPowerSpectrumPA2_fPfi: - push {r4,r5} - vpush {d8,d9,d10,d11,d12,d13,d14,d15} - - ldr r3,.LGOTa -.LPICa: - add r3,pc,r3 - ldr r4,.LGOTa+4 - ldr r4,[r3,r4] - add r5,r2,r2,lsl #1 - fldd d0,[r4,#32] - fmsr s4,r5 - fuitod d1,s4 - faddd d0,d0,d1 - fstd d0,[r4,#32] - - add r2,r0,r2,lsl #3 - sub r2,r2,#15*8 - /* r0 - freqData - * r1 - PowerSpectrum - * r2 - end of freqData - */ - cmp r0,r2 - bhs .Lendmainloopa -.Lmainloopa: - pld [r0,#64] - fldmias r0!,{s0,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15} - fmuls s16,s0,s0 - fmuls s17,s2,s2 - fmuls s18,s4,s4 - fmuls s19,s6,s6 - fmuls s20,s8,s8 - fmuls s21,s10,s10 - fmuls s22,s12,s12 - fmuls s23,s14,s14 - fmacs s16,s1,s1 - fmacs s17,s3,s3 - fmacs s18,s5,s5 - fmacs s19,s7,s7 - fmacs s20,s9,s9 - fmacs s21,s11,s11 - fmacs s22,s13,s13 - fmacs s23,s15,s15 - - pld [r0,#64] - fldmias r0!,{s0,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15} - fstmias r1!,{s16,s17,s18,s19,s20,s21,s22,s23} - - fmuls s24,s0,s0 - fmuls s25,s2,s2 - fmuls s26,s4,s4 - fmuls s27,s6,s6 - fmuls s28,s8,s8 - fmuls s29,s10,s10 - fmuls s30,s12,s12 - fmuls s31,s14,s14 - fmacs s24,s1,s1 - fmacs s25,s3,s3 - fmacs s26,s5,s5 - fmacs s27,s7,s7 - fmacs s28,s9,s9 - fmacs s29,s11,s11 - fmacs s30,s13,s13 - fmacs s31,s15,s15 - fstmias r1!,{s24,s25,s26,s27,s28,s29,s30,s31} - - cmp r0,r2 - blo .Lmainloopa -.Lendmainloopa: - add r2,r2,#8*12 - bhs .Lendsmallloopa -.Lsmallloopa: - fldmias r0!,{s0,s1,s2,s3,s4,s5,s6,s7} - fmuls s16,s0,s0 - fmuls s17,s2,s2 - fmuls s18,s4,s4 - fmuls s19,s6,s6 - fmacs s16,s1,s1 - fmacs s17,s3,s3 - fmacs s18,s5,s5 - fmacs s19,s7,s7 - fstmias r1!,{s16,s17,s18,s19} - cmp r0,r2 - blo .Lsmallloopa -.Lendsmallloopa: - add r2,r2,#8*3 - cmp r0,r2 - beq .Lendmicroloop -.Lmicroloop: - fldmias r0!,{s0,s1} - fmuls s2,s0,s0 - fmacs s2,s1,s1 - fstmias r1!,{s2} - - cmp r0,r2 - blo .Lmicroloop - -.Lendmicroloop: - mov r0,#0 - vpop {d8,d9,d10,d11,d12,d13,d14,d15} - pop {r4,r5} - bx lr - - .align 2 -.LGOTa: - .word _GLOBAL_OFFSET_TABLE_-(.LPICa+8) - .word analysis_state(GOT) - -#endif // __arm__ diff -Nru boinc-app-seti-7.28~svn2781/.pc/215-remove-vfp-Chirp.patch/client/vector/analyzeFuncs_vector.cpp boinc-app-seti-7.28~svn2858/.pc/215-remove-vfp-Chirp.patch/client/vector/analyzeFuncs_vector.cpp --- boinc-app-seti-7.28~svn2781/.pc/215-remove-vfp-Chirp.patch/client/vector/analyzeFuncs_vector.cpp 2015-02-23 15:18:24.000000000 +0000 +++ boinc-app-seti-7.28~svn2858/.pc/215-remove-vfp-Chirp.patch/client/vector/analyzeFuncs_vector.cpp 1970-01-01 00:00:00.000000000 +0000 @@ -1,1613 +0,0 @@ - -// Copyright (c) 1999-2006 Regents of the University of California -// -// FFTW: Copyright (c) 2003,2006 Matteo Frigo -// Copyright (c) 2003,2006 Massachusets Institute of Technology -// -// fft8g.[cpp,h]: Copyright (c) 1995-2001 Takya Ooura - -// This program is free software; you can redistribute it and/or modify it -// under the terms of the GNU General Public License as published by the -// Free Software Foundation; either version 2, or (at your option) any later -// version. - -// This program is distributed in the hope that it will be useful, but WITHOUT -// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -// more details. - -// You should have received a copy of the GNU General Public License along -// with this program; see the file COPYING. If not, write to the Free Software -// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - -// In addition, as a special exception, the Regents of the University of -// California give permission to link the code of this program with libraries -// that provide specific optimized fast Fourier transform (FFT) functions -// as an alternative to FFTW and distribute a linked executable and -// source code. You must obey the GNU General Public License in all -// respects for all of the code used other than the FFT library itself. -// Any modification required to support these libraries must be distributed -// under the terms of this license. If you modify this program, you may extend -// this exception to your version of the program, but you are not obligated to -// do so. If you do not wish to do so, delete this exception statement from -// your version. Please be aware that FFTW is not covered by this exception, -// therefore you may not use FFTW in any derivative work so modified without -// permission of the authors of FFTW. -// -// $Id: analyzeFuncs_vector.cpp,v 1.1.2.29 2007/08/16 10:13:56 charlief Exp $ - -#include "sah_config.h" - -#ifdef __APPLE_CC__ -#define _CPP_CMATH // Block inclusion of which undefines isnan() (for using GCC 3 on OS X) -#define _GLIBCXX_CMATH // Block inclusion of which undefines isnan() (for using GCC 4 on OS X) -#endif -#ifdef _WIN32 -#define uint32_t unsigned long -#endif - -#include -#include -#include -#include -#include -#ifdef HAVE_FLOAT_H -#include -#endif -#ifdef HAVE_FLOATINGPOINT_H -#include -#endif -#ifdef HAVE_IEEEFP_H -#include -#endif - -#include "util.h" -#include "s_util.h" -#include "boinc_api.h" -#ifdef BOINC_APP_GRAPHICS -#include "sah_gfx_main.h" -#endif -#include "diagnostics.h" -#include "filesys.h" -#include "str_replace.h" -#include "sighandler.h" -#include "analyzeFuncs.h" -#include "analyzeFuncs_vector.h" - -#include "hires_timer.h" - -#include "chirpfft.h" -#include "analyzePoT.h" -#include "pulsefind.h" -#include "sincos.h" -#ifdef USE_ASMLIB -#include "asmlib.h" -#endif - -#ifndef M_PI -#define M_PI 3.14159265358979323846 -#endif - -#ifdef __APPLE_CC__ -#include -#ifdef HAVE___ISNAN -#define isnotnan(x) (!isnan(x)) -#else -#define isnotnan(x) ((x) == (x)) -#endif -#endif - - -#ifndef __APPLE_CC__ -#if defined(HAVE__ISNAN) -#define isnotnan(x) (!_isnan(x)) -#elif defined(HAVE_ISNAN) -#define isnotnan(x) (!isnan(x)) -#elif defined(HAVE___ISNAN) -#define isnotnan(x) (!__isnan(x)) -#else -#define isnotnan(x) ((x) == (x)) -#endif -#endif - -static bool do_print; - -// Bit patterns to compare host capabilities and what SIMD capability a routine needs -#define BA_ANY 0x00000001 // any CPU OK -#define BA_MMX 0x00000002 -#define BA_SSE 0x00000004 -#define BA_SSE2 0x00000008 -#define BA_SSE3 0x00000010 -#define BA_3Dnow 0x00000020 -#define BA_3DnowP 0x00000040 -#define BA_MMX_P 0x00000080 -#define BA_SSSE3 0x00000100 -#define BA_SSE41 0x00000200 -#define BA_SSE4a 0x00000400 -#define BA_SSE42 0x00000800 -#define BA_XOP 0x00001001 -#define BA_AVX 0x00002000 -#define BA_FMA 0x00004000 -#define BA_FMA4 0x00008000 -#define BA_ALTVC 0x00100000 -#define BA_NEON 0x00200000 -#define BA_VFP 0x00400000 -#define BA_VFPV3 0x00800000 -#define BA_VFPV3D16 0x01000000 -#define BA_VFPV4 0x02000000 -#define BA_VFPV4D16 0x04000000 - -uint32_t CPUCaps = BA_ANY; - -/*********************************** - *JWS: Temporary hack for AVX support, based on section 2.2 of Intel 319433-010.pdf - * (Intel® Advanced Vector Extensions Programming Reference) - * - * Note this does not check for whether the CPU supports cpuid, etc. so must - * not be used unless such details have already been confirmed. - */ -#if defined(USE_AVX) - -int avxSupported(void) { -#ifdef USE_MANUAL_CALLSTACK - call_stack.enter("avxSupported()"); -#endif - int retval = 1; -#if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ > 3)) || (__GNUC__ > 4)) -#if defined(__i386__) && (defined(__PIC__) || defined(__pic__)) -// EBX can't be clobbered on linux PIC. - __asm__ ( "pushl %ebx \n\t" ); -#endif - __asm__ ( "\n\t" - "movl $1, %%eax \n\t" - "cpuid \n\t" - "andl $0x18000000, %%ecx \n\t" - "cmpl $0x18000000, %%ecx \n\t" - "jne 1f \n\t" - "xorl %%ecx, %%ecx \n\t" - "xgetbv \n\t" - "andl $6, %%eax \n\t" - "cmpl $6, %%eax \n\t" - "jne 1f \n\t" - "movl $1, %0 \n\t" - "jmp 2f \n\t" - "1: \n\t" - "movl $0, %0 \n\t" - "2: \n\t" -# if defined(_WIN64) || defined(__LP64__) || defined (__X86_64__) - : "=g" (retval) :: "%rax", "%rbx", "%rcx", "%rdx" -# elif defined(__i386__) && (defined(__PIC__) || defined(__pic__)) - "popl %%ebx \n\t" - : "=g" (retval) :: "%eax", "%ecx", "%edx" -# else - : "=g" (retval) :: "%eax", "%ebx", "%ecx", "%edx" -# endif - ); -#elif (_MSC_VER >= 1600) && !defined(_WIN64) - _asm { - mov eax, 1 - cpuid - and ecx, 018000000H - cmp ecx, 018000000H - jne NOSUPPORT - xor ecx, ecx - xgetbv - and eax, 06H - cmp eax, 06H - jne NOSUPPORT - mov retval, 1 - jmp DONE - NOSUPPORT: - mov retval, 0 - DONE: - }; -#endif // compiler -#ifdef USE_MANUAL_CALLSTACK - call_stack.exit(); -#endif - return retval; -} -#endif // USE_AVX - -/********************** - */ -void SetCapabilities(void) { -#ifdef USE_MANUAL_CALLSTACK - call_stack.enter("SetCapabilities()"); -#endif - -#if defined(__APPLE_CC__) - // OS X assumes MMX, SSE and SSE2 are present on Intel processors - // SSE3 and Altivec are optional -#if defined(__i386__) || defined(__x86_64__) - CPUCaps |= (BA_MMX|BA_SSE|BA_SSE2); - int sse3flag=0; - size_t length=sizeof(sse3flag); - int error=sysctlbyname("hw.optional.sse3",&sse3flag,&length,NULL,0); - if (sse3flag && !error) CPUCaps |= BA_SSE3; -#else // PowerPC - int altivecflag=0; - size_t length=sizeof(altivecflag); - int error=sysctlbyname("hw.optional.altivec",&altivecflag,&length,NULL,0); - if (altivecflag && !error) CPUCaps |= BA_ALTVC; -#endif - -#elif defined( USE_BHCPUID ) - CPU_INFO theCPU; - - if (theCPU.mmx()) CPUCaps |= BA_MMX; - if (theCPU.sse()) CPUCaps |= BA_SSE; - if (theCPU.sse2()) CPUCaps |= BA_SSE2; - if (theCPU.sse3()) CPUCaps |= BA_SSE3; - if (theCPU._3Dnow()) CPUCaps |= BA_3Dnow; - if (theCPU._3DnowPlus()) CPUCaps |= BA_3DnowP; - if (theCPU.mmxPlus()) CPUCaps |= BA_MMX_P; - -#elif defined( USE_ASMLIB ) - int dp = DetectProcessor(); - if (dp & 0x00800000) CPUCaps |= BA_MMX; - if ((dp & 0x02000800) == 0x02000800) CPUCaps |= BA_SSE; - if ((dp & 0x04000800) == 0x04000800) CPUCaps |= BA_SSE2; - if ((dp & 0x08000800) == 0x08000800) CPUCaps |= BA_SSE3; - if (dp & 0x80000000) CPUCaps |= BA_3Dnow; - if (dp & 0x40000000) CPUCaps |= BA_3DnowP; - if (dp & 0x20000000) CPUCaps |= BA_MMX_P; -#if defined(USE_AVX) - if ((dp & 0x00000800) && avxSupported()) CPUCaps |= BA_AVX; -#endif - -#elif defined(__i386__) || defined (__x86_64__) - /* we're hoping to rely on signal handling to keep us out of trouble */ - CPUCaps |= (BA_MMX | BA_SSE | BA_SSE2 | BA_SSE3 | BA_3Dnow | BA_3DnowP | BA_MMX_P ); -#if defined(USE_AVX) - CPUCaps |= BA_AVX; -#endif - -#elif defined(USE_ALTIVEC) - CPUCaps |= BA_ALTVC; - -#elif defined(__arm__) && defined(__VFP_FP__) && !defined(__SOFTFP__) - -#if defined(ANDROID) || defined(__linux__) - // if strlen(p_features) is 0 or p_features doesn't contain "vfp" assume - // either BOINC is screwed up or we're running stand along. - size_t len=strlen(app_init_data.host_info.p_features); - if ((len==0) || (strstr(app_init_data.host_info.p_features,"vfp") == 0)) { - if (do_print) fprintf(stderr,"Getting CPU Capabilities from /proc/cpuinfo\n"); - app_init_data.host_info.p_features[0]=0; - char buf[256]; - FILE *cpuinfo=boinc_fopen("/proc/cpuinfo","r"); - if (cpuinfo) { - while (fgets(buf,sizeof(buf),cpuinfo)) { - if (strstr(buf, "Features")==buf) { - strlcpy(app_init_data.host_info.p_features,strstr(buf,":")+1,sizeof(app_init_data.host_info.p_features)); - break; - } - } - fclose(cpuinfo); - } else { - fprintf(stderr,"Could not open /proc/cpuinfo\n"); - } - len=strlen(app_init_data.host_info.p_features); - if (len == 0) fprintf(stderr,"Didn't find 'Features' line\n"); - } - - // ensure the features list ends with a space. - if ((app_init_data.host_info.p_features[len-1] != ' ') && - (len(RAND_MAX/2))?-1.0f:1.0f; - indata[i][1]=((rand()&RAND_MAX)>(RAND_MAX/2))?-1.0f:1.0f; - } -#if !defined(USE_ASMLIB) && !defined(__APPLE_CC__) - install_sighandler(); - for (i=0;i= MIN_TRIGARRAY_MEMORY)); - if (default_functions_flag) { - if (do_print) - fprintf(stderr,"%32s (default)\n",ChirpDataFuncs[0].nom); -#ifdef USE_MANUAL_CALLSTACK - call_stack.exit(); -#endif - return ChirpDataFuncs[0].func; - } // else - hires_timer timer; - ChirpData_func chirp_data; - int i,j,rv,k = sizeof(ChirpDataFuncs)/sizeof(CDtb); - double speed=1e+6,timing,accuracy; - int NumDataPoints=1024*1024; - int best; - double best_timing, best_accuracy; - FORCE_FRAME_POINTER; - - if (k == 1) { - if (do_print) fprintf(stderr,"%32s (no other)\n",ChirpDataFuncs[0].nom); -#ifdef USE_MANUAL_CALLSTACK - call_stack.exit(); -#endif - return v_ChirpData; - } - sah_complex *indata=(sah_complex *)malloc_a(NumDataPoints*sizeof(sah_complex),MEM_ALIGN); - sah_complex *outdata=(sah_complex *)malloc_a(NumDataPoints*sizeof(sah_complex),MEM_ALIGN); - sah_complex *test=(sah_complex *)malloc_a(NumDataPoints*sizeof(sah_complex),MEM_ALIGN); - - if (!indata || !outdata || !test) { - if (indata) - free_a(indata); - if (outdata) - free_a(outdata); - if (test) - free_a(test); - fprintf(stderr,"Memory allocation failed in ChooseChirp\n"); -#ifdef USE_MANUAL_CALLSTACK - call_stack.exit(); -#endif - return v_ChirpData; - } - //JWS: Generate indata as the chirp of flat line (constant) data - for (i=0;i(i)*recip_sample_rate; - ang=0.5*chirp_rate*time*time; - ang -= floor(ang); - ang *= M_PI*2; - sincos(ang,&dd,&cc); - // Notionally: - // c=cc; - // d=dd; - // indata[i][0] = save[i][0] * c - save[i][1] * d; - // indata[i][1] = save[i][0] * d + save[i][1] * c; - indata[i][0] = static_cast(cc); - indata[i][1] = static_cast(dd); - } - -#if !defined(USE_ASMLIB) && !defined(__APPLE_CC__) - install_sighandler(); - for (i=0;i0) { - ind*=-1; - } else { - ind=-1*(ind-2); - } - j++; - } - if (rv) continue; - timing/=j; - accuracy=0; - //JWS: indata is positive chirp of constant at TESTCHIRPIND, test was copied - // at -TESTCHIRPIND so we check for deviation from flat - for (j=0;j>= 1); - FFTtbl[iL][0] = ChirpFftPairs[i].FftLen; - FFTtbl[iL][2] += 1; - } - } - - // For testing, the chirp/fft table may just be a short list. If so, add the FFT - // length to the counts to simulate distribution for a full table. (This also affects - // very high angle range WUs, but Pulse folding is a small factor there so looser - // testing will have very little effect.) - if (num_cfft < 14) { - for (iL = 0; iL < 32; iL++) FFTtbl[iL][2] += FFTtbl[iL][0]; - } - - // Scale the counts so the minimum is 1. Use the scaled value directly for fold - // sequences starting with a fold by 4. Adjust the values for fold by 3 and - // fold by 5 to get close to the 10, 9, 8 ratios. Add the PoT length values to - // the table, and tot up the needed number of PoTPlans. - for (iL = 0; iL < 32; iL++) { - if ((FFTtbl[iL][0]) && ((double)FFTtbl[iL][2] < dTmp)) dTmp = (double)FFTtbl[iL][2]; - } - for (iL = 0; iL < 32; iL++) { - if (FFTtbl[iL][0]) { - FFTtbl[iL][2] = (int)((double)FFTtbl[iL][2] / dTmp + 0.5); - FFTtbl[iL][1] = (int)((double)FFTtbl[iL][2] * 10.0 / 9.0 + 0.5); - FFTtbl[iL][3] = FFTtbl[iL][2] + FFTtbl[iL][2] - FFTtbl[iL][1]; - PoTLen = (int)(NumSamples / FFTtbl[iL][0] + 0.5); - GetPulsePoTLen(PoTLen, &PulsePoTLen, &Overlap); - FFTtbl[iL][4] = PulsePoTLen; - for (i = 32, ndivs = 1; i <= PulsePoTLen; ndivs++, i *= 2); - NumPlans += 3 * FFTtbl[iL][2] * ndivs; - } - } - if (NumPlans == 0) { - // No pulse finding in this WU, abort test. -#ifdef USE_MANUAL_CALLSTACK - call_stack.exit(); -#endif - return 0; - } - - // At some angle ranges the total number of test folds may be too few to get good - // timing measurements. Scale up so there are at least 16K test folds. - while (NumPlans < 16384) { - for (iL = 0; iL < 32; iL++) { - FFTtbl[iL][1] *= 2; - FFTtbl[iL][2] *= 2; - FFTtbl[iL][3] *= 2; - } - NumPlans *= 2; - } - -// fprintf(stderr, "Calculated Preplans = %d\n", NumPlans); -// NumPlans *= 2; // temporary safety measure - - - // Now we know what we're doing - allocate memory in which to do it. - PoTPlan *PlanBuf = (PoTPlan *)malloc_a((NumPlans + 1) * sizeof(PoTPlan), MEM_ALIGN); - float *indata = (float *)malloc_a(MaxPulsePoT * sizeof(float), MEM_ALIGN); - float *outdata = (float *)malloc_a(MaxPulsePoT * sizeof(float), MEM_ALIGN); - float *maxdata = (float *)malloc_a(NumPlans * sizeof(float), MEM_ALIGN); - float *save = (float *)malloc_a(NumPlans * sizeof(float), MEM_ALIGN); - - FORCE_FRAME_POINTER; - if (!PlanBuf || !indata || !outdata || !maxdata || !save) { - if (PlanBuf) - free_a(PlanBuf); - if (indata) - free_a(indata); - if (outdata) - free_a(outdata); - if (maxdata) - free_a(maxdata); - if (save) - free_a(save); -#ifdef USE_MANUAL_CALLSTACK - call_stack.exit(); -#endif - return 0; // Can't test, make no change - } - - SrcSel[0] = indata; - SrcSel[1] = outdata; - - // Generate PowerSpectrum random data - srand(11); - for (i = 0; i < MaxPulsePoT; i++) { - float fr1 = (float)(rand()) / RAND_MAX; - float fr2 = (float)(rand()) / RAND_MAX; - indata[i] = fr1 * fr1 + fr2 * fr2; - } - -#if !defined(USE_ASMLIB) && !defined(__APPLE_CC__) - install_sighandler(); - for (i=0;(i*sizeof(FolSub))name); - continue; - } - j = 0; - timing = 0; - CopyFoldSet(&TestFoldSet, FoldSubs[i].fsp); - int n = planFoldTest(PlanBuf, outdata, FFTtbl); -// if (!i) fprintf(stderr, "Actual Preplans = %d\n", n); - while ((j < 100) && ((j < 10) || ((j * timing) < (3 * timer.resolution())))) { - memset(outdata, 0, MaxPulsePoT * sizeof(float)); - memset(maxdata, 0, NumPlans * sizeof(float)); - maxdata[0] = -1.234f; - timer.start(); - for ( k = 0; PlanBuf[k].di; k++) { - maxdata[k] = PlanBuf[k].fun_ptr(SrcSel, &PlanBuf[k]); - } - onetime = timer.stop(); - if (j) timing = std::min(onetime, timing); - else timing = onetime; -#if !defined(USE_ASMLIB) && !defined(__APPLE_CC__) - if (maxdata[0] < 0) siglongjmp(jb,1); -#else - if (maxdata[0] < 0) break; -#endif - j++; - } - accuracy = 0; - errmax = 0; - if (i == 0) { - memcpy(save, maxdata, NumPlans * sizeof(float)); - } else { - for (j = 0; j < NumPlans; j++) { - // a zero max should never happen, but be safe... - if (save[j]) { - double relerr = fabs((save[j] - maxdata[j]) / save[j]); - accuracy += relerr; - if (relerr > errmax) errmax = relerr; - } - } - } - accuracy /= NumPlans; - if (verbose) { - fprintf(stderr, "%24s folding %8.6f %7.5f test\n", FoldSubs[i].fsp->name, timing, accuracy); - fflush(stderr); - } - if ((timing < speed) && isnotnan(accuracy) && (accuracy < 1e-6) && (errmax < 1e-4)) { - speed = timing; - best = i; - best_timing = timing; - best_accuracy = accuracy; - } -#if !defined(USE_ASMLIB) && !defined(__APPLE_CC__) - } else { - // reinstall_sighandler(); - if (verbose) { - fprintf(stderr, "%24s folding faulted\n", FoldSubs[i].fsp->name); - fflush(stderr); - } - } - } - uninstall_sighandler(); -#else - } -#endif - free_a(PlanBuf); - free_a(indata); - free_a(outdata); - free_a(maxdata); - free_a(save); - if (do_print) - fprintf(stderr, "%24s folding %8.6f %7.5f %s\n", - FoldSubs[best].fsp->name, - best_timing, - best_accuracy, - verbose ? " choice\n": ""); - CopyFoldSet(&Foldmain, FoldSubs[best].fsp); -#ifdef USE_MANUAL_CALLSTACK - call_stack.exit(); -#endif - return 0; -} - - - -void ChooseFunctions(BaseLineSmooth_func *baseline_smooth, - GetPowerSpectrum_func *get_power_spectrum, - ChirpData_func *chirp_data, - Transpose_func *transpose, - ChirpFftPair_t * ChirpFftPairs, - int num_cfft, - int nsamples, - bool print_choices) { -#ifdef USE_MANUAL_CALLSTACK - call_stack.enter("ChooseFunctions()"); -#endif - do_print=print_choices; - if (verbose) do_print = true; - if (TestBoincSignalHandling()) { - SetCapabilities(); - hires_timer durtimer; - double TestDur=0; - if (do_print) { - fprintf(stderr,"Optimal function choices:\n"); - fprintf(stderr,"--------------------------------------------------------\n"); - fprintf(stderr,"%32s %8s %7s\n","name","timing","error"); - fprintf(stderr,"--------------------------------------------------------\n"); - fflush(stderr); - } - durtimer.start(); -#ifdef ANDROID - // low memory android is a special case because android will kill - // without warning to free memory. Testing the functions could trigger - // a loop. So we'll choose what fits the processor and return - if (app_init_data.host_info.m_nbytes && - (app_init_data.host_info.m_nbytes < MIN_TRIGARRAY_MEMORY)) { - *baseline_smooth=v_BaseLineSmooth; - found_baseline_smooth=true; - if (do_print) fprintf(stderr,"%32s (no other)\n", - "v_BaseLineSmooth"); -#if defined(__VFP_FP__) && !defined(__SOFTFP__) -#ifdef USE_NEON - if (CPUCaps & BA_NEON) { - *get_power_spectrum=vfp_GetPowerSpectrum; - found_get_power_spectrum=true; - if (do_print) fprintf(stderr,"%32s (CPU Caps)\n", - "vfp_GetPowerSpectrum"); - *chirp_data=neon_ChirpData; - found_chirp_data=true; - if (do_print) fprintf(stderr,"%32s (CPU Caps)\n", - "neon_ChirpData"); - } else -#endif - if (CPUCaps & BA_VFP) { - *get_power_spectrum=vfp_GetPowerSpectrum; - found_get_power_spectrum=true; - if (do_print) fprintf(stderr,"%32s (CPU Caps)\n", - "vfp_GetPowerSpectrum"); - *chirp_data=vfp_ChirpData; - found_chirp_data=true; - if (do_print) fprintf(stderr,"%32s (CPU Caps)\n", - "vfp_ChirpData"); - } else { -#endif - *get_power_spectrum=v_GetPowerSpectrum; - found_get_power_spectrum=true; - if (do_print) fprintf(stderr,"%32s (default)\n", - "v_GetPowerSpectrum"); - *chirp_data=v_ChirpData; - found_chirp_data=true; - if (do_print) fprintf(stderr,"%32s (default)\n", - "v_ChirpData"); -#if defined(__VFP_FP__) && !defined(__SOFTFP__) - } -#endif - *transpose=v_Transpose4; - found_transpose=true; - if (do_print) fprintf(stderr,"%32s (default)\n", - "v_Transpose4"); -#if defined(__VFP_FP__) && !defined(__SOFTFP__) -#ifdef USE_NEON - if (CPUCaps & BA_NEON) { - CopyFoldSet(&Foldmain, &neonFoldMain); - found_folding=true; - if (do_print) fprintf(stderr,"%32s (CPU Caps)\n", - "opt NEON folding"); - } else -#endif - if (CPUCaps & BA_VFP) { - CopyFoldSet(&Foldmain, &vfpFoldMain); - found_folding=true; - if (do_print) fprintf(stderr,"%32s (CPU Caps)\n", - "oft VFP folding"); - } else { -#endif - CopyFoldSet(&Foldmain, &swifold); - found_folding=true; - if (do_print) fprintf(stderr,"%32s (default)\n", - "swifold"); -#if defined(__VFP_FP__) && !defined(__SOFTFP__) - } -#endif - TestDur+=durtimer.stop(); - if (verbose) - fprintf(stderr,"%32s %8.2f seconds\n\n","Test duration",TestDur); - if (do_print) { - fflush(stderr); - } - call_stack.exit(); - return; - } -#endif - if (!found_baseline_smooth) { - *baseline_smooth=ChooseBaseLineSmooth(); - } else { - if (do_print) fprintf(stderr,"BaseLineSmooth retrieved from state file\n"); - *baseline_smooth=*BaseLineSmooth; - } - fflush(stderr); - if (!found_get_power_spectrum) { - *get_power_spectrum=ChooseGetPowerSpectrum(); - } else { - if (do_print) fprintf(stderr,"GetPowerSpectrum retrieved from state file\n"); - *get_power_spectrum=*GetPowerSpectrum; - } - fflush(stderr); - if (!found_chirp_data) { - *chirp_data=ChooseChirpData(); - } else { - if (do_print) fprintf(stderr,"ChirpData retrieved from state file\n"); - *chirp_data=*ChirpData; - } - fflush(stderr); - if (!found_transpose) { - *transpose=ChooseTranspose(); - } else { - if (do_print) fprintf(stderr,"Transpose retrieved from state file\n"); - *transpose=*Transpose; - } - fflush(stderr); - // ChooseFoldSubs is inconsistent in that it directly sets a global - // variable. Maybe we should make it consistent with the others - // at some point. --EK - if (!found_folding) { - ChooseFoldSubs(ChirpFftPairs, num_cfft, nsamples); - } else { - if (do_print) fprintf(stderr,"Folding Subs retrieved from state file\n"); - } - fflush(stderr); - TestDur+=durtimer.stop(); - if (verbose) - fprintf(stderr,"%32s %8.2f seconds\n\n","Test duration",TestDur); - } - - if (do_print) { - fflush(stderr); - } -#ifdef USE_MANUAL_CALLSTACK - call_stack.exit(); -#endif -} - - diff -Nru boinc-app-seti-7.28~svn2781/.pc/applied-patches boinc-app-seti-7.28~svn2858/.pc/applied-patches --- boinc-app-seti-7.28~svn2781/.pc/applied-patches 2015-02-23 15:18:24.000000000 +0000 +++ boinc-app-seti-7.28~svn2858/.pc/applied-patches 2015-04-08 17:04:26.000000000 +0000 @@ -2,11 +2,9 @@ 211_give_stderr_some_output.patch 212_increase_buffers.patch 213_const_warning_reduction.patch -214_fix_armhf.patch 001_disable_avx_in_configure.patch 003_dont_use_own_jpeglib_and_glut.patch 004_disable_altivec_on_ppc_linux.patch 006_omitting_archs.patch 007_worker_comments.patch -215-remove-vfp-Chirp.patch 216-fix-build.patch diff -Nru boinc-app-seti-7.28~svn2781/.pc/.quilt_patches boinc-app-seti-7.28~svn2858/.pc/.quilt_patches --- boinc-app-seti-7.28~svn2781/.pc/.quilt_patches 2015-02-23 15:18:24.000000000 +0000 +++ boinc-app-seti-7.28~svn2858/.pc/.quilt_patches 2015-04-08 17:04:25.000000000 +0000 @@ -1 +1 @@ -/home/buildd/build-RECIPEBRANCHBUILD-875423/chroot-autobuild/home/buildd/work/tree/boinc-app-seti-7.28~svn2781/debian/patches +/home/buildd/build-RECIPEBRANCHBUILD-901594/chroot-autobuild/home/buildd/work/tree/boinc-app-seti-7.28~svn2858/debian/patches diff -Nru boinc-app-seti-7.28~svn2781/.pc/.quilt_series boinc-app-seti-7.28~svn2858/.pc/.quilt_series --- boinc-app-seti-7.28~svn2781/.pc/.quilt_series 2015-02-23 15:18:24.000000000 +0000 +++ boinc-app-seti-7.28~svn2858/.pc/.quilt_series 2015-04-08 17:04:25.000000000 +0000 @@ -1 +1 @@ -/home/buildd/build-RECIPEBRANCHBUILD-875423/chroot-autobuild/home/buildd/work/tree/boinc-app-seti-7.28~svn2781/debian/patches/series +/home/buildd/build-RECIPEBRANCHBUILD-901594/chroot-autobuild/home/buildd/work/tree/boinc-app-seti-7.28~svn2858/debian/patches/series