ViennaCL - The Vienna Computing Library  1.5.1
viennacl/linalg/host_based/sse_blas.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_
00002 #define VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00027 //complex BLAS functions are included but unused in this version of ViennaCL
00028 #if defined VIENNACL_WITH_COMPLEX
00029 #include <complex>
00030 #endif
00031 
00032 //defining VIENNACL_SSE3 adds a slight optimization for complex multiplication using SSE3
00033 #if defined VIENNACL_WITH_SSE3
00034 #include <pmmintrin.h>
00035 #elif defined VIENNACL_WITH_SSE2
00036 #include <emmintrin.h>
00037 #endif
00038 
00039 #include <cstddef>
00040 #include <cmath>
00041 
00042 namespace viennacl
00043 {
00044   namespace linalg
00045   {
00046 
00047     namespace host_based
00048     {
00049       //saxpy, daxpy, caxpy, zaxpy
00050       template <class T> inline void _axpy(const T*, T*, vcl_size_t, T);
00051 
00052       //sdot, ddot, cdotu, zdotu
00053       template <class T> inline T    _dot (vcl_size_t, const T*, const T*);
00054 
00055       //sdot, ddot, cdotc, zdotc
00056       template <class T> inline T    _dotc(vcl_size_t, const T*, const T*);
00057 
00058       //sswap, dswap, cswap, zswap
00059       template <class T> inline void _swap(vcl_size_t, T*, T*);
00060 
00061       //scopy, dcopy, ccopy, zcopy
00062       template <class T> inline void _copy(vcl_size_t, T*, T*);
00063 
00064       //snrm2, dnrm2, euclidian norm of complex vectors
00065       template <class T> inline T    _nrm2(const T*, vcl_size_t);
00066 
00067       namespace detail
00068       {
00069         template <class T> inline T conjIfComplex(T x){return x;}
00070       }
00071 
00072       template <class T>
00073       inline void _axpy(const T* x, T* y, vcl_size_t n, T a)
00074       {
00075         for(vcl_size_t i=0;i<n;i++)
00076           y[i]+=a*x[i];
00077       }
00078 
00079       template <class T>
00080       inline T _dot(vcl_size_t n, const T* x, const T* y)
00081       {
00082         T sum(0);
00083         for(vcl_size_t i=0;i<n;i++)
00084           sum+=x[i]*y[i];
00085         return sum;
00086       }
00087 
00088       template <class T>
00089       inline T _dotc(vcl_size_t n, const T* x, const T* y)
00090       {
00091         T sum(0);
00092         for(vcl_size_t i=0;i<n;i++)
00093           sum+=detail::conjIfComplex(x[i])*y[i];
00094         return sum;
00095       }
00096 
00097       template <class T>
00098       inline void _swap(vcl_size_t n, T* sx, T* sy)
00099       {
00100         T t;
00101         for(vcl_size_t i=0;i<n;i++)
00102         {
00103           t=sx[i];
00104           sx[i]=sy[i];
00105           sy[i]=t;
00106         }
00107       }
00108 
00109       template <class T>
00110       inline void _copy(vcl_size_t n, T* cx, T* cy)
00111       {
00112         for(vcl_size_t i=0;i<n;i++)
00113           cx[i]=cy[i];
00114       }
00115 
00116       template <class T>
00117       inline T _nrm2(const T* x, vcl_size_t n)
00118       {
00119         //based on http://www.netlib.org/blas/snrm2.f, but works with std::complex
00120 
00121         if(n<1)
00122           return T(0);
00123         if(n==1)
00124           return std::abs(x[0]);
00125         T scale(0);
00126         T scaledSquareSum(1);
00127         for(vcl_size_t i=0;i<n;i++){
00128           if(x[i]!=T(0)){
00129             T absXi=std::abs(x[i]);
00130             if(std::abs(x[i])>std::abs(scale)){
00131               T temp=scale/absXi;
00132               scaledSquareSum=T(1)+scaledSquareSum*temp*temp;
00133               scale=absXi;
00134             }
00135             else{
00136               T temp=absXi/scale;
00137               scaledSquareSum+=temp*temp;
00138             }
00139           }
00140         }
00141         return scale*sqrt(scaledSquareSum);
00142       }
00143 
00144   #if defined VIENNACL_WITH_COMPLEX
00145 
00146       namespace detail
00147       {
00148         template <> inline std::complex<double> conjIfComplex(std::complex<double> x){return conj(x);}
00149         template <> inline std::complex<float > conjIfComplex(std::complex<float > x){return conj(x);}
00150       }
00151 
00152       template <>
00153       inline std::complex<double> _nrm2(const std::complex<double>* x, vcl_size_t n)
00154       {
00155         //based on http://www.netlib.org/blas/snrm2.f
00156 
00157         if(n<1)
00158           return std::complex<double>(0);
00159         if(n==1)
00160           return std::abs(x[0]);
00161         double scale=0.0;
00162         double scaledSquareSum=1.0;
00163         for(vcl_size_t i=0;i<n;i++){
00164           if(x[i].real()!=0.0){
00165             double absXi=std::abs(x[i].real());
00166             if(absXi>scale){
00167               double temp=scale/absXi;
00168               scaledSquareSum=1.0+scaledSquareSum*temp*temp;
00169               scale=absXi;
00170             }
00171             else{
00172               double temp=absXi/scale;
00173               scaledSquareSum+=temp*temp;
00174             }
00175           }
00176           if(x[i].imag()!=0.0){
00177             double absXi=std::abs(x[i].imag());
00178             if(absXi>scale){
00179               double temp=scale/absXi;
00180               scaledSquareSum=1.0+scaledSquareSum*temp*temp;
00181               scale=absXi;
00182             }
00183             else{
00184               double temp=absXi/scale;
00185               scaledSquareSum+=temp*temp;
00186             }
00187           }
00188         }
00189         return std::complex<double>(scale*sqrt(scaledSquareSum));
00190       }
00191 
00192       template <>
00193       inline std::complex<float> _nrm2(const std::complex<float>* x, vcl_size_t n)
00194       {
00195         //based on http://www.netlib.org/blas/snrm2.f
00196 
00197         if(n<1)
00198           return std::complex<float>(0);
00199         if(n==1)
00200           return std::abs(x[0]);
00201         float scale=0.0;
00202         float scaledSquareSum=1.0;
00203         for(vcl_size_t i=0;i<n;i++){
00204           if(x[i].real()!=0.0){
00205             float absXi=std::abs(x[i].real());
00206             if(absXi>scale){
00207               float temp=scale/absXi;
00208               scaledSquareSum=1.0f+scaledSquareSum*temp*temp;
00209               scale=absXi;
00210             }
00211             else{
00212               float temp=absXi/scale;
00213               scaledSquareSum+=temp*temp;
00214             }
00215           }
00216           if(x[i].imag()!=0.0){
00217             float absXi=std::abs(x[i].imag());
00218             if(absXi>scale){
00219               float temp=scale/absXi;
00220               scaledSquareSum=1.0f+scaledSquareSum*temp*temp;
00221               scale=absXi;
00222             }
00223             else{
00224               float temp=absXi/scale;
00225               scaledSquareSum+=temp*temp;
00226             }
00227           }
00228         }
00229         return std::complex<float>(scale*sqrt(scaledSquareSum));
00230       }
00231 
00232   #endif //defined VIENNACL_COMPLEX
00233 
00234   #if defined VIENNACL_WITH_SSE2
00235 
00236       //saxpy
00237       template <>
00238       inline void _axpy<float>(const float* x, float* y, vcl_size_t n, float a)
00239       {
00240 
00241         //if the array is short or if either array is unaligned, perform the non-SSE code
00242         if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(float)!=0)
00243           for(vcl_size_t i=0;i<n;i++)
00244             y[i]+=a*x[i];
00245         else
00246         {
00247           //process unaligned section of arrays
00248           while(((vcl_size_t)x)%16)
00249           {
00250             if(n<=0)
00251               return;
00252             y[0]+=a*x[0];
00253             x++;
00254             y++;
00255             n--;
00256           }
00257 
00258           __m128 sum0;
00259           __m128 sum1;
00260           __m128 reg0,reg1,reg2,reg3;
00261           __m128 areg=_mm_set1_ps(a);
00262           __m128 prod;
00263 
00264           //add floats 8 at a time
00265           while(n>=8){
00266 
00267             //read floats into MMX registers (8 from each array)
00268             reg0=_mm_load_ps(x+0);
00269             reg1=_mm_load_ps(x+4);
00270             reg2=_mm_load_ps(y+0);
00271             reg3=_mm_load_ps(y+4);
00272 
00273             //add floats
00274             prod=_mm_mul_ps(reg0,areg);
00275             sum0=_mm_add_ps(prod,reg2);
00276             prod=_mm_mul_ps(reg1,areg);
00277             sum1=_mm_add_ps(prod,reg3);
00278 
00279             //put float sums into y
00280             _mm_store_ps(y+0,sum0);
00281             _mm_store_ps(y+4,sum1);
00282 
00283             x+=8;
00284             y+=8;
00285             n-=8;
00286           }
00287 
00288           //add beyond the last multiple of 8
00289           for(vcl_size_t i=0;i<n;i++)
00290             y[i]+=a*x[i];
00291         }
00292       }
00293 
00294       //daxpy
00295       template <>
00296       inline void _axpy<double>(const double* x, double* y, vcl_size_t n, double a)
00297       {
00298         //if the array is short or if either array is unaligned, perform the non-SSE code
00299         if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(double)!=0)
00300           for(vcl_size_t i=0;i<n;i++)
00301             y[i]+=a*x[i];
00302 
00303         else
00304         {
00305           //process unaligned section of arrays
00306           while(((vcl_size_t)x)%16)
00307           {
00308             if(n<=0)
00309               return;
00310             y[0]+=a*x[0];
00311             x++;
00312             y++;
00313             n--;
00314           }
00315 
00316           __m128d sum0;
00317           __m128d sum1;
00318           __m128d reg0,reg1,reg2,reg3;
00319           __m128d areg=_mm_set1_pd(a);
00320           __m128d prod;
00321 
00322           //add doubles 4 at a time
00323           while(n>=8){
00324 
00325             //read floats into MMX registers (4 from each array)
00326             reg0=_mm_load_pd(x+0);
00327             reg1=_mm_load_pd(x+2);
00328             reg2=_mm_load_pd(y+0);
00329             reg3=_mm_load_pd(y+2);
00330 
00331             //add floats
00332             prod=_mm_mul_pd(reg0,areg);
00333             sum0=_mm_add_pd(prod,reg2);
00334             prod=_mm_mul_pd(reg1,areg);
00335             sum1=_mm_add_pd(prod,reg3);
00336 
00337             //put float sums into y
00338             _mm_store_pd(y+0,sum0);
00339             _mm_store_pd(y+2,sum1);
00340 
00341             x+=4;
00342             y+=4;
00343             n-=4;
00344           }
00345 
00346           //add beyond the last multiple of 4
00347           for(vcl_size_t i=0;i<n;i++)
00348             y[i]+=a*x[i];
00349         }
00350       }
00351 
00352       //sdot
00353       template <>
00354       inline float _dot<float>(vcl_size_t n, const float* x, const float* y)
00355       {
00356 
00357         //if the array is short or if either array is unaligned, perform the non-SSE code
00358         if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(float)!=0)
00359         {
00360           float sum=0;
00361           for(vcl_size_t i=0;i<n;i++)
00362             sum+=x[i]*y[i];
00363           return sum;
00364         }
00365         else
00366         {
00367 
00368           //process unaligned section of array
00369           float sum=0;
00370           while(((vcl_size_t)x)%16)
00371           {
00372             if(n<=0)
00373               return sum;
00374             sum+=x[0]*y[0];
00375             y++;
00376             x++;
00377             n--;
00378           }
00379 
00380           __m128 sumReg=_mm_setzero_ps();
00381           __m128 reg0,reg1,reg2,reg3;
00382 
00383           //add floats 8 at a time
00384           while(n>=8)
00385           {
00386             //read floats into MMX registers (8 from each array)
00387             reg0=_mm_load_ps(x+0);
00388             reg1=_mm_load_ps(x+4);
00389             reg2=_mm_load_ps(y+0);
00390             reg3=_mm_load_ps(y+4);
00391 
00392             //multiply floats together
00393             reg0=_mm_mul_ps(reg0,reg2);
00394             reg1=_mm_mul_ps(reg1,reg3);
00395 
00396             //add to sums
00397             sumReg=_mm_add_ps(sumReg,reg0);
00398             sumReg=_mm_add_ps(sumReg,reg1);
00399 
00400             x+=8;
00401             y+=8;
00402             n-=8;
00403           }
00404 
00405           //add beyond where the inner loop stopped
00406           for(vcl_size_t i=0;i<n;i++)
00407             sum+=x[i]*y[i];
00408 
00409           //move the sums from the xmm registers to aligned memory on the stack
00410           float sums[8];
00411           float* pSums=(float*)((((vcl_size_t)sums)&(~15))+16);
00412           _mm_store_ps(pSums,sumReg);
00413 
00414           return sum+pSums[0]+pSums[1]+pSums[2]+pSums[3];
00415         }
00416       }
00417 
00418       //ddot
00419       template <>
00420       inline double _dot(vcl_size_t n, const double* x, const double* y)
00421       {
00422         //if the array is short or if either array is unaligned, perform the non-SSE code
00423         if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(double)!=0)
00424         {
00425           double sum=0;
00426           for(vcl_size_t i=0;i<n;i++)
00427             sum+=x[i]*y[i];
00428           return sum;
00429         }
00430         else
00431         {
00432           //process unaligned section of array
00433           double sum=0;
00434           while(((vcl_size_t)x)%16)
00435           {
00436             if(n<=0)
00437               return sum;
00438             sum+=x[0]*y[0];
00439             y++;
00440             x++;
00441             n--;
00442           }
00443 
00444           __m128d sum0=_mm_setzero_pd();
00445           __m128d sum1=_mm_setzero_pd();
00446           __m128d reg0,reg1,reg2,reg3;
00447 
00448           //add doubles 4 at a time
00449           while(n>=4)
00450           {
00451             //read doubles into MMX registers (4 from each array)
00452             reg0=_mm_load_pd(x+0);
00453             reg1=_mm_load_pd(x+2);
00454             reg2=_mm_load_pd(y+0);
00455             reg3=_mm_load_pd(y+2);
00456 
00457             //multiply doubles together
00458             reg0=_mm_mul_pd(reg0,reg2);
00459             reg1=_mm_mul_pd(reg1,reg3);
00460 
00461             //add to sums
00462             sum0=_mm_add_pd(sum0,reg0);
00463             sum1=_mm_add_pd(sum1,reg1);
00464 
00465             x+=4;
00466             y+=4;
00467             n-=4;
00468           }
00469 
00470           //add beyond where the inner loop stopped
00471           for(vcl_size_t i=0;i<n;i++)
00472             sum+=x[i]*y[i];
00473 
00474           //move the sums from the xmm registers to aligned memory on the stack
00475           double sums[4];
00476           double* pSums=(double*)((((vcl_size_t)sums)&(~15))+16);
00477           sum0=_mm_add_pd(sum0,sum1);
00478           _mm_store_pd(pSums,sum0);
00479 
00480           return sum+pSums[0]+pSums[1];
00481         }
00482       }
00483 
00484       //conjugated dot products are the same as non-conjugated dot products for real numbers
00485       template <> inline float  _dotc<float >(vcl_size_t n, const float  *x, const float  *y){return _dot(n,x,y);}
00486       template <> inline double _dotc<double>(vcl_size_t n, const double *x, const double *y){return _dot(n,x,y);}
00487 
00488   #if defined VIENNACL_WITH_COMPLEX
00489 
00490       //caxpy
00491       template <>
00492       inline void _axpy<std::complex<float> >(const std::complex<float>* x, std::complex<float>* y, vcl_size_t n, std::complex<float> a)
00493       {
00494         //if the array is short or if either array is unaligned, perform the non-SSE code
00495         if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0)
00496           for(vcl_size_t i=0;i<n;i++)
00497             y[i]+=a*x[i];
00498 
00499         else
00500         {
00501           //process unaligned section of arrays
00502           while(((vcl_size_t)x)%16)
00503           {
00504             if(n<=0)
00505               return;
00506             y[0]+=a*x[0];
00507             x++;
00508             y++;
00509             n--;
00510           }
00511 
00512           __m128 reg0,reg1,reg2,reg3,reg4;
00513           __m128 areg0=_mm_set_ps(a.imag(),a.real(),a.imag(),a.real());
00514           __m128 areg1=_mm_set_ps(a.real(),a.imag(),a.real(),a.imag());
00515   #ifndef VIENNACL_WITH_SSE3
00516           __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
00517   #endif
00518 
00519           //add complex floats 4 at a time
00520           while(n>=4)
00521           {
00522             //read floats into MMX registers (8 from each array)
00523             reg0=_mm_load_ps((float*)(x+0));
00524             reg1=_mm_load_ps((float*)(x+2));
00525             reg2=_mm_load_ps((float*)(y+0));
00526             reg3=_mm_load_ps((float*)(y+2));
00527 
00528             //do complex multiplication and addition
00529   #ifndef VIENNACL_WITH_SSE3
00530             reg4=_mm_shuffle_ps(reg0,reg0,0xA0);
00531             reg0=_mm_shuffle_ps(reg0,reg0,0xF5);
00532             reg4=_mm_mul_ps(reg4,areg0);
00533             reg0=_mm_mul_ps(reg0,areg1);
00534             reg0=_mm_mul_ps(reg0,nreg);
00535             reg0=_mm_add_ps(reg4,reg0);
00536             reg0=_mm_add_ps(reg0,reg2);
00537             reg4=_mm_shuffle_ps(reg1,reg1,0xA0);
00538             reg1=_mm_shuffle_ps(reg1,reg1,0xF5);
00539             reg4=_mm_mul_ps(reg4,areg0);
00540             reg1=_mm_mul_ps(reg1,areg1);
00541             reg1=_mm_mul_ps(reg1,nreg);
00542             reg1=_mm_add_ps(reg4,reg1);
00543             reg1=_mm_add_ps(reg1,reg3);
00544   #else
00545             reg4=_mm_moveldup_ps(reg0);
00546             reg0=_mm_movehdup_ps(reg0);
00547             reg4=_mm_mul_ps(reg4,areg0);
00548             reg0=_mm_mul_ps(reg0,areg1);
00549             reg0=_mm_addsub_ps(reg4,reg0);
00550             reg0=_mm_add_ps(reg0,reg2);
00551             reg4=_mm_moveldup_ps(reg1);
00552             reg1=_mm_movehdup_ps(reg1);
00553             reg4=_mm_mul_ps(reg4,areg0);
00554             reg1=_mm_mul_ps(reg1,areg1);
00555             reg1=_mm_addsub_ps(reg4,reg1);
00556             reg1=_mm_add_ps(reg1,reg3);
00557   #endif
00558             //put results into y
00559             _mm_store_ps((float*)(y+0),reg0);
00560             _mm_store_ps((float*)(y+2),reg1);
00561 
00562             x+=4;
00563             y+=4;
00564             n-=4;
00565           }
00566 
00567           //add beyond the last multiple of 4
00568           for(vcl_size_t i=0;i<n;i++)
00569             y[i]+=a*x[i];
00570         }
00571       }
00572 
00573       //zaxpy
00574       template <>
00575       inline void _axpy<std::complex<double> >(const std::complex<double>* x, std::complex<double>* y, vcl_size_t n, std::complex<double> a)
00576       {
00577         //if the array is short or if either array is unaligned, perform the non-SSE code
00578         if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16)
00579           for(vcl_size_t i=0;i<n;i++)
00580             y[i]+=a*x[i];
00581 
00582         else
00583         {
00584           __m128d reg0,reg1,reg2,reg3,reg4;
00585           __m128d areg0=_mm_set_pd(a.imag(),a.real());
00586           __m128d areg1=_mm_set_pd(a.real(),a.imag());
00587   #ifndef VIENNACL_WITH_SSE3
00588           __m128d nreg=_mm_set_pd(1.0,-1.0);
00589   #endif
00590 
00591           //add complex doubles 2 at a time
00592           while(n>=2)
00593           {
00594             //read doubles into MMX registers (4 from each array)
00595             reg0=_mm_load_pd((double*)(x+0));
00596             reg1=_mm_load_pd((double*)(x+1));
00597             reg2=_mm_load_pd((double*)(y+0));
00598             reg3=_mm_load_pd((double*)(y+1));
00599 
00600             //do complex multiplication and addition
00601   #ifndef VIENNACL_WITH_SSE3
00602             reg4=_mm_shuffle_pd(reg0,reg0,0x0);
00603             reg0=_mm_shuffle_pd(reg0,reg0,0x3);
00604             reg4=_mm_mul_pd(reg4,areg0);
00605             reg0=_mm_mul_pd(reg0,areg1);
00606             reg0=_mm_mul_pd(reg0,nreg);
00607             reg0=_mm_add_pd(reg4,reg0);
00608             reg0=_mm_add_pd(reg0,reg2);
00609             reg4=_mm_shuffle_pd(reg1,reg1,0x0);
00610             reg1=_mm_shuffle_pd(reg1,reg1,0x3);
00611             reg4=_mm_mul_pd(reg4,areg0);
00612             reg1=_mm_mul_pd(reg1,areg1);
00613             reg1=_mm_mul_pd(reg1,nreg);
00614             reg1=_mm_add_pd(reg4,reg1);
00615             reg1=_mm_add_pd(reg1,reg3);
00616   #else
00617             reg4=_mm_shuffle_pd(reg0,reg0,0x0);
00618             reg0=_mm_shuffle_pd(reg0,reg0,0x3);
00619             reg4=_mm_mul_pd(reg4,areg0);
00620             reg0=_mm_mul_pd(reg0,areg1);
00621             reg0=_mm_addsub_pd(reg4,reg0);
00622             reg0=_mm_add_pd(reg0,reg2);
00623             reg4=_mm_shuffle_pd(reg1,reg1,0x0);
00624             reg1=_mm_shuffle_pd(reg1,reg1,0x3);
00625             reg4=_mm_mul_pd(reg4,areg0);
00626             reg1=_mm_mul_pd(reg1,areg1);
00627             reg1=_mm_addsub_pd(reg4,reg1);
00628             reg1=_mm_add_pd(reg1,reg3);
00629   #endif
00630             //put results into y
00631             _mm_store_pd((double*)(y+0),reg0);
00632             _mm_store_pd((double*)(y+1),reg1);
00633 
00634             x+=2;
00635             y+=2;
00636             n-=2;
00637           }
00638 
00639           //add beyond the last multiple of 2
00640           if(n)
00641             y[0]+=a*x[0];
00642         }
00643       }
00644 
00645       //cdotu
00646       template <>
00647       inline std::complex<float> _dot<std::complex<float> >(vcl_size_t n, const std::complex<float>* x, const std::complex<float>* y)
00648       {
00649         //if the array is short or if either array is unaligned, perform the non-SSE code
00650         if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0)
00651         {
00652           std::complex<float> sum(0);
00653           for(vcl_size_t i=0;i<n;i++)
00654             sum+=x[i]*y[i];
00655           return sum;
00656         }
00657         else
00658         {
00659           //process unaligned section of arrays
00660           std::complex<float> sum(0);
00661           while(((vcl_size_t)x)%16)
00662           {
00663             if(n<=0)
00664               return sum;
00665             sum+=x[0]*y[0];
00666             y++;
00667             x++;
00668             n--;
00669           }
00670 
00671           __m128 sumReg=_mm_setzero_ps();
00672           __m128 reg0,reg1,reg2,reg3,reg4;
00673   #ifndef VIENNACL_WITH_SSE3
00674           __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
00675   #endif
00676 
00677           //add complex floats 4 at a time
00678           while(n>=4)
00679           {
00680             //read floats into MMX registers (8 from each array)
00681             reg0=_mm_load_ps((float*)(x+0));
00682             reg1=_mm_load_ps((float*)(x+2));
00683             reg2=_mm_load_ps((float*)(y+0));
00684             reg3=_mm_load_ps((float*)(y+2));
00685 
00686             //multiply complex floats together
00687   #ifndef VIENNACL_WITH_SSE3
00688             reg4=_mm_shuffle_ps(reg2,reg2,0xA0);
00689             reg2=_mm_shuffle_ps(reg2,reg2,0xF5);
00690             reg4=_mm_mul_ps(reg4,reg0);
00691             reg2=_mm_mul_ps(reg2,reg0);
00692             reg2=_mm_shuffle_ps(reg2,reg2,0xB1);
00693             reg2=_mm_mul_ps(reg2,nreg);
00694             reg0=_mm_add_ps(reg4,reg2);
00695             reg4=_mm_shuffle_ps(reg3,reg3,0xA0);
00696             reg3=_mm_shuffle_ps(reg3,reg3,0xF5);
00697             reg4=_mm_mul_ps(reg4,reg1);
00698             reg3=_mm_mul_ps(reg3,reg1);
00699             reg3=_mm_shuffle_ps(reg3,reg3,0xB1);
00700             reg3=_mm_mul_ps(reg3,nreg);
00701             reg1=_mm_add_ps(reg4,reg3);
00702   #else
00703             reg4=_mm_moveldup_ps(reg2);
00704             reg2=_mm_movehdup_ps(reg2);
00705             reg4=_mm_mul_ps(reg4,reg0);
00706             reg2=_mm_mul_ps(reg2,reg0);
00707             reg2=_mm_shuffle_ps(reg2,reg2,0xB1);
00708             reg0=_mm_addsub_ps(reg4,reg2);
00709             reg4=_mm_moveldup_ps(reg3);
00710             reg3=_mm_movehdup_ps(reg3);
00711             reg4=_mm_mul_ps(reg4,reg1);
00712             reg3=_mm_mul_ps(reg3,reg1);
00713             reg3=_mm_shuffle_ps(reg3,reg3,0xB1);
00714             reg1=_mm_addsub_ps(reg4,reg3);
00715   #endif
00716 
00717             //add to sum
00718             sumReg=_mm_add_ps(sumReg,reg0);
00719             sumReg=_mm_add_ps(sumReg,reg1);
00720 
00721             x+=4;
00722             y+=4;
00723             n-=4;
00724           }
00725 
00726           //add beyond where the inner loop stopped
00727           for(vcl_size_t i=0;i<n;i++)
00728             sum+=x[i]*y[i];
00729 
00730           //move the sums from the xmm registers to aligned memory on the stack
00731           std::complex<float> sums[4];
00732           std::complex<float>* pSums=(std::complex<float>*)((((vcl_size_t)sums)&(~15))+16);
00733           pSums[0]=std::complex<float>(0);
00734           pSums[1]=std::complex<float>(0);
00735           _mm_store_ps((float*)pSums,sumReg);
00736 
00737           return sum+pSums[0]+pSums[1];
00738         }
00739       }
00740 
00741       //zdotu
00742       template <>
00743       inline std::complex<double> _dot<std::complex<double> >(vcl_size_t n, const std::complex<double>* x, const std::complex<double>* y)
00744       {
00745         //if the array is short or if either array is unaligned, perform the non-SSE code
00746         if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16)
00747         {
00748           std::complex<double> sum(0);
00749           for(vcl_size_t i=0;i<n;i++)
00750             sum+=x[i]*y[i];
00751           return sum;
00752         }
00753         else
00754         {
00755           __m128d sumReg=_mm_setzero_pd();
00756           __m128d reg0,reg1,reg2,reg3,reg4;
00757   #ifndef VIENNACL_WITH_SSE3
00758           __m128d nreg=_mm_set_pd(1.0,-1.0);
00759   #endif
00760 
00761           //add complex doubles 2 at a time
00762           while(n>=2)
00763           {
00764             //read doubles into MMX registers (4 from each array)
00765             reg0=_mm_load_pd((double*)(x+0));
00766             reg1=_mm_load_pd((double*)(x+1));
00767             reg2=_mm_load_pd((double*)(y+0));
00768             reg3=_mm_load_pd((double*)(y+1));
00769 
00770             //multiply complex doubles together
00771   #ifndef VIENNACL_WITH_SSE3
00772             reg4=_mm_shuffle_pd(reg2,reg2,0x0);
00773             reg2=_mm_shuffle_pd(reg2,reg2,0x3);
00774             reg4=_mm_mul_pd(reg4,reg0);
00775             reg2=_mm_mul_pd(reg2,reg0);
00776             reg2=_mm_shuffle_pd(reg2,reg2,0x1);
00777             reg2=_mm_mul_pd(reg2,nreg);
00778             reg0=_mm_add_pd(reg4,reg2);
00779             reg4=_mm_shuffle_pd(reg3,reg3,0x0);
00780             reg3=_mm_shuffle_pd(reg3,reg3,0x3);
00781             reg4=_mm_mul_pd(reg4,reg1);
00782             reg3=_mm_mul_pd(reg3,reg1);
00783             reg3=_mm_shuffle_pd(reg3,reg3,0x1);
00784             reg3=_mm_mul_pd(reg3,nreg);
00785             reg1=_mm_add_pd(reg4,reg3);
00786   #else
00787             reg4=_mm_shuffle_pd(reg2,reg2,0x0);
00788             reg2=_mm_shuffle_pd(reg2,reg2,0x3);
00789             reg4=_mm_mul_pd(reg4,reg0);
00790             reg2=_mm_mul_pd(reg2,reg0);
00791             reg2=_mm_shuffle_pd(reg2,reg2,0x1);
00792             reg0=_mm_addsub_pd(reg4,reg2);
00793             reg4=_mm_shuffle_pd(reg3,reg3,0x0);
00794             reg3=_mm_shuffle_pd(reg3,reg3,0x3);
00795             reg4=_mm_mul_pd(reg4,reg1);
00796             reg3=_mm_mul_pd(reg3,reg1);
00797             reg3=_mm_shuffle_pd(reg3,reg3,0x1);
00798             reg1=_mm_addsub_pd(reg4,reg3);
00799   #endif
00800 
00801             //add to sum
00802             sumReg=_mm_add_pd(sumReg,reg0);
00803             sumReg=_mm_add_pd(sumReg,reg1);
00804 
00805             x+=2;
00806             y+=2;
00807             n-=2;
00808           }
00809 
00810           //add beyond where the inner loop stopped
00811           std::complex<double> sum(0);
00812           if(n)
00813             sum=x[0]*y[0];
00814 
00815           //move the sums from the xmm registers to aligned memory on the stack
00816           std::complex<double> sums[2];
00817           std::complex<double>* pSums=(std::complex<double>*)((((vcl_size_t)sums)&(~15))+16);
00818           pSums[0]=std::complex<double>(0);
00819           _mm_store_pd((double*)pSums,sumReg);
00820 
00821           return sum+pSums[0];
00822         }
00823       }
00824 
00825       //cdotc
00826       template <>
00827       inline std::complex<float> _dotc<std::complex<float> >(vcl_size_t n, const std::complex<float>* x, const std::complex<float>* y)
00828       {
00829         //if the array is short or if either array is unaligned, perform the non-SSE code
00830         if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0)
00831         {
00832           std::complex<float> sum(0);
00833           for(vcl_size_t i=0;i<n;i++)
00834             sum+=conj(x[i])*y[i];
00835           return sum;
00836         }
00837         else
00838         {
00839           //process unaligned section of arrays
00840           std::complex<float> sum(0);
00841           while(((vcl_size_t)x)%16)
00842           {
00843             if(n<=0)
00844               return sum;
00845             sum+=conj(x[0])*y[0];
00846             y++;
00847             x++;
00848             n--;
00849           }
00850 
00851           __m128 sumReg=_mm_setzero_ps();
00852           __m128 reg0,reg1,reg2,reg3,reg4;
00853   #ifndef VIENNACL_WITH_SSE3
00854           __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f);
00855   #endif
00856 
00857           //add complex floats 4 at a time
00858           while(n>=4)
00859           {
00860             //read floats into MMX registers (8 from each array)
00861             reg0=_mm_load_ps((float*)(x+0));
00862             reg1=_mm_load_ps((float*)(x+2));
00863             reg2=_mm_load_ps((float*)(y+0));
00864             reg3=_mm_load_ps((float*)(y+2));
00865 
00866             //multiply complex doubles together
00867   #ifndef VIENNACL_WITH_SSE3
00868             reg4=_mm_shuffle_ps(reg2,reg2,0xA0);
00869             reg2=_mm_shuffle_ps(reg2,reg2,0xF5);
00870             reg4=_mm_mul_ps(reg4,reg0);
00871             reg2=_mm_mul_ps(reg2,reg0);
00872             reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
00873             reg4=_mm_mul_ps(reg4,nreg);
00874             reg0=_mm_add_ps(reg4,reg2);
00875             reg4=_mm_shuffle_ps(reg3,reg3,0xA0);
00876             reg3=_mm_shuffle_ps(reg3,reg3,0xF5);
00877             reg4=_mm_mul_ps(reg4,reg1);
00878             reg3=_mm_mul_ps(reg3,reg1);
00879             reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
00880             reg4=_mm_mul_ps(reg4,nreg);
00881             reg1=_mm_add_ps(reg4,reg3);
00882   #else
00883             reg4=_mm_moveldup_ps(reg2);
00884             reg2=_mm_movehdup_ps(reg2);
00885             reg4=_mm_mul_ps(reg4,reg0);
00886             reg2=_mm_mul_ps(reg2,reg0);
00887             reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
00888             reg0=_mm_addsub_ps(reg2,reg4);
00889             reg4=_mm_moveldup_ps(reg3);
00890             reg3=_mm_movehdup_ps(reg3);
00891             reg4=_mm_mul_ps(reg4,reg1);
00892             reg3=_mm_mul_ps(reg3,reg1);
00893             reg4=_mm_shuffle_ps(reg4,reg4,0xB1);
00894             reg1=_mm_addsub_ps(reg3,reg4);
00895   #endif
00896 
00897             //add to sum
00898             sumReg=_mm_add_ps(sumReg,reg0);
00899             sumReg=_mm_add_ps(sumReg,reg1);
00900 
00901             x+=4;
00902             y+=4;
00903             n-=4;
00904           }
00905 
00906           //add beyond where the inner loop stopped
00907           for(vcl_size_t i=0;i<n;i++)
00908             sum+=conj(x[i])*y[i];
00909 
00910           //move the sums from the xmm registers to aligned memory on the stack
00911           std::complex<float> sums[4];
00912           std::complex<float>* pSums=(std::complex<float>*)((((vcl_size_t)sums)&(~15))+16);
00913           sumReg=_mm_shuffle_ps(sumReg,sumReg,0xB1);//swap real and imag
00914           _mm_store_ps((float*)pSums,sumReg);
00915 
00916           return sum+pSums[0]+pSums[1];
00917         }
00918       }
00919 
00920       //zdotc
00921       template <>
00922       inline std::complex<double> _dotc<std::complex<double> >(vcl_size_t n, const std::complex<double>* x, const std::complex<double>* y)
00923       {
00924         //if the array is short or if either array is unaligned, perform the non-SSE code
00925         if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16)
00926         {
00927           std::complex<double> sum(0);
00928           for(vcl_size_t i=0;i<n;i++)
00929             sum+=conj(x[i])*y[i];
00930           return sum;
00931         }
00932         else
00933         {
00934           __m128d sumReg=_mm_setzero_pd();
00935           __m128d reg0,reg1,reg2,reg3,reg4;
00936   #ifndef VIENNACL_WITH_SSE3
00937           __m128d nreg=_mm_set_pd(1.0,-1.0);
00938   #endif
00939 
00940           //add complex doubles 2 at a time
00941           while(n>=2)
00942           {
00943             //read doubles into MMX registers (4 from each array)
00944             reg0=_mm_load_pd((double*)(x+0));
00945             reg1=_mm_load_pd((double*)(x+1));
00946             reg2=_mm_load_pd((double*)(y+0));
00947             reg3=_mm_load_pd((double*)(y+1));
00948 
00949             //multiply complex floats together
00950   #ifndef VIENNACL_WITH_SSE3
00951             reg4=_mm_shuffle_pd(reg2,reg2,0x0);
00952             reg2=_mm_shuffle_pd(reg2,reg2,0x3);
00953             reg4=_mm_mul_pd(reg4,reg0);
00954             reg2=_mm_mul_pd(reg2,reg0);
00955             reg4=_mm_shuffle_pd(reg4,reg4,0x1);
00956             reg4=_mm_mul_pd(reg4,nreg);
00957             reg0=_mm_add_pd(reg4,reg2);
00958             reg4=_mm_shuffle_pd(reg3,reg3,0x0);
00959             reg3=_mm_shuffle_pd(reg3,reg3,0x3);
00960             reg4=_mm_mul_pd(reg4,reg1);
00961             reg3=_mm_mul_pd(reg3,reg1);
00962             reg4=_mm_shuffle_pd(reg4,reg4,0x1);
00963             reg4=_mm_mul_pd(reg4,nreg);
00964             reg1=_mm_add_pd(reg4,reg3);
00965   #else
00966             reg4=_mm_shuffle_pd(reg2,reg2,0x0);
00967             reg2=_mm_shuffle_pd(reg2,reg2,0x3);
00968             reg4=_mm_mul_pd(reg4,reg0);
00969             reg2=_mm_mul_pd(reg2,reg0);
00970             reg4=_mm_shuffle_pd(reg4,reg4,0x1);
00971             reg0=_mm_addsub_pd(reg2,reg4);
00972             reg4=_mm_shuffle_pd(reg3,reg3,0x0);
00973             reg3=_mm_shuffle_pd(reg3,reg3,0x3);
00974             reg4=_mm_mul_pd(reg4,reg1);
00975             reg3=_mm_mul_pd(reg3,reg1);
00976             reg4=_mm_shuffle_pd(reg4,reg4,0x1);
00977             reg1=_mm_addsub_pd(reg3,reg4);
00978 
00979   #endif
00980 
00981             //add to sum
00982             sumReg=_mm_add_pd(sumReg,reg0);
00983             sumReg=_mm_add_pd(sumReg,reg1);
00984 
00985             x+=2;
00986             y+=2;
00987             n-=2;
00988           }
00989 
00990           //add beyond where the inner loop stopped
00991           std::complex<double> sum(0);
00992           if(n)
00993             sum=conj(x[0])*y[0];
00994 
00995           //move the sums from the xmm registers to aligned memory on the stack
00996           std::complex<double> sums[2];
00997           std::complex<double>* pSums=(std::complex<double>*)((((vcl_size_t)sums)&(~15))+16);
00998           sumReg=_mm_shuffle_pd(sumReg,sumReg,0x1);//swap real and imag
00999           _mm_store_pd((double*)pSums,sumReg);
01000 
01001           return sum+pSums[0];
01002         }
01003       }
01004 
01005   #endif //defined VIENNACL_WITH_COMPLEX
01006 
01007   #endif //defined VIENNACL_WITH_SSE2
01008 
01009     } //namespace host_based
01010   } //namespace linalg
01011 } //namespace viennacl
01012 
01013 #endif