ViennaCL - The Vienna Computing Library  1.5.1
viennacl/linalg/host_based/sse_kernels.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_
00002 #define VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00027 #ifdef VIENNACL_WITH_OPENMP
00028 #include <omp.h>
00029 #endif
00030 
00031 #include <iostream>
00032 #include <vector>
00033 
00034 //for std::min
00035 #include <algorithm>
00036 
00037 #include "viennacl/linalg/host_based/sse_blas.hpp"
00038 
00039 namespace viennacl
00040 {
00041   namespace linalg
00042   {
00043     namespace host_based
00044     {
00045       namespace detail
00046       {
00047 
00048         // returns true if the matrix is hermitian (or real symmetric), false otherwise
00049         template <typename ScalarType>
00050         bool isHermitian(ScalarType ** const A, vcl_size_t n)
00051         {
00052           for(vcl_size_t i=0;i<n;i++)
00053             for(vcl_size_t j=i;j<n;j++)
00054               if(A[i][j] != conjIfComplex(A[j][i]))
00055                 return false;
00056           return true;
00057         }
00058 
00059         // returns the bandwidth of a hermitian (or real symmetric) matrix
00060         template <typename ScalarType>
00061         vcl_size_t getHermitianBandwidth(ScalarType ** const A, vcl_size_t n)
00062         {
00063           for(vcl_size_t i=n-1;i>=0;i--)
00064             for(vcl_size_t j=0;j<n-i;j++)
00065               if(A[i+j][j]!=ScalarType(0))
00066                 return 2*i+1;
00067           return 0;
00068         }
00069 
00070         // helper for tridiagonalizeBandedMatrix
00071         // does a householder similarity transform to eliminate a range of nonzeros in a row of a hermitian matrix
00072         template <typename ScalarType>
00073         void eliminateHermitian(ScalarType ** A, vcl_size_t row, vcl_size_t from, vcl_size_t to, vcl_size_t width, ScalarType * ss)
00074         {
00075           if(from>=to)
00076             return;
00077 
00078           ScalarType norm=_nrm2(&A[row][row+from],to-from);
00079 
00080           if(norm != ScalarType(0))
00081           {
00082 
00083             //pick the better of two reflectors, to 1 or -1
00084             //this is wierd syntax that also works with std::complex
00085             if(std::abs(A[row][row+from]-ScalarType(1))>std::abs(A[row][row+from]+ScalarType(1)))
00086               norm=-norm;
00087             for(vcl_size_t i=row+from;i<row+to;i++)
00088               A[row][i]/=norm;
00089             A[row][row+from]+=ScalarType(1);
00090 
00091             //apply the similarity transformation
00092 
00093             //left transformation
00094             for(vcl_size_t j=row+1;j<row+width;j++)
00095             {
00096               ScalarType s=_dotc(to-from,&A[row][row+from],&A[j][row+from]);
00097               s=-s/A[row][row+from];
00098               _axpy(&A[row][row+from],&A[j][row+from],to-from,s);
00099             }
00100 
00101             //conjugate householder reflector for right transformation
00102             for(vcl_size_t i=row+from;i<row+to;i++)
00103               A[row][i]=conjIfComplex(A[row][i]);
00104 
00105             //right transformation (cache aligned)
00106             for(vcl_size_t i=0;i<width;i++)
00107               ss[i]=ScalarType(0);
00108             for(vcl_size_t i=from;i<to;i++)
00109               _axpy(&A[row+i][row],ss,width,conjIfComplex(A[row][row+i]));
00110             for(vcl_size_t i=0;i<width;i++)
00111               ss[i]=-ss[i]/A[row][row+from];
00112             for(vcl_size_t i=from;i<to;i++)
00113               _axpy(ss,&A[row+i][row],width,A[row][row+i]);
00114 
00115             //clean up the householder reflector
00116             for(vcl_size_t col=row+from;col<row+to;col++)
00117               A[row][col]=conjIfComplex(A[col][row]);
00118 
00119           }
00120         }
00121 
00122         // reduces a hermitian (or symmetric real) banded matrix to a hermitian (or symmetric real) tridiagonal matrix,
00123         // using householder similarity transforms, so eigenvalues are preserved.
00124         // bandwidth should be an odd integer, such as 3 for an already tridiagonal matrix
00125         // based on http://www.netlib.org/lapack/lawnspdf/lawn208.pdf
00126         template<typename ScalarType>
00127         void tridiagonalizeHermitianBandedMatrix(ScalarType ** A, vcl_size_t n, vcl_size_t bandwidth)
00128         {
00129           if(bandwidth<=3)
00130             return;
00131 
00132           vcl_size_t belowDiagonal=(bandwidth-1)/2;
00133           ScalarType *ss=new ScalarType[bandwidth+belowDiagonal];
00134 
00135           //eliminate and chase bulges where the elimination makes a bulge
00136           vcl_size_t k=0;
00137           for(;k<n-belowDiagonal;k++)
00138           {
00139 
00140               //eliminate below the diagonal
00141               eliminateHermitian(A,k,1,1+belowDiagonal,std::min(n-k,2*belowDiagonal+1),ss);
00142 
00143               //chase the bulge
00144               for(vcl_size_t bulgeStart=k+1;bulgeStart<n-belowDiagonal;bulgeStart+=belowDiagonal)
00145                   for(vcl_size_t i=0;i<belowDiagonal-1;i++)
00146                       eliminateHermitian(A,bulgeStart+i,belowDiagonal,std::min(n-bulgeStart-i,belowDiagonal*2-i),std::min(n-bulgeStart-i,bandwidth+belowDiagonal),ss);
00147           }
00148 
00149           //eliminate beyond where elimination makes bulges
00150           for(;k<n-2;k++)
00151               eliminateHermitian(A,k,1,n-k,n-k,ss);
00152 
00153           delete [] ss;
00154         }
00155 
00156         // reduces a hermitian (or symmetric real) matrix to a hermitian (or symmetric real) banded matrix with bandwidth 2*block_size+1
00157         // using householder similarity transformations, so eigenvalues are preserved. reduceToBandedMatrix(A,1) reduces the matrix to tridiagonal
00158         template<typename ScalarType>
00159         void reduceHermitianToBandedMatrix(ScalarType ** A, vcl_size_t n, vcl_size_t block_size, vcl_size_t num_threads)
00160         {
00161           ScalarType* norms=new ScalarType[block_size];
00162           ScalarType* ss=new ScalarType[n];
00163 
00164           for (vcl_size_t k=0;k<n-block_size;k+=block_size)
00165           {
00166             for(vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
00167             {
00168 
00169               //this is the same as the norm of the column, since it's hermetian
00170               norms[bi]=_nrm2(&A[k+bi][k+bi+block_size],n-k-bi-block_size);
00171 
00172               if(norms[bi]!=ScalarType(0))
00173               {
00174 
00175                 //pick the better of two reflectors, to 1 or -1
00176                 //this is wierd syntax that also works with std::complex
00177                 if(std::abs(A[k+bi][k+bi+block_size]-ScalarType(1))>std::abs(A[k+bi][k+bi+block_size]+ScalarType(1)))
00178                     norms[bi]=-norms[bi];
00179                 for(vcl_size_t i=k+bi+block_size;i<n;i++)
00180                     A[k+bi][i]/=norms[bi];
00181                 A[k+bi][k+bi+block_size]+=ScalarType(1);
00182 
00183                 // Apply transformation to remaining rows within the block
00184                 for(vcl_size_t j=k+bi+1;j<k+block_size;j++)
00185                 {
00186                     ScalarType s=_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
00187                     s=-s/A[k+bi][k+bi+block_size];
00188                     _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
00189                 }
00190               }
00191             }
00192 
00193             //apply transformations from block to remaining rows and columns the block in parallel
00194 
00195             //left transformations
00196   #ifdef VIENNACL_WITH_OPENMP
00197   #pragma omp parallel for
00198             for(int j=k+block_size;j<(int)n;j++)
00199   #else
00200             for(vcl_size_t j=k+block_size;j<n;j++)
00201   #endif
00202             {
00203               for(vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
00204               {
00205                 if(norms[bi]!=ScalarType(0))
00206                 {
00207                   ScalarType s=_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]);
00208                   s=-s/A[k+bi][k+bi+block_size];
00209                   _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s);
00210                 }
00211               }
00212             }
00213 
00214             //conjugate householder reflectors for right transformations
00215             for(vcl_size_t bi=0;bi<block_size;bi++)
00216               for(vcl_size_t i=k+bi+block_size;i<n;i++)
00217                 A[k+bi][i]=conjIfComplex(A[k+bi][i]);
00218 
00219             //right transformations (cache aligned)
00220   #ifdef VIENNACL_WITH_OPENMP
00221   #pragma omp parallel for
00222             for(int section=0;section<(int)num_threads;section++)
00223   #else
00224             for(vcl_size_t section=0;section<num_threads;section++)
00225   #endif
00226             {
00227               vcl_size_t start=((n-k)*(section+0))/num_threads+k;
00228               vcl_size_t end  =((n-k)*(section+1))/num_threads+k;
00229               vcl_size_t length=end-start;
00230               for(vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++)
00231               {
00232                 if(norms[bi]!=ScalarType(0))
00233                 {
00234                   for(vcl_size_t i=start;i<end;i++)
00235                     ss[i]=ScalarType(0);
00236                   for(vcl_size_t i=k+bi+block_size;i<n;i++)
00237                     _axpy(&A[i][start],ss+start,length,conjIfComplex(A[k+bi][i]));
00238                   for(vcl_size_t i=start;i<end;i++)
00239                     ss[i]=-ss[i]/A[k+bi][k+bi+block_size];
00240                   for(vcl_size_t i=k+bi+block_size;i<n;i++)
00241                     _axpy(ss+start,&A[i][start],length,A[k+bi][i]);
00242                 }
00243               }
00244             }
00245 
00246             //clean up householder reflectors
00247             for(vcl_size_t row=k;row<k+block_size;row++)
00248               for(vcl_size_t col=row+block_size;col<n;col++)
00249                 A[row][col]=conjIfComplex(A[col][row]);
00250           }
00251           delete [] norms;
00252           delete [] ss;
00253         }
00254 
00255       } //namespace detail
00256 
00265       template<typename ScalarType>
00266       void inplace_tred2(ScalarType ** A, vcl_size_t n, vcl_size_t block_size = 1, vcl_size_t num_threads = 1)
00267       {
00268         if(!detail::isHermitian(A,n))
00269           std::cerr << "ViennaCL: Warning in inplace_tred2(): Matrix is not hermitian (or real symmetric)" << std::endl;
00270 
00271         // Don't touch the whole matrix if the bandwidth is already small.
00272         // There's nothing numerically significant about n*4,
00273         // it's just a point I chose to switch to assuming the matrix is full.
00274         vcl_size_t bandwidth=detail::getHermitianBandwidth(A,n);
00275         if(bandwidth*bandwidth*num_threads<n*4 || 2*block_size+1>bandwidth)
00276           detail::tridiagonalizeHermitianBandedMatrix(A,n,bandwidth);
00277         else
00278         {
00279           detail::reduceHermitianToBandedMatrix(A,n,block_size,num_threads);
00280           detail::tridiagonalizeHermitianBandedMatrix(A,n,2*block_size+1);
00281         }
00282       }
00283 
00294       template <typename ScalarType>
00295       bool lu_factorize_row_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t * piv = NULL, vcl_size_t block_size = 8)
00296       {
00297         // Use a parallel "left-looking", row-operation-based, block Crout/Doolittle algorithm.
00298         if(piv)
00299           for(vcl_size_t i=0; i<m; i++)
00300             piv[i]=i;
00301         bool pivsign=true;
00302 
00303         // Outer loop.
00304             for(vcl_size_t j=0; j<std::min(m,n); j+=block_size)
00305         {
00306                   block_size=std::min(std::min(m-j,n-j),block_size);
00307 
00308           //do Gaussian elimination with partial pivoting in the block
00309           //(in the first few columns of the matrix)
00310           for(vcl_size_t bi=0;bi<block_size;bi++)
00311           {
00312             // Find pivot and exchange if necessary.
00313             vcl_size_t p=j+bi;
00314             if(piv)
00315             {
00316               for(vcl_size_t i=j+bi+1; i<m; i++)
00317                 if(std::abs(A[i][j+bi])>std::abs(A[p][j+bi]))
00318                   p=i;
00319 
00320               if (p!=j+bi)
00321               {
00322                 for(vcl_size_t k=0; k<n; k++)
00323                 {
00324                   ScalarType t=A[p][k];
00325                   A[p][k]=A[j+bi][k];
00326                   A[j+bi][k]=t;
00327                 }
00328 
00329                 //swap pivot vector
00330                 vcl_size_t k = piv[p];
00331                 piv[p] = piv[j+bi];
00332                 piv[j+bi] = k;
00333                 pivsign = !pivsign;
00334               }
00335             }
00336 
00337             //eliminate below the diagonal in the block
00338             ScalarType elimVal=A[j+bi][j+bi];
00339             if(elimVal==ScalarType(0))
00340             {
00341               //apply previous transformations from the block to the top of the submatrix
00342               for(vcl_size_t row=j+1;row<j+bi;row++)
00343                 for(vcl_size_t bi_=0;bi_<row-j;bi_++)
00344                   if(A[row][j+bi_]!=ScalarType(0))
00345                     _axpy(&(A[j+bi_][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi_]);
00346               return pivsign;
00347             }
00348             for(vcl_size_t row=j+bi+1;row<m;row++)
00349             {
00350               ScalarType multiplier=A[row][j+bi]/elimVal;
00351                 for(vcl_size_t col=j+bi;col<j+block_size;col++)
00352                   A[row][col]-=multiplier*A[j+bi][col];
00353                     A[row][j+bi]=multiplier;
00354             }
00355           }
00356 
00357           //at this point, the matrix looks something like this (if block size were 4)
00358           //
00359           //U U U U * * * *
00360           //L U U U * * * *
00361           //L L U U * * * *
00362           //L L L U * * * *
00363           //L L L L * * * *
00364           //L L L L * * * *
00365           //L L L L * * * *
00366           //L L L L * * * *
00367 
00368           //apply previous transformations from the block to the top of the submatrix
00369           for(vcl_size_t row=j+1;row<j+block_size;row++)
00370             for(vcl_size_t bi=0;bi<row-j;bi++)
00371               if(A[row][j+bi]!=ScalarType(0))
00372                 _axpy(&(A[j+bi][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi]);
00373 
00374           //at this point, the matrix looks something like this (if block size were 4)
00375           //
00376           //U U U U U U U U
00377           //L U U U U U U U
00378           //L L U U U U U U
00379           //L L L U U U U U
00380           //L L L L * * * *
00381           //L L L L * * * *
00382           //L L L L * * * *
00383           //L L L L * * * *
00384 
00385           //apply previous transformations from the block in parallel to the rest of the submatrix
00386   #ifdef VIENNACL_OPENMP
00387   #pragma omp parallel for
00388           for(int row=j+block_size;row<(int)m;row++)
00389   #else
00390           for(vcl_size_t row=j+block_size;row<m;row++)
00391   #endif
00392           for(vcl_size_t bi=0;bi<block_size;bi++)
00393             if(A[row][j+bi]!=ScalarType(0))
00394                 _axpy(&(A[j+bi][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi]);
00395         }
00396         return pivsign;
00397       }
00398 
00406       template <typename ScalarType>
00407           std::vector<ScalarType> inplace_qr_col_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size = 8)
00408       {
00409         std::vector<ScalarType> betas(std::min(m,n));
00410         ScalarType* norms=new ScalarType[block_size];
00411 
00412         for(vcl_size_t k=0; k<std::min(m,n); k+=block_size)
00413         {
00414           block_size=std::min(std::min(m-k,n-k),block_size);
00415 
00416           for(vcl_size_t bi=0;bi<block_size;bi++)
00417           {
00418 
00419             // Compute 2-norm of k+bi-th column below the diagonal
00420             norms[bi]=_nrm2(&A[k+bi][k+bi],m-k-bi);
00421 
00422             if(norms[bi]!=ScalarType(0))
00423             {
00424               //pick the better of two reflectors, to 1 or -1,
00425               //this is wierd syntax that also works with std::complex
00426               if(std::abs(A[k+bi][k+bi]-ScalarType(1))>std::abs(A[k+bi][k+bi]+ScalarType(1)))
00427                 norms[bi]*=-1;
00428               for(vcl_size_t i=k+bi;i<m;i++)
00429                 A[k+bi][i]/=norms[bi];
00430               A[k+bi][k+bi]+=ScalarType(1);
00431 
00432               // Apply transformation to columns within the block
00433               for(vcl_size_t j=k+bi+1; j<k+block_size; j++)
00434               {
00435                 ScalarType s=_dotc(m-k-bi,&A[k+bi][k+bi],&A[j][k+bi]);
00436                 s = -s/A[k+bi][k+bi];
00437                 _axpy(&A[k+bi][k+bi],&A[j][k+bi],m-k-bi,s);
00438               }
00439             }
00440             //temporarily store the diagonal value of R in betas
00441             betas[k+bi]=-norms[bi];
00442           }
00443 
00444           //apply transformations from block to remaining columns to the right of the block in parallel
00445   #ifdef VIENNACL_OPENMP
00446   #pragma omp parallel for
00447           for(int j=k+block_size; j<(int)n; j++)
00448   #else
00449           for(vcl_size_t j=k+block_size; j<n; j++)
00450   #endif
00451           {
00452             for(vcl_size_t bi=0;bi<block_size;bi++)
00453             {
00454               if(norms[bi]!=ScalarType(0))
00455                           {
00456                 ScalarType s=_dotc(m-k-bi,&A[k+bi][k+bi],&A[j][k+bi]);
00457                 s = -s/A[k+bi][k+bi];
00458                 _axpy(&A[k+bi][k+bi],A[j]+k+bi,m-k-bi,s);
00459               }
00460             }
00461           }
00462         }
00463 
00464         //normalize the householder reflectors and store the betas
00465         for(vcl_size_t j=0;j<std::min(m,n);j++)
00466         {
00467           ScalarType beta=A[j][j];
00468           for(vcl_size_t i=j+1;i<m;i++)
00469             A[j][i]/=beta;
00470           A[j][j]=betas[j];//R diagonal values were stored temporarily in betas
00471           betas[j]=beta;
00472         }
00473 
00474         delete [] norms;
00475         return betas;
00476       }
00477 
00486       template <typename ScalarType>
00487       std::vector<ScalarType> inplace_qr_row_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size = 8, vcl_size_t num_threads = 1)
00488       {
00489         std::vector<ScalarType> betas(std::min(m,n));
00490         ScalarType* norms=new ScalarType[block_size];
00491         ScalarType* ss=new ScalarType[n];
00492 
00493         //allocate O(m) memory for temporary column-major storage of the block for blas functions
00494         ScalarType** block_cols=new ScalarType*[block_size];
00495         for(vcl_size_t i=0;i<block_size;i++)
00496           block_cols[i]=new ScalarType[m];
00497 
00498         for(vcl_size_t k=0; k<std::min(m,n); k+=block_size)
00499         {
00500           block_size=std::min(std::min(m-k,n-k),block_size);
00501 
00502           //copy the block to column-major storage for cache alignment (necessary for _nrm2)
00503           for(vcl_size_t i=0;i<m-k;i++)
00504             for(vcl_size_t bi=0;bi<block_size;bi++)
00505               block_cols[bi][i]=A[k+i][k+bi];
00506 
00507           for(vcl_size_t bi=0;bi<block_size;bi++)
00508           {
00509 
00510             // Compute 2-norm of k+bi-th column below the diagonal
00511             norms[bi]=_nrm2(&block_cols[bi][bi],m-k-bi);
00512 
00513             if(norms[bi]!=ScalarType(0))
00514             {
00515               //pick the better of two reflectors, to 1 or -1,
00516               //this is wierd syntax that also works with std::complex
00517               if(std::abs(block_cols[bi][bi]-ScalarType(1))>std::abs(block_cols[bi][bi]+ScalarType(1)))
00518                 norms[bi]*=-1;
00519               for(vcl_size_t i=bi;i<m-k;i++)
00520                 block_cols[bi][i]/=norms[bi];
00521               block_cols[bi][bi]+=ScalarType(1);
00522 
00523               // Apply transformation to columns within the block
00524               for(vcl_size_t j=bi+1; j<block_size; j++)
00525               {
00526                 ScalarType s=_dotc(m-k-bi,&block_cols[bi][bi],&block_cols[j][bi]);
00527                 s = -s/block_cols[bi][bi];
00528                 _axpy(&block_cols[bi][bi],&block_cols[j][bi],m-k-bi,s);
00529               }
00530             }
00531             //temporarily store the diagonal value of R in betas
00532             betas[k+bi]=-norms[bi];
00533           }
00534 
00535           //copy the block back to row-major storage
00536           for(vcl_size_t i=0;i<m-k;i++)
00537             for(vcl_size_t bi=0;bi<block_size;bi++)
00538               A[k+i][k+bi]=block_cols[bi][i];
00539 
00540           //apply transformations from block to remaining rows to the right of the block in parallel
00541   #ifdef VIENNACL_OPENMP
00542   #pragma omp parallel for
00543           for(int section=0;section<(int)num_threads;section++)
00544   #else
00545           for(vcl_size_t section=0;section<num_threads;section++)
00546   #endif
00547           {
00548             vcl_size_t start=((n-k-block_size)*(section+0))/num_threads+k+block_size;
00549             vcl_size_t end  =((n-k-block_size)*(section+1))/num_threads+k+block_size;
00550             vcl_size_t length=end-start;
00551             for(vcl_size_t bi=0;bi<block_size;bi++)
00552             {
00553               if(norms[bi]!=ScalarType(0))
00554               {
00555                 for(vcl_size_t i=start;i<end;i++)
00556                   ss[i]=ScalarType(0);
00557                 for(vcl_size_t i=k+bi;i<m;i++)
00558                   _axpy(&A[i][start],ss+start,length,A[i][k+bi]);
00559                 for(vcl_size_t i=start;i<end;i++)
00560                   ss[i]=-ss[i]/A[k+bi][k+bi];
00561                 for(vcl_size_t i=k+bi;i<m;i++)
00562                   _axpy(ss+start,&A[i][start],length,A[i][k+bi]);
00563               }
00564             }
00565           }
00566         }
00567 
00568         //normalize the householder reflectors and store the betas
00569         for(vcl_size_t j=0;j<std::min(m,n);j++)
00570         {
00571           ScalarType beta=A[j][j];
00572           for(vcl_size_t i=j+1;i<m;i++)
00573             A[i][j]/=beta;
00574           A[j][j]=betas[j];//R diagonal values were stored temporarily in betas
00575           betas[j]=beta;
00576         }
00577 
00578         delete [] norms;
00579         for(vcl_size_t i=0;i<block_size;i++)
00580           delete [] block_cols[i];
00581         delete [] block_cols;
00582         delete [] ss;
00583 
00584         return betas;
00585       }
00586 
00587     } //namespace host_based
00588   } //namespace linalg
00589 } //namespace viennacl
00590 #endif