ViennaCL - The Vienna Computing Library
1.5.1
|
00001 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_ 00002 #define VIENNACL_LINALG_HOST_BASED_SSE_KERNELS_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00027 #ifdef VIENNACL_WITH_OPENMP 00028 #include <omp.h> 00029 #endif 00030 00031 #include <iostream> 00032 #include <vector> 00033 00034 //for std::min 00035 #include <algorithm> 00036 00037 #include "viennacl/linalg/host_based/sse_blas.hpp" 00038 00039 namespace viennacl 00040 { 00041 namespace linalg 00042 { 00043 namespace host_based 00044 { 00045 namespace detail 00046 { 00047 00048 // returns true if the matrix is hermitian (or real symmetric), false otherwise 00049 template <typename ScalarType> 00050 bool isHermitian(ScalarType ** const A, vcl_size_t n) 00051 { 00052 for(vcl_size_t i=0;i<n;i++) 00053 for(vcl_size_t j=i;j<n;j++) 00054 if(A[i][j] != conjIfComplex(A[j][i])) 00055 return false; 00056 return true; 00057 } 00058 00059 // returns the bandwidth of a hermitian (or real symmetric) matrix 00060 template <typename ScalarType> 00061 vcl_size_t getHermitianBandwidth(ScalarType ** const A, vcl_size_t n) 00062 { 00063 for(vcl_size_t i=n-1;i>=0;i--) 00064 for(vcl_size_t j=0;j<n-i;j++) 00065 if(A[i+j][j]!=ScalarType(0)) 00066 return 2*i+1; 00067 return 0; 00068 } 00069 00070 // helper for tridiagonalizeBandedMatrix 00071 // does a householder similarity transform to eliminate a range of nonzeros in a row of a hermitian matrix 00072 template <typename ScalarType> 00073 void eliminateHermitian(ScalarType ** A, vcl_size_t row, vcl_size_t from, vcl_size_t to, vcl_size_t width, ScalarType * ss) 00074 { 00075 if(from>=to) 00076 return; 00077 00078 ScalarType norm=_nrm2(&A[row][row+from],to-from); 00079 00080 if(norm != ScalarType(0)) 00081 { 00082 00083 //pick the better of two reflectors, to 1 or -1 00084 //this is wierd syntax that also works with std::complex 00085 if(std::abs(A[row][row+from]-ScalarType(1))>std::abs(A[row][row+from]+ScalarType(1))) 00086 norm=-norm; 00087 for(vcl_size_t i=row+from;i<row+to;i++) 00088 A[row][i]/=norm; 00089 A[row][row+from]+=ScalarType(1); 00090 00091 //apply the similarity transformation 00092 00093 //left transformation 00094 for(vcl_size_t j=row+1;j<row+width;j++) 00095 { 00096 ScalarType s=_dotc(to-from,&A[row][row+from],&A[j][row+from]); 00097 s=-s/A[row][row+from]; 00098 _axpy(&A[row][row+from],&A[j][row+from],to-from,s); 00099 } 00100 00101 //conjugate householder reflector for right transformation 00102 for(vcl_size_t i=row+from;i<row+to;i++) 00103 A[row][i]=conjIfComplex(A[row][i]); 00104 00105 //right transformation (cache aligned) 00106 for(vcl_size_t i=0;i<width;i++) 00107 ss[i]=ScalarType(0); 00108 for(vcl_size_t i=from;i<to;i++) 00109 _axpy(&A[row+i][row],ss,width,conjIfComplex(A[row][row+i])); 00110 for(vcl_size_t i=0;i<width;i++) 00111 ss[i]=-ss[i]/A[row][row+from]; 00112 for(vcl_size_t i=from;i<to;i++) 00113 _axpy(ss,&A[row+i][row],width,A[row][row+i]); 00114 00115 //clean up the householder reflector 00116 for(vcl_size_t col=row+from;col<row+to;col++) 00117 A[row][col]=conjIfComplex(A[col][row]); 00118 00119 } 00120 } 00121 00122 // reduces a hermitian (or symmetric real) banded matrix to a hermitian (or symmetric real) tridiagonal matrix, 00123 // using householder similarity transforms, so eigenvalues are preserved. 00124 // bandwidth should be an odd integer, such as 3 for an already tridiagonal matrix 00125 // based on http://www.netlib.org/lapack/lawnspdf/lawn208.pdf 00126 template<typename ScalarType> 00127 void tridiagonalizeHermitianBandedMatrix(ScalarType ** A, vcl_size_t n, vcl_size_t bandwidth) 00128 { 00129 if(bandwidth<=3) 00130 return; 00131 00132 vcl_size_t belowDiagonal=(bandwidth-1)/2; 00133 ScalarType *ss=new ScalarType[bandwidth+belowDiagonal]; 00134 00135 //eliminate and chase bulges where the elimination makes a bulge 00136 vcl_size_t k=0; 00137 for(;k<n-belowDiagonal;k++) 00138 { 00139 00140 //eliminate below the diagonal 00141 eliminateHermitian(A,k,1,1+belowDiagonal,std::min(n-k,2*belowDiagonal+1),ss); 00142 00143 //chase the bulge 00144 for(vcl_size_t bulgeStart=k+1;bulgeStart<n-belowDiagonal;bulgeStart+=belowDiagonal) 00145 for(vcl_size_t i=0;i<belowDiagonal-1;i++) 00146 eliminateHermitian(A,bulgeStart+i,belowDiagonal,std::min(n-bulgeStart-i,belowDiagonal*2-i),std::min(n-bulgeStart-i,bandwidth+belowDiagonal),ss); 00147 } 00148 00149 //eliminate beyond where elimination makes bulges 00150 for(;k<n-2;k++) 00151 eliminateHermitian(A,k,1,n-k,n-k,ss); 00152 00153 delete [] ss; 00154 } 00155 00156 // reduces a hermitian (or symmetric real) matrix to a hermitian (or symmetric real) banded matrix with bandwidth 2*block_size+1 00157 // using householder similarity transformations, so eigenvalues are preserved. reduceToBandedMatrix(A,1) reduces the matrix to tridiagonal 00158 template<typename ScalarType> 00159 void reduceHermitianToBandedMatrix(ScalarType ** A, vcl_size_t n, vcl_size_t block_size, vcl_size_t num_threads) 00160 { 00161 ScalarType* norms=new ScalarType[block_size]; 00162 ScalarType* ss=new ScalarType[n]; 00163 00164 for (vcl_size_t k=0;k<n-block_size;k+=block_size) 00165 { 00166 for(vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++) 00167 { 00168 00169 //this is the same as the norm of the column, since it's hermetian 00170 norms[bi]=_nrm2(&A[k+bi][k+bi+block_size],n-k-bi-block_size); 00171 00172 if(norms[bi]!=ScalarType(0)) 00173 { 00174 00175 //pick the better of two reflectors, to 1 or -1 00176 //this is wierd syntax that also works with std::complex 00177 if(std::abs(A[k+bi][k+bi+block_size]-ScalarType(1))>std::abs(A[k+bi][k+bi+block_size]+ScalarType(1))) 00178 norms[bi]=-norms[bi]; 00179 for(vcl_size_t i=k+bi+block_size;i<n;i++) 00180 A[k+bi][i]/=norms[bi]; 00181 A[k+bi][k+bi+block_size]+=ScalarType(1); 00182 00183 // Apply transformation to remaining rows within the block 00184 for(vcl_size_t j=k+bi+1;j<k+block_size;j++) 00185 { 00186 ScalarType s=_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]); 00187 s=-s/A[k+bi][k+bi+block_size]; 00188 _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s); 00189 } 00190 } 00191 } 00192 00193 //apply transformations from block to remaining rows and columns the block in parallel 00194 00195 //left transformations 00196 #ifdef VIENNACL_WITH_OPENMP 00197 #pragma omp parallel for 00198 for(int j=k+block_size;j<(int)n;j++) 00199 #else 00200 for(vcl_size_t j=k+block_size;j<n;j++) 00201 #endif 00202 { 00203 for(vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++) 00204 { 00205 if(norms[bi]!=ScalarType(0)) 00206 { 00207 ScalarType s=_dotc(n-k-bi-block_size,&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size]); 00208 s=-s/A[k+bi][k+bi+block_size]; 00209 _axpy(&A[k+bi][k+bi+block_size],&A[j][k+bi+block_size],n-k-bi-block_size,s); 00210 } 00211 } 00212 } 00213 00214 //conjugate householder reflectors for right transformations 00215 for(vcl_size_t bi=0;bi<block_size;bi++) 00216 for(vcl_size_t i=k+bi+block_size;i<n;i++) 00217 A[k+bi][i]=conjIfComplex(A[k+bi][i]); 00218 00219 //right transformations (cache aligned) 00220 #ifdef VIENNACL_WITH_OPENMP 00221 #pragma omp parallel for 00222 for(int section=0;section<(int)num_threads;section++) 00223 #else 00224 for(vcl_size_t section=0;section<num_threads;section++) 00225 #endif 00226 { 00227 vcl_size_t start=((n-k)*(section+0))/num_threads+k; 00228 vcl_size_t end =((n-k)*(section+1))/num_threads+k; 00229 vcl_size_t length=end-start; 00230 for(vcl_size_t bi=0;bi<std::min(block_size,n-k-block_size);bi++) 00231 { 00232 if(norms[bi]!=ScalarType(0)) 00233 { 00234 for(vcl_size_t i=start;i<end;i++) 00235 ss[i]=ScalarType(0); 00236 for(vcl_size_t i=k+bi+block_size;i<n;i++) 00237 _axpy(&A[i][start],ss+start,length,conjIfComplex(A[k+bi][i])); 00238 for(vcl_size_t i=start;i<end;i++) 00239 ss[i]=-ss[i]/A[k+bi][k+bi+block_size]; 00240 for(vcl_size_t i=k+bi+block_size;i<n;i++) 00241 _axpy(ss+start,&A[i][start],length,A[k+bi][i]); 00242 } 00243 } 00244 } 00245 00246 //clean up householder reflectors 00247 for(vcl_size_t row=k;row<k+block_size;row++) 00248 for(vcl_size_t col=row+block_size;col<n;col++) 00249 A[row][col]=conjIfComplex(A[col][row]); 00250 } 00251 delete [] norms; 00252 delete [] ss; 00253 } 00254 00255 } //namespace detail 00256 00265 template<typename ScalarType> 00266 void inplace_tred2(ScalarType ** A, vcl_size_t n, vcl_size_t block_size = 1, vcl_size_t num_threads = 1) 00267 { 00268 if(!detail::isHermitian(A,n)) 00269 std::cerr << "ViennaCL: Warning in inplace_tred2(): Matrix is not hermitian (or real symmetric)" << std::endl; 00270 00271 // Don't touch the whole matrix if the bandwidth is already small. 00272 // There's nothing numerically significant about n*4, 00273 // it's just a point I chose to switch to assuming the matrix is full. 00274 vcl_size_t bandwidth=detail::getHermitianBandwidth(A,n); 00275 if(bandwidth*bandwidth*num_threads<n*4 || 2*block_size+1>bandwidth) 00276 detail::tridiagonalizeHermitianBandedMatrix(A,n,bandwidth); 00277 else 00278 { 00279 detail::reduceHermitianToBandedMatrix(A,n,block_size,num_threads); 00280 detail::tridiagonalizeHermitianBandedMatrix(A,n,2*block_size+1); 00281 } 00282 } 00283 00294 template <typename ScalarType> 00295 bool lu_factorize_row_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t * piv = NULL, vcl_size_t block_size = 8) 00296 { 00297 // Use a parallel "left-looking", row-operation-based, block Crout/Doolittle algorithm. 00298 if(piv) 00299 for(vcl_size_t i=0; i<m; i++) 00300 piv[i]=i; 00301 bool pivsign=true; 00302 00303 // Outer loop. 00304 for(vcl_size_t j=0; j<std::min(m,n); j+=block_size) 00305 { 00306 block_size=std::min(std::min(m-j,n-j),block_size); 00307 00308 //do Gaussian elimination with partial pivoting in the block 00309 //(in the first few columns of the matrix) 00310 for(vcl_size_t bi=0;bi<block_size;bi++) 00311 { 00312 // Find pivot and exchange if necessary. 00313 vcl_size_t p=j+bi; 00314 if(piv) 00315 { 00316 for(vcl_size_t i=j+bi+1; i<m; i++) 00317 if(std::abs(A[i][j+bi])>std::abs(A[p][j+bi])) 00318 p=i; 00319 00320 if (p!=j+bi) 00321 { 00322 for(vcl_size_t k=0; k<n; k++) 00323 { 00324 ScalarType t=A[p][k]; 00325 A[p][k]=A[j+bi][k]; 00326 A[j+bi][k]=t; 00327 } 00328 00329 //swap pivot vector 00330 vcl_size_t k = piv[p]; 00331 piv[p] = piv[j+bi]; 00332 piv[j+bi] = k; 00333 pivsign = !pivsign; 00334 } 00335 } 00336 00337 //eliminate below the diagonal in the block 00338 ScalarType elimVal=A[j+bi][j+bi]; 00339 if(elimVal==ScalarType(0)) 00340 { 00341 //apply previous transformations from the block to the top of the submatrix 00342 for(vcl_size_t row=j+1;row<j+bi;row++) 00343 for(vcl_size_t bi_=0;bi_<row-j;bi_++) 00344 if(A[row][j+bi_]!=ScalarType(0)) 00345 _axpy(&(A[j+bi_][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi_]); 00346 return pivsign; 00347 } 00348 for(vcl_size_t row=j+bi+1;row<m;row++) 00349 { 00350 ScalarType multiplier=A[row][j+bi]/elimVal; 00351 for(vcl_size_t col=j+bi;col<j+block_size;col++) 00352 A[row][col]-=multiplier*A[j+bi][col]; 00353 A[row][j+bi]=multiplier; 00354 } 00355 } 00356 00357 //at this point, the matrix looks something like this (if block size were 4) 00358 // 00359 //U U U U * * * * 00360 //L U U U * * * * 00361 //L L U U * * * * 00362 //L L L U * * * * 00363 //L L L L * * * * 00364 //L L L L * * * * 00365 //L L L L * * * * 00366 //L L L L * * * * 00367 00368 //apply previous transformations from the block to the top of the submatrix 00369 for(vcl_size_t row=j+1;row<j+block_size;row++) 00370 for(vcl_size_t bi=0;bi<row-j;bi++) 00371 if(A[row][j+bi]!=ScalarType(0)) 00372 _axpy(&(A[j+bi][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi]); 00373 00374 //at this point, the matrix looks something like this (if block size were 4) 00375 // 00376 //U U U U U U U U 00377 //L U U U U U U U 00378 //L L U U U U U U 00379 //L L L U U U U U 00380 //L L L L * * * * 00381 //L L L L * * * * 00382 //L L L L * * * * 00383 //L L L L * * * * 00384 00385 //apply previous transformations from the block in parallel to the rest of the submatrix 00386 #ifdef VIENNACL_OPENMP 00387 #pragma omp parallel for 00388 for(int row=j+block_size;row<(int)m;row++) 00389 #else 00390 for(vcl_size_t row=j+block_size;row<m;row++) 00391 #endif 00392 for(vcl_size_t bi=0;bi<block_size;bi++) 00393 if(A[row][j+bi]!=ScalarType(0)) 00394 _axpy(&(A[j+bi][j+block_size]),&(A[row][j+block_size]),n-j-block_size,-A[row][j+bi]); 00395 } 00396 return pivsign; 00397 } 00398 00406 template <typename ScalarType> 00407 std::vector<ScalarType> inplace_qr_col_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size = 8) 00408 { 00409 std::vector<ScalarType> betas(std::min(m,n)); 00410 ScalarType* norms=new ScalarType[block_size]; 00411 00412 for(vcl_size_t k=0; k<std::min(m,n); k+=block_size) 00413 { 00414 block_size=std::min(std::min(m-k,n-k),block_size); 00415 00416 for(vcl_size_t bi=0;bi<block_size;bi++) 00417 { 00418 00419 // Compute 2-norm of k+bi-th column below the diagonal 00420 norms[bi]=_nrm2(&A[k+bi][k+bi],m-k-bi); 00421 00422 if(norms[bi]!=ScalarType(0)) 00423 { 00424 //pick the better of two reflectors, to 1 or -1, 00425 //this is wierd syntax that also works with std::complex 00426 if(std::abs(A[k+bi][k+bi]-ScalarType(1))>std::abs(A[k+bi][k+bi]+ScalarType(1))) 00427 norms[bi]*=-1; 00428 for(vcl_size_t i=k+bi;i<m;i++) 00429 A[k+bi][i]/=norms[bi]; 00430 A[k+bi][k+bi]+=ScalarType(1); 00431 00432 // Apply transformation to columns within the block 00433 for(vcl_size_t j=k+bi+1; j<k+block_size; j++) 00434 { 00435 ScalarType s=_dotc(m-k-bi,&A[k+bi][k+bi],&A[j][k+bi]); 00436 s = -s/A[k+bi][k+bi]; 00437 _axpy(&A[k+bi][k+bi],&A[j][k+bi],m-k-bi,s); 00438 } 00439 } 00440 //temporarily store the diagonal value of R in betas 00441 betas[k+bi]=-norms[bi]; 00442 } 00443 00444 //apply transformations from block to remaining columns to the right of the block in parallel 00445 #ifdef VIENNACL_OPENMP 00446 #pragma omp parallel for 00447 for(int j=k+block_size; j<(int)n; j++) 00448 #else 00449 for(vcl_size_t j=k+block_size; j<n; j++) 00450 #endif 00451 { 00452 for(vcl_size_t bi=0;bi<block_size;bi++) 00453 { 00454 if(norms[bi]!=ScalarType(0)) 00455 { 00456 ScalarType s=_dotc(m-k-bi,&A[k+bi][k+bi],&A[j][k+bi]); 00457 s = -s/A[k+bi][k+bi]; 00458 _axpy(&A[k+bi][k+bi],A[j]+k+bi,m-k-bi,s); 00459 } 00460 } 00461 } 00462 } 00463 00464 //normalize the householder reflectors and store the betas 00465 for(vcl_size_t j=0;j<std::min(m,n);j++) 00466 { 00467 ScalarType beta=A[j][j]; 00468 for(vcl_size_t i=j+1;i<m;i++) 00469 A[j][i]/=beta; 00470 A[j][j]=betas[j];//R diagonal values were stored temporarily in betas 00471 betas[j]=beta; 00472 } 00473 00474 delete [] norms; 00475 return betas; 00476 } 00477 00486 template <typename ScalarType> 00487 std::vector<ScalarType> inplace_qr_row_major(ScalarType ** A, vcl_size_t m, vcl_size_t n, vcl_size_t block_size = 8, vcl_size_t num_threads = 1) 00488 { 00489 std::vector<ScalarType> betas(std::min(m,n)); 00490 ScalarType* norms=new ScalarType[block_size]; 00491 ScalarType* ss=new ScalarType[n]; 00492 00493 //allocate O(m) memory for temporary column-major storage of the block for blas functions 00494 ScalarType** block_cols=new ScalarType*[block_size]; 00495 for(vcl_size_t i=0;i<block_size;i++) 00496 block_cols[i]=new ScalarType[m]; 00497 00498 for(vcl_size_t k=0; k<std::min(m,n); k+=block_size) 00499 { 00500 block_size=std::min(std::min(m-k,n-k),block_size); 00501 00502 //copy the block to column-major storage for cache alignment (necessary for _nrm2) 00503 for(vcl_size_t i=0;i<m-k;i++) 00504 for(vcl_size_t bi=0;bi<block_size;bi++) 00505 block_cols[bi][i]=A[k+i][k+bi]; 00506 00507 for(vcl_size_t bi=0;bi<block_size;bi++) 00508 { 00509 00510 // Compute 2-norm of k+bi-th column below the diagonal 00511 norms[bi]=_nrm2(&block_cols[bi][bi],m-k-bi); 00512 00513 if(norms[bi]!=ScalarType(0)) 00514 { 00515 //pick the better of two reflectors, to 1 or -1, 00516 //this is wierd syntax that also works with std::complex 00517 if(std::abs(block_cols[bi][bi]-ScalarType(1))>std::abs(block_cols[bi][bi]+ScalarType(1))) 00518 norms[bi]*=-1; 00519 for(vcl_size_t i=bi;i<m-k;i++) 00520 block_cols[bi][i]/=norms[bi]; 00521 block_cols[bi][bi]+=ScalarType(1); 00522 00523 // Apply transformation to columns within the block 00524 for(vcl_size_t j=bi+1; j<block_size; j++) 00525 { 00526 ScalarType s=_dotc(m-k-bi,&block_cols[bi][bi],&block_cols[j][bi]); 00527 s = -s/block_cols[bi][bi]; 00528 _axpy(&block_cols[bi][bi],&block_cols[j][bi],m-k-bi,s); 00529 } 00530 } 00531 //temporarily store the diagonal value of R in betas 00532 betas[k+bi]=-norms[bi]; 00533 } 00534 00535 //copy the block back to row-major storage 00536 for(vcl_size_t i=0;i<m-k;i++) 00537 for(vcl_size_t bi=0;bi<block_size;bi++) 00538 A[k+i][k+bi]=block_cols[bi][i]; 00539 00540 //apply transformations from block to remaining rows to the right of the block in parallel 00541 #ifdef VIENNACL_OPENMP 00542 #pragma omp parallel for 00543 for(int section=0;section<(int)num_threads;section++) 00544 #else 00545 for(vcl_size_t section=0;section<num_threads;section++) 00546 #endif 00547 { 00548 vcl_size_t start=((n-k-block_size)*(section+0))/num_threads+k+block_size; 00549 vcl_size_t end =((n-k-block_size)*(section+1))/num_threads+k+block_size; 00550 vcl_size_t length=end-start; 00551 for(vcl_size_t bi=0;bi<block_size;bi++) 00552 { 00553 if(norms[bi]!=ScalarType(0)) 00554 { 00555 for(vcl_size_t i=start;i<end;i++) 00556 ss[i]=ScalarType(0); 00557 for(vcl_size_t i=k+bi;i<m;i++) 00558 _axpy(&A[i][start],ss+start,length,A[i][k+bi]); 00559 for(vcl_size_t i=start;i<end;i++) 00560 ss[i]=-ss[i]/A[k+bi][k+bi]; 00561 for(vcl_size_t i=k+bi;i<m;i++) 00562 _axpy(ss+start,&A[i][start],length,A[i][k+bi]); 00563 } 00564 } 00565 } 00566 } 00567 00568 //normalize the householder reflectors and store the betas 00569 for(vcl_size_t j=0;j<std::min(m,n);j++) 00570 { 00571 ScalarType beta=A[j][j]; 00572 for(vcl_size_t i=j+1;i<m;i++) 00573 A[i][j]/=beta; 00574 A[j][j]=betas[j];//R diagonal values were stored temporarily in betas 00575 betas[j]=beta; 00576 } 00577 00578 delete [] norms; 00579 for(vcl_size_t i=0;i<block_size;i++) 00580 delete [] block_cols[i]; 00581 delete [] block_cols; 00582 delete [] ss; 00583 00584 return betas; 00585 } 00586 00587 } //namespace host_based 00588 } //namespace linalg 00589 } //namespace viennacl 00590 #endif