ViennaCL - The Vienna Computing Library
1.5.1
|
00001 #ifndef VIENNACL_LINALG_DETAIL_SPAI_QR_HPP 00002 #define VIENNACL_LINALG_DETAIL_SPAI_QR_HPP 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00027 #include <utility> 00028 #include <iostream> 00029 #include <fstream> 00030 #include <string> 00031 #include <algorithm> 00032 #include <vector> 00033 #include <math.h> 00034 #include <cmath> 00035 #include <sstream> 00036 #include "viennacl/ocl/backend.hpp" 00037 #include "boost/numeric/ublas/vector.hpp" 00038 #include "boost/numeric/ublas/matrix.hpp" 00039 #include "boost/numeric/ublas/matrix_proxy.hpp" 00040 #include "boost/numeric/ublas/storage.hpp" 00041 #include "boost/numeric/ublas/io.hpp" 00042 #include "boost/numeric/ublas/matrix_expression.hpp" 00043 #include "boost/numeric/ublas/detail/matrix_assign.hpp" 00044 00045 #include "viennacl/vector.hpp" 00046 #include "viennacl/matrix.hpp" 00047 00048 #include "viennacl/linalg/detail/spai/block_matrix.hpp" 00049 #include "viennacl/linalg/detail/spai/block_vector.hpp" 00050 #include "viennacl/linalg/opencl/kernels/spai.hpp" 00051 00052 namespace viennacl 00053 { 00054 namespace linalg 00055 { 00056 namespace detail 00057 { 00058 namespace spai 00059 { 00060 00061 //********** DEBUG FUNCTIONS *****************// 00062 template< typename T, typename InputIterator> 00063 void Print(std::ostream& ostr, InputIterator it_begin, InputIterator it_end){ 00064 //std::ostream_iterator<int> it_os(ostr, delimiter); 00065 std::string delimiters = " "; 00066 std::copy(it_begin, it_end, std::ostream_iterator<T>(ostr, delimiters.c_str())); 00067 ostr<<std::endl; 00068 } 00069 00070 template<typename VectorType, typename MatrixType> 00071 void write_to_block(VectorType& con_A_I_J, unsigned int start_ind, const std::vector<unsigned int>& I, const std::vector<unsigned int>& J, MatrixType& m){ 00072 m.resize(I.size(), J.size(), false); 00073 for(vcl_size_t i = 0; i < J.size(); ++i){ 00074 for(vcl_size_t j = 0; j < I.size(); ++j){ 00075 m(j,i) = con_A_I_J[start_ind + i*I.size() + j]; 00076 } 00077 } 00078 } 00079 00080 template<typename VectorType> 00081 void print_continious_matrix(VectorType& con_A_I_J, std::vector<cl_uint>& blocks_ind, 00082 const std::vector<std::vector<unsigned int> >& g_I, const std::vector<std::vector<unsigned int> >& g_J){ 00083 typedef typename VectorType::value_type ScalarType; 00084 std::vector<boost::numeric::ublas::matrix<ScalarType> > com_A_I_J(g_I.size()); 00085 for(vcl_size_t i = 0; i < g_I.size(); ++i){ 00086 write_to_block( con_A_I_J, blocks_ind[i], g_I[i], g_J[i], com_A_I_J[i]); 00087 std::cout<<com_A_I_J[i]<<std::endl; 00088 } 00089 } 00090 template<typename VectorType> 00091 void print_continious_vector(VectorType& con_v, std::vector<cl_uint>& block_ind, const std::vector<std::vector<unsigned int> >& g_J){ 00092 typedef typename VectorType::value_type ScalarType; 00093 std::vector<boost::numeric::ublas::vector<ScalarType> > com_v(g_J.size()); 00094 //Print<ScalarType>(std::cout, con_v.begin(), con_v.end()); 00095 for(vcl_size_t i = 0; i < g_J.size(); ++i){ 00096 com_v[i].resize(g_J[i].size()); 00097 for(vcl_size_t j = 0; j < g_J[i].size(); ++j){ 00098 com_v[i](j) = con_v[block_ind[i] + j]; 00099 } 00100 std::cout<<com_v[i]<<std::endl; 00101 } 00102 } 00103 00105 00112 inline void compute_blocks_size(const std::vector<std::vector<unsigned int> >& g_I, const std::vector<std::vector<unsigned int> >& g_J, 00113 unsigned int& sz, std::vector<cl_uint>& blocks_ind, std::vector<cl_uint>& matrix_dims) 00114 { 00115 sz = 0; 00116 for(vcl_size_t i = 0; i < g_I.size(); ++i){ 00117 sz += static_cast<unsigned int>(g_I[i].size()*g_J[i].size()); 00118 matrix_dims[2*i] = static_cast<cl_uint>(g_I[i].size()); 00119 matrix_dims[2*i + 1] = static_cast<cl_uint>(g_J[i].size()); 00120 blocks_ind[i+1] = blocks_ind[i] + static_cast<cl_uint>(g_I[i].size()*g_J[i].size()); 00121 00122 } 00123 } 00128 template <typename SizeType> 00129 void get_size(const std::vector<std::vector<SizeType> >& inds, SizeType & size){ 00130 size = 0; 00131 for (vcl_size_t i = 0; i < inds.size(); ++i) { 00132 size += static_cast<unsigned int>(inds[i].size()); 00133 } 00134 } 00135 00140 template <typename SizeType> 00141 void init_start_inds(const std::vector<std::vector<SizeType> >& inds, std::vector<cl_uint>& start_inds){ 00142 for(vcl_size_t i = 0; i < inds.size(); ++i){ 00143 start_inds[i+1] = start_inds[i] + static_cast<cl_uint>(inds[i].size()); 00144 } 00145 } 00146 00147 //************************************* QR FUNCTIONS ***************************************// 00153 template<typename MatrixType, typename ScalarType> 00154 void dot_prod(const MatrixType& A, unsigned int beg_ind, ScalarType& res){ 00155 res = static_cast<ScalarType>(0); 00156 for(vcl_size_t i = beg_ind; i < A.size1(); ++i){ 00157 res += A(i, beg_ind-1)*A(i, beg_ind-1); 00158 } 00159 } 00167 template<typename MatrixType, typename VectorType, typename ScalarType> 00168 void custom_inner_prod(const MatrixType& A, const VectorType& v, unsigned int col_ind, unsigned int start_ind, ScalarType& res){ 00169 res = static_cast<ScalarType>(0); 00170 for(unsigned int i = start_ind; i < static_cast<unsigned int>(A.size1()); ++i){ 00171 res += A(i, col_ind)*v(i); 00172 } 00173 } 00174 00180 template<typename MatrixType, typename VectorType> 00181 void copy_vector(const MatrixType & A, VectorType & v, const unsigned int beg_ind){ 00182 for(unsigned int i = beg_ind; i < static_cast<unsigned int>(A.size1()); ++i){ 00183 v(i) = A( i, beg_ind-1); 00184 } 00185 } 00186 00187 //householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210 00194 template<typename MatrixType, typename VectorType, typename ScalarType> 00195 void householder_vector(const MatrixType& A, unsigned int j, VectorType& v, ScalarType& b) 00196 { 00197 ScalarType sg; 00198 // 00199 dot_prod(A, j+1, sg); 00200 copy_vector(A, v, j+1); 00201 ScalarType mu; 00202 v(j) = static_cast<ScalarType>(1.0); 00203 if(sg == 0){ 00204 b = 0; 00205 } 00206 else{ 00207 mu = std::sqrt(A(j,j)*A(j, j) + sg); 00208 if(A(j, j) <= 0){ 00209 v(j) = A(j, j) - mu; 00210 }else{ 00211 v(j) = -sg/(A(j, j) + mu); 00212 } 00213 b = 2*(v(j)*v(j))/(sg + v(j)*v(j)); 00214 v = v/v(j); 00215 } 00216 } 00223 template<typename MatrixType, typename VectorType, typename ScalarType> 00224 void apply_householder_reflection(MatrixType& A, unsigned int iter_cnt, VectorType& v, ScalarType b) 00225 { 00226 //update every column of matrix A 00227 ScalarType in_prod_res; 00228 for(unsigned int i = iter_cnt; i < static_cast<unsigned int>(A.size2()); ++i){ 00229 //update each column in a fashion: ai = ai - b*v*(v'*ai) 00230 custom_inner_prod(A, v, i, iter_cnt, in_prod_res); 00231 for(unsigned int j = iter_cnt; j < static_cast<unsigned int>(A.size1()); ++j){ 00232 A(j, i) -= b*in_prod_res*v(j); 00233 } 00234 } 00235 } 00236 00242 template<typename MatrixType, typename VectorType> 00243 void store_householder_vector(MatrixType& A, unsigned int ind, VectorType& v) 00244 { 00245 for(unsigned int i = ind; i < static_cast<unsigned int>(A.size1()); ++i){ 00246 A(i, ind-1) = v(i); 00247 } 00248 } 00249 00250 00251 //QR algorithm 00256 template<typename MatrixType, typename VectorType> 00257 void single_qr(MatrixType& R, VectorType& b_v) 00258 { 00259 typedef typename MatrixType::value_type ScalarType; 00260 if((R.size1() > 0) && (R.size2() > 0)){ 00261 VectorType v = (VectorType)boost::numeric::ublas::zero_vector<ScalarType>(R.size1()); 00262 b_v = (VectorType)boost::numeric::ublas::zero_vector<ScalarType>(R.size2()); 00263 for(unsigned int i = 0; i < static_cast<unsigned int>(R.size2()); ++i){ 00264 householder_vector(R, i, v, b_v[i]); 00265 apply_householder_reflection(R, i, v, b_v[i]); 00266 if(i < R.size1()) store_householder_vector(R, i+1, v); 00267 } 00268 } 00269 } 00270 00271 //********************** HELP FUNCTIONS FOR GPU-based QR factorization *************************// 00272 /* * @brief Reading from text file into string 00273 * @param file_name file name 00274 * @param kernel_source string that contains file 00275 00276 void read_kernel_from_file(std::string& file_name, std::string& kernel_source){ 00277 std::ifstream ifs(file_name.c_str(), std::ifstream::in); 00278 00279 if (!ifs) 00280 std::cerr << "WARNING: Cannot open file " << file_name << std::endl; 00281 00282 std::string line; 00283 std::ostringstream ost; 00284 while (std::getline(ifs, line)) { 00285 ost<<line<<std::endl; 00286 } 00287 kernel_source = ost.str(); 00288 }*/ 00289 00294 template <typename SizeType> 00295 void get_max_block_size(const std::vector<std::vector<SizeType> >& inds, SizeType & max_size) 00296 { 00297 max_size = 0; 00298 for(vcl_size_t i = 0; i < inds.size(); ++i){ 00299 if(inds[i].size() > max_size){ 00300 max_size = static_cast<SizeType>(inds[i].size()); 00301 } 00302 } 00303 } 00304 00311 template<typename MatrixType, typename VectorType, typename ScalarType> 00312 void custom_dot_prod(const MatrixType& A, const VectorType& v, unsigned int ind, ScalarType& res) 00313 { 00314 res = static_cast<ScalarType>(0); 00315 for(unsigned int j = ind; j < A.size1(); ++j){ 00316 if(j == ind){ 00317 res += v(j); 00318 }else{ 00319 res += A(j, ind)*v(j); 00320 } 00321 } 00322 } 00323 00329 template<typename MatrixType, typename VectorType> 00330 void apply_q_trans_vec(const MatrixType& R, const VectorType& b_v, VectorType& y) 00331 { 00332 typedef typename MatrixType::value_type ScalarType; 00333 ScalarType inn_prod = static_cast<ScalarType>(0); 00334 for(vcl_size_t i = 0; i < R.size2(); ++i){ 00335 custom_dot_prod(R, y, static_cast<unsigned int>(i), inn_prod); 00336 for(vcl_size_t j = i; j < R.size1(); ++j){ 00337 if(i == j){ 00338 y(j) -= b_v(i)*inn_prod; 00339 } 00340 else{ 00341 y(j) -= b_v(i)*inn_prod*R(j,i); 00342 } 00343 } 00344 } 00345 } 00346 00352 template<typename MatrixType, typename VectorType> 00353 void apply_q_trans_mat(const MatrixType& R, const VectorType& b_v, MatrixType& A) 00354 { 00355 VectorType tmp_v; 00356 for(vcl_size_t i = 0; i < A.size2(); ++i){ 00357 tmp_v = (VectorType)column(A,i); 00358 apply_q_trans_vec(R, b_v, tmp_v); 00359 column(A,i) = tmp_v; 00360 } 00361 } 00362 00363 //parallel QR for GPU 00373 template<typename ScalarType> 00374 void block_qr(std::vector<std::vector<unsigned int> >& g_I, 00375 std::vector<std::vector<unsigned int> >& g_J, 00376 block_matrix& g_A_I_J_vcl, 00377 block_vector& g_bv_vcl, 00378 std::vector<cl_uint>& g_is_update, 00379 viennacl::context ctx) 00380 { 00381 viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context()); 00382 00383 //typedef typename MatrixType::value_type ScalarType; 00384 unsigned int bv_size = 0; 00385 unsigned int v_size = 0; 00386 //set up arguments for GPU 00387 //find maximum size of rows/columns 00388 unsigned int local_r_n = 0; 00389 unsigned int local_c_n = 0; 00390 //find max size for blocks 00391 get_max_block_size(g_I, local_r_n); 00392 get_max_block_size(g_J, local_c_n); 00393 //get size 00394 get_size(g_J, bv_size); 00395 get_size(g_I, v_size); 00396 //get start indices 00397 std::vector<cl_uint> start_bv_inds(g_I.size() + 1, 0); 00398 std::vector<cl_uint> start_v_inds(g_I.size() + 1, 0); 00399 init_start_inds(g_J, start_bv_inds); 00400 init_start_inds(g_I, start_v_inds); 00401 //init arrays 00402 std::vector<ScalarType> b_v(bv_size, static_cast<ScalarType>(0)); 00403 std::vector<ScalarType> v(v_size, static_cast<ScalarType>(0)); 00404 //call qr program 00405 block_vector v_vcl; 00406 00407 g_bv_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00408 static_cast<unsigned int>(sizeof(ScalarType)*bv_size), 00409 &(b_v[0])); 00410 00411 v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00412 static_cast<unsigned int>(sizeof(ScalarType)*v_size), 00413 &(v[0])); 00414 //the same as j_start_inds 00415 g_bv_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00416 static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()), 00417 &(start_bv_inds[0])); 00418 00419 v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00420 static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()), 00421 &(start_v_inds[0])); 00422 viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE, 00423 static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()), 00424 &(g_is_update[0])); 00425 //local memory 00426 //viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp)); 00427 viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx); 00428 viennacl::ocl::kernel& qr_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_qr"); 00429 qr_kernel.local_work_size(0, local_c_n); 00430 qr_kernel.global_work_size(0, local_c_n*256); 00431 viennacl::ocl::enqueue(qr_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle1(), g_bv_vcl.handle(), 00432 v_vcl.handle(), g_A_I_J_vcl.handle2(), 00433 g_bv_vcl.handle1(), v_vcl.handle1(), g_is_update_vcl, 00434 viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))), 00435 static_cast<cl_uint>(g_I.size()))); 00436 00437 } 00438 } 00439 } 00440 } 00441 } 00442 #endif