ViennaCL - The Vienna Computing Library  1.5.1
viennacl/linalg/detail/spai/spai.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
00002 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00025 #include <utility>
00026 #include <iostream>
00027 #include <fstream>
00028 #include <string>
00029 #include <algorithm>
00030 #include <vector>
00031 #include <math.h>
00032 #include <map>
00033 
00034 //local includes
00035 #include "viennacl/linalg/detail/spai/spai_tag.hpp"
00036 #include "viennacl/linalg/qr.hpp"
00037 #include "viennacl/linalg/detail/spai/spai-dynamic.hpp"
00038 #include "viennacl/linalg/detail/spai/spai-static.hpp"
00039 #include "viennacl/linalg/detail/spai/sparse_vector.hpp"
00040 #include "viennacl/linalg/detail/spai/block_matrix.hpp"
00041 #include "viennacl/linalg/detail/spai/block_vector.hpp"
00042 
00043 //boost includes
00044 #include "boost/numeric/ublas/vector.hpp"
00045 #include "boost/numeric/ublas/matrix.hpp"
00046 #include "boost/numeric/ublas/matrix_proxy.hpp"
00047 #include "boost/numeric/ublas/vector_proxy.hpp"
00048 #include "boost/numeric/ublas/storage.hpp"
00049 #include "boost/numeric/ublas/io.hpp"
00050 #include "boost/numeric/ublas/lu.hpp"
00051 #include "boost/numeric/ublas/triangular.hpp"
00052 #include "boost/numeric/ublas/matrix_expression.hpp"
00053 
00054 // ViennaCL includes
00055 #include "viennacl/linalg/prod.hpp"
00056 #include "viennacl/matrix.hpp"
00057 #include "viennacl/compressed_matrix.hpp"
00058 #include "viennacl/linalg/sparse_matrix_operations.hpp"
00059 #include "viennacl/linalg/matrix_operations.hpp"
00060 #include "viennacl/scalar.hpp"
00061 #include "viennacl/linalg/inner_prod.hpp"
00062 #include "viennacl/linalg/ilu.hpp"
00063 #include "viennacl/ocl/backend.hpp"
00064 #include "viennacl/linalg/opencl/kernels/spai.hpp"
00065 
00066 
00067 
00068 #define VIENNACL_SPAI_K_b 20
00069 
00070 namespace viennacl
00071 {
00072   namespace linalg
00073   {
00074     namespace detail
00075     {
00076       namespace spai
00077       {
00078 
00079         //debug function for print
00080         template<typename SparseVectorType>
00081         void print_sparse_vector(const SparseVectorType& v){
00082             for(typename SparseVectorType::const_iterator vec_it = v.begin(); vec_it!= v.end(); ++vec_it){
00083                 std::cout<<"[ "<<vec_it->first<<" ]:"<<vec_it->second<<std::endl;
00084             }
00085         }
00086         template<typename DenseMatrixType>
00087         void print_matrix( DenseMatrixType & m){
00088             for(int i = 0; i < m.size2(); ++i){
00089                 for(int j = 0; j < m.size1(); ++j){
00090                     std::cout<<m(j, i)<<" ";
00091                 }
00092                 std::cout<<std::endl;
00093             }
00094         }
00095 
00101         template<typename SparseVectorType, typename ScalarType>
00102         void add_sparse_vectors(const SparseVectorType& v, const ScalarType b,  SparseVectorType& res_v){
00103             for(typename SparseVectorType::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it){
00104                 res_v[v_it->first] += b*v_it->second;
00105             }
00106         }
00107         //sparse-matrix - vector product
00114         template<typename SparseVectorType, typename ScalarType>
00115         void compute_spai_residual(const std::vector<SparseVectorType>& A_v_c, const SparseVectorType& v,
00116                                    const unsigned int ind, SparseVectorType& res){
00117             for(typename SparseVectorType::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it){
00118                 add_sparse_vectors(A_v_c[v_it->first], v_it->second, res);
00119             }
00120             res[ind] -= static_cast<ScalarType>(1);
00121         }
00122 
00129         template<typename SparseVectorType>
00130         void build_index_set(const std::vector<SparseVectorType>& A_v_c, const SparseVectorType& v, std::vector<unsigned int>& J,
00131                              std::vector<unsigned int>& I){
00132             buildColumnIndexSet(v, J);
00133             projectRows(A_v_c, J, I);
00134         }
00135 
00142         template<typename SparseMatrixType, typename DenseMatrixType>
00143         void initProjectSubMatrix(const SparseMatrixType& A_in, const std::vector<unsigned int>& J, std::vector<unsigned int>& I,
00144                                   DenseMatrixType& A_out)
00145         {
00146           A_out.resize(I.size(), J.size(), false);
00147           for(vcl_size_t j = 0; j < J.size(); ++j)
00148           {
00149             for(vcl_size_t i = 0; i < I.size(); ++i)
00150               A_out(i,j) = A_in(I[i],J[j]);
00151           }
00152         }
00153 
00154 
00155         /************************************************** CPU BLOCK SET UP ***************************************/
00165         template<typename SparseMatrixType, typename DenseMatrixType, typename SparseVectorType, typename VectorType>
00166         void block_set_up(const SparseMatrixType& A,
00167                           const std::vector<SparseVectorType>& A_v_c,
00168                           const std::vector<SparseVectorType>& M_v,
00169                           std::vector<std::vector<unsigned int> >& g_I,
00170                           std::vector<std::vector<unsigned int> >& g_J,
00171                           std::vector<DenseMatrixType>& g_A_I_J,
00172                           std::vector<VectorType>& g_b_v){
00173 #ifdef VIENNACL_WITH_OPENMP
00174             #pragma omp parallel for
00175 #endif
00176             for (long i = 0; i < static_cast<long>(M_v.size()); ++i){
00177                 build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
00178                 initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);
00179                 //print_matrix(g_A_I_J[i]);
00180                 single_qr(g_A_I_J[i], g_b_v[i]);
00181                 //print_matrix(g_A_I_J[i]);
00182             }
00183         }
00184 
00191         template<typename SparseVectorType>
00192         void index_set_up(const std::vector<SparseVectorType>& A_v_c,
00193                           const std::vector<SparseVectorType>& M_v,
00194                           std::vector<std::vector<unsigned int> >& g_J,
00195                           std::vector<std::vector<unsigned int> >& g_I)
00196         {
00197 #ifdef VIENNACL_WITH_OPENMP
00198             #pragma omp parallel for
00199 #endif
00200             for (long i = 0; i < static_cast<long>(M_v.size()); ++i){
00201                 build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
00202             }
00203         }
00204 
00205         /************************************************** GPU BLOCK SET UP ***************************************/
00216         template<typename ScalarType, unsigned int MAT_ALIGNMENT, typename SparseVectorType>
00217         void block_set_up(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A,
00218                           const std::vector<SparseVectorType>& A_v_c,
00219                           const std::vector<SparseVectorType>& M_v,
00220                           std::vector<cl_uint> g_is_update,
00221                           std::vector<std::vector<unsigned int> >& g_I,
00222                           std::vector<std::vector<unsigned int> >& g_J,
00223                           block_matrix & g_A_I_J,
00224                           block_vector & g_bv)
00225         {
00226           viennacl::context ctx = viennacl::traits::context(A);
00227           bool is_empty_block;
00228           //build index set
00229           index_set_up(A_v_c, M_v, g_J, g_I);
00230           block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block);
00231           block_qr<ScalarType>(g_I, g_J, g_A_I_J, g_bv, g_is_update, ctx);
00232 
00233         }
00234 
00235 
00236         /***************************************************************************************************/
00237         /******************************** SOLVING LS PROBLEMS ON GPU ***************************************/
00238         /***************************************************************************************************/
00245         template<typename ScalarType, typename SparseVectorType>
00246         void custom_fan_out(const std::vector<ScalarType> & m_in,
00247                             unsigned int start_m_ind,
00248                             const std::vector<unsigned int> & J,
00249                             SparseVectorType & m)
00250         {
00251             unsigned int  cnt = 0;
00252             for (vcl_size_t i = 0; i < J.size(); ++i) {
00253                 m[J[i]] = m_in[start_m_ind + cnt++];
00254             }
00255         }
00256 
00257 
00258 
00259         //GPU based least square problem
00272         template<typename SparseVectorType, typename ScalarType>
00273         void least_square_solve(std::vector<SparseVectorType> & A_v_c,
00274                                 std::vector<SparseVectorType> & M_v,
00275                                 std::vector<std::vector<unsigned int> >& g_I,
00276                                 std::vector<std::vector<unsigned int> > & g_J,
00277                                 block_matrix & g_A_I_J_vcl,
00278                                 block_vector & g_bv_vcl,
00279                                 std::vector<SparseVectorType> & g_res,
00280                                 std::vector<cl_uint> & g_is_update,
00281                                 const spai_tag & tag,
00282                                 viennacl::context ctx)
00283         {
00284           viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
00285           unsigned int y_sz, m_sz;
00286           std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
00287           std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
00288           get_size(g_I, y_sz);
00289           init_start_inds(g_I, y_inds);
00290           init_start_inds(g_J, m_inds);
00291           //create y_v
00292           std::vector<ScalarType> y_v(y_sz, static_cast<ScalarType>(0));
00293           for(vcl_size_t i = 0; i < M_v.size(); ++i)
00294           {
00295             for(vcl_size_t j = 0; j < g_I[i].size(); ++j)
00296             {
00297               if(g_I[i][j] == i)
00298                 y_v[y_inds[i] + j] = static_cast<ScalarType>(1.0);
00299             }
00300           }
00301           //compute m_v
00302           get_size(g_J, m_sz);
00303           std::vector<ScalarType> m_v(m_sz, static_cast<cl_uint>(0));
00304 
00305           block_vector y_v_vcl;
00306           block_vector m_v_vcl;
00307           //prepearing memory for least square problem on GPU
00308           y_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00309                                                       static_cast<unsigned int>(sizeof(ScalarType)*y_v.size()),
00310                                                       &(y_v[0]));
00311           m_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00312                                                       static_cast<unsigned int>(sizeof(ScalarType)*m_v.size()),
00313                                                       &(m_v[0]));
00314           y_v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00315                                                        static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
00316                                                        &(y_inds[0]));
00317           viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00318                                                                                    static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
00319                                                                                    &(g_is_update[0]));
00320           viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
00321           viennacl::ocl::kernel& ls_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "block_least_squares");
00322           ls_kernel.local_work_size(0, 1);
00323           ls_kernel.global_work_size(0, 256);
00324           viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(),
00325                                            y_v_vcl.handle(), y_v_vcl.handle1(),
00326                                            g_A_I_J_vcl.handle1(), g_is_update_vcl,
00327                                            //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
00328                                            static_cast<unsigned int>(M_v.size())));
00329           //copy vector m_v back from GPU to CPU
00330           cl_int vcl_err = clEnqueueReadBuffer(opencl_ctx.get_queue().handle().get(),
00331                                                m_v_vcl.handle().get(), CL_TRUE, 0,
00332                                                sizeof(ScalarType)*(m_v.size()),
00333                                                &(m_v[0]), 0, NULL, NULL);
00334           VIENNACL_ERR_CHECK(vcl_err);
00335           //fan out vector in parallel
00336           //#pragma omp parallel for
00337           for(long i = 0; i < static_cast<long>(M_v.size()); ++i)
00338           {
00339             if(g_is_update[i])
00340             {
00341               //faned out onto sparse vector
00342               custom_fan_out(m_v, m_inds[i], g_J[i], M_v[i]);
00343               g_res[i].clear();
00344               compute_spai_residual<SparseVectorType, ScalarType>(A_v_c,  M_v[i], static_cast<unsigned int>(i), g_res[i]);
00345               ScalarType res_norm = 0;
00346               //compute norm of res - just to make sure that this implementatino works correct
00347               sparse_norm_2(g_res[i], res_norm);
00348               //std::cout<<"Residual norm of column #: "<<i<<std::endl;
00349               //std::cout<<res_norm<<std::endl;
00350               //std::cout<<"************************"<<std::endl;
00351               g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
00352             }
00353           }
00354         }
00355 
00356         //CPU based least square problems
00368         template<typename SparseVectorType, typename DenseMatrixType, typename VectorType>
00369         void least_square_solve(const std::vector<SparseVectorType>& A_v_c,
00370                                 std::vector<DenseMatrixType>& g_R,
00371                                 std::vector<VectorType>& g_b_v,
00372                                 std::vector<std::vector<unsigned int> >& g_I,
00373                                 std::vector<std::vector<unsigned int> >& g_J,
00374                                 std::vector<SparseVectorType>& g_res,
00375                                 std::vector<bool>& g_is_update,
00376                                 std::vector<SparseVectorType>& M_v,
00377                                 const spai_tag& tag){
00378             typedef typename DenseMatrixType::value_type ScalarType;
00379             //VectorType m_new, y;
00380 #ifdef VIENNACL_WITH_OPENMP
00381             #pragma omp parallel for
00382 #endif
00383             for (long i = 0; i < static_cast<long>(M_v.size()); ++i){
00384                 if(g_is_update[i]){
00385                     VectorType y = boost::numeric::ublas::zero_vector<ScalarType>(g_I[i].size());
00386                     //std::cout<<y<<std::endl;
00387                     projectI<VectorType, ScalarType>(g_I[i], y, static_cast<unsigned int>(tag.getBegInd() + i));
00388                     apply_q_trans_vec(g_R[i], g_b_v[i], y);
00389                     VectorType m_new =  boost::numeric::ublas::zero_vector<ScalarType>(g_R[i].size2());
00390                     backwardSolve(g_R[i], y, m_new);
00391                     fanOutVector(m_new, g_J[i], M_v[i]);
00392                     g_res[i].clear();
00393                     compute_spai_residual<SparseVectorType, ScalarType>(A_v_c,  M_v[i], static_cast<unsigned int>(tag.getBegInd() + i), g_res[i]);
00394                     ScalarType res_norm = 0;
00395                     sparse_norm_2(g_res[i], res_norm);
00396 //                    std::cout<<"Residual norm of column #: "<<i<<std::endl;
00397 //                    std::cout<<res_norm<<std::endl;
00398 //                    std::cout<<"************************"<<std::endl;
00399                     g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic());
00400                 }
00401             }
00402         }
00403 
00404         //************************************ UPDATE CHECK ***************************************************//
00405         template<typename VectorType>
00406         bool is_all_update(VectorType& parallel_is_update){
00407 
00408             for(unsigned int i = 0; i < parallel_is_update.size(); ++i){
00409                 if(parallel_is_update[i])
00410                     return true;
00411             }
00412             return false;
00413         }
00414 
00415         //********************************** MATRIX VECTORIZATION ***********************************************//
00416         //Matrix vectorization, column based approach
00421         template<typename SparseMatrixType, typename SparseVectorType>
00422         void vectorize_column_matrix(const SparseMatrixType& M_in, std::vector<SparseVectorType>& M_v){
00423             for(typename SparseMatrixType::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it){
00424                 //
00425                 for(typename SparseMatrixType::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it){
00426                     M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it;
00427                 }
00428                 //std::cout<<std::endl;
00429             }
00430         }
00431 
00432         //Matrix vectorization row based approach
00433         template<typename SparseMatrixType, typename SparseVectorType>
00434         void vectorize_row_matrix(const SparseMatrixType& M_in, std::vector<SparseVectorType>& M_v){
00435             for(typename SparseMatrixType::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it){
00436                 for(typename SparseMatrixType::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it){
00437                     M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it;
00438                 }
00439             }
00440         }
00441 
00442         //************************************* BLOCK ASSEMBLY CODE *********************************************//
00443 
00444 
00445         template <typename SizeType>
00446         void write_set_to_array(const std::vector<std::vector<SizeType> >& ind_set, std::vector<cl_uint>& a){
00447             vcl_size_t cnt = 0;
00448             //unsigned int tmp;
00449             for(vcl_size_t i = 0; i < ind_set.size(); ++i){
00450                 for(vcl_size_t j = 0; j < ind_set[i].size(); ++j){
00451                     a[cnt++] = static_cast<cl_uint>(ind_set[i][j]);
00452                 }
00453             }
00454         }
00455 
00456 
00457 
00458         //assembling blocks on GPU
00467         template<typename ScalarType, unsigned int MAT_ALIGNMENT>
00468         void block_assembly(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, const std::vector<std::vector<unsigned int> >& g_J,
00469                             const std::vector<std::vector<unsigned int> >& g_I,
00470                             block_matrix& g_A_I_J_vcl,
00471                             std::vector<cl_uint>& g_is_update,
00472                             bool& is_empty_block)
00473         {
00474             //computing start indices for index sets and start indices for block matrices
00475             unsigned int sz_I, sz_J, sz_blocks;
00476             std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
00477             std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00478             std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00479             std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
00480             //
00481             init_start_inds(g_J, j_ind);
00482             init_start_inds(g_I, i_ind);
00483             //
00484             get_size(g_J, sz_J);
00485             get_size(g_I, sz_I);
00486             std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0));
00487             //
00488             std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0));
00489             // computing size for blocks
00490             // writing set to arrays
00491             write_set_to_array(g_I, I_set);
00492             write_set_to_array(g_J, J_set);
00493             // if block for assembly does exist
00494             if (I_set.size() > 0 && J_set.size() > 0)
00495             {
00496               viennacl::context ctx = viennacl::traits::context(A);
00497               viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context());
00498               compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
00499               std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
00500 
00501               block_vector set_I_vcl, set_J_vcl;
00502               //init memory on GPU
00503               //contigious g_A_I_J
00504               g_A_I_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00505                                                               static_cast<unsigned int>(sizeof(ScalarType)*(sz_blocks)),
00506                                                               &(con_A_I_J[0]));
00507               g_A_I_J_vcl.handle().context(opencl_ctx);
00508 
00509               //matrix_dimensions
00510               g_A_I_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00511                                                                static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())),
00512                                                                &(matrix_dims[0]));
00513               g_A_I_J_vcl.handle1().context(opencl_ctx);
00514 
00515               //start_block inds
00516               g_A_I_J_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00517                                                                static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
00518                                                                &(blocks_ind[0]));
00519               g_A_I_J_vcl.handle2().context(opencl_ctx);
00520 
00521               //set_I
00522               set_I_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00523                                                             static_cast<unsigned int>(sizeof(cl_uint)*sz_I),
00524                                                             &(I_set[0]));
00525               set_I_vcl.handle().context(opencl_ctx);
00526 
00527               //set_J
00528               set_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00529                                                             static_cast<unsigned int>(sizeof(cl_uint)*sz_J),
00530                                                             &(J_set[0]));
00531               set_J_vcl.handle().context(opencl_ctx);
00532 
00533               //i_ind
00534               set_I_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00535                                                              static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
00536                                                              &(i_ind[0]));
00537               set_I_vcl.handle().context(opencl_ctx);
00538 
00539               //j_ind
00540               set_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00541                                                              static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
00542                                                              &(j_ind[0]));
00543               set_J_vcl.handle().context(opencl_ctx);
00544 
00545               viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
00546                                                                                        static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
00547                                                                                        &(g_is_update[0]));
00548 
00549               viennacl::linalg::opencl::kernels::spai<ScalarType>::init(opencl_ctx);
00550               viennacl::ocl::kernel& assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<ScalarType>::program_name(), "assemble_blocks");
00551               assembly_kernel.local_work_size(0, 1);
00552               assembly_kernel.global_work_size(0, 256);
00553               viennacl::ocl::enqueue(assembly_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(),
00554                                                      set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(),
00555                                                      set_J_vcl.handle1(),
00556                                                      g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
00557                                                      g_is_update_vcl,
00558                                                      static_cast<unsigned int>(g_I.size())));
00559               is_empty_block = false;
00560             }
00561             else
00562               is_empty_block = true;
00563         }
00564 
00565         /************************************************************************************************************************/
00566 
00572         template<typename SparseMatrixType, typename SparseVectorType>
00573         void insert_sparse_columns(const std::vector<SparseVectorType>& M_v,
00574                                    SparseMatrixType& M,
00575                                    bool is_right){
00576             if (is_right)
00577             {
00578               for(unsigned int i = 0; i < M_v.size(); ++i){
00579                   for(typename SparseVectorType::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it){
00580                       M(vec_it->first, i) = vec_it->second;
00581                   }
00582               }
00583             }
00584             else  //transposed fill of M
00585             {
00586               for(unsigned int i = 0; i < M_v.size(); ++i){
00587                   for(typename SparseVectorType::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it){
00588                       M(i, vec_it->first) = vec_it->second;
00589                   }
00590               }
00591             }
00592         }
00593 
00598         template<typename MatrixType>
00599         void sparse_transpose(const MatrixType& A_in, MatrixType& A){
00600             typedef typename MatrixType::value_type ScalarType;
00601             std::vector<std::map<vcl_size_t, ScalarType> >   temp_A(A_in.size2());
00602             A.resize(A_in.size2(), A_in.size1(), false);
00603 
00604             for (typename MatrixType::const_iterator1 row_it = A_in.begin1();
00605                  row_it != A_in.end1();
00606                  ++row_it)
00607             {
00608                 for (typename MatrixType::const_iterator2 col_it = row_it.begin();
00609                      col_it != row_it.end();
00610                      ++col_it)
00611                 {
00612                     temp_A[col_it.index2()][col_it.index1()] = *col_it;
00613                 }
00614             }
00615 
00616             for (vcl_size_t i=0; i<temp_A.size(); ++i)
00617             {
00618                 for (typename std::map<vcl_size_t, ScalarType>::const_iterator it = temp_A[i].begin();
00619                      it != temp_A[i].end();
00620                      ++it)
00621                     A(i, it->first) = it->second;
00622             }
00623         }
00624 
00625 
00626 
00627 
00628 //        template<typename SparseVectorType>
00629 //        void custom_copy(std::vector<SparseVectorType> & M_v, std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){
00630 //            for(int i = 0; i < l_M_v.size(); ++i){
00631 //                l_M_v[i] = M_v[i + beg_ind];
00632 //            }
00633 //        }
00634 
00635         //CPU version
00641         template <typename MatrixType>
00642         void computeSPAI(const MatrixType & A, MatrixType & M, spai_tag & tag){
00643             typedef typename MatrixType::value_type ScalarType;
00644             typedef typename boost::numeric::ublas::vector<ScalarType> VectorType;
00645             typedef typename viennacl::linalg::detail::spai::sparse_vector<ScalarType> SparseVectorType;
00646             typedef typename boost::numeric::ublas::matrix<ScalarType> DenseMatrixType;
00647             //sparse matrix transpose...
00648             unsigned int cur_iter = 0;
00649             tag.setBegInd(0); tag.setEndInd(VIENNACL_SPAI_K_b);
00650             bool go_on = true;
00651             std::vector<SparseVectorType> A_v_c(M.size2());
00652             std::vector<SparseVectorType> M_v(M.size2());
00653             vectorize_column_matrix(A, A_v_c);
00654             vectorize_column_matrix(M, M_v);
00655 
00656 
00657             while(go_on){
00658                 go_on = (tag.getEndInd() < static_cast<long>(M.size2()));
00659                 cur_iter = 0;
00660                 unsigned int l_sz = tag.getEndInd() - tag.getBegInd();
00661                 //std::vector<bool> g_is_update(M.size2(), true);
00662                 std::vector<bool> g_is_update(l_sz, true);
00663                 //init is update
00664                 //init_parallel_is_update(g_is_update);
00665                 //std::vector<SparseVectorType> A_v_c(K);
00666                 //std::vector<SparseVectorType> M_v(K);
00667                 //vectorization of marices
00668                 //print_matrix(M_v);
00669                 std::vector<SparseVectorType> l_M_v(l_sz);
00670                 //custom_copy(M_v, l_M_v, beg_ind);
00671                 std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin());
00672                 //print_matrix(l_M_v);
00673                 //std::vector<SparseVectorType> l_A_v_c(K);
00674                 //custom_copy(A_v_c, l_A_v_c, beg_ind);
00675                 //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, l_A_v_c.begin());
00676                 //print_matrix(l_A_v_c);
00677                 //vectorize_row_matrix(A, A_v_r);
00678                 //working blocks
00679                 //std::vector<DenseMatrixType> g_A_I_J(M.size2())
00680                 std::vector<DenseMatrixType> g_A_I_J(l_sz);
00681                 //std::vector<VectorType> g_b_v(M.size2());
00682                 std::vector<VectorType> g_b_v(l_sz);
00683                 //std::vector<SparseVectorType> g_res(M.size2());
00684                 std::vector<SparseVectorType> g_res(l_sz);
00685                 //std::vector<std::vector<unsigned int> > g_I(M.size2());
00686                 std::vector<std::vector<unsigned int> > g_I(l_sz);
00687                 //std::vector<std::vector<unsigned int> > g_J(M.size2());
00688                 std::vector<std::vector<unsigned int> > g_J(l_sz);
00689                 while((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update)){
00690                     // SET UP THE BLOCKS..
00691                     // PHASE ONE
00692                     if(cur_iter == 0) block_set_up(A, A_v_c, l_M_v,  g_I, g_J, g_A_I_J, g_b_v);
00693                     else block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, tag);
00694 
00695                     //PHASE TWO, LEAST SQUARE SOLUTION
00696                     least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, l_M_v, tag);
00697 
00698                     if(tag.getIsStatic()) break;
00699                     cur_iter++;
00700 
00701 
00702                 }
00703                 std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
00704                 tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;
00705                 tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));
00706                 //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
00707 
00708             }
00709             M.resize(M.size1(), M.size2(), false);
00710             insert_sparse_columns(M_v, M, tag.getIsRight());
00711         }
00712 
00713 
00714         //GPU - based version
00722         template <typename ScalarType, unsigned int MAT_ALIGNMENT>
00723         void computeSPAI(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, //input
00724                          const boost::numeric::ublas::compressed_matrix<ScalarType>& cpu_A,
00725                          boost::numeric::ublas::compressed_matrix<ScalarType>& cpu_M, //output
00726                          viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& M,
00727                          const spai_tag& tag){
00728             typedef typename viennacl::linalg::detail::spai::sparse_vector<ScalarType> SparseVectorType;
00729             //typedef typename viennacl::compressed_matrix<ScalarType> GPUSparseMatrixType;
00730             //sparse matrix transpose...
00731             unsigned int cur_iter = 0;
00732             std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1));
00733             //init is update
00734             //init_parallel_is_update(g_is_update);
00735             std::vector<SparseVectorType> A_v_c(cpu_M.size2());
00736             std::vector<SparseVectorType> M_v(cpu_M.size2());
00737             vectorize_column_matrix(cpu_A, A_v_c);
00738             vectorize_column_matrix(cpu_M, M_v);
00739             std::vector<SparseVectorType> g_res(cpu_M.size2());
00740             std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());
00741             std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());
00742 
00743             //OpenCL variables
00744             block_matrix g_A_I_J_vcl;
00745             block_vector g_bv_vcl;
00746             while((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update)){
00747                 // SET UP THE BLOCKS..
00748                 // PHASE ONE..
00749                 //timer.start();
00750                 //index set up on CPU
00751                 if(cur_iter == 0)
00752                   block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl);
00753                 else
00754                   block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag);
00755                 //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;
00756                 //PERFORM LEAST SQUARE problems solution
00757                 //PHASE TWO
00758                 //timer.start();
00759                 least_square_solve<SparseVectorType, ScalarType>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, viennacl::traits::context(A));
00760                 //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;
00761                 if(tag.getIsStatic()) break;
00762                 cur_iter++;
00763             }
00764             cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false);
00765             insert_sparse_columns(M_v, cpu_M, tag.getIsRight());
00766             //copy back to GPU
00767             M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned int>(cpu_M.size2()));
00768             viennacl::copy(cpu_M, M);
00769         }
00770 
00771       }
00772     }
00773   }
00774 }
00775 #endif