ViennaCL - The Vienna Computing Library  1.5.1
viennacl/generator/matrix_product.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_GENERATOR_GENERATE_MATRIX_PRODUCT_HPP
00002 #define VIENNACL_GENERATOR_GENERATE_MATRIX_PRODUCT_HPP
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00021 
00027 #include <vector>
00028 
00029 #include "viennacl/scheduler/forwards.h"
00030 
00031 #include "viennacl/generator/profile_base.hpp"
00032 #include "viennacl/generator/mapped_objects.hpp"
00033 #include "viennacl/generator/utils.hpp"
00034 
00035 #include "viennacl/forwards.h"
00036 
00037 #include "viennacl/tools/tools.hpp"
00038 
00039 namespace viennacl{
00040 
00041   namespace generator{
00042 
00044     class matrix_product : public profile_base{
00045 
00046         enum access_flow{
00047           REGULAR,
00048           STRIDED
00049         };
00050 
00051         bool is_slow_impl(viennacl::ocl::device const &) const { return false; }
00052 
00053         vcl_size_t lmem_used(vcl_size_t scalartype_size) const {
00054           vcl_size_t lmem_used = 0;
00055           if(use_lhs_shared_)
00056             lmem_used += (ml_ + 1) * (cache_width_ + 1) * scalartype_size;
00057           if(use_rhs_shared_)
00058             lmem_used += (cache_width_ + 1) * (nl_ + 1) * scalartype_size;
00059           return lmem_used;
00060         }
00061 
00062         virtual void print(std::ostream & s) const{
00063           s << "{vector_type, local_size1, cache_width, local_size2, ms, ks, ns, use_lhs_shared, use_rhs_shared} = {"
00064             << vector_size_ << ","
00065             << local_size1_ << ", "
00066             << cache_width_ << ", "
00067             << local_size2_ << ", "
00068             << ms_ << ", "
00069             << ks_ << ", "
00070             << ns_ << ", "
00071             << use_lhs_shared_ << ", " << use_rhs_shared_ << "}" ;
00072         }
00073 
00074 
00075         bool invalid_impl(viennacl::ocl::device const & /*dev*/, vcl_size_t /*scalartype_size*/) const{
00076           static const unsigned int alignment = 128;
00077           return ml_ > alignment
00078               || cache_width_ > alignment
00079               || nl_ > alignment
00080               || ml_ < ms_
00081               || cache_width_ < ks_
00082               || nl_ < ns_
00083               || (ms_ % vector_size_) > 0
00084               || (ks_ % vector_size_) > 0
00085               || (ns_ % vector_size_) > 0;
00086         }
00087 
00088       public:
00090         matrix_product(unsigned int vectorization
00091                 , vcl_size_t local_size1, vcl_size_t cache_width, vcl_size_t local_size2
00092                 , unsigned int ms, unsigned int ks, unsigned int ns
00093                 , bool use_lhs_shared, bool use_rhs_shared) : profile_base(vectorization,local_size1, local_size2,1){
00094           local_size1_ = local_size1;
00095           local_size2_ = local_size2;
00096           cache_width_=cache_width;
00097           ml_= ms*local_size1;
00098           nl_=ns*local_size2;
00099           ms_ = ms;
00100           ks_=ks;
00101           ns_=ns;
00102           use_lhs_shared_ = use_lhs_shared;
00103           use_rhs_shared_ = use_rhs_shared;
00104         }
00105 
00106         static std::string csv_format() {
00107           return "Vec,LSize1,CacheWidth,LSize2,mS,kS,nS,NumGroups";
00108         }
00109 
00110         std::string csv_representation() const{
00111           std::ostringstream oss;
00112           oss << vector_size_
00113               << "," << local_size1_
00114               << "," << cache_width_
00115               << "," << local_size2_
00116               << "," << ms_
00117               << "," << ks_
00118               << "," << ns_
00119               << "," << use_lhs_shared_
00120               << "," << use_rhs_shared_;
00121           return oss.str();
00122         }
00123 
00124         void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg)  const {
00125           //set M, N
00126           scheduler::statement_node const & first_node = statements.front().second;
00127           vcl_size_t M = utils::call_on_matrix(first_node.lhs, utils::internal_size1_fun());
00128           vcl_size_t N = utils::call_on_matrix(first_node.lhs, utils::internal_size2_fun());
00129 
00130           //set ND range
00131           configure_local_sizes(k, kernel_id);
00132           k.global_work_size(0, M/ms_);
00133           k.global_work_size(1, N/ns_);
00134 
00135           //set arguments
00136           //M,N
00137           k.arg(n_arg++, cl_uint(M));
00138           k.arg(n_arg++, cl_uint(N));
00139 
00140           //K
00141           for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
00142             scheduler::statement::container_type exprs = it->first.array();
00143             for(scheduler::statement::container_type::iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){
00144               if(iit->op.type==scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE){
00145                 scheduler::statement_node const * current_node = &(*iit);
00146                 //The LHS of the prod is a matrix
00147                 if(current_node->lhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
00148                 {
00149                   k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
00150                 }
00151                 else{
00152                   //The LHS of the prod is a matrix expression
00153                   current_node = &exprs[current_node->lhs.node_index];
00154                   if(current_node->lhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
00155                   {
00156                     if(current_node->op.type==scheduler::OPERATION_UNARY_TRANS_TYPE)
00157                       k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
00158                     else
00159                       k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
00160                   }
00161                   else{
00162                     assert(false && bool("unexpected expression tree"));
00163                   }
00164                 }
00165                 return;
00166               }
00167             }
00168           }
00169 
00170         }
00171 
00172         static std::string size1() { return "M";  }
00173         static std::string size2() { return "K"; }
00174         static std::string size3() { return "N"; }
00175 
00176         void kernel_arguments(statements_type  const & /*statements*/, std::string & arguments_string) const{
00177           arguments_string += detail::generate_value_kernel_argument("unsigned int", "M");
00178           arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
00179           arguments_string += detail::generate_value_kernel_argument("unsigned int", "K");
00180         }
00181 
00182       private:
00183 
00184         void transform_block(detail::mapped_matrix const & /*mat_infos*/, bool store_shared
00185                              , unsigned int & large_block_1, unsigned int & large_block_2
00186                              , unsigned int & small_block_1, unsigned int & small_block_2
00187                              , access_flow flow) const {
00188           if(flow==REGULAR){
00189             large_block_2/=vector_size_;
00190             if(!store_shared)
00191               small_block_2/=vector_size_;
00192           }
00193           else{
00194             large_block_1/=vector_size_;
00195             if(!store_shared)
00196               small_block_1/=vector_size_;
00197           }
00198         }
00199 
00200 
00201         std::string helper_variable(utils::kernel_generation_stream & stream
00202                                     , bool store_in_register
00203                                     , std::string const & type
00204                                     , std::string const & name
00205                                     , std::string const & expr) const {
00206           if(!store_in_register)
00207             return expr;
00208           stream << type << " " << name << " = " << expr << ";" << std::endl;
00209           return name;
00210         }
00211 
00212         void fetch_element_to_local_mem(utils::kernel_generation_stream & stream,
00213                                 std::string const & lmem_name,
00214                                 vcl_size_t lmem_size2,
00215                                 std::string const & global_ptr,
00216                                 detail::mapped_matrix const & mat,
00217                                 access_flow flow,
00218                                 std::string const & i,
00219                                 std::string const & j) const {
00220 
00221             if(flow==REGULAR){
00222                 stream << "val = *(" << global_ptr << " + " << j << " + " << mat.size2()  << "*" << i << ");" << std::endl;
00223               for(unsigned int a = 0 ; a < vector_size_ ; ++a)
00224                   if(vector_size_>1)
00225                       stream << lmem_name << "[" << i << "*" << lmem_size2 << " + " << j << "*" << vector_size_<<" + " << a << "] = val.s" << a << ";" <<std::endl;
00226                   else
00227                       stream << lmem_name << "[" << i << "*" << lmem_size2 << " + " << j << "*" << vector_size_ << "] = val" << ";" <<std::endl;
00228             }
00229             else{
00230               stream << "val = *(" << global_ptr << "+ " << j << "*" << mat.size1() << " + " << i << ");" << std::endl;
00231               for(unsigned int a = 0 ; a < vector_size_ ; ++a)
00232                   if(vector_size_>1)
00233                       stream << lmem_name << "[" << i << "*" << vector_size_*lmem_size2 << " + " << j << " + " << a*lmem_size2 << "] = val.s" << a << ";" <<std::endl;
00234                   else
00235                       stream << lmem_name << "[" << i << "*" << vector_size_*lmem_size2 << " + " << j << "] = val" << ";" <<std::endl;
00236             }
00237         }
00238         void fetch_to_local_mem(utils::kernel_generation_stream & stream,
00239                                 std::string const & lmem_name,
00240                                 vcl_size_t lmem_size2,
00241                                 std::string const & global_ptr,
00242                                 unsigned int bound1,
00243                                 unsigned int bound2,
00244                                 detail::mapped_matrix const & mat,
00245                                 access_flow flow) const {
00246           std::string aligned_scalartype = mat.scalartype();
00247           if(vector_size_ > 1)
00248             aligned_scalartype+=utils::to_string(vector_size_);
00249           stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
00250           stream << "{" << std::endl;
00251           stream << aligned_scalartype << " val;" << std::endl;
00252           //Can unroll
00253           if(bound2%local_size2_==0 && bound1%local_size1_==0){
00254               for(unsigned int j = 0 ; j < bound2 ; j+=static_cast<unsigned int>(local_size2_)){
00255                   for(unsigned int i = 0 ; i < bound1 ; i+=static_cast<unsigned int>(local_size1_)){
00256                       std::string indi = "(get_local_id(0) + " + utils::to_string(i)+")";
00257                       std::string indj = "(get_local_id(1) + " + utils::to_string(j)+")";
00258                       fetch_element_to_local_mem(stream,lmem_name,lmem_size2,global_ptr,mat,flow,indi,indj);
00259                   }
00260               }
00261           }
00262           else{
00263               stream << "for(unsigned int j = get_local_id(1)" << " ; j < " << bound2 << "; j+= " << local_size2_ << "){" << std::endl;
00264               stream.inc_tab();
00265               stream << "for(unsigned int i = get_local_id(0)" << " ; i < " << bound1 << "; i+= " << local_size1_ << "){" << std::endl;
00266               stream.inc_tab();
00267               fetch_element_to_local_mem(stream,lmem_name,lmem_size2,global_ptr,mat,flow,"i","j");
00268               stream.dec_tab();
00269               stream << "}" << std::endl;
00270               stream.dec_tab();
00271               stream << "}" << std::endl;
00272 
00273           }
00274           stream << "}" << std::endl;
00275           stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
00276 
00277         }
00278 
00279         void core(vcl_size_t /*kernel_id*/, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
00280 
00284 
00285           detail::mapped_matrix const * assigned = static_cast<detail::mapped_matrix const *>(at(mapping.at(0), std::make_pair(&statements.front().second,detail::LHS_NODE_TYPE)).get());
00286           detail::mapped_matrix_product* prod = NULL;
00287           detail::mapped_matrix const * lhs = NULL;
00288           detail::mapped_matrix const * rhs = NULL;
00289 
00290           bool is_lhs_transposed = false;
00291           bool is_rhs_transposed = false;
00292 
00293           for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
00294             scheduler::statement::container_type const & exprs = it->first.array();
00295             vcl_size_t i = std::distance(statements.begin(), it);
00296             for(scheduler::statement::container_type::const_iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){
00297               if(iit->op.type==scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE){
00298                 prod = (detail::mapped_matrix_product *)at(mapping.at(i), std::make_pair(&(*iit), detail::PARENT_NODE_TYPE)).get();
00299                 if(iit->lhs.type_family == scheduler::COMPOSITE_OPERATION_FAMILY){
00300                   is_lhs_transposed = true;
00301                   lhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&exprs[iit->lhs.node_index],detail::LHS_NODE_TYPE)).get();
00302                 }
00303                 else{
00304                   is_lhs_transposed = false;
00305                   lhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&(*iit), detail::LHS_NODE_TYPE)).get();
00306                 }
00307 
00308                 if(iit->rhs.type_family == scheduler::COMPOSITE_OPERATION_FAMILY){
00309                   is_rhs_transposed = true;
00310                   rhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&exprs[iit->rhs.node_index], detail::LHS_NODE_TYPE)).get();
00311                 }
00312                 else{
00313                   is_rhs_transposed = false;
00314                   rhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&(*iit),detail::RHS_NODE_TYPE)).get();
00315                 }
00316 
00317               }
00318             }
00319           }
00320 
00321           if(vector_size_>1){
00322             std::string StrV = "/"+utils::to_string(vector_size_) ;
00323 
00324             for(detail::mapping_type::const_iterator it = mapping.front().begin() ; it != mapping.front().end() ; ++it){
00325               if(detail::mapped_matrix const * p = dynamic_cast<detail::mapped_matrix const *>(it->second.get())){
00326                 if(p->is_row_major())
00327                   p->bind_sizes("M", "N"+StrV);
00328                 else
00329                   p->bind_sizes("M"+StrV, "N");
00330               }
00331             }
00332 
00333             if(lhs->is_row_major())
00334               if(is_lhs_transposed)
00335                 lhs->bind_sizes("M"+StrV, "K");
00336               else
00337                 lhs->bind_sizes("M", "K"+StrV);
00338             else
00339               if(is_lhs_transposed)
00340                 lhs->bind_sizes("M", "K"+StrV);
00341               else
00342                 lhs->bind_sizes("M"+StrV, "K");
00343 
00344 
00345             if(rhs->is_row_major())
00346               if(is_rhs_transposed)
00347                 rhs->bind_sizes("K"+StrV, "N");
00348               else
00349                 rhs->bind_sizes("K", "N"+StrV);
00350             else
00351               if(is_rhs_transposed)
00352                 rhs->bind_sizes("K", "N"+StrV);
00353               else
00354                 rhs->bind_sizes("K"+StrV, "N");
00355 
00356 
00357           }
00358           else{
00359             for(detail::mapping_type::const_iterator it = mapping.front().begin() ; it != mapping.front().end() ; ++it){
00360               if(detail::mapped_matrix const * p = dynamic_cast<detail::mapped_matrix const *>(it->second.get())){
00361                 p->bind_sizes("M", "N");
00362               }
00363             }
00364 
00365             lhs->bind_sizes("M", "K");
00366             rhs->bind_sizes("K", "N");
00367           }
00368 
00369 
00370 
00371           std::string aligned_scalartype = assigned->scalartype();
00372           if(vector_size_ > 1)
00373             aligned_scalartype+=utils::to_string(vector_size_);
00374 
00375 
00376           access_flow result_access_flow;
00377           if(assigned->is_row_major())
00378             result_access_flow = REGULAR;
00379           else
00380             result_access_flow = STRIDED;
00381 
00382           access_flow lhs_access_flow;
00383           if((lhs->is_row_major() && !is_lhs_transposed)
00384              ||(!lhs->is_row_major() && is_lhs_transposed))
00385             lhs_access_flow = REGULAR;
00386           else
00387             lhs_access_flow = STRIDED;
00388 
00389           access_flow rhs_access_flow;
00390           if((rhs->is_row_major() && !is_rhs_transposed)
00391              ||(!rhs->is_row_major() && is_rhs_transposed))
00392             rhs_access_flow = REGULAR;
00393           else
00394             rhs_access_flow = STRIDED;
00395 
00396 
00397           std::string lhs_value_scalartype;
00398           if(use_lhs_shared_)
00399             lhs_value_scalartype = lhs->scalartype();
00400           else
00401             lhs_value_scalartype = aligned_scalartype;
00402 
00403           std::string rhs_value_scalartype;
00404           if(use_rhs_shared_)
00405             rhs_value_scalartype = rhs->scalartype();
00406           else
00407             rhs_value_scalartype = aligned_scalartype;
00408 
00409 
00410           unsigned int ml_res = static_cast<unsigned int>(ml_), nl_res = static_cast<unsigned int>(nl_), ms_res = static_cast<unsigned int>(ms_), ns_res = static_cast<unsigned int>(ns_);
00411           unsigned int ml_lhs = static_cast<unsigned int>(ml_), cache_width_lhs = static_cast<unsigned int>(cache_width_), ms_lhs = static_cast<unsigned int>(ms_), ks_lhs = static_cast<unsigned int>(ks_);
00412           unsigned int cache_width_rhs = static_cast<unsigned int>(cache_width_), nl_rhs = static_cast<unsigned int>(nl_), ks_rhs = static_cast<unsigned int>(ks_), ns_rhs = static_cast<unsigned int>(ns_);
00413 
00414           transform_block(*assigned,false,ml_res,nl_res,ms_res,ns_res,result_access_flow);
00415           transform_block(*lhs,use_lhs_shared_,ml_lhs,cache_width_lhs,ms_lhs,ks_lhs,lhs_access_flow);
00416           transform_block(*rhs,use_rhs_shared_,cache_width_rhs,nl_rhs,ks_rhs,ns_rhs,rhs_access_flow);
00417 
00421 
00422 
00423           vcl_size_t local_lhs_size1 = ml_ ;
00424           vcl_size_t local_lhs_size2 = cache_width_ + 1;
00425 
00426           vcl_size_t local_rhs_size1 = cache_width_;
00427           vcl_size_t local_rhs_size2 = nl_ + 1;
00428 
00430           for(unsigned int m=0; m< ms_res; ++m)
00431             for(unsigned int n=0; n < ns_res ; ++n)
00432               stream << aligned_scalartype << " " << "res" << m << "_" << n << " = (" << aligned_scalartype << ")(0) ;" << std::endl;
00433 
00435           if(use_lhs_shared_)
00436             stream << "__local " << lhs->scalartype() << " lhs_buf[" << local_lhs_size1*local_lhs_size2 << "]" << ";" << std::endl;
00437           if(use_rhs_shared_)
00438             stream << "__local " << rhs->scalartype() << " rhs_buf[" << local_rhs_size1*local_rhs_size2 << "]" << ";" << std::endl;
00439 
00441           //stream << "__global " << aligned_scalartype << "* res_ptr = " <<  assigned->name() << " + " << assigned->offset(std::make_pair("get_global_id(0)*" + utils::to_string(ms_res), "get_global_id(1)*" + utils::to_string(ns_res))) << ";" << std::endl;
00442 
00443 
00445           if(use_lhs_shared_){
00446             std::string i = "get_group_id(0)*" + utils::to_string(ml_lhs);
00447             stream << "__global " << aligned_scalartype << "* global_lhs_ptr = " << lhs->name() << " + ";
00448             if(lhs_access_flow==REGULAR)
00449               stream << "(" << i << ")" << "*" << lhs->size2();
00450             else
00451               stream << i;
00452             stream << ";" << std::endl;
00453           }
00454 
00456           else{
00457             if(lhs_access_flow==REGULAR)
00458               for(unsigned int m=0; m<ms_lhs; ++m)
00459                 stream << "__global " << aligned_scalartype << "* " << "lhs_ptr_" << m << " = " << lhs->name() << " + "
00460                        << lhs->size2() << "* ("
00461                        << "get_group_id(0)*" << ml_lhs << "+" << "get_local_id(0)*" << ms_lhs << "+" << m
00462                        << " );" << std::endl;
00463             else
00464               for(unsigned int k=0; k<ks_lhs; ++k)
00465                 stream << "__global " << aligned_scalartype<< "* " << "lhs_ptr_" << k << " = " << lhs->name() << " + "
00466                        << "(" << lhs->size1() << ")*" << k
00467                        << "+ " << "get_group_id(0)*" << ml_lhs << "+" << "get_local_id(0)*" << ms_lhs << ";" << std::endl;
00468           }
00469 
00471           if(use_rhs_shared_){
00472             std::string j = "get_group_id(1)*" + utils::to_string(nl_rhs);
00473             stream << "__global " << aligned_scalartype << "* global_rhs_ptr = " << rhs->name() << " + ";
00474             if(rhs_access_flow==REGULAR)
00475               stream << j;
00476             else
00477               stream << "(" << j << ")" << "*" << rhs->size1();
00478             stream << ";" << std::endl;
00479           }
00480 
00482           else{
00483             if(rhs_access_flow==REGULAR)
00484               for(unsigned int k = 0 ; k < ks_rhs ; ++k)
00485                 stream << "__global " << aligned_scalartype << "* " << "rhs_ptr_" << k << " = " << rhs->name() << " + "
00486                        << "(" << k << ")" << "*" << rhs->size2()
00487                        << " + " << "get_local_id(1)*" << ns_rhs << " + get_group_id(1)*" << nl_rhs
00488                        << ";" << std::endl;
00489             else
00490               for(unsigned int n = 0 ; n < ns_rhs ; ++n)
00491                 stream << "__global " << aligned_scalartype << "* " << "rhs_ptr_" << n << " = " << rhs->name() << " +  "
00492                        << "(" << "get_local_id(1)*" << ns_rhs << " + get_group_id(1)*" << nl_rhs << " + " << n << ")" << "*" << rhs->size1()
00493                        << ";" << std::endl;
00494           }
00495 
00496 
00498           std::string block_num = helper_variable(stream,false,"unsigned int", "block_num", "K/" + utils::to_string(cache_width_));
00499           stream << "for(unsigned int bl=0 ; bl<" << block_num << " ; ++bl){" << std::endl;
00500           stream.inc_tab();
00501 
00503           if(use_lhs_shared_){
00504             fetch_to_local_mem(stream,"lhs_buf",local_lhs_size2,"global_lhs_ptr",ml_lhs,cache_width_lhs,*lhs,lhs_access_flow);
00505             for(unsigned int m=0; m<ms_lhs; ++m)
00506               stream << "__local " << lhs_value_scalartype << "* lhs_ptr_" << m << " = lhs_buf + "
00507                      << "(" << "get_local_id(0)*" << ms_lhs << "+" << m << ")" << "*" << local_lhs_size2
00508                      << ";" << std::endl;
00509           }
00510 
00512           if(use_rhs_shared_){
00513             fetch_to_local_mem(stream,"rhs_buf", local_rhs_size2, "global_rhs_ptr",cache_width_rhs,nl_rhs,*rhs,rhs_access_flow);
00514             for(unsigned int k=0; k<ks_rhs; ++k)
00515               stream << "__local " << rhs_value_scalartype << "* rhs_ptr_" << k << " = rhs_buf + "
00516                      << k*local_rhs_size2 << " + " << "get_local_id(1)*" << ns_rhs
00517                      << ";" << std::endl;
00518           }
00519 
00520 
00521           stream << " for(unsigned int bs=0 ; bs < " << cache_width_/ks_  << " ; ++bs){" << std::endl;
00522           stream.inc_tab();
00523 
00524 
00525           for(unsigned int k = 0 ; k < ks_rhs ; ++k){
00526             for(unsigned int n=0 ; n < ns_rhs ; ++n){
00527               stream << rhs_value_scalartype << " val_rhs_" << k << "_" << n << " = " ;
00528               if(use_rhs_shared_ )
00529                 stream << "* rhs_ptr_" << k << "++";
00530               else{
00531                 if(rhs_access_flow==REGULAR)
00532                   stream << "* rhs_ptr_" << k << "++";
00533                 else
00534                   stream  << "* rhs_ptr_" << n << "++";
00535               }
00536               stream << ";";
00537               stream << std::endl;
00538             }
00539           }
00540 
00541 
00542           for(unsigned int k = 0 ; k < ks_lhs ; ++k){
00543             for(unsigned int m=0 ; m < ms_lhs ; ++m){
00544               stream << lhs_value_scalartype << " " << "val_lhs_" << m << "_" << k << " = ";
00545               if(use_lhs_shared_)
00546                 stream <<  "* lhs_ptr_" << m << "++" ;
00547               else if(lhs_access_flow==REGULAR)
00548                 stream << "* lhs_ptr_" << m << "++";
00549               else
00550                 stream << "* lhs_ptr_" << k << "++";
00551               stream << ";";
00552               stream << std::endl;
00553             }
00554           }
00555 
00556 
00557             for(unsigned int n=0 ; n < ns_res ; ++n){
00558              for(unsigned int k = 0 ; k < ks_ ; ++k){
00559                for(unsigned int m=0 ; m < ms_res ; ++m){
00560                 for(unsigned int a = 0; a<vector_size_; ++a){
00561 
00562                   int ind_lhs_1 = m;
00563                   int ind_lhs_2 = k;
00564                   int ind_s_lhs = a;
00565 
00566                   int ind_rhs_1=k;
00567                   int ind_rhs_2=n;
00568                   int ind_s_rhs=a;
00569 
00570                   if(result_access_flow==REGULAR){
00571                     if(!use_lhs_shared_){
00572                       if(lhs_access_flow==REGULAR){
00573                         ind_s_lhs = ind_lhs_2%vector_size_;
00574                         ind_lhs_2 /= vector_size_;
00575                       }
00576                       else{
00577                         ind_s_lhs = ind_lhs_1%vector_size_;
00578                         ind_lhs_1 /= vector_size_;
00579                       }
00580                     }
00581                   }
00582                   else{
00583                     if(use_lhs_shared_){
00584                       ind_lhs_1 = ind_lhs_1*vector_size_+a;
00585                     }
00586                     else{
00587                       if(lhs_access_flow==REGULAR){
00588                         ind_lhs_1 = ind_lhs_1*vector_size_+a;
00589                         ind_s_lhs = ind_lhs_2%vector_size_;
00590                         ind_lhs_2 /= vector_size_;
00591                       }
00592                     }
00593                   }
00594 
00595                   if(result_access_flow==REGULAR){
00596                     if(use_rhs_shared_){
00597                       ind_rhs_2 = ind_rhs_2*vector_size_+a;
00598                     }
00599                     else{
00600                       if(rhs_access_flow==STRIDED){
00601                         ind_rhs_2 = ind_rhs_2*vector_size_+a;
00602                         ind_s_rhs = ind_rhs_1%vector_size_;
00603                         ind_rhs_1 = ind_rhs_1/vector_size_;
00604                       }
00605                       else{
00606                       }
00607                     }
00608                   }
00609                   else{
00610                     if(!use_rhs_shared_){
00611                       if(rhs_access_flow==REGULAR){
00612                         ind_s_rhs = ind_rhs_2%vector_size_;
00613                         ind_rhs_2/=vector_size_;
00614                       }
00615                       else{
00616                         ind_s_rhs = ind_rhs_1%vector_size_;
00617                         ind_rhs_1/=vector_size_;
00618                       }
00619                     }
00620                   }
00621 
00622                   std::ostringstream res_oss;
00623                   std::ostringstream lhs_oss;
00624                   std::ostringstream rhs_oss;
00625 
00626                   res_oss << "res" << m << "_" << n ;
00627                   if(vector_size_>1) res_oss << ".s" << a;
00628 
00629                   lhs_oss << "val_lhs_" << ind_lhs_1 << "_" << ind_lhs_2;
00630                   if(!use_lhs_shared_ && vector_size_>1) lhs_oss << ".s" << ind_s_lhs;
00631 
00632 
00633                   rhs_oss << "val_rhs_" << ind_rhs_1 << "_" << ind_rhs_2;
00634                   if(!use_rhs_shared_ && vector_size_>1) rhs_oss << ".s" << ind_s_rhs;
00635 
00636 
00637                   stream << res_oss.str() << "+=" << lhs_oss.str() << "*" << rhs_oss.str() << ";" << std::endl;
00638                 }
00639               }
00640             }
00641           }
00642 
00643 
00644           if(use_rhs_shared_){
00645             for(unsigned int k=0 ; k<ks_ ; ++k)
00646               stream << "rhs_ptr_" << k << " += " << ks_rhs*local_rhs_size2 - ns_rhs << ";" << std::endl;
00647           }
00648           else{
00649             if(rhs_access_flow==REGULAR)
00650               for(unsigned int k=0 ; k<ks_ ; ++k)
00651                 stream << "rhs_ptr_" << k << " += " << ks_rhs << "*" << rhs->size2() << " - " << ns_rhs << ";" << std::endl;
00652           }
00653 
00654           if(!use_lhs_shared_){
00655             if(lhs_access_flow==STRIDED)
00656               for(unsigned int k=0 ; k<ks_lhs ; ++k)
00657                 stream << "lhs_ptr_" << k << " += " << ks_lhs << "*" << lhs->size1() << " - " << ms_lhs << ";" << std::endl;
00658           }
00659 
00660 
00661 
00662           stream.dec_tab();
00663           stream << "}" << std::endl;
00664 
00665           if(use_lhs_shared_){
00666             if(lhs_access_flow==REGULAR)
00667               stream << "global_lhs_ptr += " << cache_width_lhs << ";" << std::endl;
00668             else
00669               stream << "global_lhs_ptr += " << cache_width_lhs << "*" << lhs->size1() << ";" << std::endl;
00670           }
00671 
00672           if(use_rhs_shared_){
00673             if(rhs_access_flow==REGULAR)
00674               stream << "global_rhs_ptr += " << cache_width_rhs << "*" << rhs->size2() << ";" << std::endl;
00675             else
00676               stream << "global_rhs_ptr += " << cache_width_rhs << ";" << std::endl;
00677           }
00678 
00679           stream.dec_tab();
00680           stream << "}" << std::endl;
00681 
00682           for(unsigned int m=0 ; m < ms_res ; ++m){
00683             for(unsigned int n=0 ; n < ns_res ; ++n){
00684               std::string i = "get_global_id(0)*" + utils::to_string(ms_res) + "+" + utils::to_string(m);
00685               std::string j = "get_global_id(1)*" + utils::to_string(ns_res) + "+" + utils::to_string(n);
00686               prod->access_name("res"+utils::to_string(m)+"_"+utils::to_string(n));
00687               std::string str;
00688               detail::traverse(statements.front().first, statements.front().second, detail::expression_generation_traversal(std::make_pair(i, j), -1, str, mapping[0]), false);
00689               stream << str << ";" << std::endl;
00690             }
00691           }
00692 
00693 
00694         }
00695 
00696       private:
00697         vcl_size_t local_size1_;
00698         vcl_size_t local_size2_;
00699         vcl_size_t cache_width_;
00700 
00701         vcl_size_t ml_;
00702         vcl_size_t nl_;
00703 
00704         vcl_size_t ms_;
00705         vcl_size_t ks_;
00706         vcl_size_t ns_;
00707 
00708         bool use_lhs_shared_;
00709         bool use_rhs_shared_;
00710     };
00711 
00712   }
00713 
00714 }
00715 
00716 #endif