ViennaCL - The Vienna Computing Library
1.5.1
|
00001 #ifndef VIENNACL_GENERATOR_GENERATE_MATRIX_PRODUCT_HPP 00002 #define VIENNACL_GENERATOR_GENERATE_MATRIX_PRODUCT_HPP 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00021 00027 #include <vector> 00028 00029 #include "viennacl/scheduler/forwards.h" 00030 00031 #include "viennacl/generator/profile_base.hpp" 00032 #include "viennacl/generator/mapped_objects.hpp" 00033 #include "viennacl/generator/utils.hpp" 00034 00035 #include "viennacl/forwards.h" 00036 00037 #include "viennacl/tools/tools.hpp" 00038 00039 namespace viennacl{ 00040 00041 namespace generator{ 00042 00044 class matrix_product : public profile_base{ 00045 00046 enum access_flow{ 00047 REGULAR, 00048 STRIDED 00049 }; 00050 00051 bool is_slow_impl(viennacl::ocl::device const &) const { return false; } 00052 00053 vcl_size_t lmem_used(vcl_size_t scalartype_size) const { 00054 vcl_size_t lmem_used = 0; 00055 if(use_lhs_shared_) 00056 lmem_used += (ml_ + 1) * (cache_width_ + 1) * scalartype_size; 00057 if(use_rhs_shared_) 00058 lmem_used += (cache_width_ + 1) * (nl_ + 1) * scalartype_size; 00059 return lmem_used; 00060 } 00061 00062 virtual void print(std::ostream & s) const{ 00063 s << "{vector_type, local_size1, cache_width, local_size2, ms, ks, ns, use_lhs_shared, use_rhs_shared} = {" 00064 << vector_size_ << "," 00065 << local_size1_ << ", " 00066 << cache_width_ << ", " 00067 << local_size2_ << ", " 00068 << ms_ << ", " 00069 << ks_ << ", " 00070 << ns_ << ", " 00071 << use_lhs_shared_ << ", " << use_rhs_shared_ << "}" ; 00072 } 00073 00074 00075 bool invalid_impl(viennacl::ocl::device const & /*dev*/, vcl_size_t /*scalartype_size*/) const{ 00076 static const unsigned int alignment = 128; 00077 return ml_ > alignment 00078 || cache_width_ > alignment 00079 || nl_ > alignment 00080 || ml_ < ms_ 00081 || cache_width_ < ks_ 00082 || nl_ < ns_ 00083 || (ms_ % vector_size_) > 0 00084 || (ks_ % vector_size_) > 0 00085 || (ns_ % vector_size_) > 0; 00086 } 00087 00088 public: 00090 matrix_product(unsigned int vectorization 00091 , vcl_size_t local_size1, vcl_size_t cache_width, vcl_size_t local_size2 00092 , unsigned int ms, unsigned int ks, unsigned int ns 00093 , bool use_lhs_shared, bool use_rhs_shared) : profile_base(vectorization,local_size1, local_size2,1){ 00094 local_size1_ = local_size1; 00095 local_size2_ = local_size2; 00096 cache_width_=cache_width; 00097 ml_= ms*local_size1; 00098 nl_=ns*local_size2; 00099 ms_ = ms; 00100 ks_=ks; 00101 ns_=ns; 00102 use_lhs_shared_ = use_lhs_shared; 00103 use_rhs_shared_ = use_rhs_shared; 00104 } 00105 00106 static std::string csv_format() { 00107 return "Vec,LSize1,CacheWidth,LSize2,mS,kS,nS,NumGroups"; 00108 } 00109 00110 std::string csv_representation() const{ 00111 std::ostringstream oss; 00112 oss << vector_size_ 00113 << "," << local_size1_ 00114 << "," << cache_width_ 00115 << "," << local_size2_ 00116 << "," << ms_ 00117 << "," << ks_ 00118 << "," << ns_ 00119 << "," << use_lhs_shared_ 00120 << "," << use_rhs_shared_; 00121 return oss.str(); 00122 } 00123 00124 void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg) const { 00125 //set M, N 00126 scheduler::statement_node const & first_node = statements.front().second; 00127 vcl_size_t M = utils::call_on_matrix(first_node.lhs, utils::internal_size1_fun()); 00128 vcl_size_t N = utils::call_on_matrix(first_node.lhs, utils::internal_size2_fun()); 00129 00130 //set ND range 00131 configure_local_sizes(k, kernel_id); 00132 k.global_work_size(0, M/ms_); 00133 k.global_work_size(1, N/ns_); 00134 00135 //set arguments 00136 //M,N 00137 k.arg(n_arg++, cl_uint(M)); 00138 k.arg(n_arg++, cl_uint(N)); 00139 00140 //K 00141 for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){ 00142 scheduler::statement::container_type exprs = it->first.array(); 00143 for(scheduler::statement::container_type::iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){ 00144 if(iit->op.type==scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE){ 00145 scheduler::statement_node const * current_node = &(*iit); 00146 //The LHS of the prod is a matrix 00147 if(current_node->lhs.type_family==scheduler::MATRIX_TYPE_FAMILY) 00148 { 00149 k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun()))); 00150 } 00151 else{ 00152 //The LHS of the prod is a matrix expression 00153 current_node = &exprs[current_node->lhs.node_index]; 00154 if(current_node->lhs.type_family==scheduler::MATRIX_TYPE_FAMILY) 00155 { 00156 if(current_node->op.type==scheduler::OPERATION_UNARY_TRANS_TYPE) 00157 k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun()))); 00158 else 00159 k.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun()))); 00160 } 00161 else{ 00162 assert(false && bool("unexpected expression tree")); 00163 } 00164 } 00165 return; 00166 } 00167 } 00168 } 00169 00170 } 00171 00172 static std::string size1() { return "M"; } 00173 static std::string size2() { return "K"; } 00174 static std::string size3() { return "N"; } 00175 00176 void kernel_arguments(statements_type const & /*statements*/, std::string & arguments_string) const{ 00177 arguments_string += detail::generate_value_kernel_argument("unsigned int", "M"); 00178 arguments_string += detail::generate_value_kernel_argument("unsigned int", "N"); 00179 arguments_string += detail::generate_value_kernel_argument("unsigned int", "K"); 00180 } 00181 00182 private: 00183 00184 void transform_block(detail::mapped_matrix const & /*mat_infos*/, bool store_shared 00185 , unsigned int & large_block_1, unsigned int & large_block_2 00186 , unsigned int & small_block_1, unsigned int & small_block_2 00187 , access_flow flow) const { 00188 if(flow==REGULAR){ 00189 large_block_2/=vector_size_; 00190 if(!store_shared) 00191 small_block_2/=vector_size_; 00192 } 00193 else{ 00194 large_block_1/=vector_size_; 00195 if(!store_shared) 00196 small_block_1/=vector_size_; 00197 } 00198 } 00199 00200 00201 std::string helper_variable(utils::kernel_generation_stream & stream 00202 , bool store_in_register 00203 , std::string const & type 00204 , std::string const & name 00205 , std::string const & expr) const { 00206 if(!store_in_register) 00207 return expr; 00208 stream << type << " " << name << " = " << expr << ";" << std::endl; 00209 return name; 00210 } 00211 00212 void fetch_element_to_local_mem(utils::kernel_generation_stream & stream, 00213 std::string const & lmem_name, 00214 vcl_size_t lmem_size2, 00215 std::string const & global_ptr, 00216 detail::mapped_matrix const & mat, 00217 access_flow flow, 00218 std::string const & i, 00219 std::string const & j) const { 00220 00221 if(flow==REGULAR){ 00222 stream << "val = *(" << global_ptr << " + " << j << " + " << mat.size2() << "*" << i << ");" << std::endl; 00223 for(unsigned int a = 0 ; a < vector_size_ ; ++a) 00224 if(vector_size_>1) 00225 stream << lmem_name << "[" << i << "*" << lmem_size2 << " + " << j << "*" << vector_size_<<" + " << a << "] = val.s" << a << ";" <<std::endl; 00226 else 00227 stream << lmem_name << "[" << i << "*" << lmem_size2 << " + " << j << "*" << vector_size_ << "] = val" << ";" <<std::endl; 00228 } 00229 else{ 00230 stream << "val = *(" << global_ptr << "+ " << j << "*" << mat.size1() << " + " << i << ");" << std::endl; 00231 for(unsigned int a = 0 ; a < vector_size_ ; ++a) 00232 if(vector_size_>1) 00233 stream << lmem_name << "[" << i << "*" << vector_size_*lmem_size2 << " + " << j << " + " << a*lmem_size2 << "] = val.s" << a << ";" <<std::endl; 00234 else 00235 stream << lmem_name << "[" << i << "*" << vector_size_*lmem_size2 << " + " << j << "] = val" << ";" <<std::endl; 00236 } 00237 } 00238 void fetch_to_local_mem(utils::kernel_generation_stream & stream, 00239 std::string const & lmem_name, 00240 vcl_size_t lmem_size2, 00241 std::string const & global_ptr, 00242 unsigned int bound1, 00243 unsigned int bound2, 00244 detail::mapped_matrix const & mat, 00245 access_flow flow) const { 00246 std::string aligned_scalartype = mat.scalartype(); 00247 if(vector_size_ > 1) 00248 aligned_scalartype+=utils::to_string(vector_size_); 00249 stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl; 00250 stream << "{" << std::endl; 00251 stream << aligned_scalartype << " val;" << std::endl; 00252 //Can unroll 00253 if(bound2%local_size2_==0 && bound1%local_size1_==0){ 00254 for(unsigned int j = 0 ; j < bound2 ; j+=static_cast<unsigned int>(local_size2_)){ 00255 for(unsigned int i = 0 ; i < bound1 ; i+=static_cast<unsigned int>(local_size1_)){ 00256 std::string indi = "(get_local_id(0) + " + utils::to_string(i)+")"; 00257 std::string indj = "(get_local_id(1) + " + utils::to_string(j)+")"; 00258 fetch_element_to_local_mem(stream,lmem_name,lmem_size2,global_ptr,mat,flow,indi,indj); 00259 } 00260 } 00261 } 00262 else{ 00263 stream << "for(unsigned int j = get_local_id(1)" << " ; j < " << bound2 << "; j+= " << local_size2_ << "){" << std::endl; 00264 stream.inc_tab(); 00265 stream << "for(unsigned int i = get_local_id(0)" << " ; i < " << bound1 << "; i+= " << local_size1_ << "){" << std::endl; 00266 stream.inc_tab(); 00267 fetch_element_to_local_mem(stream,lmem_name,lmem_size2,global_ptr,mat,flow,"i","j"); 00268 stream.dec_tab(); 00269 stream << "}" << std::endl; 00270 stream.dec_tab(); 00271 stream << "}" << std::endl; 00272 00273 } 00274 stream << "}" << std::endl; 00275 stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl; 00276 00277 } 00278 00279 void core(vcl_size_t /*kernel_id*/, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const { 00280 00284 00285 detail::mapped_matrix const * assigned = static_cast<detail::mapped_matrix const *>(at(mapping.at(0), std::make_pair(&statements.front().second,detail::LHS_NODE_TYPE)).get()); 00286 detail::mapped_matrix_product* prod = NULL; 00287 detail::mapped_matrix const * lhs = NULL; 00288 detail::mapped_matrix const * rhs = NULL; 00289 00290 bool is_lhs_transposed = false; 00291 bool is_rhs_transposed = false; 00292 00293 for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){ 00294 scheduler::statement::container_type const & exprs = it->first.array(); 00295 vcl_size_t i = std::distance(statements.begin(), it); 00296 for(scheduler::statement::container_type::const_iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){ 00297 if(iit->op.type==scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE){ 00298 prod = (detail::mapped_matrix_product *)at(mapping.at(i), std::make_pair(&(*iit), detail::PARENT_NODE_TYPE)).get(); 00299 if(iit->lhs.type_family == scheduler::COMPOSITE_OPERATION_FAMILY){ 00300 is_lhs_transposed = true; 00301 lhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&exprs[iit->lhs.node_index],detail::LHS_NODE_TYPE)).get(); 00302 } 00303 else{ 00304 is_lhs_transposed = false; 00305 lhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&(*iit), detail::LHS_NODE_TYPE)).get(); 00306 } 00307 00308 if(iit->rhs.type_family == scheduler::COMPOSITE_OPERATION_FAMILY){ 00309 is_rhs_transposed = true; 00310 rhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&exprs[iit->rhs.node_index], detail::LHS_NODE_TYPE)).get(); 00311 } 00312 else{ 00313 is_rhs_transposed = false; 00314 rhs = (detail::mapped_matrix const *)at(mapping.at(i), std::make_pair(&(*iit),detail::RHS_NODE_TYPE)).get(); 00315 } 00316 00317 } 00318 } 00319 } 00320 00321 if(vector_size_>1){ 00322 std::string StrV = "/"+utils::to_string(vector_size_) ; 00323 00324 for(detail::mapping_type::const_iterator it = mapping.front().begin() ; it != mapping.front().end() ; ++it){ 00325 if(detail::mapped_matrix const * p = dynamic_cast<detail::mapped_matrix const *>(it->second.get())){ 00326 if(p->is_row_major()) 00327 p->bind_sizes("M", "N"+StrV); 00328 else 00329 p->bind_sizes("M"+StrV, "N"); 00330 } 00331 } 00332 00333 if(lhs->is_row_major()) 00334 if(is_lhs_transposed) 00335 lhs->bind_sizes("M"+StrV, "K"); 00336 else 00337 lhs->bind_sizes("M", "K"+StrV); 00338 else 00339 if(is_lhs_transposed) 00340 lhs->bind_sizes("M", "K"+StrV); 00341 else 00342 lhs->bind_sizes("M"+StrV, "K"); 00343 00344 00345 if(rhs->is_row_major()) 00346 if(is_rhs_transposed) 00347 rhs->bind_sizes("K"+StrV, "N"); 00348 else 00349 rhs->bind_sizes("K", "N"+StrV); 00350 else 00351 if(is_rhs_transposed) 00352 rhs->bind_sizes("K", "N"+StrV); 00353 else 00354 rhs->bind_sizes("K"+StrV, "N"); 00355 00356 00357 } 00358 else{ 00359 for(detail::mapping_type::const_iterator it = mapping.front().begin() ; it != mapping.front().end() ; ++it){ 00360 if(detail::mapped_matrix const * p = dynamic_cast<detail::mapped_matrix const *>(it->second.get())){ 00361 p->bind_sizes("M", "N"); 00362 } 00363 } 00364 00365 lhs->bind_sizes("M", "K"); 00366 rhs->bind_sizes("K", "N"); 00367 } 00368 00369 00370 00371 std::string aligned_scalartype = assigned->scalartype(); 00372 if(vector_size_ > 1) 00373 aligned_scalartype+=utils::to_string(vector_size_); 00374 00375 00376 access_flow result_access_flow; 00377 if(assigned->is_row_major()) 00378 result_access_flow = REGULAR; 00379 else 00380 result_access_flow = STRIDED; 00381 00382 access_flow lhs_access_flow; 00383 if((lhs->is_row_major() && !is_lhs_transposed) 00384 ||(!lhs->is_row_major() && is_lhs_transposed)) 00385 lhs_access_flow = REGULAR; 00386 else 00387 lhs_access_flow = STRIDED; 00388 00389 access_flow rhs_access_flow; 00390 if((rhs->is_row_major() && !is_rhs_transposed) 00391 ||(!rhs->is_row_major() && is_rhs_transposed)) 00392 rhs_access_flow = REGULAR; 00393 else 00394 rhs_access_flow = STRIDED; 00395 00396 00397 std::string lhs_value_scalartype; 00398 if(use_lhs_shared_) 00399 lhs_value_scalartype = lhs->scalartype(); 00400 else 00401 lhs_value_scalartype = aligned_scalartype; 00402 00403 std::string rhs_value_scalartype; 00404 if(use_rhs_shared_) 00405 rhs_value_scalartype = rhs->scalartype(); 00406 else 00407 rhs_value_scalartype = aligned_scalartype; 00408 00409 00410 unsigned int ml_res = static_cast<unsigned int>(ml_), nl_res = static_cast<unsigned int>(nl_), ms_res = static_cast<unsigned int>(ms_), ns_res = static_cast<unsigned int>(ns_); 00411 unsigned int ml_lhs = static_cast<unsigned int>(ml_), cache_width_lhs = static_cast<unsigned int>(cache_width_), ms_lhs = static_cast<unsigned int>(ms_), ks_lhs = static_cast<unsigned int>(ks_); 00412 unsigned int cache_width_rhs = static_cast<unsigned int>(cache_width_), nl_rhs = static_cast<unsigned int>(nl_), ks_rhs = static_cast<unsigned int>(ks_), ns_rhs = static_cast<unsigned int>(ns_); 00413 00414 transform_block(*assigned,false,ml_res,nl_res,ms_res,ns_res,result_access_flow); 00415 transform_block(*lhs,use_lhs_shared_,ml_lhs,cache_width_lhs,ms_lhs,ks_lhs,lhs_access_flow); 00416 transform_block(*rhs,use_rhs_shared_,cache_width_rhs,nl_rhs,ks_rhs,ns_rhs,rhs_access_flow); 00417 00421 00422 00423 vcl_size_t local_lhs_size1 = ml_ ; 00424 vcl_size_t local_lhs_size2 = cache_width_ + 1; 00425 00426 vcl_size_t local_rhs_size1 = cache_width_; 00427 vcl_size_t local_rhs_size2 = nl_ + 1; 00428 00430 for(unsigned int m=0; m< ms_res; ++m) 00431 for(unsigned int n=0; n < ns_res ; ++n) 00432 stream << aligned_scalartype << " " << "res" << m << "_" << n << " = (" << aligned_scalartype << ")(0) ;" << std::endl; 00433 00435 if(use_lhs_shared_) 00436 stream << "__local " << lhs->scalartype() << " lhs_buf[" << local_lhs_size1*local_lhs_size2 << "]" << ";" << std::endl; 00437 if(use_rhs_shared_) 00438 stream << "__local " << rhs->scalartype() << " rhs_buf[" << local_rhs_size1*local_rhs_size2 << "]" << ";" << std::endl; 00439 00441 //stream << "__global " << aligned_scalartype << "* res_ptr = " << assigned->name() << " + " << assigned->offset(std::make_pair("get_global_id(0)*" + utils::to_string(ms_res), "get_global_id(1)*" + utils::to_string(ns_res))) << ";" << std::endl; 00442 00443 00445 if(use_lhs_shared_){ 00446 std::string i = "get_group_id(0)*" + utils::to_string(ml_lhs); 00447 stream << "__global " << aligned_scalartype << "* global_lhs_ptr = " << lhs->name() << " + "; 00448 if(lhs_access_flow==REGULAR) 00449 stream << "(" << i << ")" << "*" << lhs->size2(); 00450 else 00451 stream << i; 00452 stream << ";" << std::endl; 00453 } 00454 00456 else{ 00457 if(lhs_access_flow==REGULAR) 00458 for(unsigned int m=0; m<ms_lhs; ++m) 00459 stream << "__global " << aligned_scalartype << "* " << "lhs_ptr_" << m << " = " << lhs->name() << " + " 00460 << lhs->size2() << "* (" 00461 << "get_group_id(0)*" << ml_lhs << "+" << "get_local_id(0)*" << ms_lhs << "+" << m 00462 << " );" << std::endl; 00463 else 00464 for(unsigned int k=0; k<ks_lhs; ++k) 00465 stream << "__global " << aligned_scalartype<< "* " << "lhs_ptr_" << k << " = " << lhs->name() << " + " 00466 << "(" << lhs->size1() << ")*" << k 00467 << "+ " << "get_group_id(0)*" << ml_lhs << "+" << "get_local_id(0)*" << ms_lhs << ";" << std::endl; 00468 } 00469 00471 if(use_rhs_shared_){ 00472 std::string j = "get_group_id(1)*" + utils::to_string(nl_rhs); 00473 stream << "__global " << aligned_scalartype << "* global_rhs_ptr = " << rhs->name() << " + "; 00474 if(rhs_access_flow==REGULAR) 00475 stream << j; 00476 else 00477 stream << "(" << j << ")" << "*" << rhs->size1(); 00478 stream << ";" << std::endl; 00479 } 00480 00482 else{ 00483 if(rhs_access_flow==REGULAR) 00484 for(unsigned int k = 0 ; k < ks_rhs ; ++k) 00485 stream << "__global " << aligned_scalartype << "* " << "rhs_ptr_" << k << " = " << rhs->name() << " + " 00486 << "(" << k << ")" << "*" << rhs->size2() 00487 << " + " << "get_local_id(1)*" << ns_rhs << " + get_group_id(1)*" << nl_rhs 00488 << ";" << std::endl; 00489 else 00490 for(unsigned int n = 0 ; n < ns_rhs ; ++n) 00491 stream << "__global " << aligned_scalartype << "* " << "rhs_ptr_" << n << " = " << rhs->name() << " + " 00492 << "(" << "get_local_id(1)*" << ns_rhs << " + get_group_id(1)*" << nl_rhs << " + " << n << ")" << "*" << rhs->size1() 00493 << ";" << std::endl; 00494 } 00495 00496 00498 std::string block_num = helper_variable(stream,false,"unsigned int", "block_num", "K/" + utils::to_string(cache_width_)); 00499 stream << "for(unsigned int bl=0 ; bl<" << block_num << " ; ++bl){" << std::endl; 00500 stream.inc_tab(); 00501 00503 if(use_lhs_shared_){ 00504 fetch_to_local_mem(stream,"lhs_buf",local_lhs_size2,"global_lhs_ptr",ml_lhs,cache_width_lhs,*lhs,lhs_access_flow); 00505 for(unsigned int m=0; m<ms_lhs; ++m) 00506 stream << "__local " << lhs_value_scalartype << "* lhs_ptr_" << m << " = lhs_buf + " 00507 << "(" << "get_local_id(0)*" << ms_lhs << "+" << m << ")" << "*" << local_lhs_size2 00508 << ";" << std::endl; 00509 } 00510 00512 if(use_rhs_shared_){ 00513 fetch_to_local_mem(stream,"rhs_buf", local_rhs_size2, "global_rhs_ptr",cache_width_rhs,nl_rhs,*rhs,rhs_access_flow); 00514 for(unsigned int k=0; k<ks_rhs; ++k) 00515 stream << "__local " << rhs_value_scalartype << "* rhs_ptr_" << k << " = rhs_buf + " 00516 << k*local_rhs_size2 << " + " << "get_local_id(1)*" << ns_rhs 00517 << ";" << std::endl; 00518 } 00519 00520 00521 stream << " for(unsigned int bs=0 ; bs < " << cache_width_/ks_ << " ; ++bs){" << std::endl; 00522 stream.inc_tab(); 00523 00524 00525 for(unsigned int k = 0 ; k < ks_rhs ; ++k){ 00526 for(unsigned int n=0 ; n < ns_rhs ; ++n){ 00527 stream << rhs_value_scalartype << " val_rhs_" << k << "_" << n << " = " ; 00528 if(use_rhs_shared_ ) 00529 stream << "* rhs_ptr_" << k << "++"; 00530 else{ 00531 if(rhs_access_flow==REGULAR) 00532 stream << "* rhs_ptr_" << k << "++"; 00533 else 00534 stream << "* rhs_ptr_" << n << "++"; 00535 } 00536 stream << ";"; 00537 stream << std::endl; 00538 } 00539 } 00540 00541 00542 for(unsigned int k = 0 ; k < ks_lhs ; ++k){ 00543 for(unsigned int m=0 ; m < ms_lhs ; ++m){ 00544 stream << lhs_value_scalartype << " " << "val_lhs_" << m << "_" << k << " = "; 00545 if(use_lhs_shared_) 00546 stream << "* lhs_ptr_" << m << "++" ; 00547 else if(lhs_access_flow==REGULAR) 00548 stream << "* lhs_ptr_" << m << "++"; 00549 else 00550 stream << "* lhs_ptr_" << k << "++"; 00551 stream << ";"; 00552 stream << std::endl; 00553 } 00554 } 00555 00556 00557 for(unsigned int n=0 ; n < ns_res ; ++n){ 00558 for(unsigned int k = 0 ; k < ks_ ; ++k){ 00559 for(unsigned int m=0 ; m < ms_res ; ++m){ 00560 for(unsigned int a = 0; a<vector_size_; ++a){ 00561 00562 int ind_lhs_1 = m; 00563 int ind_lhs_2 = k; 00564 int ind_s_lhs = a; 00565 00566 int ind_rhs_1=k; 00567 int ind_rhs_2=n; 00568 int ind_s_rhs=a; 00569 00570 if(result_access_flow==REGULAR){ 00571 if(!use_lhs_shared_){ 00572 if(lhs_access_flow==REGULAR){ 00573 ind_s_lhs = ind_lhs_2%vector_size_; 00574 ind_lhs_2 /= vector_size_; 00575 } 00576 else{ 00577 ind_s_lhs = ind_lhs_1%vector_size_; 00578 ind_lhs_1 /= vector_size_; 00579 } 00580 } 00581 } 00582 else{ 00583 if(use_lhs_shared_){ 00584 ind_lhs_1 = ind_lhs_1*vector_size_+a; 00585 } 00586 else{ 00587 if(lhs_access_flow==REGULAR){ 00588 ind_lhs_1 = ind_lhs_1*vector_size_+a; 00589 ind_s_lhs = ind_lhs_2%vector_size_; 00590 ind_lhs_2 /= vector_size_; 00591 } 00592 } 00593 } 00594 00595 if(result_access_flow==REGULAR){ 00596 if(use_rhs_shared_){ 00597 ind_rhs_2 = ind_rhs_2*vector_size_+a; 00598 } 00599 else{ 00600 if(rhs_access_flow==STRIDED){ 00601 ind_rhs_2 = ind_rhs_2*vector_size_+a; 00602 ind_s_rhs = ind_rhs_1%vector_size_; 00603 ind_rhs_1 = ind_rhs_1/vector_size_; 00604 } 00605 else{ 00606 } 00607 } 00608 } 00609 else{ 00610 if(!use_rhs_shared_){ 00611 if(rhs_access_flow==REGULAR){ 00612 ind_s_rhs = ind_rhs_2%vector_size_; 00613 ind_rhs_2/=vector_size_; 00614 } 00615 else{ 00616 ind_s_rhs = ind_rhs_1%vector_size_; 00617 ind_rhs_1/=vector_size_; 00618 } 00619 } 00620 } 00621 00622 std::ostringstream res_oss; 00623 std::ostringstream lhs_oss; 00624 std::ostringstream rhs_oss; 00625 00626 res_oss << "res" << m << "_" << n ; 00627 if(vector_size_>1) res_oss << ".s" << a; 00628 00629 lhs_oss << "val_lhs_" << ind_lhs_1 << "_" << ind_lhs_2; 00630 if(!use_lhs_shared_ && vector_size_>1) lhs_oss << ".s" << ind_s_lhs; 00631 00632 00633 rhs_oss << "val_rhs_" << ind_rhs_1 << "_" << ind_rhs_2; 00634 if(!use_rhs_shared_ && vector_size_>1) rhs_oss << ".s" << ind_s_rhs; 00635 00636 00637 stream << res_oss.str() << "+=" << lhs_oss.str() << "*" << rhs_oss.str() << ";" << std::endl; 00638 } 00639 } 00640 } 00641 } 00642 00643 00644 if(use_rhs_shared_){ 00645 for(unsigned int k=0 ; k<ks_ ; ++k) 00646 stream << "rhs_ptr_" << k << " += " << ks_rhs*local_rhs_size2 - ns_rhs << ";" << std::endl; 00647 } 00648 else{ 00649 if(rhs_access_flow==REGULAR) 00650 for(unsigned int k=0 ; k<ks_ ; ++k) 00651 stream << "rhs_ptr_" << k << " += " << ks_rhs << "*" << rhs->size2() << " - " << ns_rhs << ";" << std::endl; 00652 } 00653 00654 if(!use_lhs_shared_){ 00655 if(lhs_access_flow==STRIDED) 00656 for(unsigned int k=0 ; k<ks_lhs ; ++k) 00657 stream << "lhs_ptr_" << k << " += " << ks_lhs << "*" << lhs->size1() << " - " << ms_lhs << ";" << std::endl; 00658 } 00659 00660 00661 00662 stream.dec_tab(); 00663 stream << "}" << std::endl; 00664 00665 if(use_lhs_shared_){ 00666 if(lhs_access_flow==REGULAR) 00667 stream << "global_lhs_ptr += " << cache_width_lhs << ";" << std::endl; 00668 else 00669 stream << "global_lhs_ptr += " << cache_width_lhs << "*" << lhs->size1() << ";" << std::endl; 00670 } 00671 00672 if(use_rhs_shared_){ 00673 if(rhs_access_flow==REGULAR) 00674 stream << "global_rhs_ptr += " << cache_width_rhs << "*" << rhs->size2() << ";" << std::endl; 00675 else 00676 stream << "global_rhs_ptr += " << cache_width_rhs << ";" << std::endl; 00677 } 00678 00679 stream.dec_tab(); 00680 stream << "}" << std::endl; 00681 00682 for(unsigned int m=0 ; m < ms_res ; ++m){ 00683 for(unsigned int n=0 ; n < ns_res ; ++n){ 00684 std::string i = "get_global_id(0)*" + utils::to_string(ms_res) + "+" + utils::to_string(m); 00685 std::string j = "get_global_id(1)*" + utils::to_string(ns_res) + "+" + utils::to_string(n); 00686 prod->access_name("res"+utils::to_string(m)+"_"+utils::to_string(n)); 00687 std::string str; 00688 detail::traverse(statements.front().first, statements.front().second, detail::expression_generation_traversal(std::make_pair(i, j), -1, str, mapping[0]), false); 00689 stream << str << ";" << std::endl; 00690 } 00691 } 00692 00693 00694 } 00695 00696 private: 00697 vcl_size_t local_size1_; 00698 vcl_size_t local_size2_; 00699 vcl_size_t cache_width_; 00700 00701 vcl_size_t ml_; 00702 vcl_size_t nl_; 00703 00704 vcl_size_t ms_; 00705 vcl_size_t ks_; 00706 vcl_size_t ns_; 00707 00708 bool use_lhs_shared_; 00709 bool use_rhs_shared_; 00710 }; 00711 00712 } 00713 00714 } 00715 00716 #endif