ViennaCL - The Vienna Computing Library  1.5.1
viennacl/generator/scalar_reduction.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_GENERATOR_GENERATE_SCALAR_REDUCTION_HPP
00002 #define VIENNACL_GENERATOR_GENERATE_SCALAR_REDUCTION_HPP
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00021 
00027 #include <vector>
00028 
00029 #include "viennacl/backend/opencl.hpp"
00030 
00031 #include "viennacl/scheduler/forwards.h"
00032 
00033 #include "viennacl/generator/helpers.hpp"
00034 #include "viennacl/generator/utils.hpp"
00035 
00036 #include "viennacl/generator/profile_base.hpp"
00037 
00038 #include "viennacl/tools/tools.hpp"
00039 
00040 namespace viennacl{
00041 
00042   namespace generator{
00043 
00045     class scalar_reduction : public profile_base{
00046       private:
00047         typedef std::vector<std::pair<const char *, viennacl::ocl::handle<cl_mem> > > temporaries_type;
00048 
00049         static void fill_scalartypes(statements_type statements, std::vector<const char *> & res){
00050           res.reserve(statements.size());
00051           for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
00052             if (it->second.lhs.type_family == scheduler::SCALAR_TYPE_FAMILY)
00053             {
00054               switch(it->second.lhs.numeric_type){
00055                 case scheduler::FLOAT_TYPE:
00056                   res.push_back("float");
00057                   break;
00058                 case scheduler::DOUBLE_TYPE:
00059                   res.push_back("double");
00060                   break;
00061                 default:
00062                   res.push_back("");
00063                   break;
00064               }
00065             }
00066             else
00067             {
00068               res.push_back("");
00069             }
00070           }
00071         }
00072 
00073       public:
00074 
00075         vcl_size_t lmem_used(vcl_size_t scalartype_size) const {
00076           return local_size_1_*scalartype_size;
00077         }
00078 
00079         void init_temporaries(statements_type const & statements) const {
00080           if(temporaries_.empty()){
00081             //set temporary buffer argument
00082             for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
00083               scheduler::statement::container_type const & array = it->first.array();
00084               vcl_size_t size_of_scalartype;
00085               const char * scalartype_name;
00086               if (array[0].lhs.type_family != scheduler::SCALAR_TYPE_FAMILY) throw "not implemented";
00087               switch(array[0].lhs.numeric_type){
00088                 case scheduler::FLOAT_TYPE: scalartype_name = "float"; size_of_scalartype = sizeof(float); break;
00089                 case scheduler::DOUBLE_TYPE: scalartype_name = "double"; size_of_scalartype = sizeof(double); break;
00090                 default: throw "not implemented";
00091               }
00092               for(scheduler::statement::container_type::const_iterator iit = array.begin() ; iit != array.end() ; ++iit){
00093                 if(iit->op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE){
00094                   temporaries_.push_back(std::make_pair(scalartype_name, viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, static_cast<unsigned int>(num_groups_*size_of_scalartype))));
00095                 }
00096               }
00097             }
00098           }
00099         }
00100 
00101         void set_size_argument(viennacl::scheduler::statement const & s, viennacl::scheduler::statement_node const & /*root_node*/, unsigned int & n_arg, viennacl::ocl::kernel & k) const {
00102           scheduler::statement::container_type exprs = s.array();
00103           for(scheduler::statement::container_type::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
00104             if(it->op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE){
00105               //set size argument
00106               scheduler::statement_node const * current_node = &(*it);
00107 
00108               vcl_size_t vector_size = 0;
00109               //The LHS of the prod is a vector
00110               if(current_node->lhs.type_family==scheduler::VECTOR_TYPE_FAMILY)
00111               {
00112                 vector_size = utils::call_on_vector(current_node->lhs, utils::internal_size_fun());
00113               }
00114               else{
00115                 //The LHS of the prod is a vector expression
00116                 current_node = &exprs[current_node->lhs.node_index];
00117                 if(current_node->lhs.type_family==scheduler::VECTOR_TYPE_FAMILY)
00118                 {
00119                   vector_size = cl_uint(utils::call_on_vector(current_node->lhs, utils::internal_size_fun()));
00120                 }
00121                 else if(current_node->rhs.type_family==scheduler::VECTOR_TYPE_FAMILY)
00122                 {
00123                   vector_size = cl_uint(utils::call_on_vector(current_node->lhs, utils::internal_size_fun()));
00124                 }
00125                 else{
00126                   assert(false && bool("unexpected expression tree"));
00127                 }
00128               }
00129               k.arg(n_arg++, cl_uint(vector_size/vector_size_));
00130             }
00131           }
00132         }
00133 
00134       public:
00136         scalar_reduction(unsigned int vectorization, unsigned int local_size, unsigned int num_groups, unsigned int decomposition) : profile_base(vectorization, local_size, 1, 2), num_groups_(num_groups), decomposition_(decomposition){ }
00137 
00138 
00139         static std::string csv_format() {
00140           return "Vec,LSize,NumGroups,GlobalDecomposition";
00141         }
00142 
00143         std::string csv_representation() const{
00144           std::ostringstream oss;
00145           oss << vector_size_
00146                  << "," << local_size_1_
00147                  << "," << num_groups_
00148                  << "," << decomposition_;
00149           return oss.str();
00150         }
00151 
00152         unsigned int num_groups() const { return num_groups_; }
00153 
00154 
00155         unsigned int decomposition() const { return decomposition_; }
00156 
00157 
00158         void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg)  const{
00159 
00160           //create temporaries
00161           init_temporaries(statements);
00162 
00163           //configure ND range
00164           if(kernel_id==0){
00165             configure_local_sizes(k, 0);
00166 
00167             vcl_size_t gsize = local_size_1_*num_groups_;
00168             k.global_work_size(0,gsize);
00169             k.global_work_size(1,1);
00170           }
00171           else{
00172             configure_local_sizes(k, 1);
00173 
00174             k.global_work_size(0,local_size_1_);
00175             k.global_work_size(1,1);
00176           }
00177 
00178           //set arguments
00179           set_size_argument(statements.front().first, statements.front().second, n_arg, k);
00180           for(temporaries_type::iterator it = temporaries_.begin() ; it != temporaries_.end() ; ++it){
00181             k.arg(n_arg++, it->second);
00182           }
00183         }
00184 
00185         void kernel_arguments(statements_type  const & statements, std::string & arguments_string) const{
00186           init_temporaries(statements);
00187           arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
00188           for(temporaries_type::iterator it = temporaries_.begin() ; it != temporaries_.end() ; ++it){
00189             arguments_string += detail::generate_pointer_kernel_argument("__global", it->first, "temp" + utils::to_string(std::distance(temporaries_.begin(), it)));
00190           }
00191         }
00192 
00193       private:
00194 
00195         void core_0(utils::kernel_generation_stream& stream, std::vector<detail::mapped_scalar_reduction*> exprs, std::vector<const char *> const & scalartypes, statements_type const & /*statements*/, std::vector<detail::mapping_type> const & /*mapping*/) const {
00196 
00197           stream << "unsigned int lid = get_local_id(0);" << std::endl;
00198 
00199           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
00200             stream << scalartypes[k] << " sum" << k << " = 0;" << std::endl;
00201 
00202           if(decomposition_){
00203             stream << "for(unsigned int i = get_global_id(0) ; i < N ; i += get_global_size(0)){" << std::endl;
00204           }
00205           else{
00206             stream << "unsigned int chunk_size = (N + get_num_groups(0)-1)/get_num_groups(0);" << std::endl;
00207             stream << "unsigned int chunk_start = get_group_id(0)*chunk_size;" << std::endl;
00208             stream << "unsigned int chunk_end = min(chunk_start+chunk_size, N);" << std::endl;
00209             stream << "for(unsigned int i = chunk_start + get_local_id(0) ; i < chunk_end ; i += get_local_size(0)){" << std::endl;
00210           }
00211           stream.inc_tab();
00212 
00213           //Fetch vector entry
00214           std::set<std::string>  fetched;
00215 
00216           for(std::vector<detail::mapped_scalar_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
00217             viennacl::scheduler::statement const & statement = (*it)->statement();
00218             viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
00219             detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair("i", "0"),vector_size_,stream,(*it)->mapping());
00220             detail::fetch_all_rhs(fetched,statement,root_node, std::make_pair("i", "0"),vector_size_,stream,(*it)->mapping());
00221           }
00222 
00223 
00224           //Update sums;
00225           for(std::vector<detail::mapped_scalar_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
00226             viennacl::scheduler::statement const & statement = (*it)->statement();
00227             viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
00228             if(vector_size_ > 1){
00229               for(unsigned int a = 0 ; a < vector_size_ ; ++a){
00230                 std::string str;
00231                 detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),a,str,(*it)->mapping());
00232                 str += "*";
00233                 detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),a,str,(*it)->mapping());
00234                 stream << " sum" << std::distance(exprs.begin(),it) << " += "  << str << ";" << std::endl;
00235               }
00236             }
00237             else{
00238               std::string str;
00239               detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
00240               str += "*";
00241               detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
00242               stream << " sum" << std::distance(exprs.begin(),it) << " += "  << str << ";" << std::endl;
00243             }
00244           }
00245 
00246 
00247           stream.dec_tab();
00248           stream << "}" << std::endl;
00249           //Declare and fill local memory
00250           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
00251             stream << "__local " << scalartypes[k] << " buf" << k << "[" << local_size_1_ << "];" << std::endl;
00252 
00253           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
00254             stream << "buf" << k << "[lid] = sum" << k << ";" << std::endl;
00255 
00256           //Reduce local memory
00257           for(vcl_size_t stride = local_size_1_/2 ; stride>1 ; stride /=2){
00258             stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
00259             stream << "if(lid < " << stride << "){" << std::endl;
00260             stream.inc_tab();
00261             for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
00262               stream << "buf" << k << "[lid] += buf" << k << "[lid + " << stride << "];" << std::endl;
00263             }
00264             stream.dec_tab();
00265             stream << "}" << std::endl;
00266           }
00267 
00268           //Last reduction and write back to temporary buffer
00269           stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
00270           stream << "if(lid==0){" << std::endl;
00271           stream.inc_tab();
00272           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
00273             stream << "buf" << k << "[0] += buf" << k << "[1];" << std::endl;
00274 
00275           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
00276             stream << "temp"<< k << "[get_group_id(0)] = buf" << k << "[0];" << std::endl;
00277 
00278           stream.dec_tab();
00279           stream << "}" << std::endl;
00280         }
00281 
00282 
00283         void core_1(utils::kernel_generation_stream& stream, std::vector<detail::mapped_scalar_reduction*> exprs, std::vector<const char *> scalartypes, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
00284           stream << "unsigned int lid = get_local_id(0);" << std::endl;
00285 
00286           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
00287             stream << "__local " << scalartypes[k] << " buf" << k << "[" << local_size_1_ << "];" << std::endl;
00288 
00289           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
00290             stream << scalartypes[0] << " sum" << k << " = 0;" << std::endl;
00291 
00292           stream << "for(unsigned int i = lid ; i < " << num_groups_ << " ; i += get_local_size(0)){" << std::endl;
00293           stream.inc_tab();
00294           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
00295             stream << "sum" << k << " += temp" << k << "[i];" << std::endl;
00296           stream.dec_tab();
00297           stream << "}" << std::endl;
00298 
00299           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
00300             stream << "buf" << k << "[lid] = sum" << k << ";" << std::endl;
00301 
00302           //Reduce local memory
00303           for(vcl_size_t stride = local_size_1_/2 ; stride>1 ; stride /=2){
00304             stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
00305             stream << "if(lid < " << stride << "){" << std::endl;
00306             stream.inc_tab();
00307             for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
00308               stream << "buf" << k << "[lid] += buf" << k << "[lid + " << stride << "];" << std::endl;
00309             }
00310             stream.dec_tab();
00311             stream << "}" << std::endl;
00312           }
00313 
00314           stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
00315           stream << "if(lid==0){" << std::endl;
00316           stream.inc_tab();
00317           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
00318             stream << "buf" << k << "[0] += buf" << k << "[1];" << std::endl;
00319             exprs[k]->access_name("buf"+utils::to_string(k)+"[0]");
00320           }
00321 
00322           vcl_size_t i = 0;
00323           for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
00324             std::string str;
00325             detail::traverse(it->first, it->second, detail::expression_generation_traversal(std::make_pair("0", "0"), -1, str, mapping[i++]), false);
00326             stream << str << ";" << std::endl;
00327           }
00328 
00329           stream.dec_tab();
00330           stream << "}" << std::endl;
00331         }
00332 
00333         void core(vcl_size_t kernel_id, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
00334           std::vector<detail::mapped_scalar_reduction*> exprs;
00335           for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it)
00336             for(detail::mapping_type::const_iterator iit = it->begin() ; iit != it->end() ; ++iit)
00337               if(detail::mapped_scalar_reduction * p = dynamic_cast<detail::mapped_scalar_reduction*>(iit->second.get()))
00338                 exprs.push_back(p);
00339 
00340           std::vector<const char *> scalartypes;
00341           fill_scalartypes(statements, scalartypes);
00342 
00343           if(kernel_id==0){
00344             core_0(stream,exprs,scalartypes,statements,mapping);
00345           }
00346           else{
00347             core_1(stream,exprs,scalartypes,statements,mapping);
00348           }
00349         }
00350 
00351       private:
00352         unsigned int num_groups_;
00353         unsigned int decomposition_;
00354         mutable temporaries_type temporaries_;
00355     };
00356 
00357 
00358   }
00359 
00360 }
00361 
00362 #endif