ViennaCL - The Vienna Computing Library  1.5.1
viennacl/linalg/opencl/kernels/spai.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
00002 #define VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP
00003 
00004 #include "viennacl/tools/tools.hpp"
00005 #include "viennacl/ocl/kernel.hpp"
00006 #include "viennacl/ocl/platform.hpp"
00007 #include "viennacl/ocl/utils.hpp"
00008 
00011 namespace viennacl
00012 {
00013   namespace linalg
00014   {
00015     namespace opencl
00016     {
00017       namespace kernels
00018       {
00019 
00021 
00022         template <typename StringType>
00023         void generate_spai_assemble_blocks(StringType & source, std::string const & numeric_string)
00024         {
00025           source.append("float get_element(__global const unsigned int * row_indices, \n");
00026           source.append("           __global const unsigned int * column_indices, \n");
00027           source.append("           __global const "); source.append(numeric_string); source.append(" * elements, \n");
00028           source.append("           unsigned int row, \n");
00029           source.append("           unsigned int col) \n");
00030           source.append("{ \n");
00031           source.append("  unsigned int row_end = row_indices[row+1]; \n");
00032           source.append("  for(unsigned int i = row_indices[row]; i < row_end; ++i){ \n");
00033           source.append("    if(column_indices[i] == col) \n");
00034           source.append("      return elements[i]; \n");
00035           source.append("    if(column_indices[i] > col) \n");
00036           source.append("      return 0; \n");
00037           source.append("  } \n");
00038           source.append("  return 0; \n");
00039           source.append("} \n");
00040 
00041           source.append("void block_assembly(__global const unsigned int * row_indices, \n");
00042           source.append("          __global const unsigned int * column_indices, \n");
00043           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00044           source.append("          __global const unsigned int * matrix_dimensions, \n");
00045           source.append("          __global const unsigned int * set_I, \n");
00046           source.append("          __global const unsigned int * set_J, \n");
00047           source.append("          unsigned int matrix_ind, \n");
00048           source.append("          __global "); source.append(numeric_string); source.append(" * com_A_I_J) \n");
00049           source.append("{ \n");
00050           source.append("  unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
00051           source.append("  unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
00052 
00053           source.append("  for(unsigned int i = 0; i < col_n; ++i){ \n");
00054                   //start row index
00055           source.append("        for(unsigned int j = 0; j < row_n; j++){ \n");
00056           source.append("          com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]); \n");
00057           source.append("        } \n");
00058           source.append("      } \n");
00059           source.append("} \n");
00060 
00061           source.append("__kernel void assemble_blocks( \n");
00062           source.append("          __global const unsigned int * row_indices, \n");
00063           source.append("          __global const unsigned int * column_indices, \n");
00064           source.append("          __global const "); source.append(numeric_string); source.append(" * elements, \n");
00065           source.append("          __global const unsigned int * set_I, \n");
00066           source.append("        __global const unsigned int * set_J, \n");
00067           source.append("      __global const unsigned int * i_ind, \n");
00068           source.append("      __global const unsigned int * j_ind, \n");
00069           source.append("        __global const unsigned int * block_ind, \n");
00070           source.append("        __global const unsigned int * matrix_dimensions, \n");
00071           source.append("      __global "); source.append(numeric_string); source.append(" * com_A_I_J, \n");
00072           source.append("      __global unsigned int * g_is_update, \n");
00073           source.append("                   unsigned int  block_elems_num) \n");
00074           source.append("{ \n");
00075           source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
00076           source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
00077           source.append("            block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]); \n");
00078           source.append("        } \n");
00079           source.append("    } \n");
00080           source.append("  } \n");
00081         }
00082 
00083         template <typename StringType>
00084         void generate_spai_block_bv_assembly(StringType & source, std::string const & numeric_string)
00085         {
00086           source.append("  void assemble_bv(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n){ \n");
00087           source.append("    for(unsigned int i = 0; i < col_n; ++i){ \n");
00088           source.append("      g_bv_r[i] = g_bv[ i]; \n");
00089           source.append("    } \n");
00090           source.append("  } \n");
00091 
00092           source.append("  void assemble_bv_block(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n, \n");
00093           source.append("               __global "); source.append(numeric_string); source.append(" * g_bv_u, unsigned int col_n_u) \n");
00094           source.append("  { \n");
00095           source.append("    assemble_bv(g_bv_r, g_bv, col_n); \n");
00096           source.append("    assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u); \n");
00097           source.append("  } \n");
00098 
00099           source.append("  __kernel void block_bv_assembly(__global "); source.append(numeric_string); source.append(" * g_bv, \n");
00100           source.append("              __global unsigned int * start_bv_ind, \n");
00101           source.append("              __global unsigned int * matrix_dimensions, \n");
00102           source.append("              __global "); source.append(numeric_string); source.append(" * g_bv_u, \n");
00103           source.append("              __global unsigned int * start_bv_u_ind, \n");
00104           source.append("              __global unsigned int * matrix_dimensions_u, \n");
00105           source.append("              __global "); source.append(numeric_string); source.append(" * g_bv_r, \n");
00106           source.append("              __global unsigned int * start_bv_r_ind, \n");
00107           source.append("              __global unsigned int * matrix_dimensions_r, \n");
00108           source.append("              __global unsigned int * g_is_update, \n");
00109           source.append("              //__local  "); source.append(numeric_string); source.append(" * local_gb, \n");
00110           source.append("              unsigned int  block_elems_num) \n");
00111           source.append("  { \n");
00112           source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
00113           source.append("      if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
00114           source.append("        assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]); \n");
00115           source.append("      } \n");
00116           source.append("    } \n");
00117           source.append("  } \n");
00118         }
00119 
00120         template <typename StringType>
00121         void generate_spai_block_least_squares(StringType & source, std::string const & numeric_string)
00122         {
00123           source.append("void custom_dot_prod_ls(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __global "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
00124           source.append("  *res = 0.0; \n");
00125           source.append("  for(unsigned int j = ind; j < row_n; ++j){ \n");
00126           source.append("    if(j == ind){ \n");
00127           source.append("      *res += v[ j]; \n");
00128           source.append("    }else{ \n");
00129           source.append("      *res += A[ j + ind*row_n]*v[ j]; \n");
00130           source.append("    } \n");
00131           source.append("  } \n");
00132           source.append("} \n");
00133 
00134           source.append("void backwardSolve(__global "); source.append(numeric_string); source.append(" * R,  unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * y, __global "); source.append(numeric_string); source.append(" * x){ \n");
00135           source.append("  for (int i = col_n-1; i >= 0 ; i--) { \n");
00136           source.append("    x[ i] = y[ i]; \n");
00137           source.append("    for (int j = i+1; j < col_n; ++j) { \n");
00138           source.append("      x[ i] -= R[ i + j*row_n]*x[ j]; \n");
00139           source.append("    } \n");
00140           source.append("    x[i] /= R[ i + i*row_n]; \n");
00141           source.append("  } \n");
00142           source.append("} \n");
00143 
00144 
00145           source.append("void apply_q_trans_vec_ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global const "); source.append(numeric_string); source.append(" * b_v,  __global "); source.append(numeric_string); source.append(" * y){ \n");
00146           source.append("            "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
00147           source.append("            for(unsigned int i = 0; i < col_n; ++i){ \n");
00148           source.append("                custom_dot_prod_ls(R, row_n, y, i, &inn_prod); \n");
00149           source.append("                for(unsigned int j = i; j < row_n; ++j){ \n");
00150           source.append("                    if(i == j){ \n");
00151           source.append("                        y[ j] -= b_v[ i]*inn_prod; \n");
00152           source.append("                    } \n");
00153           source.append("                    else{ \n");
00154           source.append("                        y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n]; \n");
00155           source.append("                    } \n");
00156           source.append("                } \n");
00157           source.append("            } \n");
00158           source.append("        } \n");
00159 
00160           source.append("void ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * m_v, __global "); source.append(numeric_string); source.append(" * y_v){ \n");
00161           source.append("  apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v); \n");
00162           source.append("  //m_new - is m_v now \n");
00163           source.append("  backwardSolve(R, row_n, col_n, y_v, m_v); \n");
00164           source.append("} \n");
00165 
00166           source.append("__kernel void block_least_squares( \n");
00167           source.append("      __global "); source.append(numeric_string); source.append(" * global_R, \n");
00168           source.append("      __global unsigned int * block_ind, \n");
00169           source.append("      __global "); source.append(numeric_string); source.append(" * b_v, \n");
00170           source.append("      __global unsigned int * start_bv_inds, \n");
00171           source.append("      __global "); source.append(numeric_string); source.append(" * m_v, \n");
00172           source.append("      __global "); source.append(numeric_string); source.append(" * y_v, \n");
00173           source.append("      __global unsigned int * start_y_inds, \n");
00174           source.append("      __global unsigned int * matrix_dimensions, \n");
00175           source.append("      __global unsigned int * g_is_update, \n");
00176           source.append("      unsigned int  block_elems_num) \n");
00177           source.append("{ \n");
00178           source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
00179           source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
00180           source.append("            ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] ); \n");
00181           source.append("        } \n");
00182           source.append("    } \n");
00183           source.append("} \n");
00184         }
00185 
00186         template <typename StringType>
00187         void generate_spai_block_q_mult(StringType & source, std::string const & numeric_string)
00188         {
00189           source.append("void custom_dot_prod(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __local "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n");
00190           source.append("  *res = 0.0; \n");
00191           source.append("  for(unsigned int j = ind; j < row_n; ++j){ \n");
00192           source.append("    if(j == ind){ \n");
00193           source.append("      *res += v[j]; \n");
00194           source.append("    }else{ \n");
00195           source.append("      *res += A[j + ind*row_n]*v[j]; \n");
00196           source.append("    } \n");
00197           source.append("  } \n");
00198           source.append("} \n");
00199 
00200           source.append("void apply_q_trans_vec(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * y){ \n");
00201           source.append("  "); source.append(numeric_string); source.append(" inn_prod = 0; \n");
00202           source.append("  for(unsigned int i = 0; i < col_n; ++i){ \n");
00203           source.append("    custom_dot_prod(R, row_n, y, i, &inn_prod); \n");
00204           source.append("    for(unsigned int j = i; j < row_n; ++j){ \n");
00205           source.append("      if(i == j){ \n");
00206           source.append("        y[j] -= b_v[ i]*inn_prod; \n");
00207           source.append("      } \n");
00208           source.append("      else{ \n");
00209           source.append("        y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n]; \n");
00210           source.append("      } \n");
00211           source.append("    } \n");
00212           source.append("  } \n");
00213           source.append("} \n");
00214 
00215           source.append("void q_mult(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * R_u, unsigned int col_n_u){ \n");
00216           source.append("        for(unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){ \n");
00217           source.append("          apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i); \n");
00218           source.append("        } \n");
00219           source.append("} \n");
00220 
00221           source.append("void matrix_from_global_to_local(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
00222           source.append("  for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
00223           source.append("    for(unsigned int j = 0; j < row_n; ++j){ \n");
00224           source.append("      l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
00225           source.append("    } \n");
00226           source.append("  } \n");
00227           source.append("} \n");
00228 
00229           source.append("void matrix_from_local_to_global(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
00230           source.append("  for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
00231           source.append("    for(unsigned int j = 0; j < row_n; ++j){ \n");
00232           source.append("      g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
00233           source.append("    } \n");
00234           source.append("  } \n");
00235           source.append("} \n");
00236 
00237           source.append("__kernel void block_q_mult(__global "); source.append(numeric_string); source.append(" * global_R, \n");
00238           source.append("  __global unsigned int * block_ind, \n");
00239           source.append("  __global "); source.append(numeric_string); source.append(" * global_R_u, \n");
00240           source.append("  __global unsigned int *block_ind_u, \n");
00241           source.append("  __global "); source.append(numeric_string); source.append(" * b_v, \n");
00242           source.append("  __global unsigned int * start_bv_inds, \n");
00243           source.append("  __global unsigned int * matrix_dimensions, \n");
00244           source.append("  __global unsigned int * matrix_dimensions_u, \n");
00245           source.append("  __global unsigned int * g_is_update, \n");
00246           source.append("  __local  "); source.append(numeric_string); source.append(" * local_R_u, \n");
00247           source.append("    unsigned int  block_elems_num){ \n");
00248           source.append("    for(unsigned int i  = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
00249           source.append("          if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){ \n");
00250                   //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
00251           source.append("        matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]); \n");
00252           source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
00253           source.append("              q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n");
00254           source.append("             matrix_dimensions_u[2*i + 1]); \n");
00255           source.append("        barrier(CLK_LOCAL_MEM_FENCE); \n");
00256           source.append("              matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]); \n");
00257           source.append("          } \n");
00258           source.append("      } \n");
00259           source.append("} \n");
00260         }
00261 
00262         template <typename StringType>
00263         void generate_spai_block_qr(StringType & source, std::string const & numeric_string)
00264         {
00265           source.append("void dot_prod(__local const "); source.append(numeric_string); source.append("* A, unsigned int n, unsigned int beg_ind, "); source.append(numeric_string); source.append("* res){ \n");
00266           source.append("    *res = 0; \n");
00267           source.append("    for(unsigned int i = beg_ind; i < n; ++i){ \n");
00268           source.append("        *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i]; \n");
00269           source.append("    } \n");
00270           source.append("} \n");
00271 
00272           source.append("void vector_div(__global "); source.append(numeric_string); source.append("* v, unsigned int beg_ind, "); source.append(numeric_string); source.append(" b, unsigned int n){ \n");
00273           source.append("    for(unsigned int i = beg_ind; i < n; ++i){ \n");
00274           source.append("        v[i] /= b; \n");
00275           source.append("    } \n");
00276           source.append("} \n");
00277 
00278           source.append("void copy_vector(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, const unsigned int beg_ind, const unsigned int n){ \n");
00279           source.append("    for(unsigned int i = beg_ind; i < n; ++i){ \n");
00280           source.append("        v[i] = A[(beg_ind-1)*n + i]; \n");
00281           source.append("    } \n");
00282           source.append("} \n");
00283 
00284 
00285           source.append("void householder_vector(__local const "); source.append(numeric_string); source.append("* A, unsigned int j, unsigned int n, __global "); source.append(numeric_string); source.append("* v, __global "); source.append(numeric_string); source.append("* b){ \n");
00286           source.append("    "); source.append(numeric_string); source.append(" sg; \n");
00287           source.append("    dot_prod(A, n, j+1, &sg); \n");
00288           source.append("    copy_vector(A, v, j+1, n); \n");
00289           source.append("    "); source.append(numeric_string); source.append(" mu; \n");
00290           source.append("    v[j] = 1.0; \n");
00291               //print_contigious_vector(v, v_start_ind, n);
00292           source.append("    if(sg == 0){ \n");
00293           source.append("        *b = 0; \n");
00294           source.append("    } \n");
00295           source.append("    else{ \n");
00296           source.append("        mu = sqrt(A[j*n + j]*A[ j*n + j] + sg); \n");
00297           source.append("        if(A[ j*n + j] <= 0){ \n");
00298           source.append("            v[j] = A[ j*n + j] - mu; \n");
00299           source.append("        }else{ \n");
00300           source.append("            v[j] = -sg/(A[ j*n + j] + mu); \n");
00301           source.append("        } \n");
00302           source.append("    *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]); \n");
00303                   //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j]));
00304           source.append("        vector_div(v, j, v[j], n); \n");
00305                   //print_contigious_vector(v, v_start_ind, n);
00306           source.append("    } \n");
00307           source.append("} \n");
00308 
00309           source.append("void custom_inner_prod(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, "); source.append(numeric_string); source.append("* res){ \n");
00310           source.append("    for(unsigned int i = start_ind; i < row_num; ++i){ \n");
00311           source.append("        *res += A[col_ind*row_num + i]*v[i]; \n");
00312           source.append("    } \n");
00313           source.append("} \n");
00314           //
00315           source.append("void apply_householder_reflection(__local "); source.append(numeric_string); source.append("* A,  unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global "); source.append(numeric_string); source.append("* v, "); source.append(numeric_string); source.append(" b){ \n");
00316           source.append("    "); source.append(numeric_string); source.append(" in_prod_res; \n");
00317           source.append("    for(unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){ \n");
00318           source.append("        in_prod_res = 0.0; \n");
00319           source.append("        custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res); \n");
00320           source.append("        for(unsigned int j = iter_cnt; j < row_n; ++j){ \n");
00321           source.append("            A[ i*row_n + j] -= b*in_prod_res* v[j]; \n");
00322           source.append("        } \n");
00323           source.append("    } \n");
00324           source.append("} \n");
00325 
00326           source.append("void store_householder_vector(__local "); source.append(numeric_string); source.append("* A,  unsigned int ind, unsigned int n, __global "); source.append(numeric_string); source.append("* v){ \n");
00327           source.append("    for(unsigned int i = ind; i < n; ++i){ \n");
00328           source.append("        A[ (ind-1)*n + i] = v[i]; \n");
00329           source.append("    } \n");
00330           source.append("} \n");
00331 
00332           source.append("void single_qr( __local "); source.append(numeric_string); source.append("* R, __global unsigned int* matrix_dimensions, __global "); source.append(numeric_string); source.append("* b_v, __global "); source.append(numeric_string); source.append("* v, unsigned int matrix_ind){ \n");
00333                       //matrix_dimensions[0] - number of rows
00334                         //matrix_dimensions[1] - number of columns
00335           source.append("  unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n");
00336           source.append("  unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n");
00337 
00338           source.append("  if((col_n == row_n)&&(row_n == 1)){ \n");
00339           source.append("    b_v[0] = 0.0; \n");
00340           source.append("      return; \n");
00341           source.append("  } \n");
00342           source.append("  for(unsigned int i = 0; i < col_n; ++i){ \n");
00343           source.append("    if(get_local_id(0) == 0){ \n");
00344           source.append("      householder_vector(R, i, row_n, v, b_v + i); \n");
00345           source.append("    } \n");
00346           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00347           source.append("    apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]); \n");
00348           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00349           source.append("    if(get_local_id(0) == 0){ \n");
00350           source.append("      if(i < matrix_dimensions[2*matrix_ind]){ \n");
00351           source.append("        store_householder_vector(R, i+1, row_n, v); \n");
00352           source.append("      } \n");
00353           source.append("    } \n");
00354           source.append("  } \n");
00355           source.append("} \n");
00356 
00357           source.append("void matrix_from_global_to_local_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
00358           source.append("  for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
00359           source.append("    for(unsigned int j = 0; j < row_n; ++j){ \n");
00360           source.append("      l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n");
00361           source.append("    } \n");
00362           source.append("  } \n");
00363           source.append("} \n");
00364           source.append("void matrix_from_local_to_global_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n");
00365           source.append("  for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n");
00366           source.append("    for(unsigned int j = 0; j < row_n; ++j){ \n");
00367           source.append("      g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n");
00368           source.append("    } \n");
00369           source.append("  } \n");
00370           source.append("} \n");
00371 
00372 
00373           source.append("__kernel void block_qr( \n");
00374           source.append("      __global "); source.append(numeric_string); source.append("* R, \n");
00375           source.append("      __global unsigned int* matrix_dimensions, \n");
00376           source.append("      __global "); source.append(numeric_string); source.append("* b_v, \n");
00377           source.append("      __global "); source.append(numeric_string); source.append("* v, \n");
00378           source.append("      __global unsigned int* start_matrix_inds, \n");
00379           source.append("      __global unsigned int* start_bv_inds, \n");
00380           source.append("      __global unsigned int* start_v_inds, \n");
00381           source.append("      __global unsigned int * g_is_update, \n");
00382           source.append("      __local "); source.append(numeric_string); source.append("* local_buff_R, \n");
00383           source.append("      unsigned int block_elems_num){ \n");
00384           source.append("    for(unsigned int i  = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n");
00385           source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
00386           source.append("      matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
00387           source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
00388           source.append("            single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i); \n");
00389           source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
00390           source.append("            matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n");
00391           source.append("        } \n");
00392           source.append("    } \n");
00393           source.append("} \n");
00394         }
00395 
00396         template <typename StringType>
00397         void generate_spai_block_qr_assembly(StringType & source, std::string const & numeric_string)
00398         {
00399           source.append("void assemble_upper_part(__global "); source.append(numeric_string); source.append(" * R_q, \n");
00400           source.append("            unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
00401           source.append("            unsigned int row_n_u, unsigned int col_n_u, \n");
00402           source.append("            unsigned int col_n, unsigned int diff){ \n");
00403           source.append("            for(unsigned int i = 0; i < col_n_q; ++i){ \n");
00404           source.append("                for(unsigned int j = 0; j < diff; ++j){ \n");
00405           source.append("          R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ]; \n");
00406           source.append("                } \n");
00407           source.append("            } \n");
00408           source.append("        } \n");
00409 
00410           source.append("void assemble_lower_part(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
00411           source.append("             unsigned int row_n_u_u, unsigned int col_n_u_u, \n");
00412           source.append("             unsigned int diff){ \n");
00413           source.append("  for(unsigned int i = 0; i < col_n_u_u; ++i){ \n");
00414           source.append("    for(unsigned int j = 0; j < row_n_u_u; ++j){ \n");
00415           source.append("      R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j]; \n");
00416           source.append("    } \n");
00417           source.append("  } \n");
00418           source.append("} \n");
00419 
00420           source.append("void assemble_qr_block(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
00421           source.append("            unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){ \n");
00422           source.append("            unsigned int diff = row_n_u - col_n; \n");
00423           source.append("            assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
00424           source.append("            if(diff > 0){ \n");
00425           source.append("              assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff); \n");
00426           source.append("            } \n");
00427           source.append("} \n");
00428 
00429           source.append("__kernel void block_qr_assembly( \n");
00430           source.append("      __global unsigned int * matrix_dimensions, \n");
00431           source.append("      __global "); source.append(numeric_string); source.append(" * R_u, \n");
00432           source.append("      __global unsigned int * block_ind_u, \n");
00433           source.append("      __global unsigned int * matrix_dimensions_u, \n");
00434           source.append("      __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
00435           source.append("      __global unsigned int * block_ind_u_u, \n");
00436           source.append("      __global unsigned int * matrix_dimensions_u_u, \n");
00437           source.append("      __global "); source.append(numeric_string); source.append(" * R_q, \n");
00438           source.append("      __global unsigned int * block_ind_q, \n");
00439           source.append("      __global unsigned int * matrix_dimensions_q, \n");
00440           source.append("      __global unsigned int * g_is_update, \n");
00441           source.append("          //__local  "); source.append(numeric_string); source.append(" * local_R_q, \n");
00442           source.append("      unsigned int  block_elems_num) \n");
00443           source.append("{ \n");
00444           source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
00445           source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
00446           source.append("           assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
00447           source.append("             matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
00448           source.append("       } \n");
00449           source.append("   } \n");
00450           source.append("} \n");
00451         }
00452 
00453         template <typename StringType>
00454         void generate_spai_block_qr_assembly_1(StringType & source, std::string const & numeric_string)
00455         {
00456           source.append("void assemble_upper_part_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n");
00457           source.append("             unsigned int row_n_u, unsigned int col_n_u, \n");
00458           source.append("             unsigned int col_n, unsigned int diff){ \n");
00459           source.append("            for(unsigned int i = 0; i < col_n_q; ++i){ \n");
00460           source.append("                for(unsigned int j = 0; j < diff; ++j){ \n");
00461           source.append("          R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ]; \n");
00462           source.append("                } \n");
00463           source.append("            } \n");
00464           source.append("        } \n");
00465 
00466 
00467           source.append("void assemble_qr_block_1(__global "); source.append(numeric_string); source.append(" * R_q,  unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n");
00468           source.append("            unsigned int col_n_u, unsigned int col_n){ \n");
00469           source.append("            unsigned int diff = row_n_u - col_n; \n");
00470           source.append("            assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n");
00471           source.append("} \n");
00472 
00473           source.append("__kernel void block_qr_assembly_1( \n");
00474           source.append("  __global unsigned int * matrix_dimensions, \n");
00475           source.append("  __global "); source.append(numeric_string); source.append(" * R_u, \n");
00476           source.append("  __global unsigned int * block_ind_u, \n");
00477           source.append("  __global unsigned int * matrix_dimensions_u, \n");
00478           source.append("  __global "); source.append(numeric_string); source.append(" * R_q, \n");
00479           source.append("  __global unsigned int * block_ind_q, \n");
00480           source.append("  __global unsigned int * matrix_dimensions_q, \n");
00481           source.append("  __global unsigned int * g_is_update, \n");
00482           source.append("  unsigned int  block_elems_num) \n");
00483           source.append("{ \n");
00484           source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
00485           source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
00486           source.append("            assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n");
00487           source.append("              matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]); \n");
00488           source.append("        } \n");
00489           source.append("    } \n");
00490           source.append("} \n");
00491         }
00492 
00493         template <typename StringType>
00494         void generate_spai_block_r_assembly(StringType & source, std::string const & numeric_string)
00495         {
00496           source.append("void assemble_r(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, \n");
00497           source.append("        unsigned int row_n, unsigned int col_n) \n");
00498           source.append("{ \n");
00499           source.append("  for(unsigned int i = 0; i < col_n; ++i){ \n");
00500           source.append("     for(unsigned int j = 0; j < row_n; ++j){ \n");
00501           source.append("    gR[i*row_n_r + j] = R[i*row_n + j ]; \n");
00502           source.append("     } \n");
00503           source.append("  } \n");
00504           source.append("} \n");
00505 
00506           source.append("void assemble_r_u(__global "); source.append(numeric_string); source.append(" * gR, \n");
00507           source.append("          unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, \n");
00508           source.append("          unsigned int col_n) \n");
00509           source.append("{ \n");
00510           source.append("  for(unsigned int i = 0; i < col_n_u; ++i){ \n");
00511           source.append("    for(unsigned int j = 0; j < col_n; ++j){ \n");
00512           source.append("      gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j]; \n");
00513           source.append("    } \n");
00514           source.append("  } \n");
00515           source.append("} \n");
00516 
00517 
00518           source.append("void assemble_r_u_u(__global "); source.append(numeric_string); source.append(" * gR,  unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, \n");
00519           source.append("          unsigned int col_n_u_u, unsigned int col_n) \n");
00520           source.append("{ \n");
00521           source.append("  for(unsigned int i = 0; i < col_n_u_u; ++i){ \n");
00522           source.append("    for(unsigned int j = 0; j < row_n_u_u; ++j){ \n");
00523           source.append("      gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j]; \n");
00524           source.append("    } \n");
00525           source.append("  } \n");
00526           source.append("} \n");
00527 
00528           source.append("void assemble_r_block(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, \n");
00529           source.append("        unsigned int col_n, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
00530           source.append("        unsigned int row_n_u_u, unsigned int col_n_u_u){ \n");
00531           source.append("        assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n");
00532           source.append("        assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n); \n");
00533           source.append("        assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n); \n");
00534           source.append("} \n");
00535 
00536 
00537           source.append("__kernel void block_r_assembly( \n");
00538           source.append("  __global "); source.append(numeric_string); source.append(" * R, \n");
00539           source.append("  __global unsigned int * block_ind, \n");
00540           source.append("  __global unsigned int * matrix_dimensions, \n");
00541           source.append("  __global "); source.append(numeric_string); source.append(" * R_u, \n");
00542           source.append("  __global unsigned int * block_ind_u, \n");
00543           source.append("  __global unsigned int * matrix_dimensions_u, \n");
00544           source.append("  __global "); source.append(numeric_string); source.append(" * R_u_u, \n");
00545           source.append("  __global unsigned int * block_ind_u_u, \n");
00546           source.append("  __global unsigned int * matrix_dimensions_u_u, \n");
00547           source.append("  __global "); source.append(numeric_string); source.append(" * g_R, \n");
00548           source.append("  __global unsigned int * block_ind_r, \n");
00549           source.append("  __global unsigned int * matrix_dimensions_r, \n");
00550           source.append("  __global unsigned int * g_is_update, \n");
00551           source.append("  unsigned int  block_elems_num) \n");
00552           source.append("{ \n");
00553           source.append("    for(unsigned int i  = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n");
00554           source.append("        if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n");
00555 
00556           source.append("            assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n");
00557           source.append("              matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], \n");
00558           source.append("              R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]); \n");
00559 
00560           source.append("        } \n");
00561           source.append("    } \n");
00562           source.append("} \n");
00563         }
00564 
00566 
00567         // main kernel class
00569         template <typename NumericT>
00570         struct spai
00571         {
00572           static std::string program_name()
00573           {
00574             return viennacl::ocl::type_to_string<NumericT>::apply() + "_spai";
00575           }
00576 
00577           static void init(viennacl::ocl::context & ctx)
00578           {
00579             viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
00580             std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
00581 
00582             static std::map<cl_context, bool> init_done;
00583             if (!init_done[ctx.handle().get()])
00584             {
00585               std::string source;
00586               source.reserve(1024);
00587 
00588               viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
00589 
00590               generate_spai_assemble_blocks(source, numeric_string);
00591               generate_spai_block_bv_assembly(source, numeric_string);
00592               generate_spai_block_least_squares(source, numeric_string);
00593               generate_spai_block_q_mult(source, numeric_string);
00594               generate_spai_block_qr(source, numeric_string);
00595               generate_spai_block_qr_assembly(source, numeric_string);
00596               generate_spai_block_qr_assembly_1(source, numeric_string);
00597               generate_spai_block_r_assembly(source, numeric_string);
00598 
00599               std::string prog_name = program_name();
00600               #ifdef VIENNACL_BUILD_INFO
00601               std::cout << "Creating program " << prog_name << std::endl;
00602               #endif
00603               ctx.add_program(source, prog_name);
00604               init_done[ctx.handle().get()] = true;
00605             } //if
00606           } //init
00607         };
00608 
00609       }  // namespace kernels
00610     }  // namespace opencl
00611   }  // namespace linalg
00612 }  // namespace viennacl
00613 #endif
00614