ViennaCL - The Vienna Computing Library
1.5.1
|
00001 #ifndef VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP 00002 #define VIENNACL_LINALG_CUDA_DIRECT_SOLVE_HPP 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00025 #include "viennacl/forwards.h" 00026 #include "viennacl/vector.hpp" 00027 #include "viennacl/matrix.hpp" 00028 00029 00030 #include "viennacl/linalg/cuda/common.hpp" 00031 00032 00033 namespace viennacl 00034 { 00035 namespace linalg 00036 { 00037 namespace cuda 00038 { 00039 00040 template <typename T> 00041 __global__ void matrix_matrix_upper_solve_kernel( 00042 const T * A, 00043 unsigned int A_start1, unsigned int A_start2, 00044 unsigned int A_inc1, unsigned int A_inc2, 00045 unsigned int A_size1, unsigned int A_size2, 00046 unsigned int A_internal_size1, unsigned int A_internal_size2, 00047 bool row_major_A, 00048 bool transpose_A, 00049 00050 T * B, 00051 unsigned int B_start1, unsigned int B_start2, 00052 unsigned int B_inc1, unsigned int B_inc2, 00053 unsigned int B_size1, unsigned int B_size2, 00054 unsigned int B_internal_size1, unsigned int B_internal_size2, 00055 bool row_major_B, 00056 bool transpose_B, 00057 00058 bool unit_diagonal) 00059 { 00060 T temp; 00061 T entry_A; 00062 00063 for (unsigned int row_cnt = 0; row_cnt < A_size1; ++row_cnt) 00064 { 00065 unsigned int row = A_size1 - 1 - row_cnt; 00066 00067 if (!unit_diagonal) 00068 { 00069 __syncthreads(); 00070 00071 if (threadIdx.x == 0) 00072 { 00073 if (row_major_B && transpose_B) 00074 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] 00075 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; 00076 else if (row_major_B && !transpose_B) 00077 B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] 00078 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; 00079 else if (!row_major_B && transpose_B) 00080 B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] 00081 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; 00082 else //if (!row_major_B && !transpose_B) 00083 B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] 00084 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; 00085 } 00086 } 00087 00088 __syncthreads(); 00089 00090 if (row_major_B && transpose_B) 00091 temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)]; 00092 else if (row_major_B && !transpose_B) 00093 temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)]; 00094 else if (!row_major_B && transpose_B) 00095 temp = B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1]; 00096 else //if (!row_major_B && !transpose_B) 00097 temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1]; 00098 00099 //eliminate column of op(A) with index 'row' in parallel: " << std::endl; 00100 for (unsigned int elim = threadIdx.x; elim < row; elim += blockDim.x) 00101 { 00102 if (row_major_A && transpose_A) 00103 entry_A = A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)]; 00104 else if (row_major_A && !transpose_A) 00105 entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; 00106 else if (!row_major_A && transpose_A) 00107 entry_A = A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1]; 00108 else //if (!row_major_A && !transpose_A) 00109 entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; 00110 00111 if (row_major_B && transpose_B) 00112 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A; 00113 else if (row_major_B && !transpose_B) 00114 B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A; 00115 else if (!row_major_B && transpose_B) 00116 B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A; 00117 else //if (!row_major_B && !transpose_B) 00118 B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A; 00119 00120 } 00121 } 00122 } 00123 00124 00125 00126 template <typename T> 00127 __global__ void matrix_matrix_lower_solve_kernel( 00128 const T * A, 00129 unsigned int A_start1, unsigned int A_start2, 00130 unsigned int A_inc1, unsigned int A_inc2, 00131 unsigned int A_size1, unsigned int A_size2, 00132 unsigned int A_internal_size1, unsigned int A_internal_size2, 00133 bool row_major_A, 00134 bool transpose_A, 00135 00136 T * B, 00137 unsigned int B_start1, unsigned int B_start2, 00138 unsigned int B_inc1, unsigned int B_inc2, 00139 unsigned int B_size1, unsigned int B_size2, 00140 unsigned int B_internal_size1, unsigned int B_internal_size2, 00141 bool row_major_B, 00142 bool transpose_B, 00143 00144 bool unit_diagonal) 00145 { 00146 T temp; 00147 T entry_A; 00148 00149 for (unsigned int row = 0; row < A_size1; ++row) 00150 { 00151 00152 if (!unit_diagonal) 00153 { 00154 __syncthreads(); 00155 00156 if (threadIdx.x == 0) 00157 { 00158 if (row_major_B && transpose_B) 00159 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] 00160 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; 00161 else if (row_major_B && !transpose_B) 00162 B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] 00163 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; 00164 else if (!row_major_B && transpose_B) 00165 B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] 00166 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; 00167 else //if (!row_major_B && !transpose_B) 00168 B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] /= (row_major_A) ? A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)] 00169 : A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2)*A_internal_size1]; 00170 } 00171 } 00172 00173 __syncthreads(); 00174 00175 if (row_major_B && transpose_B) 00176 temp = B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (row * B_inc2 + B_start2)]; 00177 else if (row_major_B && !transpose_B) 00178 temp = B[(row * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)]; 00179 else if (!row_major_B && transpose_B) 00180 temp = B[(blockIdx.x * B_inc1 + B_start1) + (row * B_inc2 + B_start2) * B_internal_size1]; 00181 else //if (!row_major_B && !transpose_B) 00182 temp = B[(row * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1]; 00183 00184 //eliminate column of op(A) with index 'row' in parallel: " << std::endl; 00185 for (unsigned int elim = row + threadIdx.x + 1; elim < A_size1; elim += blockDim.x) 00186 { 00187 if (row_major_A && transpose_A) 00188 entry_A = A[(row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)]; 00189 else if (row_major_A && !transpose_A) 00190 entry_A = A[(elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; 00191 else if (!row_major_A && transpose_A) 00192 entry_A = A[(row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1]; 00193 else //if (!row_major_A && !transpose_A) 00194 entry_A = A[(elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; 00195 00196 if (row_major_B && transpose_B) 00197 B[(blockIdx.x * B_inc1 + B_start1) * B_internal_size2 + (elim * B_inc2 + B_start2)] -= temp * entry_A; 00198 else if (row_major_B && !transpose_B) 00199 B[(elim * B_inc1 + B_start1) * B_internal_size2 + (blockIdx.x * B_inc2 + B_start2)] -= temp * entry_A; 00200 else if (!row_major_B && transpose_B) 00201 B[(blockIdx.x * B_inc1 + B_start1) + (elim * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A; 00202 else //if (!row_major_B && !transpose_B) 00203 B[(elim * B_inc1 + B_start1) + (blockIdx.x * B_inc2 + B_start2) * B_internal_size1] -= temp * entry_A; 00204 00205 } 00206 } 00207 } 00208 00209 00210 00211 00212 00213 00214 namespace detail 00215 { 00216 template <typename T> 00217 bool is_unit_solve(T const & tag) { return false; } 00218 00219 inline bool is_unit_solve(viennacl::linalg::unit_lower_tag) { return true; } 00220 inline bool is_unit_solve(viennacl::linalg::unit_upper_tag) { return true; } 00221 00222 template <typename T> 00223 bool is_upper_solve(T const & tag) { return false; } 00224 00225 inline bool is_upper_solve(viennacl::linalg::upper_tag) { return true; } 00226 inline bool is_upper_solve(viennacl::linalg::unit_upper_tag) { return true; } 00227 00228 template <typename M1, typename M2, typename SolverTag> 00229 void inplace_solve_impl(M1 const & A, bool transpose_A, 00230 M2 & B, bool transpose_B, 00231 SolverTag const & tag) 00232 { 00233 typedef typename viennacl::result_of::cpu_value_type<M1>::type value_type; 00234 00235 dim3 threads(128); 00236 dim3 grid( transpose_B ? B.size1() : B.size2() ); 00237 00238 if (is_upper_solve(tag)) 00239 { 00240 matrix_matrix_upper_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A), 00241 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00242 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00243 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00244 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00245 bool(viennacl::is_row_major<M1>::value), 00246 transpose_A, 00247 00248 detail::cuda_arg<value_type>(B), 00249 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 00250 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 00251 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 00252 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 00253 bool(viennacl::is_row_major<M2>::value), 00254 transpose_B, 00255 00256 is_unit_solve(tag) 00257 ); 00258 } 00259 else 00260 { 00261 matrix_matrix_lower_solve_kernel<<<grid,threads>>>(detail::cuda_arg<value_type>(A), 00262 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00263 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00264 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00265 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00266 bool(viennacl::is_row_major<M1>::value), 00267 transpose_A, 00268 00269 detail::cuda_arg<value_type>(B), 00270 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 00271 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 00272 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 00273 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 00274 bool(viennacl::is_row_major<M2>::value), 00275 transpose_B, 00276 00277 is_unit_solve(tag) 00278 ); 00279 } 00280 00281 } 00282 } 00283 00284 00285 // 00286 // Note: By convention, all size checks are performed in the calling frontend. No need to double-check here. 00287 // 00288 00290 00296 template <typename NumericT, typename F1, typename F2, typename SOLVERTAG> 00297 void inplace_solve(const matrix_base<NumericT, F1> & A, matrix_base<NumericT, F2> & B, SOLVERTAG tag) 00298 { 00299 detail::inplace_solve_impl(A, false, 00300 B, false, tag); 00301 } 00302 00309 template <typename NumericT, typename F1, typename F2, typename SOLVERTAG> 00310 void inplace_solve(const matrix_base<NumericT, F1> & A, 00311 matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> proxy_B, 00312 SOLVERTAG tag) 00313 { 00314 detail::inplace_solve_impl(A, false, 00315 const_cast<matrix_base<NumericT, F2> &>(proxy_B.lhs()), true, tag); 00316 } 00317 00318 //upper triangular solver for transposed lower triangular matrices 00325 template <typename NumericT, typename F1, typename F2, typename SOLVERTAG> 00326 void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A, 00327 matrix_base<NumericT, F2> & B, 00328 SOLVERTAG tag) 00329 { 00330 detail::inplace_solve_impl(const_cast<matrix_base<NumericT, F1> &>(proxy_A.lhs()), true, 00331 B, false, tag); 00332 } 00333 00340 template <typename NumericT, typename F1, typename F2, typename SOLVERTAG> 00341 void inplace_solve(const matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & proxy_A, 00342 matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> proxy_B, 00343 SOLVERTAG tag) 00344 { 00345 detail::inplace_solve_impl(const_cast<matrix_base<NumericT, F1> &>(proxy_A.lhs()), true, 00346 const_cast<matrix_base<NumericT, F2> &>(proxy_B.lhs()), true, tag); 00347 } 00348 00349 00350 00351 // 00352 // Solve on vector 00353 // 00354 00355 template <typename T> 00356 __global__ void triangular_substitute_inplace_row_kernel( 00357 T const * A, 00358 unsigned int A_start1, unsigned int A_start2, 00359 unsigned int A_inc1, unsigned int A_inc2, 00360 unsigned int A_size1, unsigned int A_size2, 00361 unsigned int A_internal_size1, unsigned int A_internal_size2, 00362 T * v, 00363 unsigned int v_start, 00364 unsigned int v_inc, 00365 unsigned int v_size, 00366 00367 unsigned int options) 00368 { 00369 T temp; 00370 unsigned int unit_diagonal_flag = (options & (1 << 0)); 00371 unsigned int transposed_access_A = (options & (1 << 1)); 00372 unsigned int is_lower_solve = (options & (1 << 2)); 00373 unsigned int row; 00374 for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square 00375 { 00376 row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); 00377 if (!unit_diagonal_flag) 00378 { 00379 __syncthreads(); 00380 if (threadIdx.x == 0) 00381 v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; 00382 } 00383 00384 __syncthreads(); 00385 00386 temp = v[row * v_inc + v_start]; 00387 00388 for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x); 00389 elim < (is_lower_solve ? A_size1 : row); 00390 elim += blockDim.x) 00391 v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) 00392 : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; 00393 } 00394 } 00395 00396 00397 template <typename T> 00398 __global__ void triangular_substitute_inplace_col_kernel( 00399 T const * A, 00400 unsigned int A_start1, unsigned int A_start2, 00401 unsigned int A_inc1, unsigned int A_inc2, 00402 unsigned int A_size1, unsigned int A_size2, 00403 unsigned int A_internal_size1, unsigned int A_internal_size2, 00404 T * v, 00405 unsigned int v_start, 00406 unsigned int v_inc, 00407 unsigned int v_size, 00408 unsigned int options) 00409 { 00410 T temp; 00411 unsigned int unit_diagonal_flag = (options & (1 << 0)); 00412 unsigned int transposed_access_A = (options & (1 << 1)); 00413 unsigned int is_lower_solve = (options & (1 << 2)); 00414 unsigned int row; 00415 for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) //Note: A required to be square 00416 { 00417 row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); 00418 if (!unit_diagonal_flag) 00419 { 00420 __syncthreads(); 00421 if (threadIdx.x == 0) 00422 v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; 00423 } 00424 00425 __syncthreads(); 00426 00427 temp = v[row * v_inc + v_start]; 00428 00429 for (int elim = (is_lower_solve ? (row + threadIdx.x + 1) : threadIdx.x); 00430 elim < (is_lower_solve ? A_size1 : row); 00431 elim += blockDim.x) 00432 v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) 00433 : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; 00434 } 00435 } 00436 00437 00438 namespace detail 00439 { 00440 inline unsigned int get_option_for_solver_tag(viennacl::linalg::upper_tag) { return 0; } 00441 inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); } 00442 inline unsigned int get_option_for_solver_tag(viennacl::linalg::lower_tag) { return (1 << 2); } 00443 inline unsigned int get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); } 00444 00445 template <typename MatrixType, typename VectorType> 00446 void inplace_solve_vector_impl(MatrixType const & mat, 00447 VectorType & vec, 00448 unsigned int options) 00449 { 00450 typedef typename viennacl::result_of::cpu_value_type<MatrixType>::type value_type; 00451 00452 if (viennacl::is_row_major<MatrixType>::value) 00453 { 00454 triangular_substitute_inplace_row_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat), 00455 static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), 00456 static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), 00457 static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), 00458 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), 00459 detail::cuda_arg<value_type>(vec), 00460 static_cast<unsigned int>(viennacl::traits::start(vec)), 00461 static_cast<unsigned int>(viennacl::traits::stride(vec)), 00462 static_cast<unsigned int>(viennacl::traits::size(vec)), 00463 options 00464 ); 00465 } 00466 else 00467 { 00468 triangular_substitute_inplace_col_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(mat), 00469 static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), 00470 static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), 00471 static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), 00472 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), 00473 detail::cuda_arg<value_type>(vec), 00474 static_cast<unsigned int>(viennacl::traits::start(vec)), 00475 static_cast<unsigned int>(viennacl::traits::stride(vec)), 00476 static_cast<unsigned int>(viennacl::traits::size(vec)), 00477 options 00478 ); 00479 } 00480 } 00481 00482 } 00483 00489 template <typename NumericT, typename F, typename SOLVERTAG> 00490 void inplace_solve(const matrix_base<NumericT, F> & mat, 00491 vector_base<NumericT> & vec, 00492 SOLVERTAG) 00493 { 00494 unsigned int options = detail::get_option_for_solver_tag(SOLVERTAG()); 00495 00496 detail::inplace_solve_vector_impl(mat, vec, options); 00497 } 00498 00499 00500 00501 00507 template <typename NumericT, typename F, typename SOLVERTAG> 00508 void inplace_solve(const matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & proxy, 00509 vector_base<NumericT> & vec, 00510 SOLVERTAG) 00511 { 00512 unsigned int options = detail::get_option_for_solver_tag(SOLVERTAG()) | 0x02; //add transpose-flag 00513 00514 detail::inplace_solve_vector_impl(proxy.lhs(), vec, options); 00515 } 00516 00517 00518 00519 } 00520 } 00521 } 00522 00523 #endif