ViennaCL - The Vienna Computing Library
1.5.1
|
00001 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_ 00002 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00025 #include "viennacl/forwards.h" 00026 00027 namespace viennacl 00028 { 00029 namespace linalg 00030 { 00031 namespace cuda 00032 { 00033 // 00034 // Compressed matrix 00035 // 00036 00037 // 00038 // non-transposed 00039 // 00040 00041 template <typename T> 00042 __global__ void csr_unit_lu_forward_kernel( 00043 const unsigned int * row_indices, 00044 const unsigned int * column_indices, 00045 const T * elements, 00046 T * vector, 00047 unsigned int size) 00048 { 00049 __shared__ unsigned int col_index_buffer[128]; 00050 __shared__ T element_buffer[128]; 00051 __shared__ T vector_buffer[128]; 00052 00053 unsigned int nnz = row_indices[size]; 00054 unsigned int current_row = 0; 00055 unsigned int row_at_window_start = 0; 00056 T current_vector_entry = vector[0]; 00057 unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x; 00058 unsigned int next_row = row_indices[1]; 00059 00060 for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x) 00061 { 00062 //load into shared memory (coalesced access): 00063 if (i < nnz) 00064 { 00065 element_buffer[threadIdx.x] = elements[i]; 00066 unsigned int tmp = column_indices[i]; 00067 col_index_buffer[threadIdx.x] = tmp; 00068 vector_buffer[threadIdx.x] = vector[tmp]; 00069 } 00070 00071 __syncthreads(); 00072 00073 //now a single thread does the remaining work in shared memory: 00074 if (threadIdx.x == 0) 00075 { 00076 // traverse through all the loaded data: 00077 for (unsigned int k=0; k<blockDim.x; ++k) 00078 { 00079 if (current_row < size && i+k == next_row) //current row is finished. Write back result 00080 { 00081 vector[current_row] = current_vector_entry; 00082 ++current_row; 00083 if (current_row < size) //load next row's data 00084 { 00085 next_row = row_indices[current_row+1]; 00086 current_vector_entry = vector[current_row]; 00087 } 00088 } 00089 00090 if (current_row < size && col_index_buffer[k] < current_row) //substitute 00091 { 00092 if (col_index_buffer[k] < row_at_window_start) //use recently computed results 00093 current_vector_entry -= element_buffer[k] * vector_buffer[k]; 00094 else if (col_index_buffer[k] < current_row) //use buffered data 00095 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; 00096 } 00097 00098 } // for k 00099 00100 row_at_window_start = current_row; 00101 } // if (get_local_id(0) == 0) 00102 00103 __syncthreads(); 00104 } //for i 00105 } 00106 00107 00108 00109 template <typename T> 00110 __global__ void csr_lu_forward_kernel( 00111 const unsigned int * row_indices, 00112 const unsigned int * column_indices, 00113 const T * elements, 00114 T * vector, 00115 unsigned int size) 00116 { 00117 __shared__ unsigned int col_index_buffer[128]; 00118 __shared__ T element_buffer[128]; 00119 __shared__ T vector_buffer[128]; 00120 00121 unsigned int nnz = row_indices[size]; 00122 unsigned int current_row = 0; 00123 unsigned int row_at_window_start = 0; 00124 T current_vector_entry = vector[0]; 00125 T diagonal_entry = 0; 00126 unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x; 00127 unsigned int next_row = row_indices[1]; 00128 00129 for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x) 00130 { 00131 //load into shared memory (coalesced access): 00132 if (i < nnz) 00133 { 00134 element_buffer[threadIdx.x] = elements[i]; 00135 unsigned int tmp = column_indices[i]; 00136 col_index_buffer[threadIdx.x] = tmp; 00137 vector_buffer[threadIdx.x] = vector[tmp]; 00138 } 00139 00140 __syncthreads(); 00141 00142 //now a single thread does the remaining work in shared memory: 00143 if (threadIdx.x == 0) 00144 { 00145 // traverse through all the loaded data: 00146 for (unsigned int k=0; k<blockDim.x; ++k) 00147 { 00148 if (current_row < size && i+k == next_row) //current row is finished. Write back result 00149 { 00150 vector[current_row] = current_vector_entry / diagonal_entry; 00151 ++current_row; 00152 if (current_row < size) //load next row's data 00153 { 00154 next_row = row_indices[current_row+1]; 00155 current_vector_entry = vector[current_row]; 00156 } 00157 } 00158 00159 if (current_row < size && col_index_buffer[k] < current_row) //substitute 00160 { 00161 if (col_index_buffer[k] < row_at_window_start) //use recently computed results 00162 current_vector_entry -= element_buffer[k] * vector_buffer[k]; 00163 else if (col_index_buffer[k] < current_row) //use buffered data 00164 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; 00165 } 00166 else if (col_index_buffer[k] == current_row) 00167 diagonal_entry = element_buffer[k]; 00168 00169 } // for k 00170 00171 row_at_window_start = current_row; 00172 } // if (get_local_id(0) == 0) 00173 00174 __syncthreads(); 00175 } //for i 00176 } 00177 00178 00179 template <typename T> 00180 __global__ void csr_unit_lu_backward_kernel( 00181 const unsigned int * row_indices, 00182 const unsigned int * column_indices, 00183 const T * elements, 00184 T * vector, 00185 unsigned int size) 00186 { 00187 __shared__ unsigned int col_index_buffer[128]; 00188 __shared__ T element_buffer[128]; 00189 __shared__ T vector_buffer[128]; 00190 00191 unsigned int nnz = row_indices[size]; 00192 unsigned int current_row = size-1; 00193 unsigned int row_at_window_start = size-1; 00194 T current_vector_entry = vector[size-1]; 00195 unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x; 00196 unsigned int next_row = row_indices[size-1]; 00197 00198 unsigned int i = loop_end + threadIdx.x; 00199 while (1) 00200 { 00201 //load into shared memory (coalesced access): 00202 if (i < nnz) 00203 { 00204 element_buffer[threadIdx.x] = elements[i]; 00205 unsigned int tmp = column_indices[i]; 00206 col_index_buffer[threadIdx.x] = tmp; 00207 vector_buffer[threadIdx.x] = vector[tmp]; 00208 } 00209 00210 __syncthreads(); 00211 00212 //now a single thread does the remaining work in shared memory: 00213 if (threadIdx.x == 0) 00214 { 00215 // traverse through all the loaded data from back to front: 00216 for (unsigned int k2=0; k2<blockDim.x; ++k2) 00217 { 00218 unsigned int k = (blockDim.x - k2) - 1; 00219 00220 if (i+k >= nnz) 00221 continue; 00222 00223 if (col_index_buffer[k] > row_at_window_start) //use recently computed results 00224 current_vector_entry -= element_buffer[k] * vector_buffer[k]; 00225 else if (col_index_buffer[k] > current_row) //use buffered data 00226 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; 00227 00228 if (i+k == next_row) //current row is finished. Write back result 00229 { 00230 vector[current_row] = current_vector_entry; 00231 if (current_row > 0) //load next row's data 00232 { 00233 --current_row; 00234 next_row = row_indices[current_row]; 00235 current_vector_entry = vector[current_row]; 00236 } 00237 } 00238 00239 00240 } // for k 00241 00242 row_at_window_start = current_row; 00243 } // if (get_local_id(0) == 0) 00244 00245 __syncthreads(); 00246 00247 if (i < blockDim.x) 00248 break; 00249 00250 i -= blockDim.x; 00251 } //for i 00252 } 00253 00254 00255 00256 template <typename T> 00257 __global__ void csr_lu_backward_kernel( 00258 const unsigned int * row_indices, 00259 const unsigned int * column_indices, 00260 const T * elements, 00261 T * vector, 00262 unsigned int size) 00263 { 00264 __shared__ unsigned int col_index_buffer[128]; 00265 __shared__ T element_buffer[128]; 00266 __shared__ T vector_buffer[128]; 00267 00268 unsigned int nnz = row_indices[size]; 00269 unsigned int current_row = size-1; 00270 unsigned int row_at_window_start = size-1; 00271 T current_vector_entry = vector[size-1]; 00272 T diagonal_entry; 00273 unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x; 00274 unsigned int next_row = row_indices[size-1]; 00275 00276 unsigned int i = loop_end + threadIdx.x; 00277 while (1) 00278 { 00279 //load into shared memory (coalesced access): 00280 if (i < nnz) 00281 { 00282 element_buffer[threadIdx.x] = elements[i]; 00283 unsigned int tmp = column_indices[i]; 00284 col_index_buffer[threadIdx.x] = tmp; 00285 vector_buffer[threadIdx.x] = vector[tmp]; 00286 } 00287 00288 __syncthreads(); 00289 00290 //now a single thread does the remaining work in shared memory: 00291 if (threadIdx.x == 0) 00292 { 00293 // traverse through all the loaded data from back to front: 00294 for (unsigned int k2=0; k2<blockDim.x; ++k2) 00295 { 00296 unsigned int k = (blockDim.x - k2) - 1; 00297 00298 if (i+k >= nnz) 00299 continue; 00300 00301 if (col_index_buffer[k] > row_at_window_start) //use recently computed results 00302 current_vector_entry -= element_buffer[k] * vector_buffer[k]; 00303 else if (col_index_buffer[k] > current_row) //use buffered data 00304 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]]; 00305 else if (col_index_buffer[k] == current_row) 00306 diagonal_entry = element_buffer[k]; 00307 00308 if (i+k == next_row) //current row is finished. Write back result 00309 { 00310 vector[current_row] = current_vector_entry / diagonal_entry; 00311 if (current_row > 0) //load next row's data 00312 { 00313 --current_row; 00314 next_row = row_indices[current_row]; 00315 current_vector_entry = vector[current_row]; 00316 } 00317 } 00318 00319 00320 } // for k 00321 00322 row_at_window_start = current_row; 00323 } // if (get_local_id(0) == 0) 00324 00325 __syncthreads(); 00326 00327 if (i < blockDim.x) 00328 break; 00329 00330 i -= blockDim.x; 00331 } //for i 00332 } 00333 00334 00335 00336 // 00337 // transposed 00338 // 00339 00340 00341 template <typename T> 00342 __global__ void csr_trans_lu_forward_kernel2( 00343 const unsigned int * row_indices, 00344 const unsigned int * column_indices, 00345 const T * elements, 00346 T * vector, 00347 unsigned int size) 00348 { 00349 for (unsigned int row = 0; row < size; ++row) 00350 { 00351 T result_entry = vector[row]; 00352 00353 unsigned int row_start = row_indices[row]; 00354 unsigned int row_stop = row_indices[row + 1]; 00355 for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x) 00356 { 00357 unsigned int col_index = column_indices[entry_index]; 00358 if (col_index > row) 00359 vector[col_index] -= result_entry * elements[entry_index]; 00360 } 00361 00362 __syncthreads(); 00363 } 00364 } 00365 00366 template <typename T> 00367 __global__ void csr_trans_unit_lu_forward_kernel( 00368 const unsigned int * row_indices, 00369 const unsigned int * column_indices, 00370 const T * elements, 00371 T * vector, 00372 unsigned int size) 00373 { 00374 __shared__ unsigned int row_index_lookahead[256]; 00375 __shared__ unsigned int row_index_buffer[256]; 00376 00377 unsigned int row_index; 00378 unsigned int col_index; 00379 T matrix_entry; 00380 unsigned int nnz = row_indices[size]; 00381 unsigned int row_at_window_start = 0; 00382 unsigned int row_at_window_end = 0; 00383 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x; 00384 00385 for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x) 00386 { 00387 col_index = (i < nnz) ? column_indices[i] : 0; 00388 matrix_entry = (i < nnz) ? elements[i] : 0; 00389 row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : size - 1; 00390 00391 __syncthreads(); 00392 00393 if (i < nnz) 00394 { 00395 unsigned int row_index_inc = 0; 00396 while (i >= row_index_lookahead[row_index_inc + 1]) 00397 ++row_index_inc; 00398 row_index = row_at_window_start + row_index_inc; 00399 row_index_buffer[threadIdx.x] = row_index; 00400 } 00401 else 00402 { 00403 row_index = size+1; 00404 row_index_buffer[threadIdx.x] = size - 1; 00405 } 00406 00407 __syncthreads(); 00408 00409 row_at_window_start = row_index_buffer[0]; 00410 row_at_window_end = row_index_buffer[blockDim.x - 1]; 00411 00412 //forward elimination 00413 for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) 00414 { 00415 T result_entry = vector[row]; 00416 00417 if ( (row_index == row) && (col_index > row) ) 00418 vector[col_index] -= result_entry * matrix_entry; 00419 00420 __syncthreads(); 00421 } 00422 00423 row_at_window_start = row_at_window_end; 00424 } 00425 00426 } 00427 00428 template <typename T> 00429 __global__ void csr_trans_lu_forward_kernel( 00430 const unsigned int * row_indices, 00431 const unsigned int * column_indices, 00432 const T * elements, 00433 const T * diagonal_entries, 00434 T * vector, 00435 unsigned int size) 00436 { 00437 __shared__ unsigned int row_index_lookahead[256]; 00438 __shared__ unsigned int row_index_buffer[256]; 00439 00440 unsigned int row_index; 00441 unsigned int col_index; 00442 T matrix_entry; 00443 unsigned int nnz = row_indices[size]; 00444 unsigned int row_at_window_start = 0; 00445 unsigned int row_at_window_end = 0; 00446 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x; 00447 00448 for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x) 00449 { 00450 col_index = (i < nnz) ? column_indices[i] : 0; 00451 matrix_entry = (i < nnz) ? elements[i] : 0; 00452 row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : size - 1; 00453 00454 __syncthreads(); 00455 00456 if (i < nnz) 00457 { 00458 unsigned int row_index_inc = 0; 00459 while (i >= row_index_lookahead[row_index_inc + 1]) 00460 ++row_index_inc; 00461 row_index = row_at_window_start + row_index_inc; 00462 row_index_buffer[threadIdx.x] = row_index; 00463 } 00464 else 00465 { 00466 row_index = size+1; 00467 row_index_buffer[threadIdx.x] = size - 1; 00468 } 00469 00470 __syncthreads(); 00471 00472 row_at_window_start = row_index_buffer[0]; 00473 row_at_window_end = row_index_buffer[blockDim.x - 1]; 00474 00475 //forward elimination 00476 for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) 00477 { 00478 T result_entry = vector[row] / diagonal_entries[row]; 00479 00480 if ( (row_index == row) && (col_index > row) ) 00481 vector[col_index] -= result_entry * matrix_entry; 00482 00483 __syncthreads(); 00484 } 00485 00486 row_at_window_start = row_at_window_end; 00487 } 00488 00489 // final step: Divide vector by diagonal entries: 00490 for (unsigned int i = threadIdx.x; i < size; i += blockDim.x) 00491 vector[i] /= diagonal_entries[i]; 00492 00493 } 00494 00495 00496 template <typename T> 00497 __global__ void csr_trans_unit_lu_backward_kernel( 00498 const unsigned int * row_indices, 00499 const unsigned int * column_indices, 00500 const T * elements, 00501 T * vector, 00502 unsigned int size) 00503 { 00504 __shared__ unsigned int row_index_lookahead[256]; 00505 __shared__ unsigned int row_index_buffer[256]; 00506 00507 unsigned int row_index; 00508 unsigned int col_index; 00509 T matrix_entry; 00510 unsigned int nnz = row_indices[size]; 00511 unsigned int row_at_window_start = size; 00512 unsigned int row_at_window_end; 00513 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x; 00514 00515 for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x) 00516 { 00517 unsigned int i = (nnz - i2) - 1; 00518 col_index = (i2 < nnz) ? column_indices[i] : 0; 00519 matrix_entry = (i2 < nnz) ? elements[i] : 0; 00520 row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0; 00521 00522 __syncthreads(); 00523 00524 if (i2 < nnz) 00525 { 00526 unsigned int row_index_dec = 0; 00527 while (row_index_lookahead[row_index_dec] > i) 00528 ++row_index_dec; 00529 row_index = row_at_window_start - row_index_dec; 00530 row_index_buffer[threadIdx.x] = row_index; 00531 } 00532 else 00533 { 00534 row_index = size+1; 00535 row_index_buffer[threadIdx.x] = 0; 00536 } 00537 00538 __syncthreads(); 00539 00540 row_at_window_start = row_index_buffer[0]; 00541 row_at_window_end = row_index_buffer[blockDim.x - 1]; 00542 00543 //backward elimination 00544 for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) 00545 { 00546 unsigned int row = row_at_window_start - row2; 00547 T result_entry = vector[row]; 00548 00549 if ( (row_index == row) && (col_index < row) ) 00550 vector[col_index] -= result_entry * matrix_entry; 00551 00552 __syncthreads(); 00553 } 00554 00555 row_at_window_start = row_at_window_end; 00556 } 00557 00558 } 00559 00560 00561 00562 template <typename T> 00563 __global__ void csr_trans_lu_backward_kernel2( 00564 const unsigned int * row_indices, 00565 const unsigned int * column_indices, 00566 const T * elements, 00567 const T * diagonal_entries, 00568 T * vector, 00569 unsigned int size) 00570 { 00571 T result_entry = 0; 00572 00573 //backward elimination, using U and D: 00574 for (unsigned int row2 = 0; row2 < size; ++row2) 00575 { 00576 unsigned int row = (size - row2) - 1; 00577 result_entry = vector[row] / diagonal_entries[row]; 00578 00579 unsigned int row_start = row_indices[row]; 00580 unsigned int row_stop = row_indices[row + 1]; 00581 for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index) 00582 { 00583 unsigned int col_index = column_indices[entry_index]; 00584 if (col_index < row) 00585 vector[col_index] -= result_entry * elements[entry_index]; 00586 } 00587 00588 __syncthreads(); 00589 00590 if (threadIdx.x == 0) 00591 vector[row] = result_entry; 00592 } 00593 } 00594 00595 00596 template <typename T> 00597 __global__ void csr_trans_lu_backward_kernel( 00598 const unsigned int * row_indices, 00599 const unsigned int * column_indices, 00600 const T * elements, 00601 const T * diagonal_entries, 00602 T * vector, 00603 unsigned int size) 00604 { 00605 __shared__ unsigned int row_index_lookahead[256]; 00606 __shared__ unsigned int row_index_buffer[256]; 00607 00608 unsigned int row_index; 00609 unsigned int col_index; 00610 T matrix_entry; 00611 unsigned int nnz = row_indices[size]; 00612 unsigned int row_at_window_start = size; 00613 unsigned int row_at_window_end; 00614 unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x; 00615 00616 for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x) 00617 { 00618 unsigned int i = (nnz - i2) - 1; 00619 col_index = (i2 < nnz) ? column_indices[i] : 0; 00620 matrix_entry = (i2 < nnz) ? elements[i] : 0; 00621 row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0; 00622 00623 __syncthreads(); 00624 00625 if (i2 < nnz) 00626 { 00627 unsigned int row_index_dec = 0; 00628 while (row_index_lookahead[row_index_dec] > i) 00629 ++row_index_dec; 00630 row_index = row_at_window_start - row_index_dec; 00631 row_index_buffer[threadIdx.x] = row_index; 00632 } 00633 else 00634 { 00635 row_index = size+1; 00636 row_index_buffer[threadIdx.x] = 0; 00637 } 00638 00639 __syncthreads(); 00640 00641 row_at_window_start = row_index_buffer[0]; 00642 row_at_window_end = row_index_buffer[blockDim.x - 1]; 00643 00644 //backward elimination 00645 for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) 00646 { 00647 unsigned int row = row_at_window_start - row2; 00648 T result_entry = vector[row] / diagonal_entries[row]; 00649 00650 if ( (row_index == row) && (col_index < row) ) 00651 vector[col_index] -= result_entry * matrix_entry; 00652 00653 __syncthreads(); 00654 } 00655 00656 row_at_window_start = row_at_window_end; 00657 } 00658 00659 00660 // final step: Divide vector by diagonal entries: 00661 for (unsigned int i = threadIdx.x; i < size; i += blockDim.x) 00662 vector[i] /= diagonal_entries[i]; 00663 00664 } 00665 00666 00667 template <typename T> 00668 __global__ void csr_block_trans_unit_lu_forward( 00669 const unsigned int * row_jumper_L, //L part (note that L is transposed in memory) 00670 const unsigned int * column_indices_L, 00671 const T * elements_L, 00672 const unsigned int * block_offsets, 00673 T * result, 00674 unsigned int size) 00675 { 00676 unsigned int col_start = block_offsets[2*blockIdx.x]; 00677 unsigned int col_stop = block_offsets[2*blockIdx.x+1]; 00678 unsigned int row_start = row_jumper_L[col_start]; 00679 unsigned int row_stop; 00680 T result_entry = 0; 00681 00682 if (col_start >= col_stop) 00683 return; 00684 00685 //forward elimination, using L: 00686 for (unsigned int col = col_start; col < col_stop; ++col) 00687 { 00688 result_entry = result[col]; 00689 row_stop = row_jumper_L[col + 1]; 00690 for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x) 00691 result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; 00692 row_start = row_stop; //for next iteration (avoid unnecessary loads from GPU RAM) 00693 __syncthreads(); 00694 } 00695 00696 }; 00697 00698 00699 template <typename T> 00700 __global__ void csr_block_trans_lu_backward( 00701 const unsigned int * row_jumper_U, //U part (note that U is transposed in memory) 00702 const unsigned int * column_indices_U, 00703 const T * elements_U, 00704 const T * diagonal_U, 00705 const unsigned int * block_offsets, 00706 T * result, 00707 unsigned int size) 00708 { 00709 unsigned int col_start = block_offsets[2*blockIdx.x]; 00710 unsigned int col_stop = block_offsets[2*blockIdx.x+1]; 00711 unsigned int row_start; 00712 unsigned int row_stop; 00713 T result_entry = 0; 00714 00715 if (col_start >= col_stop) 00716 return; 00717 00718 //backward elimination, using U and diagonal_U 00719 for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) 00720 { 00721 unsigned int col = (col_stop - iter) - 1; 00722 result_entry = result[col] / diagonal_U[col]; 00723 row_start = row_jumper_U[col]; 00724 row_stop = row_jumper_U[col + 1]; 00725 for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x) 00726 result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index]; 00727 __syncthreads(); 00728 } 00729 00730 //divide result vector by diagonal: 00731 for (unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x) 00732 result[col] /= diagonal_U[col]; 00733 }; 00734 00735 00736 00737 // 00738 // Coordinate Matrix 00739 // 00740 00741 00742 00743 00744 // 00745 // ELL Matrix 00746 // 00747 00748 00749 00750 // 00751 // Hybrid Matrix 00752 // 00753 00754 00755 00756 } // namespace opencl 00757 } //namespace linalg 00758 } //namespace viennacl 00759 00760 00761 #endif