ViennaCL - The Vienna Computing Library  1.5.1
viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
00002 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_SOLVE_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00025 #include "viennacl/forwards.h"
00026 
00027 namespace viennacl
00028 {
00029   namespace linalg
00030   {
00031     namespace cuda
00032     {
00033       //
00034       // Compressed matrix
00035       //
00036 
00037       //
00038       // non-transposed
00039       //
00040 
00041       template <typename T>
00042       __global__ void csr_unit_lu_forward_kernel(
00043                 const unsigned int * row_indices,
00044                 const unsigned int * column_indices,
00045                 const T * elements,
00046                       T * vector,
00047                 unsigned int size)
00048       {
00049         __shared__  unsigned int col_index_buffer[128];
00050         __shared__  T element_buffer[128];
00051         __shared__  T vector_buffer[128];
00052 
00053         unsigned int nnz = row_indices[size];
00054         unsigned int current_row = 0;
00055         unsigned int row_at_window_start = 0;
00056         T current_vector_entry = vector[0];
00057         unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
00058         unsigned int next_row = row_indices[1];
00059 
00060         for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
00061         {
00062           //load into shared memory (coalesced access):
00063           if (i < nnz)
00064           {
00065             element_buffer[threadIdx.x] = elements[i];
00066             unsigned int tmp = column_indices[i];
00067             col_index_buffer[threadIdx.x] = tmp;
00068             vector_buffer[threadIdx.x] = vector[tmp];
00069           }
00070 
00071           __syncthreads();
00072 
00073           //now a single thread does the remaining work in shared memory:
00074           if (threadIdx.x == 0)
00075           {
00076             // traverse through all the loaded data:
00077             for (unsigned int k=0; k<blockDim.x; ++k)
00078             {
00079               if (current_row < size && i+k == next_row) //current row is finished. Write back result
00080               {
00081                 vector[current_row] = current_vector_entry;
00082                 ++current_row;
00083                 if (current_row < size) //load next row's data
00084                 {
00085                   next_row = row_indices[current_row+1];
00086                   current_vector_entry = vector[current_row];
00087                 }
00088               }
00089 
00090               if (current_row < size && col_index_buffer[k] < current_row) //substitute
00091               {
00092                 if (col_index_buffer[k] < row_at_window_start) //use recently computed results
00093                   current_vector_entry -= element_buffer[k] * vector_buffer[k];
00094                 else if (col_index_buffer[k] < current_row) //use buffered data
00095                   current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
00096               }
00097 
00098             } // for k
00099 
00100             row_at_window_start = current_row;
00101           } // if (get_local_id(0) == 0)
00102 
00103           __syncthreads();
00104         } //for i
00105       }
00106 
00107 
00108 
00109       template <typename T>
00110       __global__ void csr_lu_forward_kernel(
00111                 const unsigned int * row_indices,
00112                 const unsigned int * column_indices,
00113                 const T * elements,
00114                       T * vector,
00115                 unsigned int size)
00116       {
00117         __shared__  unsigned int col_index_buffer[128];
00118         __shared__  T element_buffer[128];
00119         __shared__  T vector_buffer[128];
00120 
00121         unsigned int nnz = row_indices[size];
00122         unsigned int current_row = 0;
00123         unsigned int row_at_window_start = 0;
00124         T current_vector_entry = vector[0];
00125         T diagonal_entry = 0;
00126         unsigned int loop_end = (nnz / blockDim.x + 1) * blockDim.x;
00127         unsigned int next_row = row_indices[1];
00128 
00129         for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
00130         {
00131           //load into shared memory (coalesced access):
00132           if (i < nnz)
00133           {
00134             element_buffer[threadIdx.x] = elements[i];
00135             unsigned int tmp = column_indices[i];
00136             col_index_buffer[threadIdx.x] = tmp;
00137             vector_buffer[threadIdx.x] = vector[tmp];
00138           }
00139 
00140           __syncthreads();
00141 
00142           //now a single thread does the remaining work in shared memory:
00143           if (threadIdx.x == 0)
00144           {
00145             // traverse through all the loaded data:
00146             for (unsigned int k=0; k<blockDim.x; ++k)
00147             {
00148               if (current_row < size && i+k == next_row) //current row is finished. Write back result
00149               {
00150                 vector[current_row] = current_vector_entry / diagonal_entry;
00151                 ++current_row;
00152                 if (current_row < size) //load next row's data
00153                 {
00154                   next_row = row_indices[current_row+1];
00155                   current_vector_entry = vector[current_row];
00156                 }
00157               }
00158 
00159               if (current_row < size && col_index_buffer[k] < current_row) //substitute
00160               {
00161                 if (col_index_buffer[k] < row_at_window_start) //use recently computed results
00162                   current_vector_entry -= element_buffer[k] * vector_buffer[k];
00163                 else if (col_index_buffer[k] < current_row) //use buffered data
00164                   current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
00165               }
00166               else if (col_index_buffer[k] == current_row)
00167                 diagonal_entry = element_buffer[k];
00168 
00169             } // for k
00170 
00171             row_at_window_start = current_row;
00172           } // if (get_local_id(0) == 0)
00173 
00174           __syncthreads();
00175         } //for i
00176       }
00177 
00178 
00179       template <typename T>
00180       __global__ void csr_unit_lu_backward_kernel(
00181                 const unsigned int * row_indices,
00182                 const unsigned int * column_indices,
00183                 const T * elements,
00184                       T * vector,
00185                 unsigned int size)
00186       {
00187         __shared__  unsigned int col_index_buffer[128];
00188         __shared__  T element_buffer[128];
00189         __shared__  T vector_buffer[128];
00190 
00191         unsigned int nnz = row_indices[size];
00192         unsigned int current_row = size-1;
00193         unsigned int row_at_window_start = size-1;
00194         T current_vector_entry = vector[size-1];
00195         unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
00196         unsigned int next_row = row_indices[size-1];
00197 
00198         unsigned int i = loop_end + threadIdx.x;
00199         while (1)
00200         {
00201           //load into shared memory (coalesced access):
00202           if (i < nnz)
00203           {
00204             element_buffer[threadIdx.x] = elements[i];
00205             unsigned int tmp = column_indices[i];
00206             col_index_buffer[threadIdx.x] = tmp;
00207             vector_buffer[threadIdx.x] = vector[tmp];
00208           }
00209 
00210           __syncthreads();
00211 
00212           //now a single thread does the remaining work in shared memory:
00213           if (threadIdx.x == 0)
00214           {
00215             // traverse through all the loaded data from back to front:
00216             for (unsigned int k2=0; k2<blockDim.x; ++k2)
00217             {
00218               unsigned int k = (blockDim.x - k2) - 1;
00219 
00220               if (i+k >= nnz)
00221                 continue;
00222 
00223               if (col_index_buffer[k] > row_at_window_start) //use recently computed results
00224                 current_vector_entry -= element_buffer[k] * vector_buffer[k];
00225               else if (col_index_buffer[k] > current_row) //use buffered data
00226                 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
00227 
00228               if (i+k == next_row) //current row is finished. Write back result
00229               {
00230                 vector[current_row] = current_vector_entry;
00231                 if (current_row > 0) //load next row's data
00232                 {
00233                   --current_row;
00234                   next_row = row_indices[current_row];
00235                   current_vector_entry = vector[current_row];
00236                 }
00237               }
00238 
00239 
00240             } // for k
00241 
00242             row_at_window_start = current_row;
00243           } // if (get_local_id(0) == 0)
00244 
00245           __syncthreads();
00246 
00247           if (i < blockDim.x)
00248             break;
00249 
00250           i -= blockDim.x;
00251         } //for i
00252       }
00253 
00254 
00255 
00256       template <typename T>
00257       __global__ void csr_lu_backward_kernel(
00258                 const unsigned int * row_indices,
00259                 const unsigned int * column_indices,
00260                 const T * elements,
00261                       T * vector,
00262                 unsigned int size)
00263       {
00264         __shared__  unsigned int col_index_buffer[128];
00265         __shared__  T element_buffer[128];
00266         __shared__  T vector_buffer[128];
00267 
00268         unsigned int nnz = row_indices[size];
00269         unsigned int current_row = size-1;
00270         unsigned int row_at_window_start = size-1;
00271         T current_vector_entry = vector[size-1];
00272         T diagonal_entry;
00273         unsigned int loop_end = ( (nnz - 1) / blockDim.x) * blockDim.x;
00274         unsigned int next_row = row_indices[size-1];
00275 
00276         unsigned int i = loop_end + threadIdx.x;
00277         while (1)
00278         {
00279           //load into shared memory (coalesced access):
00280           if (i < nnz)
00281           {
00282             element_buffer[threadIdx.x] = elements[i];
00283             unsigned int tmp = column_indices[i];
00284             col_index_buffer[threadIdx.x] = tmp;
00285             vector_buffer[threadIdx.x] = vector[tmp];
00286           }
00287 
00288           __syncthreads();
00289 
00290           //now a single thread does the remaining work in shared memory:
00291           if (threadIdx.x == 0)
00292           {
00293             // traverse through all the loaded data from back to front:
00294             for (unsigned int k2=0; k2<blockDim.x; ++k2)
00295             {
00296               unsigned int k = (blockDim.x - k2) - 1;
00297 
00298               if (i+k >= nnz)
00299                 continue;
00300 
00301               if (col_index_buffer[k] > row_at_window_start) //use recently computed results
00302                 current_vector_entry -= element_buffer[k] * vector_buffer[k];
00303               else if (col_index_buffer[k] > current_row) //use buffered data
00304                 current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];
00305               else if (col_index_buffer[k] == current_row)
00306                 diagonal_entry = element_buffer[k];
00307 
00308               if (i+k == next_row) //current row is finished. Write back result
00309               {
00310                 vector[current_row] = current_vector_entry / diagonal_entry;
00311                 if (current_row > 0) //load next row's data
00312                 {
00313                   --current_row;
00314                   next_row = row_indices[current_row];
00315                   current_vector_entry = vector[current_row];
00316                 }
00317               }
00318 
00319 
00320             } // for k
00321 
00322             row_at_window_start = current_row;
00323           } // if (get_local_id(0) == 0)
00324 
00325           __syncthreads();
00326 
00327           if (i < blockDim.x)
00328             break;
00329 
00330           i -= blockDim.x;
00331         } //for i
00332       }
00333 
00334 
00335 
00336       //
00337       // transposed
00338       //
00339 
00340 
00341       template <typename T>
00342       __global__ void csr_trans_lu_forward_kernel2(
00343                 const unsigned int * row_indices,
00344                 const unsigned int * column_indices,
00345                 const T * elements,
00346                       T * vector,
00347                 unsigned int size)
00348       {
00349         for (unsigned int row = 0; row < size; ++row)
00350         {
00351           T result_entry = vector[row];
00352 
00353           unsigned int row_start = row_indices[row];
00354           unsigned int row_stop  = row_indices[row + 1];
00355           for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; entry_index += blockDim.x)
00356           {
00357             unsigned int col_index = column_indices[entry_index];
00358             if (col_index > row)
00359               vector[col_index] -= result_entry * elements[entry_index];
00360           }
00361 
00362           __syncthreads();
00363         }
00364       }
00365 
00366       template <typename T>
00367       __global__ void csr_trans_unit_lu_forward_kernel(
00368                 const unsigned int * row_indices,
00369                 const unsigned int * column_indices,
00370                 const T * elements,
00371                       T * vector,
00372                 unsigned int size)
00373       {
00374         __shared__  unsigned int row_index_lookahead[256];
00375         __shared__  unsigned int row_index_buffer[256];
00376 
00377         unsigned int row_index;
00378         unsigned int col_index;
00379         T matrix_entry;
00380         unsigned int nnz = row_indices[size];
00381         unsigned int row_at_window_start = 0;
00382         unsigned int row_at_window_end = 0;
00383         unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
00384 
00385         for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
00386         {
00387           col_index    = (i < nnz) ? column_indices[i] : 0;
00388           matrix_entry = (i < nnz) ? elements[i]       : 0;
00389           row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : size - 1;
00390 
00391           __syncthreads();
00392 
00393           if (i < nnz)
00394           {
00395             unsigned int row_index_inc = 0;
00396             while (i >= row_index_lookahead[row_index_inc + 1])
00397               ++row_index_inc;
00398             row_index = row_at_window_start + row_index_inc;
00399             row_index_buffer[threadIdx.x] = row_index;
00400           }
00401           else
00402           {
00403             row_index = size+1;
00404             row_index_buffer[threadIdx.x] = size - 1;
00405           }
00406 
00407           __syncthreads();
00408 
00409           row_at_window_start = row_index_buffer[0];
00410           row_at_window_end   = row_index_buffer[blockDim.x - 1];
00411 
00412           //forward elimination
00413           for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
00414           {
00415             T result_entry = vector[row];
00416 
00417             if ( (row_index == row) && (col_index > row) )
00418               vector[col_index] -= result_entry * matrix_entry;
00419 
00420             __syncthreads();
00421           }
00422 
00423           row_at_window_start = row_at_window_end;
00424         }
00425 
00426       }
00427 
00428       template <typename T>
00429       __global__ void csr_trans_lu_forward_kernel(
00430                 const unsigned int * row_indices,
00431                 const unsigned int * column_indices,
00432                 const T * elements,
00433                 const T * diagonal_entries,
00434                       T * vector,
00435                 unsigned int size)
00436       {
00437         __shared__  unsigned int row_index_lookahead[256];
00438         __shared__  unsigned int row_index_buffer[256];
00439 
00440         unsigned int row_index;
00441         unsigned int col_index;
00442         T matrix_entry;
00443         unsigned int nnz = row_indices[size];
00444         unsigned int row_at_window_start = 0;
00445         unsigned int row_at_window_end = 0;
00446         unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
00447 
00448         for (unsigned int i = threadIdx.x; i < loop_end; i += blockDim.x)
00449         {
00450           col_index    = (i < nnz) ? column_indices[i] : 0;
00451           matrix_entry = (i < nnz) ? elements[i]       : 0;
00452           row_index_lookahead[threadIdx.x] = (row_at_window_start + threadIdx.x < size) ? row_indices[row_at_window_start + threadIdx.x] : size - 1;
00453 
00454           __syncthreads();
00455 
00456           if (i < nnz)
00457           {
00458             unsigned int row_index_inc = 0;
00459             while (i >= row_index_lookahead[row_index_inc + 1])
00460               ++row_index_inc;
00461             row_index = row_at_window_start + row_index_inc;
00462             row_index_buffer[threadIdx.x] = row_index;
00463           }
00464           else
00465           {
00466             row_index = size+1;
00467             row_index_buffer[threadIdx.x] = size - 1;
00468           }
00469 
00470           __syncthreads();
00471 
00472           row_at_window_start = row_index_buffer[0];
00473           row_at_window_end   = row_index_buffer[blockDim.x - 1];
00474 
00475           //forward elimination
00476           for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row)
00477           {
00478             T result_entry = vector[row] / diagonal_entries[row];
00479 
00480             if ( (row_index == row) && (col_index > row) )
00481               vector[col_index] -= result_entry * matrix_entry;
00482 
00483             __syncthreads();
00484           }
00485 
00486           row_at_window_start = row_at_window_end;
00487         }
00488 
00489         // final step: Divide vector by diagonal entries:
00490         for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
00491           vector[i] /= diagonal_entries[i];
00492 
00493       }
00494 
00495 
00496       template <typename T>
00497       __global__ void csr_trans_unit_lu_backward_kernel(
00498                 const unsigned int * row_indices,
00499                 const unsigned int * column_indices,
00500                 const T * elements,
00501                       T * vector,
00502                 unsigned int size)
00503       {
00504         __shared__  unsigned int row_index_lookahead[256];
00505         __shared__  unsigned int row_index_buffer[256];
00506 
00507         unsigned int row_index;
00508         unsigned int col_index;
00509         T matrix_entry;
00510         unsigned int nnz = row_indices[size];
00511         unsigned int row_at_window_start = size;
00512         unsigned int row_at_window_end;
00513         unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
00514 
00515         for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
00516         {
00517           unsigned int i = (nnz - i2) - 1;
00518           col_index    = (i2 < nnz) ? column_indices[i] : 0;
00519           matrix_entry = (i2 < nnz) ? elements[i]       : 0;
00520           row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
00521 
00522           __syncthreads();
00523 
00524           if (i2 < nnz)
00525           {
00526             unsigned int row_index_dec = 0;
00527             while (row_index_lookahead[row_index_dec] > i)
00528               ++row_index_dec;
00529             row_index = row_at_window_start - row_index_dec;
00530             row_index_buffer[threadIdx.x] = row_index;
00531           }
00532           else
00533           {
00534             row_index = size+1;
00535             row_index_buffer[threadIdx.x] = 0;
00536           }
00537 
00538           __syncthreads();
00539 
00540           row_at_window_start = row_index_buffer[0];
00541           row_at_window_end   = row_index_buffer[blockDim.x - 1];
00542 
00543           //backward elimination
00544           for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
00545           {
00546             unsigned int row = row_at_window_start - row2;
00547             T result_entry = vector[row];
00548 
00549             if ( (row_index == row) && (col_index < row) )
00550               vector[col_index] -= result_entry * matrix_entry;
00551 
00552             __syncthreads();
00553           }
00554 
00555           row_at_window_start = row_at_window_end;
00556         }
00557 
00558       }
00559 
00560 
00561 
00562       template <typename T>
00563       __global__ void csr_trans_lu_backward_kernel2(
00564                 const unsigned int * row_indices,
00565                 const unsigned int * column_indices,
00566                 const T * elements,
00567                 const T * diagonal_entries,
00568                       T * vector,
00569                 unsigned int size)
00570       {
00571         T result_entry = 0;
00572 
00573         //backward elimination, using U and D:
00574         for (unsigned int row2 = 0; row2 < size; ++row2)
00575         {
00576           unsigned int row = (size - row2) - 1;
00577           result_entry = vector[row] / diagonal_entries[row];
00578 
00579           unsigned int row_start = row_indices[row];
00580           unsigned int row_stop  = row_indices[row + 1];
00581           for (unsigned int entry_index = row_start + threadIdx.x; entry_index < row_stop; ++entry_index)
00582           {
00583             unsigned int col_index = column_indices[entry_index];
00584             if (col_index < row)
00585               vector[col_index] -= result_entry * elements[entry_index];
00586           }
00587 
00588           __syncthreads();
00589 
00590           if (threadIdx.x == 0)
00591             vector[row] = result_entry;
00592         }
00593       }
00594 
00595 
00596       template <typename T>
00597       __global__ void csr_trans_lu_backward_kernel(
00598                 const unsigned int * row_indices,
00599                 const unsigned int * column_indices,
00600                 const T * elements,
00601                 const T * diagonal_entries,
00602                       T * vector,
00603                 unsigned int size)
00604       {
00605         __shared__  unsigned int row_index_lookahead[256];
00606         __shared__  unsigned int row_index_buffer[256];
00607 
00608         unsigned int row_index;
00609         unsigned int col_index;
00610         T matrix_entry;
00611         unsigned int nnz = row_indices[size];
00612         unsigned int row_at_window_start = size;
00613         unsigned int row_at_window_end;
00614         unsigned int loop_end = ( (nnz - 1) / blockDim.x + 1) * blockDim.x;
00615 
00616         for (unsigned int i2 = threadIdx.x; i2 < loop_end; i2 += blockDim.x)
00617         {
00618           unsigned int i = (nnz - i2) - 1;
00619           col_index    = (i2 < nnz) ? column_indices[i] : 0;
00620           matrix_entry = (i2 < nnz) ? elements[i]       : 0;
00621           row_index_lookahead[threadIdx.x] = (row_at_window_start >= threadIdx.x) ? row_indices[row_at_window_start - threadIdx.x] : 0;
00622 
00623           __syncthreads();
00624 
00625           if (i2 < nnz)
00626           {
00627             unsigned int row_index_dec = 0;
00628             while (row_index_lookahead[row_index_dec] > i)
00629               ++row_index_dec;
00630             row_index = row_at_window_start - row_index_dec;
00631             row_index_buffer[threadIdx.x] = row_index;
00632           }
00633           else
00634           {
00635             row_index = size+1;
00636             row_index_buffer[threadIdx.x] = 0;
00637           }
00638 
00639           __syncthreads();
00640 
00641           row_at_window_start = row_index_buffer[0];
00642           row_at_window_end   = row_index_buffer[blockDim.x - 1];
00643 
00644           //backward elimination
00645           for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2)
00646           {
00647             unsigned int row = row_at_window_start - row2;
00648             T result_entry = vector[row] / diagonal_entries[row];
00649 
00650             if ( (row_index == row) && (col_index < row) )
00651               vector[col_index] -= result_entry * matrix_entry;
00652 
00653             __syncthreads();
00654           }
00655 
00656           row_at_window_start = row_at_window_end;
00657         }
00658 
00659 
00660         // final step: Divide vector by diagonal entries:
00661         for (unsigned int i = threadIdx.x; i < size; i += blockDim.x)
00662           vector[i] /= diagonal_entries[i];
00663 
00664       }
00665 
00666 
00667       template <typename T>
00668       __global__ void csr_block_trans_unit_lu_forward(
00669                 const unsigned int * row_jumper_L,      //L part (note that L is transposed in memory)
00670                 const unsigned int * column_indices_L,
00671                 const T * elements_L,
00672                 const unsigned int * block_offsets,
00673                 T * result,
00674                 unsigned int size)
00675       {
00676         unsigned int col_start = block_offsets[2*blockIdx.x];
00677         unsigned int col_stop  = block_offsets[2*blockIdx.x+1];
00678         unsigned int row_start = row_jumper_L[col_start];
00679         unsigned int row_stop;
00680         T result_entry = 0;
00681 
00682         if (col_start >= col_stop)
00683           return;
00684 
00685         //forward elimination, using L:
00686         for (unsigned int col = col_start; col < col_stop; ++col)
00687         {
00688           result_entry = result[col];
00689           row_stop = row_jumper_L[col + 1];
00690           for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
00691             result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index];
00692           row_start = row_stop; //for next iteration (avoid unnecessary loads from GPU RAM)
00693           __syncthreads();
00694         }
00695 
00696       };
00697 
00698 
00699       template <typename T>
00700       __global__ void csr_block_trans_lu_backward(
00701                 const unsigned int * row_jumper_U,      //U part (note that U is transposed in memory)
00702                 const unsigned int * column_indices_U,
00703                 const T * elements_U,
00704                 const T * diagonal_U,
00705                 const unsigned int * block_offsets,
00706                 T * result,
00707                 unsigned int size)
00708       {
00709         unsigned int col_start = block_offsets[2*blockIdx.x];
00710         unsigned int col_stop  = block_offsets[2*blockIdx.x+1];
00711         unsigned int row_start;
00712         unsigned int row_stop;
00713         T result_entry = 0;
00714 
00715         if (col_start >= col_stop)
00716           return;
00717 
00718         //backward elimination, using U and diagonal_U
00719         for (unsigned int iter = 0; iter < col_stop - col_start; ++iter)
00720         {
00721           unsigned int col = (col_stop - iter) - 1;
00722           result_entry = result[col] / diagonal_U[col];
00723           row_start = row_jumper_U[col];
00724           row_stop  = row_jumper_U[col + 1];
00725           for (unsigned int buffer_index = row_start + threadIdx.x; buffer_index < row_stop; buffer_index += blockDim.x)
00726             result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];
00727           __syncthreads();
00728         }
00729 
00730         //divide result vector by diagonal:
00731         for (unsigned int col = col_start + threadIdx.x; col < col_stop; col += blockDim.x)
00732           result[col] /= diagonal_U[col];
00733       };
00734 
00735 
00736 
00737       //
00738       // Coordinate Matrix
00739       //
00740 
00741 
00742 
00743 
00744       //
00745       // ELL Matrix
00746       //
00747 
00748 
00749 
00750       //
00751       // Hybrid Matrix
00752       //
00753 
00754 
00755 
00756     } // namespace opencl
00757   } //namespace linalg
00758 } //namespace viennacl
00759 
00760 
00761 #endif