ViennaCL - The Vienna Computing Library
1.5.1
|
00001 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_ 00002 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00028 namespace viennacl 00029 { 00030 namespace linalg 00031 { 00032 namespace cuda 00033 { 00034 00035 // matrix-matrix multiplication C = A * B 00036 // matrix layouts: C...col_major, A...col_major, B...col_major 00037 template <typename T> 00038 __global__ void matrix_matrix_col_col_col_prod_AA_kernel( 00039 T alpha, 00040 const T * A, 00041 unsigned int A_row_start, 00042 unsigned int A_col_start, 00043 unsigned int A_row_inc, 00044 unsigned int A_col_inc, 00045 unsigned int A_row_size, 00046 unsigned int A_col_size, 00047 unsigned int A_internal_rows, 00048 unsigned int A_internal_cols, 00049 const T * B, 00050 unsigned int B_row_start, 00051 unsigned int B_col_start, 00052 unsigned int B_row_inc, 00053 unsigned int B_col_inc, 00054 unsigned int B_row_size, 00055 unsigned int B_col_size, 00056 unsigned int B_internal_rows, 00057 unsigned int B_internal_cols, 00058 T beta, 00059 T * C, 00060 unsigned int C_row_start, 00061 unsigned int C_col_start, 00062 unsigned int C_row_inc, 00063 unsigned int C_col_inc, 00064 unsigned int C_row_size, 00065 unsigned int C_col_size, 00066 unsigned int C_internal_rows, 00067 unsigned int C_internal_cols) 00068 { 00069 00070 __shared__ T bufA[272]; 00071 __shared__ T bufB[272]; 00072 00073 vcl_size_t block_size = 16;//get_local_size(0); 00074 vcl_size_t row_block_id = blockIdx.x; 00075 vcl_size_t col_block_id = blockIdx.y; 00076 vcl_size_t row_thread_id = threadIdx.x; 00077 vcl_size_t col_thread_id = threadIdx.y; 00078 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; 00079 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; 00080 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; 00081 vcl_size_t bStep = block_size * B_row_inc; 00082 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 00083 T Csub = 0; 00084 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00085 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 00086 00087 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00088 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00089 for (vcl_size_t block = 0; 00090 block < block_num; 00091 ++block) 00092 { 00093 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 00094 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 00095 __syncthreads(); 00096 T * bufAptr = bufA + row_thread_id_times_block_size; 00097 T * bufBptr = bufB + col_thread_id_times_block_size; 00098 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00099 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00100 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00101 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00102 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00103 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00104 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00105 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00106 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00107 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00108 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00109 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00110 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00111 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00112 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00113 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00114 __syncthreads(); 00115 aBegin += aStep; 00116 bBegin += bStep; 00117 } 00118 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 00119 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 00120 } 00121 00122 // matrix-matrix multiplication C = A * B^T 00123 // matrix layouts: C...col_major, A...col_major, B...col_major 00124 template <typename T> 00125 __global__ void matrix_matrix_col_col_col_prod_AT_kernel( 00126 T alpha, 00127 const T * A, 00128 unsigned int A_row_start, 00129 unsigned int A_col_start, 00130 unsigned int A_row_inc, 00131 unsigned int A_col_inc, 00132 unsigned int A_row_size, 00133 unsigned int A_col_size, 00134 unsigned int A_internal_rows, 00135 unsigned int A_internal_cols, 00136 const T * B, 00137 unsigned int B_row_start, 00138 unsigned int B_col_start, 00139 unsigned int B_row_inc, 00140 unsigned int B_col_inc, 00141 unsigned int B_row_size, 00142 unsigned int B_col_size, 00143 unsigned int B_internal_rows, 00144 unsigned int B_internal_cols, 00145 T beta, 00146 T * C, 00147 unsigned int C_row_start, 00148 unsigned int C_col_start, 00149 unsigned int C_row_inc, 00150 unsigned int C_col_inc, 00151 unsigned int C_row_size, 00152 unsigned int C_col_size, 00153 unsigned int C_internal_rows, 00154 unsigned int C_internal_cols) 00155 { 00156 00157 __shared__ T bufA[272]; 00158 __shared__ T bufB[272]; 00159 00160 vcl_size_t block_size = 16;//get_local_size(0); 00161 vcl_size_t row_block_id = blockIdx.x; 00162 vcl_size_t col_block_id = blockIdx.y; 00163 vcl_size_t row_thread_id = threadIdx.x; 00164 vcl_size_t col_thread_id = threadIdx.y; 00165 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; 00166 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; 00167 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; 00168 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; 00169 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 00170 T Csub = 0; 00171 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00172 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 00173 00174 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00175 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00176 for (vcl_size_t block = 0; 00177 block < block_num; 00178 ++block) 00179 { 00180 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 00181 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 00182 __syncthreads(); 00183 T * bufAptr = bufA + row_thread_id_times_block_size; 00184 T * bufBptr = bufB + col_thread_id_times_block_size; 00185 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00186 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00187 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00188 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00189 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00190 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00191 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00192 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00193 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00194 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00195 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00196 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00197 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00198 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00199 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00200 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00201 __syncthreads(); 00202 aBegin += aStep; 00203 bBegin += bStep; 00204 } 00205 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 00206 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 00207 } 00208 00209 // matrix-matrix multiplication C = A^T * B 00210 // matrix layouts: C...col_major, A...col_major, B...col_major 00211 template <typename T> 00212 __global__ void matrix_matrix_col_col_col_prod_TA_kernel( 00213 T alpha, 00214 const T * A, 00215 unsigned int A_row_start, 00216 unsigned int A_col_start, 00217 unsigned int A_row_inc, 00218 unsigned int A_col_inc, 00219 unsigned int A_row_size, 00220 unsigned int A_col_size, 00221 unsigned int A_internal_rows, 00222 unsigned int A_internal_cols, 00223 const T * B, 00224 unsigned int B_row_start, 00225 unsigned int B_col_start, 00226 unsigned int B_row_inc, 00227 unsigned int B_col_inc, 00228 unsigned int B_row_size, 00229 unsigned int B_col_size, 00230 unsigned int B_internal_rows, 00231 unsigned int B_internal_cols, 00232 T beta, 00233 T * C, 00234 unsigned int C_row_start, 00235 unsigned int C_col_start, 00236 unsigned int C_row_inc, 00237 unsigned int C_col_inc, 00238 unsigned int C_row_size, 00239 unsigned int C_col_size, 00240 unsigned int C_internal_rows, 00241 unsigned int C_internal_cols) 00242 { 00243 00244 __shared__ T bufA[272]; 00245 __shared__ T bufB[272]; 00246 00247 vcl_size_t block_size = 16;//get_local_size(0); 00248 vcl_size_t row_block_id = blockIdx.x; 00249 vcl_size_t col_block_id = blockIdx.y; 00250 vcl_size_t row_thread_id = threadIdx.x; 00251 vcl_size_t col_thread_id = threadIdx.y; 00252 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; 00253 vcl_size_t aStep = block_size * A_row_inc; 00254 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; 00255 vcl_size_t bStep = block_size * B_row_inc; 00256 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 00257 T Csub = 0; 00258 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00259 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 00260 00261 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00262 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00263 for (vcl_size_t block = 0; 00264 block < block_num; 00265 ++block) 00266 { 00267 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 00268 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 00269 __syncthreads(); 00270 T * bufAptr = bufA + row_thread_id_times_block_size; 00271 T * bufBptr = bufB + col_thread_id_times_block_size; 00272 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00273 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00274 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00275 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00276 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00277 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00278 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00279 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00280 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00281 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00282 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00283 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00284 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00285 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00286 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00287 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00288 __syncthreads(); 00289 aBegin += aStep; 00290 bBegin += bStep; 00291 } 00292 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 00293 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 00294 } 00295 00296 // matrix-matrix multiplication C = A^T * B^T 00297 // matrix layouts: C...col_major, A...col_major, B...col_major 00298 template <typename T> 00299 __global__ void matrix_matrix_col_col_col_prod_TT_kernel( 00300 T alpha, 00301 const T * A, 00302 unsigned int A_row_start, 00303 unsigned int A_col_start, 00304 unsigned int A_row_inc, 00305 unsigned int A_col_inc, 00306 unsigned int A_row_size, 00307 unsigned int A_col_size, 00308 unsigned int A_internal_rows, 00309 unsigned int A_internal_cols, 00310 const T * B, 00311 unsigned int B_row_start, 00312 unsigned int B_col_start, 00313 unsigned int B_row_inc, 00314 unsigned int B_col_inc, 00315 unsigned int B_row_size, 00316 unsigned int B_col_size, 00317 unsigned int B_internal_rows, 00318 unsigned int B_internal_cols, 00319 T beta, 00320 T * C, 00321 unsigned int C_row_start, 00322 unsigned int C_col_start, 00323 unsigned int C_row_inc, 00324 unsigned int C_col_inc, 00325 unsigned int C_row_size, 00326 unsigned int C_col_size, 00327 unsigned int C_internal_rows, 00328 unsigned int C_internal_cols) 00329 { 00330 00331 __shared__ T bufA[272]; 00332 __shared__ T bufB[272]; 00333 00334 vcl_size_t block_size = 16;//get_local_size(0); 00335 vcl_size_t row_block_id = blockIdx.x; 00336 vcl_size_t col_block_id = blockIdx.y; 00337 vcl_size_t row_thread_id = threadIdx.x; 00338 vcl_size_t col_thread_id = threadIdx.y; 00339 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; 00340 vcl_size_t aStep = block_size * A_row_inc; 00341 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; 00342 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; 00343 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 00344 T Csub = 0; 00345 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00346 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 00347 00348 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00349 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00350 for (vcl_size_t block = 0; 00351 block < block_num; 00352 ++block) 00353 { 00354 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 00355 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 00356 __syncthreads(); 00357 T * bufAptr = bufA + row_thread_id_times_block_size; 00358 T * bufBptr = bufB + col_thread_id_times_block_size; 00359 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00360 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00361 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00362 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00363 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00364 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00365 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00366 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00367 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00368 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00369 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00370 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00371 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00372 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00373 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00374 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00375 __syncthreads(); 00376 aBegin += aStep; 00377 bBegin += bStep; 00378 } 00379 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 00380 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 00381 } 00382 00383 00384 00386 00387 00388 00389 00390 // matrix-matrix multiplication C = A * B 00391 // matrix layouts: C...row_major, A...col_major, B...col_major 00392 template <typename T> 00393 __global__ void matrix_matrix_row_col_col_prod_AA_kernel( 00394 T alpha, 00395 const T * A, 00396 unsigned int A_row_start, 00397 unsigned int A_col_start, 00398 unsigned int A_row_inc, 00399 unsigned int A_col_inc, 00400 unsigned int A_row_size, 00401 unsigned int A_col_size, 00402 unsigned int A_internal_rows, 00403 unsigned int A_internal_cols, 00404 const T * B, 00405 unsigned int B_row_start, 00406 unsigned int B_col_start, 00407 unsigned int B_row_inc, 00408 unsigned int B_col_inc, 00409 unsigned int B_row_size, 00410 unsigned int B_col_size, 00411 unsigned int B_internal_rows, 00412 unsigned int B_internal_cols, 00413 T beta, 00414 T * C, 00415 unsigned int C_row_start, 00416 unsigned int C_col_start, 00417 unsigned int C_row_inc, 00418 unsigned int C_col_inc, 00419 unsigned int C_row_size, 00420 unsigned int C_col_size, 00421 unsigned int C_internal_rows, 00422 unsigned int C_internal_cols) 00423 { 00424 00425 __shared__ T bufA[272]; 00426 __shared__ T bufB[272]; 00427 00428 vcl_size_t block_size = 16;//get_local_size(0); 00429 vcl_size_t row_block_id = blockIdx.x; 00430 vcl_size_t col_block_id = blockIdx.y; 00431 vcl_size_t row_thread_id = threadIdx.x; 00432 vcl_size_t col_thread_id = threadIdx.y; 00433 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; 00434 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; 00435 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; 00436 vcl_size_t bStep = block_size * B_row_inc; 00437 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 00438 T Csub = 0; 00439 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00440 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 00441 00442 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00443 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00444 for (vcl_size_t block = 0; 00445 block < block_num; 00446 ++block) 00447 { 00448 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 00449 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 00450 __syncthreads(); 00451 T * bufAptr = bufA + row_thread_id_times_block_size; 00452 T * bufBptr = bufB + col_thread_id_times_block_size; 00453 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00454 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00455 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00456 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00457 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00458 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00459 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00460 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00461 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00462 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00463 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00464 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00465 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00466 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00467 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00468 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00469 __syncthreads(); 00470 aBegin += aStep; 00471 bBegin += bStep; 00472 } 00473 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 00474 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 00475 } 00476 00477 // matrix-matrix multiplication C = A * B^T 00478 // matrix layouts: C...row_major, A...col_major, B...col_major 00479 template <typename T> 00480 __global__ void matrix_matrix_row_col_col_prod_AT_kernel( 00481 T alpha, 00482 const T * A, 00483 unsigned int A_row_start, 00484 unsigned int A_col_start, 00485 unsigned int A_row_inc, 00486 unsigned int A_col_inc, 00487 unsigned int A_row_size, 00488 unsigned int A_col_size, 00489 unsigned int A_internal_rows, 00490 unsigned int A_internal_cols, 00491 const T * B, 00492 unsigned int B_row_start, 00493 unsigned int B_col_start, 00494 unsigned int B_row_inc, 00495 unsigned int B_col_inc, 00496 unsigned int B_row_size, 00497 unsigned int B_col_size, 00498 unsigned int B_internal_rows, 00499 unsigned int B_internal_cols, 00500 T beta, 00501 T * C, 00502 unsigned int C_row_start, 00503 unsigned int C_col_start, 00504 unsigned int C_row_inc, 00505 unsigned int C_col_inc, 00506 unsigned int C_row_size, 00507 unsigned int C_col_size, 00508 unsigned int C_internal_rows, 00509 unsigned int C_internal_cols) 00510 { 00511 00512 __shared__ T bufA[272]; 00513 __shared__ T bufB[272]; 00514 00515 vcl_size_t block_size = 16;//get_local_size(0); 00516 vcl_size_t row_block_id = blockIdx.x; 00517 vcl_size_t col_block_id = blockIdx.y; 00518 vcl_size_t row_thread_id = threadIdx.x; 00519 vcl_size_t col_thread_id = threadIdx.y; 00520 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; 00521 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; 00522 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; 00523 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; 00524 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 00525 T Csub = 0; 00526 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00527 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 00528 00529 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00530 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00531 for (vcl_size_t block = 0; 00532 block < block_num; 00533 ++block) 00534 { 00535 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 00536 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 00537 __syncthreads(); 00538 T * bufAptr = bufA + row_thread_id_times_block_size; 00539 T * bufBptr = bufB + col_thread_id_times_block_size; 00540 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00541 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00542 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00543 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00544 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00545 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00546 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00547 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00548 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00549 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00550 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00551 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00552 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00553 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00554 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00555 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00556 __syncthreads(); 00557 aBegin += aStep; 00558 bBegin += bStep; 00559 } 00560 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 00561 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 00562 } 00563 00564 // matrix-matrix multiplication C = A^T * B 00565 // matrix layouts: C...row_major, A...col_major, B...col_major 00566 template <typename T> 00567 __global__ void matrix_matrix_row_col_col_prod_TA_kernel( 00568 T alpha, 00569 const T * A, 00570 unsigned int A_row_start, 00571 unsigned int A_col_start, 00572 unsigned int A_row_inc, 00573 unsigned int A_col_inc, 00574 unsigned int A_row_size, 00575 unsigned int A_col_size, 00576 unsigned int A_internal_rows, 00577 unsigned int A_internal_cols, 00578 const T * B, 00579 unsigned int B_row_start, 00580 unsigned int B_col_start, 00581 unsigned int B_row_inc, 00582 unsigned int B_col_inc, 00583 unsigned int B_row_size, 00584 unsigned int B_col_size, 00585 unsigned int B_internal_rows, 00586 unsigned int B_internal_cols, 00587 T beta, 00588 T * C, 00589 unsigned int C_row_start, 00590 unsigned int C_col_start, 00591 unsigned int C_row_inc, 00592 unsigned int C_col_inc, 00593 unsigned int C_row_size, 00594 unsigned int C_col_size, 00595 unsigned int C_internal_rows, 00596 unsigned int C_internal_cols) 00597 { 00598 00599 __shared__ T bufA[272]; 00600 __shared__ T bufB[272]; 00601 00602 vcl_size_t block_size = 16;//get_local_size(0); 00603 vcl_size_t row_block_id = blockIdx.x; 00604 vcl_size_t col_block_id = blockIdx.y; 00605 vcl_size_t row_thread_id = threadIdx.x; 00606 vcl_size_t col_thread_id = threadIdx.y; 00607 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; 00608 vcl_size_t aStep = block_size * A_row_inc; 00609 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; 00610 vcl_size_t bStep = block_size * B_row_inc; 00611 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 00612 T Csub = 0; 00613 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00614 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 00615 00616 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00617 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00618 for (vcl_size_t block = 0; 00619 block < block_num; 00620 ++block) 00621 { 00622 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 00623 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 00624 __syncthreads(); 00625 T * bufAptr = bufA + row_thread_id_times_block_size; 00626 T * bufBptr = bufB + col_thread_id_times_block_size; 00627 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00628 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00629 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00630 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00631 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00632 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00633 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00634 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00635 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00636 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00637 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00638 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00639 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00640 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00641 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00642 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00643 __syncthreads(); 00644 aBegin += aStep; 00645 bBegin += bStep; 00646 } 00647 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 00648 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 00649 } 00650 00651 // matrix-matrix multiplication C = A^T * B^T 00652 // matrix layouts: C...row_major, A...col_major, B...col_major 00653 template <typename T> 00654 __global__ void matrix_matrix_row_col_col_prod_TT_kernel( 00655 T alpha, 00656 const T * A, 00657 unsigned int A_row_start, 00658 unsigned int A_col_start, 00659 unsigned int A_row_inc, 00660 unsigned int A_col_inc, 00661 unsigned int A_row_size, 00662 unsigned int A_col_size, 00663 unsigned int A_internal_rows, 00664 unsigned int A_internal_cols, 00665 const T * B, 00666 unsigned int B_row_start, 00667 unsigned int B_col_start, 00668 unsigned int B_row_inc, 00669 unsigned int B_col_inc, 00670 unsigned int B_row_size, 00671 unsigned int B_col_size, 00672 unsigned int B_internal_rows, 00673 unsigned int B_internal_cols, 00674 T beta, 00675 T * C, 00676 unsigned int C_row_start, 00677 unsigned int C_col_start, 00678 unsigned int C_row_inc, 00679 unsigned int C_col_inc, 00680 unsigned int C_row_size, 00681 unsigned int C_col_size, 00682 unsigned int C_internal_rows, 00683 unsigned int C_internal_cols) 00684 { 00685 00686 __shared__ T bufA[272]; 00687 __shared__ T bufB[272]; 00688 00689 vcl_size_t block_size = 16;//get_local_size(0); 00690 vcl_size_t row_block_id = blockIdx.x; 00691 vcl_size_t col_block_id = blockIdx.y; 00692 vcl_size_t row_thread_id = threadIdx.x; 00693 vcl_size_t col_thread_id = threadIdx.y; 00694 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; 00695 vcl_size_t aStep = block_size * A_row_inc; 00696 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; 00697 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; 00698 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 00699 T Csub = 0; 00700 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00701 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 00702 00703 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00704 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00705 for (vcl_size_t block = 0; 00706 block < block_num; 00707 ++block) 00708 { 00709 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 00710 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 00711 __syncthreads(); 00712 T * bufAptr = bufA + row_thread_id_times_block_size; 00713 T * bufBptr = bufB + col_thread_id_times_block_size; 00714 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00715 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00716 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00717 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00718 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00719 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00720 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00721 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00722 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00723 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00724 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00725 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00726 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00727 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00728 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00729 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00730 __syncthreads(); 00731 aBegin += aStep; 00732 bBegin += bStep; 00733 } 00734 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 00735 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 00736 } 00737 00738 00739 00740 00742 00743 00744 00745 00746 // matrix-matrix multiplication C = A * B 00747 // matrix layouts: C...col_major, A...col_major, B...row_major 00748 template <typename T> 00749 __global__ void matrix_matrix_col_col_row_prod_AA_kernel( 00750 T alpha, 00751 const T * A, 00752 unsigned int A_row_start, 00753 unsigned int A_col_start, 00754 unsigned int A_row_inc, 00755 unsigned int A_col_inc, 00756 unsigned int A_row_size, 00757 unsigned int A_col_size, 00758 unsigned int A_internal_rows, 00759 unsigned int A_internal_cols, 00760 const T * B, 00761 unsigned int B_row_start, 00762 unsigned int B_col_start, 00763 unsigned int B_row_inc, 00764 unsigned int B_col_inc, 00765 unsigned int B_row_size, 00766 unsigned int B_col_size, 00767 unsigned int B_internal_rows, 00768 unsigned int B_internal_cols, 00769 T beta, 00770 T * C, 00771 unsigned int C_row_start, 00772 unsigned int C_col_start, 00773 unsigned int C_row_inc, 00774 unsigned int C_col_inc, 00775 unsigned int C_row_size, 00776 unsigned int C_col_size, 00777 unsigned int C_internal_rows, 00778 unsigned int C_internal_cols) 00779 { 00780 00781 __shared__ T bufA[272]; 00782 __shared__ T bufB[272]; 00783 00784 vcl_size_t block_size = 16;//get_local_size(0); 00785 vcl_size_t row_block_id = blockIdx.x; 00786 vcl_size_t col_block_id = blockIdx.y; 00787 vcl_size_t row_thread_id = threadIdx.x; 00788 vcl_size_t col_thread_id = threadIdx.y; 00789 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; 00790 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; 00791 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; 00792 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; 00793 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 00794 T Csub = 0; 00795 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00796 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 00797 00798 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00799 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00800 for (vcl_size_t block = 0; 00801 block < block_num; 00802 ++block) 00803 { 00804 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 00805 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 00806 __syncthreads(); 00807 T * bufAptr = bufA + row_thread_id_times_block_size; 00808 T * bufBptr = bufB + col_thread_id_times_block_size; 00809 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00810 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00811 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00812 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00813 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00814 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00815 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00816 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00817 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00818 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00819 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00820 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00821 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00822 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00823 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00824 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00825 __syncthreads(); 00826 aBegin += aStep; 00827 bBegin += bStep; 00828 } 00829 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 00830 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 00831 } 00832 00833 // matrix-matrix multiplication C = A * B^T 00834 // matrix layouts: C...col_major, A...col_major, B...row_major 00835 template <typename T> 00836 __global__ void matrix_matrix_col_col_row_prod_AT_kernel( 00837 T alpha, 00838 const T * A, 00839 unsigned int A_row_start, 00840 unsigned int A_col_start, 00841 unsigned int A_row_inc, 00842 unsigned int A_col_inc, 00843 unsigned int A_row_size, 00844 unsigned int A_col_size, 00845 unsigned int A_internal_rows, 00846 unsigned int A_internal_cols, 00847 const T * B, 00848 unsigned int B_row_start, 00849 unsigned int B_col_start, 00850 unsigned int B_row_inc, 00851 unsigned int B_col_inc, 00852 unsigned int B_row_size, 00853 unsigned int B_col_size, 00854 unsigned int B_internal_rows, 00855 unsigned int B_internal_cols, 00856 T beta, 00857 T * C, 00858 unsigned int C_row_start, 00859 unsigned int C_col_start, 00860 unsigned int C_row_inc, 00861 unsigned int C_col_inc, 00862 unsigned int C_row_size, 00863 unsigned int C_col_size, 00864 unsigned int C_internal_rows, 00865 unsigned int C_internal_cols) 00866 { 00867 00868 __shared__ T bufA[272]; 00869 __shared__ T bufB[272]; 00870 00871 vcl_size_t block_size = 16;//get_local_size(0); 00872 vcl_size_t row_block_id = blockIdx.x; 00873 vcl_size_t col_block_id = blockIdx.y; 00874 vcl_size_t row_thread_id = threadIdx.x; 00875 vcl_size_t col_thread_id = threadIdx.y; 00876 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; 00877 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; 00878 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; 00879 vcl_size_t bStep = block_size * B_col_inc; 00880 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 00881 T Csub = 0; 00882 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00883 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 00884 00885 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00886 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00887 for (vcl_size_t block = 0; 00888 block < block_num; 00889 ++block) 00890 { 00891 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 00892 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 00893 __syncthreads(); 00894 T * bufAptr = bufA + row_thread_id_times_block_size; 00895 T * bufBptr = bufB + col_thread_id_times_block_size; 00896 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00897 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00898 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00899 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00900 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00901 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00902 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00903 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00904 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00905 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00906 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00907 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00908 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00909 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00910 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00911 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00912 __syncthreads(); 00913 aBegin += aStep; 00914 bBegin += bStep; 00915 } 00916 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 00917 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 00918 } 00919 00920 // matrix-matrix multiplication C = A^T * B 00921 // matrix layouts: C...col_major, A...col_major, B...row_major 00922 template <typename T> 00923 __global__ void matrix_matrix_col_col_row_prod_TA_kernel( 00924 T alpha, 00925 const T * A, 00926 unsigned int A_row_start, 00927 unsigned int A_col_start, 00928 unsigned int A_row_inc, 00929 unsigned int A_col_inc, 00930 unsigned int A_row_size, 00931 unsigned int A_col_size, 00932 unsigned int A_internal_rows, 00933 unsigned int A_internal_cols, 00934 const T * B, 00935 unsigned int B_row_start, 00936 unsigned int B_col_start, 00937 unsigned int B_row_inc, 00938 unsigned int B_col_inc, 00939 unsigned int B_row_size, 00940 unsigned int B_col_size, 00941 unsigned int B_internal_rows, 00942 unsigned int B_internal_cols, 00943 T beta, 00944 T * C, 00945 unsigned int C_row_start, 00946 unsigned int C_col_start, 00947 unsigned int C_row_inc, 00948 unsigned int C_col_inc, 00949 unsigned int C_row_size, 00950 unsigned int C_col_size, 00951 unsigned int C_internal_rows, 00952 unsigned int C_internal_cols) 00953 { 00954 00955 __shared__ T bufA[272]; 00956 __shared__ T bufB[272]; 00957 00958 vcl_size_t block_size = 16;//get_local_size(0); 00959 vcl_size_t row_block_id = blockIdx.x; 00960 vcl_size_t col_block_id = blockIdx.y; 00961 vcl_size_t row_thread_id = threadIdx.x; 00962 vcl_size_t col_thread_id = threadIdx.y; 00963 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; 00964 vcl_size_t aStep = block_size * A_row_inc; 00965 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; 00966 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; 00967 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 00968 T Csub = 0; 00969 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 00970 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 00971 00972 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 00973 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 00974 for (vcl_size_t block = 0; 00975 block < block_num; 00976 ++block) 00977 { 00978 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 00979 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 00980 __syncthreads(); 00981 T * bufAptr = bufA + row_thread_id_times_block_size; 00982 T * bufBptr = bufB + col_thread_id_times_block_size; 00983 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00984 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00985 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00986 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00987 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00988 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00989 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00990 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00991 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00992 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00993 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00994 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00995 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00996 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00997 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00998 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 00999 __syncthreads(); 01000 aBegin += aStep; 01001 bBegin += bStep; 01002 } 01003 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 01004 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 01005 } 01006 01007 // matrix-matrix multiplication C = A^T * B^T 01008 // matrix layouts: C...col_major, A...col_major, B...row_major 01009 template <typename T> 01010 __global__ void matrix_matrix_col_col_row_prod_TT_kernel( 01011 T alpha, 01012 const T * A, 01013 unsigned int A_row_start, 01014 unsigned int A_col_start, 01015 unsigned int A_row_inc, 01016 unsigned int A_col_inc, 01017 unsigned int A_row_size, 01018 unsigned int A_col_size, 01019 unsigned int A_internal_rows, 01020 unsigned int A_internal_cols, 01021 const T * B, 01022 unsigned int B_row_start, 01023 unsigned int B_col_start, 01024 unsigned int B_row_inc, 01025 unsigned int B_col_inc, 01026 unsigned int B_row_size, 01027 unsigned int B_col_size, 01028 unsigned int B_internal_rows, 01029 unsigned int B_internal_cols, 01030 T beta, 01031 T * C, 01032 unsigned int C_row_start, 01033 unsigned int C_col_start, 01034 unsigned int C_row_inc, 01035 unsigned int C_col_inc, 01036 unsigned int C_row_size, 01037 unsigned int C_col_size, 01038 unsigned int C_internal_rows, 01039 unsigned int C_internal_cols) 01040 { 01041 01042 __shared__ T bufA[272]; 01043 __shared__ T bufB[272]; 01044 01045 vcl_size_t block_size = 16;//get_local_size(0); 01046 vcl_size_t row_block_id = blockIdx.x; 01047 vcl_size_t col_block_id = blockIdx.y; 01048 vcl_size_t row_thread_id = threadIdx.x; 01049 vcl_size_t col_thread_id = threadIdx.y; 01050 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; 01051 vcl_size_t aStep = block_size * A_row_inc; 01052 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; 01053 vcl_size_t bStep = block_size * B_col_inc; 01054 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 01055 T Csub = 0; 01056 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 01057 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 01058 01059 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01060 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01061 for (vcl_size_t block = 0; 01062 block < block_num; 01063 ++block) 01064 { 01065 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 01066 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 01067 __syncthreads(); 01068 T * bufAptr = bufA + row_thread_id_times_block_size; 01069 T * bufBptr = bufB + col_thread_id_times_block_size; 01070 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01071 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01072 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01073 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01074 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01075 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01076 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01077 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01078 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01079 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01080 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01081 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01082 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01083 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01084 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01085 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01086 __syncthreads(); 01087 aBegin += aStep; 01088 bBegin += bStep; 01089 } 01090 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 01091 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 01092 } 01093 01094 01095 01097 01098 01099 01100 01101 // matrix-matrix multiplication C = A * B 01102 // matrix layouts: C...row_major, A...col_major, B...row_major 01103 template <typename T> 01104 __global__ void matrix_matrix_row_col_row_prod_AA_kernel( 01105 T alpha, 01106 const T * A, 01107 unsigned int A_row_start, 01108 unsigned int A_col_start, 01109 unsigned int A_row_inc, 01110 unsigned int A_col_inc, 01111 unsigned int A_row_size, 01112 unsigned int A_col_size, 01113 unsigned int A_internal_rows, 01114 unsigned int A_internal_cols, 01115 const T * B, 01116 unsigned int B_row_start, 01117 unsigned int B_col_start, 01118 unsigned int B_row_inc, 01119 unsigned int B_col_inc, 01120 unsigned int B_row_size, 01121 unsigned int B_col_size, 01122 unsigned int B_internal_rows, 01123 unsigned int B_internal_cols, 01124 T beta, 01125 T * C, 01126 unsigned int C_row_start, 01127 unsigned int C_col_start, 01128 unsigned int C_row_inc, 01129 unsigned int C_col_inc, 01130 unsigned int C_row_size, 01131 unsigned int C_col_size, 01132 unsigned int C_internal_rows, 01133 unsigned int C_internal_cols) 01134 { 01135 01136 __shared__ T bufA[272]; 01137 __shared__ T bufB[272]; 01138 01139 vcl_size_t block_size = 16;//get_local_size(0); 01140 vcl_size_t row_block_id = blockIdx.x; 01141 vcl_size_t col_block_id = blockIdx.y; 01142 vcl_size_t row_thread_id = threadIdx.x; 01143 vcl_size_t col_thread_id = threadIdx.y; 01144 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; 01145 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; 01146 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; 01147 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; 01148 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 01149 T Csub = 0; 01150 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 01151 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 01152 01153 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01154 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01155 for (vcl_size_t block = 0; 01156 block < block_num; 01157 ++block) 01158 { 01159 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 01160 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 01161 __syncthreads(); 01162 T * bufAptr = bufA + row_thread_id_times_block_size; 01163 T * bufBptr = bufB + col_thread_id_times_block_size; 01164 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01165 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01166 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01167 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01168 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01169 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01170 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01171 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01172 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01173 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01174 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01175 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01176 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01177 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01178 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01179 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01180 __syncthreads(); 01181 aBegin += aStep; 01182 bBegin += bStep; 01183 } 01184 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 01185 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 01186 } 01187 01188 // matrix-matrix multiplication C = A * B^T 01189 // matrix layouts: C...row_major, A...col_major, B...row_major 01190 template <typename T> 01191 __global__ void matrix_matrix_row_col_row_prod_AT_kernel( 01192 T alpha, 01193 const T * A, 01194 unsigned int A_row_start, 01195 unsigned int A_col_start, 01196 unsigned int A_row_inc, 01197 unsigned int A_col_inc, 01198 unsigned int A_row_size, 01199 unsigned int A_col_size, 01200 unsigned int A_internal_rows, 01201 unsigned int A_internal_cols, 01202 const T * B, 01203 unsigned int B_row_start, 01204 unsigned int B_col_start, 01205 unsigned int B_row_inc, 01206 unsigned int B_col_inc, 01207 unsigned int B_row_size, 01208 unsigned int B_col_size, 01209 unsigned int B_internal_rows, 01210 unsigned int B_internal_cols, 01211 T beta, 01212 T * C, 01213 unsigned int C_row_start, 01214 unsigned int C_col_start, 01215 unsigned int C_row_inc, 01216 unsigned int C_col_inc, 01217 unsigned int C_row_size, 01218 unsigned int C_col_size, 01219 unsigned int C_internal_rows, 01220 unsigned int C_internal_cols) 01221 { 01222 01223 __shared__ T bufA[272]; 01224 __shared__ T bufB[272]; 01225 01226 vcl_size_t block_size = 16;//get_local_size(0); 01227 vcl_size_t row_block_id = blockIdx.x; 01228 vcl_size_t col_block_id = blockIdx.y; 01229 vcl_size_t row_thread_id = threadIdx.x; 01230 vcl_size_t col_thread_id = threadIdx.y; 01231 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; 01232 vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; 01233 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; 01234 vcl_size_t bStep = block_size * B_col_inc; 01235 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 01236 T Csub = 0; 01237 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 01238 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 01239 01240 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01241 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01242 for (vcl_size_t block = 0; 01243 block < block_num; 01244 ++block) 01245 { 01246 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 01247 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 01248 __syncthreads(); 01249 T * bufAptr = bufA + row_thread_id_times_block_size; 01250 T * bufBptr = bufB + col_thread_id_times_block_size; 01251 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01252 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01253 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01254 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01255 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01256 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01257 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01258 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01259 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01260 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01261 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01262 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01263 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01264 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01265 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01266 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01267 __syncthreads(); 01268 aBegin += aStep; 01269 bBegin += bStep; 01270 } 01271 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 01272 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 01273 } 01274 01275 // matrix-matrix multiplication C = A^T * B 01276 // matrix layouts: C...row_major, A...col_major, B...row_major 01277 template <typename T> 01278 __global__ void matrix_matrix_row_col_row_prod_TA_kernel( 01279 T alpha, 01280 const T * A, 01281 unsigned int A_row_start, 01282 unsigned int A_col_start, 01283 unsigned int A_row_inc, 01284 unsigned int A_col_inc, 01285 unsigned int A_row_size, 01286 unsigned int A_col_size, 01287 unsigned int A_internal_rows, 01288 unsigned int A_internal_cols, 01289 const T * B, 01290 unsigned int B_row_start, 01291 unsigned int B_col_start, 01292 unsigned int B_row_inc, 01293 unsigned int B_col_inc, 01294 unsigned int B_row_size, 01295 unsigned int B_col_size, 01296 unsigned int B_internal_rows, 01297 unsigned int B_internal_cols, 01298 T beta, 01299 T * C, 01300 unsigned int C_row_start, 01301 unsigned int C_col_start, 01302 unsigned int C_row_inc, 01303 unsigned int C_col_inc, 01304 unsigned int C_row_size, 01305 unsigned int C_col_size, 01306 unsigned int C_internal_rows, 01307 unsigned int C_internal_cols) 01308 { 01309 01310 __shared__ T bufA[272]; 01311 __shared__ T bufB[272]; 01312 01313 vcl_size_t block_size = 16;//get_local_size(0); 01314 vcl_size_t row_block_id = blockIdx.x; 01315 vcl_size_t col_block_id = blockIdx.y; 01316 vcl_size_t row_thread_id = threadIdx.x; 01317 vcl_size_t col_thread_id = threadIdx.y; 01318 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; 01319 vcl_size_t aStep = block_size * A_row_inc; 01320 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; 01321 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; 01322 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 01323 T Csub = 0; 01324 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 01325 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 01326 01327 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01328 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01329 for (vcl_size_t block = 0; 01330 block < block_num; 01331 ++block) 01332 { 01333 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 01334 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 01335 __syncthreads(); 01336 T * bufAptr = bufA + row_thread_id_times_block_size; 01337 T * bufBptr = bufB + col_thread_id_times_block_size; 01338 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01339 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01340 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01341 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01342 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01343 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01344 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01345 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01346 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01347 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01348 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01349 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01350 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01351 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01352 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01353 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01354 __syncthreads(); 01355 aBegin += aStep; 01356 bBegin += bStep; 01357 } 01358 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 01359 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 01360 } 01361 01362 // matrix-matrix multiplication C = A^T * B^T 01363 // matrix layouts: C...row_major, A...col_major, B...row_major 01364 template <typename T> 01365 __global__ void matrix_matrix_row_col_row_prod_TT_kernel( 01366 T alpha, 01367 const T * A, 01368 unsigned int A_row_start, 01369 unsigned int A_col_start, 01370 unsigned int A_row_inc, 01371 unsigned int A_col_inc, 01372 unsigned int A_row_size, 01373 unsigned int A_col_size, 01374 unsigned int A_internal_rows, 01375 unsigned int A_internal_cols, 01376 const T * B, 01377 unsigned int B_row_start, 01378 unsigned int B_col_start, 01379 unsigned int B_row_inc, 01380 unsigned int B_col_inc, 01381 unsigned int B_row_size, 01382 unsigned int B_col_size, 01383 unsigned int B_internal_rows, 01384 unsigned int B_internal_cols, 01385 T beta, 01386 T * C, 01387 unsigned int C_row_start, 01388 unsigned int C_col_start, 01389 unsigned int C_row_inc, 01390 unsigned int C_col_inc, 01391 unsigned int C_row_size, 01392 unsigned int C_col_size, 01393 unsigned int C_internal_rows, 01394 unsigned int C_internal_cols) 01395 { 01396 01397 __shared__ T bufA[272]; 01398 __shared__ T bufB[272]; 01399 01400 vcl_size_t block_size = 16;//get_local_size(0); 01401 vcl_size_t row_block_id = blockIdx.x; 01402 vcl_size_t col_block_id = blockIdx.y; 01403 vcl_size_t row_thread_id = threadIdx.x; 01404 vcl_size_t col_thread_id = threadIdx.y; 01405 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; 01406 vcl_size_t aStep = block_size * A_row_inc; 01407 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; 01408 vcl_size_t bStep = block_size * B_col_inc; 01409 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 01410 T Csub = 0; 01411 vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; 01412 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 01413 01414 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01415 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01416 for (vcl_size_t block = 0; 01417 block < block_num; 01418 ++block) 01419 { 01420 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 01421 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 01422 __syncthreads(); 01423 T * bufAptr = bufA + row_thread_id_times_block_size; 01424 T * bufBptr = bufB + col_thread_id_times_block_size; 01425 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01426 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01427 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01428 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01429 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01430 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01431 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01432 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01433 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01434 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01435 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01436 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01437 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01438 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01439 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01440 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01441 __syncthreads(); 01442 aBegin += aStep; 01443 bBegin += bStep; 01444 } 01445 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 01446 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 01447 } 01448 01449 01450 01451 01452 01454 01455 01456 01457 01458 01459 01460 // matrix-matrix multiplication C = A * B 01461 // matrix layouts: C...col_major, A...row_major, B...col_major 01462 template <typename T> 01463 __global__ void matrix_matrix_col_row_col_prod_AA_kernel( 01464 T alpha, 01465 const T * A, 01466 unsigned int A_row_start, 01467 unsigned int A_col_start, 01468 unsigned int A_row_inc, 01469 unsigned int A_col_inc, 01470 unsigned int A_row_size, 01471 unsigned int A_col_size, 01472 unsigned int A_internal_rows, 01473 unsigned int A_internal_cols, 01474 const T * B, 01475 unsigned int B_row_start, 01476 unsigned int B_col_start, 01477 unsigned int B_row_inc, 01478 unsigned int B_col_inc, 01479 unsigned int B_row_size, 01480 unsigned int B_col_size, 01481 unsigned int B_internal_rows, 01482 unsigned int B_internal_cols, 01483 T beta, 01484 T * C, 01485 unsigned int C_row_start, 01486 unsigned int C_col_start, 01487 unsigned int C_row_inc, 01488 unsigned int C_col_inc, 01489 unsigned int C_row_size, 01490 unsigned int C_col_size, 01491 unsigned int C_internal_rows, 01492 unsigned int C_internal_cols) 01493 { 01494 01495 __shared__ T bufA[272]; 01496 __shared__ T bufB[272]; 01497 01498 vcl_size_t block_size = 16;//get_local_size(0); 01499 vcl_size_t row_block_id = blockIdx.x; 01500 vcl_size_t col_block_id = blockIdx.y; 01501 vcl_size_t row_thread_id = threadIdx.x; 01502 vcl_size_t col_thread_id = threadIdx.y; 01503 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; 01504 vcl_size_t aStep = block_size * A_col_inc; 01505 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; 01506 vcl_size_t bStep = block_size * B_row_inc; 01507 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 01508 T Csub = 0; 01509 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 01510 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 01511 01512 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01513 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01514 for (vcl_size_t block = 0; 01515 block < block_num; 01516 ++block) 01517 { 01518 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 01519 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 01520 __syncthreads(); 01521 T * bufAptr = bufA + row_thread_id_times_block_size; 01522 T * bufBptr = bufB + col_thread_id_times_block_size; 01523 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01524 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01525 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01526 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01527 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01528 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01529 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01530 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01531 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01532 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01533 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01534 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01535 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01536 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01537 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01538 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01539 __syncthreads(); 01540 aBegin += aStep; 01541 bBegin += bStep; 01542 } 01543 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 01544 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 01545 } 01546 01547 // matrix-matrix multiplication C = A * B^T 01548 // matrix layouts: C...col_major, A...row_major, B...col_major 01549 template <typename T> 01550 __global__ void matrix_matrix_col_row_col_prod_AT_kernel( 01551 T alpha, 01552 const T * A, 01553 unsigned int A_row_start, 01554 unsigned int A_col_start, 01555 unsigned int A_row_inc, 01556 unsigned int A_col_inc, 01557 unsigned int A_row_size, 01558 unsigned int A_col_size, 01559 unsigned int A_internal_rows, 01560 unsigned int A_internal_cols, 01561 const T * B, 01562 unsigned int B_row_start, 01563 unsigned int B_col_start, 01564 unsigned int B_row_inc, 01565 unsigned int B_col_inc, 01566 unsigned int B_row_size, 01567 unsigned int B_col_size, 01568 unsigned int B_internal_rows, 01569 unsigned int B_internal_cols, 01570 T beta, 01571 T * C, 01572 unsigned int C_row_start, 01573 unsigned int C_col_start, 01574 unsigned int C_row_inc, 01575 unsigned int C_col_inc, 01576 unsigned int C_row_size, 01577 unsigned int C_col_size, 01578 unsigned int C_internal_rows, 01579 unsigned int C_internal_cols) 01580 { 01581 01582 __shared__ T bufA[272]; 01583 __shared__ T bufB[272]; 01584 01585 vcl_size_t block_size = 16;//get_local_size(0); 01586 vcl_size_t row_block_id = blockIdx.x; 01587 vcl_size_t col_block_id = blockIdx.y; 01588 vcl_size_t row_thread_id = threadIdx.x; 01589 vcl_size_t col_thread_id = threadIdx.y; 01590 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; 01591 vcl_size_t aStep = block_size * A_col_inc; 01592 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; 01593 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; 01594 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 01595 T Csub = 0; 01596 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 01597 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 01598 01599 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01600 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01601 for (vcl_size_t block = 0; 01602 block < block_num; 01603 ++block) 01604 { 01605 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 01606 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 01607 __syncthreads(); 01608 T * bufAptr = bufA + row_thread_id_times_block_size; 01609 T * bufBptr = bufB + col_thread_id_times_block_size; 01610 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01611 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01612 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01613 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01614 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01615 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01616 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01617 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01618 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01619 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01620 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01621 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01622 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01623 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01624 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01625 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01626 __syncthreads(); 01627 aBegin += aStep; 01628 bBegin += bStep; 01629 } 01630 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 01631 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 01632 } 01633 01634 // matrix-matrix multiplication C = A^T * B 01635 // matrix layouts: C...col_major, A...row_major, B...col_major 01636 template <typename T> 01637 __global__ void matrix_matrix_col_row_col_prod_TA_kernel( 01638 T alpha, 01639 const T * A, 01640 unsigned int A_row_start, 01641 unsigned int A_col_start, 01642 unsigned int A_row_inc, 01643 unsigned int A_col_inc, 01644 unsigned int A_row_size, 01645 unsigned int A_col_size, 01646 unsigned int A_internal_rows, 01647 unsigned int A_internal_cols, 01648 const T * B, 01649 unsigned int B_row_start, 01650 unsigned int B_col_start, 01651 unsigned int B_row_inc, 01652 unsigned int B_col_inc, 01653 unsigned int B_row_size, 01654 unsigned int B_col_size, 01655 unsigned int B_internal_rows, 01656 unsigned int B_internal_cols, 01657 T beta, 01658 T * C, 01659 unsigned int C_row_start, 01660 unsigned int C_col_start, 01661 unsigned int C_row_inc, 01662 unsigned int C_col_inc, 01663 unsigned int C_row_size, 01664 unsigned int C_col_size, 01665 unsigned int C_internal_rows, 01666 unsigned int C_internal_cols) 01667 { 01668 01669 __shared__ T bufA[272]; 01670 __shared__ T bufB[272]; 01671 01672 vcl_size_t block_size = 16;//get_local_size(0); 01673 vcl_size_t row_block_id = blockIdx.x; 01674 vcl_size_t col_block_id = blockIdx.y; 01675 vcl_size_t row_thread_id = threadIdx.x; 01676 vcl_size_t col_thread_id = threadIdx.y; 01677 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; 01678 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols; 01679 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; 01680 vcl_size_t bStep = block_size * B_row_inc; 01681 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 01682 T Csub = 0; 01683 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 01684 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 01685 01686 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01687 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01688 for (vcl_size_t block = 0; 01689 block < block_num; 01690 ++block) 01691 { 01692 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 01693 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 01694 __syncthreads(); 01695 T * bufAptr = bufA + row_thread_id_times_block_size; 01696 T * bufBptr = bufB + col_thread_id_times_block_size; 01697 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01698 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01699 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01700 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01701 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01702 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01703 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01704 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01705 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01706 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01707 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01708 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01709 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01710 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01711 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01712 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01713 __syncthreads(); 01714 aBegin += aStep; 01715 bBegin += bStep; 01716 } 01717 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 01718 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 01719 } 01720 01721 // matrix-matrix multiplication C = A^T * B^T 01722 // matrix layouts: C...col_major, A...row_major, B...col_major 01723 template <typename T> 01724 __global__ void matrix_matrix_col_row_col_prod_TT_kernel( 01725 T alpha, 01726 const T * A, 01727 unsigned int A_row_start, 01728 unsigned int A_col_start, 01729 unsigned int A_row_inc, 01730 unsigned int A_col_inc, 01731 unsigned int A_row_size, 01732 unsigned int A_col_size, 01733 unsigned int A_internal_rows, 01734 unsigned int A_internal_cols, 01735 const T * B, 01736 unsigned int B_row_start, 01737 unsigned int B_col_start, 01738 unsigned int B_row_inc, 01739 unsigned int B_col_inc, 01740 unsigned int B_row_size, 01741 unsigned int B_col_size, 01742 unsigned int B_internal_rows, 01743 unsigned int B_internal_cols, 01744 T beta, 01745 T * C, 01746 unsigned int C_row_start, 01747 unsigned int C_col_start, 01748 unsigned int C_row_inc, 01749 unsigned int C_col_inc, 01750 unsigned int C_row_size, 01751 unsigned int C_col_size, 01752 unsigned int C_internal_rows, 01753 unsigned int C_internal_cols) 01754 { 01755 01756 __shared__ T bufA[272]; 01757 __shared__ T bufB[272]; 01758 01759 vcl_size_t block_size = 16;//get_local_size(0); 01760 vcl_size_t row_block_id = blockIdx.x; 01761 vcl_size_t col_block_id = blockIdx.y; 01762 vcl_size_t row_thread_id = threadIdx.x; 01763 vcl_size_t col_thread_id = threadIdx.y; 01764 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; 01765 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols; 01766 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; 01767 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; 01768 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 01769 T Csub = 0; 01770 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 01771 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 01772 01773 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01774 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01775 for (vcl_size_t block = 0; 01776 block < block_num; 01777 ++block) 01778 { 01779 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 01780 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 01781 __syncthreads(); 01782 T * bufAptr = bufA + row_thread_id_times_block_size; 01783 T * bufBptr = bufB + col_thread_id_times_block_size; 01784 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01785 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01786 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01787 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01788 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01789 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01790 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01791 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01792 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01793 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01794 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01795 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01796 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01797 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01798 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01799 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01800 __syncthreads(); 01801 aBegin += aStep; 01802 bBegin += bStep; 01803 } 01804 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 01805 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 01806 } 01807 01808 01809 01810 01812 01813 01814 01815 01816 // matrix-matrix multiplication C = A * B 01817 // matrix layouts: C...row_major, A...row_major, B...col_major 01818 template <typename T> 01819 __global__ void matrix_matrix_row_row_col_prod_AA_kernel( 01820 T alpha, 01821 const T * A, 01822 unsigned int A_row_start, 01823 unsigned int A_col_start, 01824 unsigned int A_row_inc, 01825 unsigned int A_col_inc, 01826 unsigned int A_row_size, 01827 unsigned int A_col_size, 01828 unsigned int A_internal_rows, 01829 unsigned int A_internal_cols, 01830 const T * B, 01831 unsigned int B_row_start, 01832 unsigned int B_col_start, 01833 unsigned int B_row_inc, 01834 unsigned int B_col_inc, 01835 unsigned int B_row_size, 01836 unsigned int B_col_size, 01837 unsigned int B_internal_rows, 01838 unsigned int B_internal_cols, 01839 T beta, 01840 T * C, 01841 unsigned int C_row_start, 01842 unsigned int C_col_start, 01843 unsigned int C_row_inc, 01844 unsigned int C_col_inc, 01845 unsigned int C_row_size, 01846 unsigned int C_col_size, 01847 unsigned int C_internal_rows, 01848 unsigned int C_internal_cols) 01849 { 01850 01851 __shared__ T bufA[272]; 01852 __shared__ T bufB[272]; 01853 01854 vcl_size_t block_size = 16;//get_local_size(0); 01855 vcl_size_t row_block_id = blockIdx.x; 01856 vcl_size_t col_block_id = blockIdx.y; 01857 vcl_size_t row_thread_id = threadIdx.x; 01858 vcl_size_t col_thread_id = threadIdx.y; 01859 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; 01860 vcl_size_t aStep = block_size * A_col_inc; 01861 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; 01862 vcl_size_t bStep = block_size * B_row_inc; 01863 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 01864 T Csub = 0; 01865 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 01866 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 01867 01868 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01869 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01870 for (vcl_size_t block = 0; 01871 block < block_num; 01872 ++block) 01873 { 01874 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 01875 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 01876 __syncthreads(); 01877 T * bufAptr = bufA + row_thread_id_times_block_size; 01878 T * bufBptr = bufB + col_thread_id_times_block_size; 01879 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01880 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01881 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01882 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01883 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01884 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01885 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01886 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01887 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01888 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01889 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01890 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01891 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01892 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01893 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01894 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01895 __syncthreads(); 01896 aBegin += aStep; 01897 bBegin += bStep; 01898 } 01899 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 01900 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 01901 } 01902 01903 // matrix-matrix multiplication C = A * B^T 01904 // matrix layouts: C...row_major, A...row_major, B...col_major 01905 template <typename T> 01906 __global__ void matrix_matrix_row_row_col_prod_AT_kernel( 01907 T alpha, 01908 const T * A, 01909 unsigned int A_row_start, 01910 unsigned int A_col_start, 01911 unsigned int A_row_inc, 01912 unsigned int A_col_inc, 01913 unsigned int A_row_size, 01914 unsigned int A_col_size, 01915 unsigned int A_internal_rows, 01916 unsigned int A_internal_cols, 01917 const T * B, 01918 unsigned int B_row_start, 01919 unsigned int B_col_start, 01920 unsigned int B_row_inc, 01921 unsigned int B_col_inc, 01922 unsigned int B_row_size, 01923 unsigned int B_col_size, 01924 unsigned int B_internal_rows, 01925 unsigned int B_internal_cols, 01926 T beta, 01927 T * C, 01928 unsigned int C_row_start, 01929 unsigned int C_col_start, 01930 unsigned int C_row_inc, 01931 unsigned int C_col_inc, 01932 unsigned int C_row_size, 01933 unsigned int C_col_size, 01934 unsigned int C_internal_rows, 01935 unsigned int C_internal_cols) 01936 { 01937 01938 __shared__ T bufA[272]; 01939 __shared__ T bufB[272]; 01940 01941 vcl_size_t block_size = 16;//get_local_size(0); 01942 vcl_size_t row_block_id = blockIdx.x; 01943 vcl_size_t col_block_id = blockIdx.y; 01944 vcl_size_t row_thread_id = threadIdx.x; 01945 vcl_size_t col_thread_id = threadIdx.y; 01946 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; 01947 vcl_size_t aStep = block_size * A_col_inc; 01948 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; 01949 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; 01950 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 01951 T Csub = 0; 01952 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 01953 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 01954 01955 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 01956 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 01957 for (vcl_size_t block = 0; 01958 block < block_num; 01959 ++block) 01960 { 01961 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 01962 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 01963 __syncthreads(); 01964 T * bufAptr = bufA + row_thread_id_times_block_size; 01965 T * bufBptr = bufB + col_thread_id_times_block_size; 01966 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01967 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01968 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01969 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01970 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01971 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01972 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01973 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01974 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01975 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01976 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01977 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01978 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01979 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01980 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01981 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 01982 __syncthreads(); 01983 aBegin += aStep; 01984 bBegin += bStep; 01985 } 01986 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 01987 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 01988 } 01989 01990 // matrix-matrix multiplication C = A^T * B 01991 // matrix layouts: C...row_major, A...row_major, B...col_major 01992 template <typename T> 01993 __global__ void matrix_matrix_row_row_col_prod_TA_kernel( 01994 T alpha, 01995 const T * A, 01996 unsigned int A_row_start, 01997 unsigned int A_col_start, 01998 unsigned int A_row_inc, 01999 unsigned int A_col_inc, 02000 unsigned int A_row_size, 02001 unsigned int A_col_size, 02002 unsigned int A_internal_rows, 02003 unsigned int A_internal_cols, 02004 const T * B, 02005 unsigned int B_row_start, 02006 unsigned int B_col_start, 02007 unsigned int B_row_inc, 02008 unsigned int B_col_inc, 02009 unsigned int B_row_size, 02010 unsigned int B_col_size, 02011 unsigned int B_internal_rows, 02012 unsigned int B_internal_cols, 02013 T beta, 02014 T * C, 02015 unsigned int C_row_start, 02016 unsigned int C_col_start, 02017 unsigned int C_row_inc, 02018 unsigned int C_col_inc, 02019 unsigned int C_row_size, 02020 unsigned int C_col_size, 02021 unsigned int C_internal_rows, 02022 unsigned int C_internal_cols) 02023 { 02024 02025 __shared__ T bufA[272]; 02026 __shared__ T bufB[272]; 02027 02028 vcl_size_t block_size = 16;//get_local_size(0); 02029 vcl_size_t row_block_id = blockIdx.x; 02030 vcl_size_t col_block_id = blockIdx.y; 02031 vcl_size_t row_thread_id = threadIdx.x; 02032 vcl_size_t col_thread_id = threadIdx.y; 02033 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; 02034 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols; 02035 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; 02036 vcl_size_t bStep = block_size * B_row_inc; 02037 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 02038 T Csub = 0; 02039 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 02040 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 02041 02042 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 02043 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 02044 for (vcl_size_t block = 0; 02045 block < block_num; 02046 ++block) 02047 { 02048 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 02049 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 02050 __syncthreads(); 02051 T * bufAptr = bufA + row_thread_id_times_block_size; 02052 T * bufBptr = bufB + col_thread_id_times_block_size; 02053 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02054 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02055 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02056 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02057 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02058 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02059 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02060 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02061 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02062 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02063 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02064 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02065 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02066 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02067 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02068 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02069 __syncthreads(); 02070 aBegin += aStep; 02071 bBegin += bStep; 02072 } 02073 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 02074 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 02075 } 02076 02077 // matrix-matrix multiplication C = A^T * B^T 02078 // matrix layouts: C...row_major, A...row_major, B...col_major 02079 template <typename T> 02080 __global__ void matrix_matrix_row_row_col_prod_TT_kernel( 02081 T alpha, 02082 const T * A, 02083 unsigned int A_row_start, 02084 unsigned int A_col_start, 02085 unsigned int A_row_inc, 02086 unsigned int A_col_inc, 02087 unsigned int A_row_size, 02088 unsigned int A_col_size, 02089 unsigned int A_internal_rows, 02090 unsigned int A_internal_cols, 02091 const T * B, 02092 unsigned int B_row_start, 02093 unsigned int B_col_start, 02094 unsigned int B_row_inc, 02095 unsigned int B_col_inc, 02096 unsigned int B_row_size, 02097 unsigned int B_col_size, 02098 unsigned int B_internal_rows, 02099 unsigned int B_internal_cols, 02100 T beta, 02101 T * C, 02102 unsigned int C_row_start, 02103 unsigned int C_col_start, 02104 unsigned int C_row_inc, 02105 unsigned int C_col_inc, 02106 unsigned int C_row_size, 02107 unsigned int C_col_size, 02108 unsigned int C_internal_rows, 02109 unsigned int C_internal_cols) 02110 { 02111 02112 __shared__ T bufA[272]; 02113 __shared__ T bufB[272]; 02114 02115 vcl_size_t block_size = 16;//get_local_size(0); 02116 vcl_size_t row_block_id = blockIdx.x; 02117 vcl_size_t col_block_id = blockIdx.y; 02118 vcl_size_t row_thread_id = threadIdx.x; 02119 vcl_size_t col_thread_id = threadIdx.y; 02120 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; 02121 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols; 02122 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; 02123 vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; 02124 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 02125 T Csub = 0; 02126 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 02127 vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; 02128 02129 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 02130 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 02131 for (vcl_size_t block = 0; 02132 block < block_num; 02133 ++block) 02134 { 02135 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 02136 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 02137 __syncthreads(); 02138 T * bufAptr = bufA + row_thread_id_times_block_size; 02139 T * bufBptr = bufB + col_thread_id_times_block_size; 02140 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02141 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02142 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02143 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02144 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02145 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02146 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02147 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02148 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02149 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02150 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02151 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02152 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02153 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02154 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02155 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02156 __syncthreads(); 02157 aBegin += aStep; 02158 bBegin += bStep; 02159 } 02160 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 02161 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 02162 } 02163 02164 02165 02166 02167 02169 02170 02171 02172 02173 02174 02175 // matrix-matrix multiplication C = A * B 02176 // matrix layouts: C...col_major, A...row_major, B...row_major 02177 template <typename T> 02178 __global__ void matrix_matrix_col_row_row_prod_AA_kernel( 02179 T alpha, 02180 const T * A, 02181 unsigned int A_row_start, 02182 unsigned int A_col_start, 02183 unsigned int A_row_inc, 02184 unsigned int A_col_inc, 02185 unsigned int A_row_size, 02186 unsigned int A_col_size, 02187 unsigned int A_internal_rows, 02188 unsigned int A_internal_cols, 02189 const T * B, 02190 unsigned int B_row_start, 02191 unsigned int B_col_start, 02192 unsigned int B_row_inc, 02193 unsigned int B_col_inc, 02194 unsigned int B_row_size, 02195 unsigned int B_col_size, 02196 unsigned int B_internal_rows, 02197 unsigned int B_internal_cols, 02198 T beta, 02199 T * C, 02200 unsigned int C_row_start, 02201 unsigned int C_col_start, 02202 unsigned int C_row_inc, 02203 unsigned int C_col_inc, 02204 unsigned int C_row_size, 02205 unsigned int C_col_size, 02206 unsigned int C_internal_rows, 02207 unsigned int C_internal_cols) 02208 { 02209 02210 __shared__ T bufA[272]; 02211 __shared__ T bufB[272]; 02212 02213 vcl_size_t block_size = 16;//get_local_size(0); 02214 vcl_size_t row_block_id = blockIdx.x; 02215 vcl_size_t col_block_id = blockIdx.y; 02216 vcl_size_t row_thread_id = threadIdx.x; 02217 vcl_size_t col_thread_id = threadIdx.y; 02218 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; 02219 vcl_size_t aStep = block_size * A_col_inc; 02220 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; 02221 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; 02222 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 02223 T Csub = 0; 02224 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 02225 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 02226 02227 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 02228 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 02229 for (vcl_size_t block = 0; 02230 block < block_num; 02231 ++block) 02232 { 02233 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 02234 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 02235 __syncthreads(); 02236 T * bufAptr = bufA + row_thread_id_times_block_size; 02237 T * bufBptr = bufB + col_thread_id_times_block_size; 02238 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02239 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02240 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02241 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02242 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02243 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02244 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02245 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02246 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02247 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02248 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02249 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02250 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02251 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02252 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02253 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02254 __syncthreads(); 02255 aBegin += aStep; 02256 bBegin += bStep; 02257 } 02258 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 02259 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 02260 } 02261 02262 // matrix-matrix multiplication C = A * B^T 02263 // matrix layouts: C...col_major, A...row_major, B...row_major 02264 template <typename T> 02265 __global__ void matrix_matrix_col_row_row_prod_AT_kernel( 02266 T alpha, 02267 const T * A, 02268 unsigned int A_row_start, 02269 unsigned int A_col_start, 02270 unsigned int A_row_inc, 02271 unsigned int A_col_inc, 02272 unsigned int A_row_size, 02273 unsigned int A_col_size, 02274 unsigned int A_internal_rows, 02275 unsigned int A_internal_cols, 02276 const T * B, 02277 unsigned int B_row_start, 02278 unsigned int B_col_start, 02279 unsigned int B_row_inc, 02280 unsigned int B_col_inc, 02281 unsigned int B_row_size, 02282 unsigned int B_col_size, 02283 unsigned int B_internal_rows, 02284 unsigned int B_internal_cols, 02285 T beta, 02286 T * C, 02287 unsigned int C_row_start, 02288 unsigned int C_col_start, 02289 unsigned int C_row_inc, 02290 unsigned int C_col_inc, 02291 unsigned int C_row_size, 02292 unsigned int C_col_size, 02293 unsigned int C_internal_rows, 02294 unsigned int C_internal_cols) 02295 { 02296 02297 __shared__ T bufA[272]; 02298 __shared__ T bufB[272]; 02299 02300 vcl_size_t block_size = 16;//get_local_size(0); 02301 vcl_size_t row_block_id = blockIdx.x; 02302 vcl_size_t col_block_id = blockIdx.y; 02303 vcl_size_t row_thread_id = threadIdx.x; 02304 vcl_size_t col_thread_id = threadIdx.y; 02305 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; 02306 vcl_size_t aStep = block_size * A_col_inc; 02307 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; 02308 vcl_size_t bStep = block_size * B_col_inc; 02309 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 02310 T Csub = 0; 02311 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 02312 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 02313 02314 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 02315 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 02316 for (vcl_size_t block = 0; 02317 block < block_num; 02318 ++block) 02319 { 02320 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 02321 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 02322 __syncthreads(); 02323 T * bufAptr = bufA + row_thread_id_times_block_size; 02324 T * bufBptr = bufB + col_thread_id_times_block_size; 02325 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02326 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02327 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02328 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02329 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02330 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02331 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02332 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02333 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02334 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02335 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02336 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02337 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02338 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02339 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02340 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02341 __syncthreads(); 02342 aBegin += aStep; 02343 bBegin += bStep; 02344 } 02345 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 02346 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 02347 } 02348 02349 // matrix-matrix multiplication C = A^T * B 02350 // matrix layouts: C...col_major, A...row_major, B...row_major 02351 template <typename T> 02352 __global__ void matrix_matrix_col_row_row_prod_TA_kernel( 02353 T alpha, 02354 const T * A, 02355 unsigned int A_row_start, 02356 unsigned int A_col_start, 02357 unsigned int A_row_inc, 02358 unsigned int A_col_inc, 02359 unsigned int A_row_size, 02360 unsigned int A_col_size, 02361 unsigned int A_internal_rows, 02362 unsigned int A_internal_cols, 02363 const T * B, 02364 unsigned int B_row_start, 02365 unsigned int B_col_start, 02366 unsigned int B_row_inc, 02367 unsigned int B_col_inc, 02368 unsigned int B_row_size, 02369 unsigned int B_col_size, 02370 unsigned int B_internal_rows, 02371 unsigned int B_internal_cols, 02372 T beta, 02373 T * C, 02374 unsigned int C_row_start, 02375 unsigned int C_col_start, 02376 unsigned int C_row_inc, 02377 unsigned int C_col_inc, 02378 unsigned int C_row_size, 02379 unsigned int C_col_size, 02380 unsigned int C_internal_rows, 02381 unsigned int C_internal_cols) 02382 { 02383 02384 __shared__ T bufA[272]; 02385 __shared__ T bufB[272]; 02386 02387 vcl_size_t block_size = 16;//get_local_size(0); 02388 vcl_size_t row_block_id = blockIdx.x; 02389 vcl_size_t col_block_id = blockIdx.y; 02390 vcl_size_t row_thread_id = threadIdx.x; 02391 vcl_size_t col_thread_id = threadIdx.y; 02392 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; 02393 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols; 02394 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; 02395 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; 02396 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 02397 T Csub = 0; 02398 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 02399 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 02400 02401 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 02402 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 02403 for (vcl_size_t block = 0; 02404 block < block_num; 02405 ++block) 02406 { 02407 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 02408 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 02409 __syncthreads(); 02410 T * bufAptr = bufA + row_thread_id_times_block_size; 02411 T * bufBptr = bufB + col_thread_id_times_block_size; 02412 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02413 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02414 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02415 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02416 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02417 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02418 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02419 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02420 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02421 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02422 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02423 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02424 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02425 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02426 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02427 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02428 __syncthreads(); 02429 aBegin += aStep; 02430 bBegin += bStep; 02431 } 02432 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 02433 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 02434 } 02435 02436 // matrix-matrix multiplication C = A^T * B^T 02437 // matrix layouts: C...col_major, A...row_major, B...row_major 02438 template <typename T> 02439 __global__ void matrix_matrix_col_row_row_prod_TT_kernel( 02440 T alpha, 02441 const T * A, 02442 unsigned int A_row_start, 02443 unsigned int A_col_start, 02444 unsigned int A_row_inc, 02445 unsigned int A_col_inc, 02446 unsigned int A_row_size, 02447 unsigned int A_col_size, 02448 unsigned int A_internal_rows, 02449 unsigned int A_internal_cols, 02450 const T * B, 02451 unsigned int B_row_start, 02452 unsigned int B_col_start, 02453 unsigned int B_row_inc, 02454 unsigned int B_col_inc, 02455 unsigned int B_row_size, 02456 unsigned int B_col_size, 02457 unsigned int B_internal_rows, 02458 unsigned int B_internal_cols, 02459 T beta, 02460 T * C, 02461 unsigned int C_row_start, 02462 unsigned int C_col_start, 02463 unsigned int C_row_inc, 02464 unsigned int C_col_inc, 02465 unsigned int C_row_size, 02466 unsigned int C_col_size, 02467 unsigned int C_internal_rows, 02468 unsigned int C_internal_cols) 02469 { 02470 02471 __shared__ T bufA[272]; 02472 __shared__ T bufB[272]; 02473 02474 vcl_size_t block_size = 16;//get_local_size(0); 02475 vcl_size_t row_block_id = blockIdx.x; 02476 vcl_size_t col_block_id = blockIdx.y; 02477 vcl_size_t row_thread_id = threadIdx.x; 02478 vcl_size_t col_thread_id = threadIdx.y; 02479 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; 02480 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols; 02481 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; 02482 vcl_size_t bStep = block_size * B_col_inc; 02483 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 02484 T Csub = 0; 02485 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 02486 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 02487 02488 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 02489 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 02490 for (vcl_size_t block = 0; 02491 block < block_num; 02492 ++block) 02493 { 02494 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 02495 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 02496 __syncthreads(); 02497 T * bufAptr = bufA + row_thread_id_times_block_size; 02498 T * bufBptr = bufB + col_thread_id_times_block_size; 02499 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02500 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02501 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02502 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02503 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02504 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02505 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02506 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02507 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02508 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02509 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02510 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02511 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02512 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02513 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02514 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02515 __syncthreads(); 02516 aBegin += aStep; 02517 bBegin += bStep; 02518 } 02519 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 02520 C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; 02521 } 02522 02523 02524 02525 02526 02528 02529 02530 02531 02532 // matrix-matrix multiplication C = A * B 02533 // matrix layouts: C...row_major, A...row_major, B...row_major 02534 template <typename T> 02535 __global__ void matrix_matrix_row_row_row_prod_AA_kernel( 02536 T alpha, 02537 const T * A, 02538 unsigned int A_row_start, 02539 unsigned int A_col_start, 02540 unsigned int A_row_inc, 02541 unsigned int A_col_inc, 02542 unsigned int A_row_size, 02543 unsigned int A_col_size, 02544 unsigned int A_internal_rows, 02545 unsigned int A_internal_cols, 02546 const T * B, 02547 unsigned int B_row_start, 02548 unsigned int B_col_start, 02549 unsigned int B_row_inc, 02550 unsigned int B_col_inc, 02551 unsigned int B_row_size, 02552 unsigned int B_col_size, 02553 unsigned int B_internal_rows, 02554 unsigned int B_internal_cols, 02555 T beta, 02556 T * C, 02557 unsigned int C_row_start, 02558 unsigned int C_col_start, 02559 unsigned int C_row_inc, 02560 unsigned int C_col_inc, 02561 unsigned int C_row_size, 02562 unsigned int C_col_size, 02563 unsigned int C_internal_rows, 02564 unsigned int C_internal_cols) 02565 { 02566 02567 __shared__ T bufA[272]; 02568 __shared__ T bufB[272]; 02569 02570 vcl_size_t block_size = 16;//get_local_size(0); 02571 vcl_size_t row_block_id = blockIdx.x; 02572 vcl_size_t col_block_id = blockIdx.y; 02573 vcl_size_t row_thread_id = threadIdx.x; 02574 vcl_size_t col_thread_id = threadIdx.y; 02575 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; 02576 vcl_size_t aStep = block_size * A_col_inc; 02577 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; 02578 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; 02579 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 02580 T Csub = 0; 02581 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 02582 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 02583 02584 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 02585 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 02586 for (vcl_size_t block = 0; 02587 block < block_num; 02588 ++block) 02589 { 02590 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 02591 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 02592 __syncthreads(); 02593 T * bufAptr = bufA + row_thread_id_times_block_size; 02594 T * bufBptr = bufB + col_thread_id_times_block_size; 02595 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02596 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02597 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02598 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02599 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02600 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02601 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02602 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02603 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02604 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02605 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02606 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02607 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02608 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02609 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02610 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02611 __syncthreads(); 02612 aBegin += aStep; 02613 bBegin += bStep; 02614 } 02615 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 02616 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 02617 } 02618 02619 // matrix-matrix multiplication C = A * B^T 02620 // matrix layouts: C...row_major, A...row_major, B...row_major 02621 template <typename T> 02622 __global__ void matrix_matrix_row_row_row_prod_AT_kernel( 02623 T alpha, 02624 const T * A, 02625 unsigned int A_row_start, 02626 unsigned int A_col_start, 02627 unsigned int A_row_inc, 02628 unsigned int A_col_inc, 02629 unsigned int A_row_size, 02630 unsigned int A_col_size, 02631 unsigned int A_internal_rows, 02632 unsigned int A_internal_cols, 02633 const T * B, 02634 unsigned int B_row_start, 02635 unsigned int B_col_start, 02636 unsigned int B_row_inc, 02637 unsigned int B_col_inc, 02638 unsigned int B_row_size, 02639 unsigned int B_col_size, 02640 unsigned int B_internal_rows, 02641 unsigned int B_internal_cols, 02642 T beta, 02643 T * C, 02644 unsigned int C_row_start, 02645 unsigned int C_col_start, 02646 unsigned int C_row_inc, 02647 unsigned int C_col_inc, 02648 unsigned int C_row_size, 02649 unsigned int C_col_size, 02650 unsigned int C_internal_rows, 02651 unsigned int C_internal_cols) 02652 { 02653 02654 __shared__ T bufA[272]; 02655 __shared__ T bufB[272]; 02656 02657 vcl_size_t block_size = 16;//get_local_size(0); 02658 vcl_size_t row_block_id = blockIdx.x; 02659 vcl_size_t col_block_id = blockIdx.y; 02660 vcl_size_t row_thread_id = threadIdx.x; 02661 vcl_size_t col_thread_id = threadIdx.y; 02662 vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; 02663 vcl_size_t aStep = block_size * A_col_inc; 02664 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; 02665 vcl_size_t bStep = block_size * B_col_inc; 02666 vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; 02667 T Csub = 0; 02668 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 02669 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 02670 02671 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 02672 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 02673 for (vcl_size_t block = 0; 02674 block < block_num; 02675 ++block) 02676 { 02677 bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; 02678 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 02679 __syncthreads(); 02680 T * bufAptr = bufA + row_thread_id_times_block_size; 02681 T * bufBptr = bufB + col_thread_id_times_block_size; 02682 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02683 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02684 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02685 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02686 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02687 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02688 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02689 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02690 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02691 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02692 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02693 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02694 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02695 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02696 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02697 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02698 __syncthreads(); 02699 aBegin += aStep; 02700 bBegin += bStep; 02701 } 02702 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 02703 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 02704 } 02705 02706 // matrix-matrix multiplication C = A^T * B 02707 // matrix layouts: C...row_major, A...row_major, B...row_major 02708 template <typename T> 02709 __global__ void matrix_matrix_row_row_row_prod_TA_kernel( 02710 T alpha, 02711 const T * A, 02712 unsigned int A_row_start, 02713 unsigned int A_col_start, 02714 unsigned int A_row_inc, 02715 unsigned int A_col_inc, 02716 unsigned int A_row_size, 02717 unsigned int A_col_size, 02718 unsigned int A_internal_rows, 02719 unsigned int A_internal_cols, 02720 const T * B, 02721 unsigned int B_row_start, 02722 unsigned int B_col_start, 02723 unsigned int B_row_inc, 02724 unsigned int B_col_inc, 02725 unsigned int B_row_size, 02726 unsigned int B_col_size, 02727 unsigned int B_internal_rows, 02728 unsigned int B_internal_cols, 02729 T beta, 02730 T * C, 02731 unsigned int C_row_start, 02732 unsigned int C_col_start, 02733 unsigned int C_row_inc, 02734 unsigned int C_col_inc, 02735 unsigned int C_row_size, 02736 unsigned int C_col_size, 02737 unsigned int C_internal_rows, 02738 unsigned int C_internal_cols) 02739 { 02740 02741 __shared__ T bufA[272]; 02742 __shared__ T bufB[272]; 02743 02744 vcl_size_t block_size = 16;//get_local_size(0); 02745 vcl_size_t row_block_id = blockIdx.x; 02746 vcl_size_t col_block_id = blockIdx.y; 02747 vcl_size_t row_thread_id = threadIdx.x; 02748 vcl_size_t col_thread_id = threadIdx.y; 02749 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; 02750 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols; 02751 vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; 02752 vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; 02753 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 02754 T Csub = 0; 02755 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 02756 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 02757 02758 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 02759 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 02760 for (vcl_size_t block = 0; 02761 block < block_num; 02762 ++block) 02763 { 02764 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 02765 bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; 02766 __syncthreads(); 02767 T * bufAptr = bufA + row_thread_id_times_block_size; 02768 T * bufBptr = bufB + col_thread_id_times_block_size; 02769 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02770 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02771 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02772 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02773 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02774 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02775 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02776 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02777 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02778 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02779 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02780 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02781 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02782 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02783 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02784 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02785 __syncthreads(); 02786 aBegin += aStep; 02787 bBegin += bStep; 02788 } 02789 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) 02790 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 02791 } 02792 02793 // matrix-matrix multiplication C = A^T * B^T 02794 // matrix layouts: C...row_major, A...row_major, B...row_major 02795 template <typename T> 02796 __global__ void matrix_matrix_row_row_row_prod_TT_kernel( 02797 T alpha, 02798 const T * A, 02799 unsigned int A_row_start, 02800 unsigned int A_col_start, 02801 unsigned int A_row_inc, 02802 unsigned int A_col_inc, 02803 unsigned int A_row_size, 02804 unsigned int A_col_size, 02805 unsigned int A_internal_rows, 02806 unsigned int A_internal_cols, 02807 const T * B, 02808 unsigned int B_row_start, 02809 unsigned int B_col_start, 02810 unsigned int B_row_inc, 02811 unsigned int B_col_inc, 02812 unsigned int B_row_size, 02813 unsigned int B_col_size, 02814 unsigned int B_internal_rows, 02815 unsigned int B_internal_cols, 02816 T beta, 02817 T * C, 02818 unsigned int C_row_start, 02819 unsigned int C_col_start, 02820 unsigned int C_row_inc, 02821 unsigned int C_col_inc, 02822 unsigned int C_row_size, 02823 unsigned int C_col_size, 02824 unsigned int C_internal_rows, 02825 unsigned int C_internal_cols) 02826 { 02827 02828 __shared__ T bufA[272]; 02829 __shared__ T bufB[272]; 02830 02831 vcl_size_t block_size = 16;//get_local_size(0); 02832 vcl_size_t row_block_id = blockIdx.x; 02833 vcl_size_t col_block_id = blockIdx.y; 02834 vcl_size_t row_thread_id = threadIdx.x; 02835 vcl_size_t col_thread_id = threadIdx.y; 02836 vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; 02837 vcl_size_t aStep = block_size * A_row_inc * A_internal_cols; 02838 vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; 02839 vcl_size_t bStep = block_size * B_col_inc; 02840 vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; 02841 T Csub = 0; 02842 vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; 02843 vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; 02844 02845 vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); 02846 vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); 02847 for (vcl_size_t block = 0; 02848 block < block_num; 02849 ++block) 02850 { 02851 bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; 02852 bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; 02853 __syncthreads(); 02854 T * bufAptr = bufA + row_thread_id_times_block_size; 02855 T * bufBptr = bufB + col_thread_id_times_block_size; 02856 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02857 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02858 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02859 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02860 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02861 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02862 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02863 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02864 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02865 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02866 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02867 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02868 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02869 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02870 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02871 Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; 02872 __syncthreads(); 02873 aBegin += aStep; 02874 bBegin += bStep; 02875 } 02876 if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) 02877 C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; 02878 } 02879 02880 02881 } // namespace cuda 02882 } //namespace linalg 02883 } //namespace viennacl 02884 02885 02886 #endif