ViennaCL - The Vienna Computing Library
1.5.1
|
00001 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_ 00002 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00026 namespace viennacl 00027 { 00028 namespace linalg 00029 { 00030 namespace cuda 00031 { 00032 // 00033 // am 00034 // 00035 00036 // alpha on CPU 00037 template <typename T> 00038 __global__ void am_col_kernel( 00039 T * A, 00040 unsigned int A_start1, unsigned int A_start2, 00041 unsigned int A_inc1, unsigned int A_inc2, 00042 unsigned int A_size1, unsigned int A_size2, 00043 unsigned int A_internal_size1, unsigned int A_internal_size2, 00044 00045 T fac2, 00046 unsigned int options2, 00047 const T * B, 00048 unsigned int B_start1, unsigned int B_start2, 00049 unsigned int B_inc1, unsigned int B_inc2, 00050 unsigned int B_internal_size1, unsigned int B_internal_size2) 00051 { 00052 T alpha = fac2; 00053 if (options2 & (1 << 0)) 00054 alpha = -alpha; 00055 00056 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00057 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00058 00059 if (options2 & (1 << 1)) 00060 { 00061 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00062 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00063 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha; 00064 } 00065 else 00066 { 00067 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00068 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00069 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha; 00070 } 00071 } 00072 00073 // alpha on GPU 00074 template <typename T> 00075 __global__ void am_col_kernel( 00076 T * A, 00077 unsigned int A_start1, unsigned int A_start2, 00078 unsigned int A_inc1, unsigned int A_inc2, 00079 unsigned int A_size1, unsigned int A_size2, 00080 unsigned int A_internal_size1, unsigned int A_internal_size2, 00081 00082 const T * fac2, 00083 unsigned int options2, 00084 const T * B, 00085 unsigned int B_start1, unsigned int B_start2, 00086 unsigned int B_inc1, unsigned int B_inc2, 00087 unsigned int B_internal_size1, unsigned int B_internal_size2) 00088 { 00089 T alpha = *fac2; 00090 if (options2 & (1 << 0)) 00091 alpha = -alpha; 00092 00093 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00094 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00095 00096 if (options2 & (1 << 1)) 00097 { 00098 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00099 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00100 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha; 00101 } 00102 else 00103 { 00104 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00105 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00106 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha; 00107 } 00108 } 00109 00110 00111 // 00112 // ambm 00113 // 00114 00115 // alpha and beta on CPU 00116 template <typename T> 00117 __global__ void ambm_col_kernel( 00118 T * A, 00119 unsigned int A_start1, unsigned int A_start2, 00120 unsigned int A_inc1, unsigned int A_inc2, 00121 unsigned int A_size1, unsigned int A_size2, 00122 unsigned int A_internal_size1, unsigned int A_internal_size2, 00123 00124 T fac2, 00125 unsigned int options2, 00126 const T * B, 00127 unsigned int B_start1, unsigned int B_start2, 00128 unsigned int B_inc1, unsigned int B_inc2, 00129 unsigned int B_internal_size1, unsigned int B_internal_size2, 00130 00131 T fac3, 00132 unsigned int options3, 00133 const T * C, 00134 unsigned int C_start1, unsigned int C_start2, 00135 unsigned int C_inc1, unsigned int C_inc2, 00136 unsigned int C_internal_size1, unsigned int C_internal_size2) 00137 { 00138 T alpha = fac2; 00139 if (options2 & (1 << 0)) 00140 alpha = -alpha; 00141 00142 T beta = fac3; 00143 if (options3 & (1 << 0)) 00144 beta = -beta; 00145 00146 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00147 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00148 00149 if (options2 & (1 << 1)) 00150 { 00151 if (options3 & (1 << 1)) 00152 { 00153 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00154 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00155 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00156 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00157 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00158 } 00159 else 00160 { 00161 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00162 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00163 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00164 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00165 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00166 } 00167 } 00168 else 00169 { 00170 if (options3 & (1 << 1)) 00171 { 00172 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00173 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00174 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00175 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00176 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00177 } 00178 else 00179 { 00180 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00181 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00182 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00183 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00184 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00185 } 00186 } 00187 } 00188 00189 00190 // alpha on CPU, beta on GPU 00191 template <typename T> 00192 __global__ void ambm_col_kernel( 00193 T * A, 00194 unsigned int A_start1, unsigned int A_start2, 00195 unsigned int A_inc1, unsigned int A_inc2, 00196 unsigned int A_size1, unsigned int A_size2, 00197 unsigned int A_internal_size1, unsigned int A_internal_size2, 00198 00199 T fac2, 00200 unsigned int options2, 00201 const T * B, 00202 unsigned int B_start1, unsigned int B_start2, 00203 unsigned int B_inc1, unsigned int B_inc2, 00204 unsigned int B_internal_size1, unsigned int B_internal_size2, 00205 00206 const T * fac3, 00207 unsigned int options3, 00208 const T * C, 00209 unsigned int C_start1, unsigned int C_start2, 00210 unsigned int C_inc1, unsigned int C_inc2, 00211 unsigned int C_internal_size1, unsigned int C_internal_size2) 00212 { 00213 T alpha = fac2; 00214 if (options2 & (1 << 0)) 00215 alpha = -alpha; 00216 00217 T beta = *fac3; 00218 if (options3 & (1 << 0)) 00219 beta = -beta; 00220 00221 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00222 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00223 00224 if (options2 & (1 << 1)) 00225 { 00226 if (options3 & (1 << 1)) 00227 { 00228 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00229 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00230 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00231 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00232 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00233 } 00234 else 00235 { 00236 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00237 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00238 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00239 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00240 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00241 } 00242 } 00243 else 00244 { 00245 if (options3 & (1 << 1)) 00246 { 00247 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00248 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00249 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00250 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00251 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00252 } 00253 else 00254 { 00255 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00256 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00257 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00258 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00259 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00260 } 00261 } 00262 } 00263 00264 // alpha on GPU, beta on CPU 00265 template <typename T> 00266 __global__ void ambm_col_kernel( 00267 T * A, 00268 unsigned int A_start1, unsigned int A_start2, 00269 unsigned int A_inc1, unsigned int A_inc2, 00270 unsigned int A_size1, unsigned int A_size2, 00271 unsigned int A_internal_size1, unsigned int A_internal_size2, 00272 00273 const T * fac2, 00274 unsigned int options2, 00275 const T * B, 00276 unsigned int B_start1, unsigned int B_start2, 00277 unsigned int B_inc1, unsigned int B_inc2, 00278 unsigned int B_internal_size1, unsigned int B_internal_size2, 00279 00280 T fac3, 00281 unsigned int options3, 00282 const T * C, 00283 unsigned int C_start1, unsigned int C_start2, 00284 unsigned int C_inc1, unsigned int C_inc2, 00285 unsigned int C_internal_size1, unsigned int C_internal_size2) 00286 { 00287 T alpha = *fac2; 00288 if (options2 & (1 << 0)) 00289 alpha = -alpha; 00290 00291 T beta = fac3; 00292 if (options3 & (1 << 0)) 00293 beta = -beta; 00294 00295 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00296 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00297 00298 if (options2 & (1 << 1)) 00299 { 00300 if (options3 & (1 << 1)) 00301 { 00302 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00303 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00304 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00305 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00306 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00307 } 00308 else 00309 { 00310 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00311 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00312 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00313 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00314 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00315 } 00316 } 00317 else 00318 { 00319 if (options3 & (1 << 1)) 00320 { 00321 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00322 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00323 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00324 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00325 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00326 } 00327 else 00328 { 00329 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00330 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00331 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00332 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00333 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00334 } 00335 } 00336 } 00337 00338 00339 // alpha and beta on GPU 00340 template <typename T> 00341 __global__ void ambm_col_kernel( 00342 T * A, 00343 unsigned int A_start1, unsigned int A_start2, 00344 unsigned int A_inc1, unsigned int A_inc2, 00345 unsigned int A_size1, unsigned int A_size2, 00346 unsigned int A_internal_size1, unsigned int A_internal_size2, 00347 00348 const T * fac2, 00349 unsigned int options2, 00350 const T * B, 00351 unsigned int B_start1, unsigned int B_start2, 00352 unsigned int B_inc1, unsigned int B_inc2, 00353 unsigned int B_internal_size1, unsigned int B_internal_size2, 00354 00355 const T * fac3, 00356 unsigned int options3, 00357 const T * C, 00358 unsigned int C_start1, unsigned int C_start2, 00359 unsigned int C_inc1, unsigned int C_inc2, 00360 unsigned int C_internal_size1, unsigned int C_internal_size2) 00361 { 00362 T alpha = *fac2; 00363 if (options2 & (1 << 0)) 00364 alpha = -alpha; 00365 00366 T beta = *fac3; 00367 if (options3 & (1 << 0)) 00368 beta = -beta; 00369 00370 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00371 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00372 00373 if (options2 & (1 << 1)) 00374 { 00375 if (options3 & (1 << 1)) 00376 { 00377 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00378 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00379 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00380 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00381 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00382 } 00383 else 00384 { 00385 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00386 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00387 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00388 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00389 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00390 } 00391 } 00392 else 00393 { 00394 if (options3 & (1 << 1)) 00395 { 00396 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00397 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00398 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00399 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00400 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00401 } 00402 else 00403 { 00404 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00405 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00406 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00407 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00408 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00409 } 00410 } 00411 } 00412 00413 00414 // 00415 // ambm_m 00416 // 00417 00418 // alpha and beta on CPU 00419 template <typename T> 00420 __global__ void ambm_m_col_kernel( 00421 T * A, 00422 unsigned int A_start1, unsigned int A_start2, 00423 unsigned int A_inc1, unsigned int A_inc2, 00424 unsigned int A_size1, unsigned int A_size2, 00425 unsigned int A_internal_size1, unsigned int A_internal_size2, 00426 00427 T fac2, 00428 unsigned int options2, 00429 const T * B, 00430 unsigned int B_start1, unsigned int B_start2, 00431 unsigned int B_inc1, unsigned int B_inc2, 00432 unsigned int B_internal_size1, unsigned int B_internal_size2, 00433 00434 T fac3, 00435 unsigned int options3, 00436 const T * C, 00437 unsigned int C_start1, unsigned int C_start2, 00438 unsigned int C_inc1, unsigned int C_inc2, 00439 unsigned int C_internal_size1, unsigned int C_internal_size2) 00440 { 00441 T alpha = fac2; 00442 if (options2 & (1 << 0)) 00443 alpha = -alpha; 00444 00445 T beta = fac3; 00446 if (options3 & (1 << 0)) 00447 beta = -beta; 00448 00449 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00450 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00451 00452 if (options2 & (1 << 1)) 00453 { 00454 if (options3 & (1 << 1)) 00455 { 00456 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00457 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00458 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00459 += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00460 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00461 } 00462 else 00463 { 00464 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00465 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00466 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00467 += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00468 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00469 } 00470 } 00471 else 00472 { 00473 if (options3 & (1 << 1)) 00474 { 00475 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00476 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00477 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00478 += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00479 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00480 } 00481 else 00482 { 00483 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00484 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00485 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00486 += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00487 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00488 } 00489 } 00490 } 00491 00492 00493 // alpha on CPU, beta on GPU 00494 template <typename T> 00495 __global__ void ambm_m_col_kernel( 00496 T * A, 00497 unsigned int A_start1, unsigned int A_start2, 00498 unsigned int A_inc1, unsigned int A_inc2, 00499 unsigned int A_size1, unsigned int A_size2, 00500 unsigned int A_internal_size1, unsigned int A_internal_size2, 00501 00502 T fac2, 00503 unsigned int options2, 00504 const T * B, 00505 unsigned int B_start1, unsigned int B_start2, 00506 unsigned int B_inc1, unsigned int B_inc2, 00507 unsigned int B_internal_size1, unsigned int B_internal_size2, 00508 00509 const T * fac3, 00510 unsigned int options3, 00511 const T * C, 00512 unsigned int C_start1, unsigned int C_start2, 00513 unsigned int C_inc1, unsigned int C_inc2, 00514 unsigned int C_internal_size1, unsigned int C_internal_size2) 00515 { 00516 T alpha = fac2; 00517 if (options2 & (1 << 0)) 00518 alpha = -alpha; 00519 00520 T beta = *fac3; 00521 if (options3 & (1 << 0)) 00522 beta = -beta; 00523 00524 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00525 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00526 00527 if (options2 & (1 << 1)) 00528 { 00529 if (options3 & (1 << 1)) 00530 { 00531 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00532 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00533 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00534 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00535 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00536 } 00537 else 00538 { 00539 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00540 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00541 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00542 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00543 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00544 } 00545 } 00546 else 00547 { 00548 if (options3 & (1 << 1)) 00549 { 00550 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00551 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00552 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00553 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00554 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00555 } 00556 else 00557 { 00558 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00559 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00560 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00561 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00562 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00563 } 00564 } 00565 } 00566 00567 // alpha on GPU, beta on CPU 00568 template <typename T> 00569 __global__ void ambm_m_col_kernel( 00570 T * A, 00571 unsigned int A_start1, unsigned int A_start2, 00572 unsigned int A_inc1, unsigned int A_inc2, 00573 unsigned int A_size1, unsigned int A_size2, 00574 unsigned int A_internal_size1, unsigned int A_internal_size2, 00575 00576 const T * fac2, 00577 unsigned int options2, 00578 const T * B, 00579 unsigned int B_start1, unsigned int B_start2, 00580 unsigned int B_inc1, unsigned int B_inc2, 00581 unsigned int B_internal_size1, unsigned int B_internal_size2, 00582 00583 T fac3, 00584 unsigned int options3, 00585 const T * C, 00586 unsigned int C_start1, unsigned int C_start2, 00587 unsigned int C_inc1, unsigned int C_inc2, 00588 unsigned int C_internal_size1, unsigned int C_internal_size2) 00589 { 00590 T alpha = *fac2; 00591 if (options2 & (1 << 0)) 00592 alpha = -alpha; 00593 00594 T beta = fac3; 00595 if (options3 & (1 << 0)) 00596 beta = -beta; 00597 00598 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00599 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00600 00601 if (options2 & (1 << 1)) 00602 { 00603 if (options3 & (1 << 1)) 00604 { 00605 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00606 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00607 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00608 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00609 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00610 } 00611 else 00612 { 00613 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00614 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00615 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00616 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00617 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00618 } 00619 } 00620 else 00621 { 00622 if (options3 & (1 << 1)) 00623 { 00624 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00625 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00626 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00627 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00628 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00629 } 00630 else 00631 { 00632 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00633 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00634 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00635 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00636 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00637 } 00638 } 00639 } 00640 00641 00642 // alpha and beta on GPU 00643 template <typename T> 00644 __global__ void ambm_m_col_kernel( 00645 T * A, 00646 unsigned int A_start1, unsigned int A_start2, 00647 unsigned int A_inc1, unsigned int A_inc2, 00648 unsigned int A_size1, unsigned int A_size2, 00649 unsigned int A_internal_size1, unsigned int A_internal_size2, 00650 00651 const T * fac2, 00652 unsigned int options2, 00653 const T * B, 00654 unsigned int B_start1, unsigned int B_start2, 00655 unsigned int B_inc1, unsigned int B_inc2, 00656 unsigned int B_internal_size1, unsigned int B_internal_size2, 00657 00658 const T * fac3, 00659 unsigned int options3, 00660 const T * C, 00661 unsigned int C_start1, unsigned int C_start2, 00662 unsigned int C_inc1, unsigned int C_inc2, 00663 unsigned int C_internal_size1, unsigned int C_internal_size2) 00664 { 00665 T alpha = *fac2; 00666 if (options2 & (1 << 0)) 00667 alpha = -alpha; 00668 00669 T beta = *fac3; 00670 if (options3 & (1 << 0)) 00671 beta = -beta; 00672 00673 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00674 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00675 00676 if (options2 & (1 << 1)) 00677 { 00678 if (options3 & (1 << 1)) 00679 { 00680 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00681 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00682 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00683 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00684 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00685 } 00686 else 00687 { 00688 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00689 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00690 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00691 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha 00692 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00693 } 00694 } 00695 else 00696 { 00697 if (options3 & (1 << 1)) 00698 { 00699 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00700 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00701 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00702 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00703 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta; 00704 } 00705 else 00706 { 00707 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00708 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00709 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00710 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha 00711 + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta; 00712 } 00713 } 00714 } 00715 00716 00717 00718 // 00719 // assignments 00720 // 00721 00722 template <typename T> 00723 __global__ void matrix_col_assign_kernel( 00724 T * A, 00725 unsigned int A_start1, unsigned int A_start2, 00726 unsigned int A_inc1, unsigned int A_inc2, 00727 unsigned int A_size1, unsigned int A_size2, 00728 unsigned int A_internal_size1, unsigned int A_internal_size2, 00729 T alpha) 00730 { 00731 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00732 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00733 00734 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00735 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00736 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha; 00737 } 00738 00739 00740 template <typename T> 00741 __global__ void matrix_col_diagonal_assign_kernel( 00742 T * A, 00743 unsigned int A_start1, unsigned int A_start2, 00744 unsigned int A_inc1, unsigned int A_inc2, 00745 unsigned int A_size1, unsigned int A_size2, 00746 unsigned int A_internal_size1, unsigned int A_internal_size2, 00747 T alpha) 00748 { 00749 unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x); 00750 00751 for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x) 00752 A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1] = alpha; 00753 } 00754 00755 // 00756 // binary element-wise operations 00757 // 00758 00759 template <typename T> 00760 __global__ void element_op_col_kernel( 00761 T * A, 00762 unsigned int A_start1, unsigned int A_start2, 00763 unsigned int A_inc1, unsigned int A_inc2, 00764 unsigned int A_size1, unsigned int A_size2, 00765 unsigned int A_internal_size1, unsigned int A_internal_size2, 00766 00767 const T * B, 00768 unsigned int B_start1, unsigned int B_start2, 00769 unsigned int B_inc1, unsigned int B_inc2, 00770 unsigned int B_internal_size1, unsigned int B_internal_size2, 00771 00772 const T * C, 00773 unsigned int C_start1, unsigned int C_start2, 00774 unsigned int C_inc1, unsigned int C_inc2, 00775 unsigned int C_internal_size1, unsigned int C_internal_size2, 00776 00777 unsigned int op_type) //0: product, 1: division, 2: pow 00778 { 00779 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00780 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00781 00782 if (op_type == 2) 00783 { 00784 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00785 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00786 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00787 = pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1], 00788 C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]); 00789 } 00790 else if (op_type == 1) 00791 { 00792 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00793 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00794 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00795 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] 00796 / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; 00797 } 00798 else if (op_type == 0) 00799 { 00800 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00801 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00802 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00803 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] 00804 * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; 00805 } 00806 } 00807 00808 template <typename T> 00809 __global__ void element_op_int_col_kernel( 00810 T * A, 00811 unsigned int A_start1, unsigned int A_start2, 00812 unsigned int A_inc1, unsigned int A_inc2, 00813 unsigned int A_size1, unsigned int A_size2, 00814 unsigned int A_internal_size1, unsigned int A_internal_size2, 00815 00816 const T * B, 00817 unsigned int B_start1, unsigned int B_start2, 00818 unsigned int B_inc1, unsigned int B_inc2, 00819 unsigned int B_internal_size1, unsigned int B_internal_size2, 00820 00821 const T * C, 00822 unsigned int C_start1, unsigned int C_start2, 00823 unsigned int C_inc1, unsigned int C_inc2, 00824 unsigned int C_internal_size1, unsigned int C_internal_size2, 00825 00826 unsigned int op_type) //0: product, 1: division, 2: pow 00827 { 00828 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00829 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00830 00831 if (op_type == 1) 00832 { 00833 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00834 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00835 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00836 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] 00837 / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; 00838 } 00839 else if (op_type == 0) 00840 { 00841 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00842 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00843 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] 00844 = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] 00845 * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; 00846 } 00847 } 00848 00849 00850 // 00851 // unary element-wise operations 00852 // 00853 00854 // abs 00855 template <typename T> 00856 __global__ void matrix_col_element_abs_kernel( 00857 T * A, 00858 unsigned int A_start1, unsigned int A_start2, 00859 unsigned int A_inc1, unsigned int A_inc2, 00860 unsigned int A_size1, unsigned int A_size2, 00861 unsigned int A_internal_size1, unsigned int A_internal_size2, 00862 00863 const T * B, 00864 unsigned int B_start1, unsigned int B_start2, 00865 unsigned int B_inc1, unsigned int B_inc2, 00866 unsigned int B_internal_size1, unsigned int B_internal_size2) 00867 { 00868 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00869 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00870 00871 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00872 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00873 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 00874 } 00875 00876 00877 // acos 00878 template <typename T> 00879 __global__ void matrix_col_element_acos_kernel( 00880 T * A, 00881 unsigned int A_start1, unsigned int A_start2, 00882 unsigned int A_inc1, unsigned int A_inc2, 00883 unsigned int A_size1, unsigned int A_size2, 00884 unsigned int A_internal_size1, unsigned int A_internal_size2, 00885 00886 const T * B, 00887 unsigned int B_start1, unsigned int B_start2, 00888 unsigned int B_inc1, unsigned int B_inc2, 00889 unsigned int B_internal_size1, unsigned int B_internal_size2) 00890 { 00891 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00892 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00893 00894 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00895 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00896 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 00897 } 00898 00899 00900 // asin 00901 template <typename T> 00902 __global__ void matrix_col_element_asin_kernel( 00903 T * A, 00904 unsigned int A_start1, unsigned int A_start2, 00905 unsigned int A_inc1, unsigned int A_inc2, 00906 unsigned int A_size1, unsigned int A_size2, 00907 unsigned int A_internal_size1, unsigned int A_internal_size2, 00908 00909 const T * B, 00910 unsigned int B_start1, unsigned int B_start2, 00911 unsigned int B_inc1, unsigned int B_inc2, 00912 unsigned int B_internal_size1, unsigned int B_internal_size2) 00913 { 00914 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00915 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00916 00917 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00918 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00919 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 00920 } 00921 00922 00923 // atan 00924 template <typename T> 00925 __global__ void matrix_col_element_atan_kernel( 00926 T * A, 00927 unsigned int A_start1, unsigned int A_start2, 00928 unsigned int A_inc1, unsigned int A_inc2, 00929 unsigned int A_size1, unsigned int A_size2, 00930 unsigned int A_internal_size1, unsigned int A_internal_size2, 00931 00932 const T * B, 00933 unsigned int B_start1, unsigned int B_start2, 00934 unsigned int B_inc1, unsigned int B_inc2, 00935 unsigned int B_internal_size1, unsigned int B_internal_size2) 00936 { 00937 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00938 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00939 00940 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00941 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00942 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 00943 } 00944 00945 00946 // ceil 00947 template <typename T> 00948 __global__ void matrix_col_element_ceil_kernel( 00949 T * A, 00950 unsigned int A_start1, unsigned int A_start2, 00951 unsigned int A_inc1, unsigned int A_inc2, 00952 unsigned int A_size1, unsigned int A_size2, 00953 unsigned int A_internal_size1, unsigned int A_internal_size2, 00954 00955 const T * B, 00956 unsigned int B_start1, unsigned int B_start2, 00957 unsigned int B_inc1, unsigned int B_inc2, 00958 unsigned int B_internal_size1, unsigned int B_internal_size2) 00959 { 00960 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00961 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00962 00963 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00964 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00965 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 00966 } 00967 00968 00969 // cos 00970 template <typename T> 00971 __global__ void matrix_col_element_cos_kernel( 00972 T * A, 00973 unsigned int A_start1, unsigned int A_start2, 00974 unsigned int A_inc1, unsigned int A_inc2, 00975 unsigned int A_size1, unsigned int A_size2, 00976 unsigned int A_internal_size1, unsigned int A_internal_size2, 00977 00978 const T * B, 00979 unsigned int B_start1, unsigned int B_start2, 00980 unsigned int B_inc1, unsigned int B_inc2, 00981 unsigned int B_internal_size1, unsigned int B_internal_size2) 00982 { 00983 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00984 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00985 00986 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 00987 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 00988 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 00989 } 00990 00991 00992 // cosh 00993 template <typename T> 00994 __global__ void matrix_col_element_cosh_kernel( 00995 T * A, 00996 unsigned int A_start1, unsigned int A_start2, 00997 unsigned int A_inc1, unsigned int A_inc2, 00998 unsigned int A_size1, unsigned int A_size2, 00999 unsigned int A_internal_size1, unsigned int A_internal_size2, 01000 01001 const T * B, 01002 unsigned int B_start1, unsigned int B_start2, 01003 unsigned int B_inc1, unsigned int B_inc2, 01004 unsigned int B_internal_size1, unsigned int B_internal_size2) 01005 { 01006 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01007 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01008 01009 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01010 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01011 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01012 } 01013 01014 01015 // exp 01016 template <typename T> 01017 __global__ void matrix_col_element_exp_kernel( 01018 T * A, 01019 unsigned int A_start1, unsigned int A_start2, 01020 unsigned int A_inc1, unsigned int A_inc2, 01021 unsigned int A_size1, unsigned int A_size2, 01022 unsigned int A_internal_size1, unsigned int A_internal_size2, 01023 01024 const T * B, 01025 unsigned int B_start1, unsigned int B_start2, 01026 unsigned int B_inc1, unsigned int B_inc2, 01027 unsigned int B_internal_size1, unsigned int B_internal_size2) 01028 { 01029 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01030 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01031 01032 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01033 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01034 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01035 } 01036 01037 01038 // fabs 01039 template <typename T> 01040 __global__ void matrix_col_element_fabs_kernel( 01041 T * A, 01042 unsigned int A_start1, unsigned int A_start2, 01043 unsigned int A_inc1, unsigned int A_inc2, 01044 unsigned int A_size1, unsigned int A_size2, 01045 unsigned int A_internal_size1, unsigned int A_internal_size2, 01046 01047 const T * B, 01048 unsigned int B_start1, unsigned int B_start2, 01049 unsigned int B_inc1, unsigned int B_inc2, 01050 unsigned int B_internal_size1, unsigned int B_internal_size2) 01051 { 01052 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01053 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01054 01055 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01056 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01057 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01058 } 01059 01060 01061 // floor 01062 template <typename T> 01063 __global__ void matrix_col_element_floor_kernel( 01064 T * A, 01065 unsigned int A_start1, unsigned int A_start2, 01066 unsigned int A_inc1, unsigned int A_inc2, 01067 unsigned int A_size1, unsigned int A_size2, 01068 unsigned int A_internal_size1, unsigned int A_internal_size2, 01069 01070 const T * B, 01071 unsigned int B_start1, unsigned int B_start2, 01072 unsigned int B_inc1, unsigned int B_inc2, 01073 unsigned int B_internal_size1, unsigned int B_internal_size2) 01074 { 01075 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01076 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01077 01078 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01079 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01080 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01081 } 01082 01083 01084 // log 01085 template <typename T> 01086 __global__ void matrix_col_element_log_kernel( 01087 T * A, 01088 unsigned int A_start1, unsigned int A_start2, 01089 unsigned int A_inc1, unsigned int A_inc2, 01090 unsigned int A_size1, unsigned int A_size2, 01091 unsigned int A_internal_size1, unsigned int A_internal_size2, 01092 01093 const T * B, 01094 unsigned int B_start1, unsigned int B_start2, 01095 unsigned int B_inc1, unsigned int B_inc2, 01096 unsigned int B_internal_size1, unsigned int B_internal_size2) 01097 { 01098 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01099 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01100 01101 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01102 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01103 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01104 } 01105 01106 01107 // log10 01108 template <typename T> 01109 __global__ void matrix_col_element_log10_kernel( 01110 T * A, 01111 unsigned int A_start1, unsigned int A_start2, 01112 unsigned int A_inc1, unsigned int A_inc2, 01113 unsigned int A_size1, unsigned int A_size2, 01114 unsigned int A_internal_size1, unsigned int A_internal_size2, 01115 01116 const T * B, 01117 unsigned int B_start1, unsigned int B_start2, 01118 unsigned int B_inc1, unsigned int B_inc2, 01119 unsigned int B_internal_size1, unsigned int B_internal_size2) 01120 { 01121 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01122 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01123 01124 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01125 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01126 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01127 } 01128 01129 01130 // sin 01131 template <typename T> 01132 __global__ void matrix_col_element_sin_kernel( 01133 T * A, 01134 unsigned int A_start1, unsigned int A_start2, 01135 unsigned int A_inc1, unsigned int A_inc2, 01136 unsigned int A_size1, unsigned int A_size2, 01137 unsigned int A_internal_size1, unsigned int A_internal_size2, 01138 01139 const T * B, 01140 unsigned int B_start1, unsigned int B_start2, 01141 unsigned int B_inc1, unsigned int B_inc2, 01142 unsigned int B_internal_size1, unsigned int B_internal_size2) 01143 { 01144 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01145 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01146 01147 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01148 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01149 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01150 } 01151 01152 01153 // sinh 01154 template <typename T> 01155 __global__ void matrix_col_element_sinh_kernel( 01156 T * A, 01157 unsigned int A_start1, unsigned int A_start2, 01158 unsigned int A_inc1, unsigned int A_inc2, 01159 unsigned int A_size1, unsigned int A_size2, 01160 unsigned int A_internal_size1, unsigned int A_internal_size2, 01161 01162 const T * B, 01163 unsigned int B_start1, unsigned int B_start2, 01164 unsigned int B_inc1, unsigned int B_inc2, 01165 unsigned int B_internal_size1, unsigned int B_internal_size2) 01166 { 01167 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01168 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01169 01170 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01171 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01172 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01173 } 01174 01175 01176 // sqrt 01177 template <typename T> 01178 __global__ void matrix_col_element_sqrt_kernel( 01179 T * A, 01180 unsigned int A_start1, unsigned int A_start2, 01181 unsigned int A_inc1, unsigned int A_inc2, 01182 unsigned int A_size1, unsigned int A_size2, 01183 unsigned int A_internal_size1, unsigned int A_internal_size2, 01184 01185 const T * B, 01186 unsigned int B_start1, unsigned int B_start2, 01187 unsigned int B_inc1, unsigned int B_inc2, 01188 unsigned int B_internal_size1, unsigned int B_internal_size2) 01189 { 01190 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01191 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01192 01193 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01194 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01195 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01196 } 01197 01198 01199 // tan 01200 template <typename T> 01201 __global__ void matrix_col_element_tan_kernel( 01202 T * A, 01203 unsigned int A_start1, unsigned int A_start2, 01204 unsigned int A_inc1, unsigned int A_inc2, 01205 unsigned int A_size1, unsigned int A_size2, 01206 unsigned int A_internal_size1, unsigned int A_internal_size2, 01207 01208 const T * B, 01209 unsigned int B_start1, unsigned int B_start2, 01210 unsigned int B_inc1, unsigned int B_inc2, 01211 unsigned int B_internal_size1, unsigned int B_internal_size2) 01212 { 01213 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01214 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01215 01216 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01217 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01218 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01219 } 01220 01221 01222 // tanh 01223 template <typename T> 01224 __global__ void matrix_col_element_tanh_kernel( 01225 T * A, 01226 unsigned int A_start1, unsigned int A_start2, 01227 unsigned int A_inc1, unsigned int A_inc2, 01228 unsigned int A_size1, unsigned int A_size2, 01229 unsigned int A_internal_size1, unsigned int A_internal_size2, 01230 01231 const T * B, 01232 unsigned int B_start1, unsigned int B_start2, 01233 unsigned int B_inc1, unsigned int B_inc2, 01234 unsigned int B_internal_size1, unsigned int B_internal_size2) 01235 { 01236 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01237 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01238 01239 for (unsigned int col = col_gid; col < A_size2; col += gridDim.x) 01240 for (unsigned int row = row_gid; row < A_size1; row += blockDim.x) 01241 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]); 01242 } 01243 01244 01245 01246 // 01247 // matrix-vector product 01248 // 01249 01250 template <typename T> 01251 __global__ void vec_mul_col_kernel( 01252 const T * A, 01253 unsigned int A_row_start, 01254 unsigned int A_col_start, 01255 unsigned int A_row_inc, 01256 unsigned int A_col_inc, 01257 unsigned int A_row_size, 01258 unsigned int A_col_size, 01259 unsigned int A_internal_rows, 01260 unsigned int A_internal_cols, 01261 const T * v, 01262 unsigned int v_start, 01263 unsigned int v_inc, 01264 unsigned int v_size, 01265 T * result, 01266 unsigned int result_start, 01267 unsigned int result_inc, 01268 unsigned int result_size) 01269 { 01270 01271 for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_row_size; row += gridDim.x * blockDim.x) 01272 { 01273 T dot_prod = 0; 01274 for (unsigned int col = 0; col < A_col_size; ++col) 01275 dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; 01276 result[row * result_inc + result_start] = dot_prod; 01277 } 01278 } 01279 01280 01281 template <typename T> 01282 __global__ void trans_vec_mul_col_kernel( 01283 const T * A, 01284 unsigned int A_row_start, 01285 unsigned int A_col_start, 01286 unsigned int A_row_inc, 01287 unsigned int A_col_inc, 01288 unsigned int A_row_size, 01289 unsigned int A_col_size, 01290 unsigned int A_internal_rows, 01291 unsigned int A_internal_cols, 01292 const T * v, 01293 unsigned int v_start, 01294 unsigned int v_inc, 01295 unsigned int v_size, 01296 T * result, 01297 unsigned int result_start, 01298 unsigned int result_inc, 01299 unsigned int result_size) 01300 { 01301 __shared__ T work[128]; 01302 01303 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01304 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01305 unsigned int lid = threadIdx.x; 01306 01307 for (unsigned int row = row_gid; row < A_col_size; row += gridDim.x) 01308 { 01309 T dot_prod = 0; 01310 for (unsigned int col = col_gid; col < A_row_size; col += blockDim.x) 01311 dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; 01312 work[lid] = dot_prod; 01313 01314 for(unsigned int stride = blockDim.x/2 ; stride>0 ; stride>>=1){ 01315 __syncthreads(); 01316 if(lid < stride) 01317 work[lid] += work[lid+stride]; 01318 } 01319 01320 if(lid == 0) 01321 result[row * result_inc + result_start] = work[0]; 01322 } 01323 } 01324 01325 01326 // 01327 // matrix-matrix products 01328 // 01329 01330 01331 01332 01333 // 01334 // scaled rank-1-update 01335 // 01336 01337 // alpha on CPU 01338 template <typename T> 01339 __global__ void scaled_rank1_update_col_kernel( 01340 T * A, 01341 unsigned int A_start1, unsigned int A_start2, 01342 unsigned int A_inc1, unsigned int A_inc2, 01343 unsigned int A_size1, unsigned int A_size2, 01344 unsigned int A_internal_size1, unsigned int A_internal_size2, 01345 01346 T val, 01347 unsigned int options2, 01348 01349 const T * vec1, 01350 unsigned int start1, 01351 unsigned int inc1, 01352 unsigned int size1, 01353 01354 const T * vec2, 01355 unsigned int start2, 01356 unsigned int inc2, 01357 unsigned int size2) 01358 { 01359 T alpha = val; 01360 if (options2 & (1 << 0)) 01361 alpha = -alpha; 01362 if (options2 & (1 << 1)) 01363 alpha = ((T)(1)) / alpha; 01364 01365 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01366 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01367 01368 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01369 { 01370 T tmp = alpha * vec1[row * inc1 + start1]; 01371 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01372 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; 01373 } 01374 } 01375 01376 01377 // alpha on GPU 01378 template <typename T> 01379 __global__ void scaled_rank1_update_col_kernel( 01380 T * A, 01381 unsigned int A_start1, unsigned int A_start2, 01382 unsigned int A_inc1, unsigned int A_inc2, 01383 unsigned int A_size1, unsigned int A_size2, 01384 unsigned int A_internal_size1, unsigned int A_internal_size2, 01385 01386 const T * val, 01387 unsigned int options2, 01388 01389 const T * vec1, 01390 unsigned int start1, 01391 unsigned int inc1, 01392 unsigned int size1, 01393 01394 const T * vec2, 01395 unsigned int start2, 01396 unsigned int inc2, 01397 unsigned int size2) 01398 { 01399 T alpha = *val; 01400 if (options2 & (1 << 0)) 01401 alpha = -alpha; 01402 if (options2 & (1 << 1)) 01403 alpha = ((T)(1)) / alpha; 01404 01405 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01406 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01407 01408 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01409 { 01410 T tmp = alpha * vec1[row * inc1 + start1]; 01411 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01412 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; 01413 } 01414 } 01415 01416 01417 01418 } // namespace cuda 01419 } //namespace linalg 01420 } //namespace viennacl 01421 01422 01423 #endif