ViennaCL - The Vienna Computing Library  1.5.1
viennacl/linalg/cuda/matrix_operations_col.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
00002 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00026 namespace viennacl
00027 {
00028   namespace linalg
00029   {
00030     namespace cuda
00031     {
00032       //
00033       // am
00034       //
00035 
00036       // alpha on CPU
00037       template <typename T>
00038       __global__ void am_col_kernel(
00039                 T * A,
00040                 unsigned int A_start1, unsigned int A_start2,
00041                 unsigned int A_inc1,   unsigned int A_inc2,
00042                 unsigned int A_size1,  unsigned int A_size2,
00043                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00044 
00045                 T fac2,
00046                 unsigned int options2,
00047                 const T * B,
00048                 unsigned int B_start1, unsigned int B_start2,
00049                 unsigned int B_inc1,   unsigned int B_inc2,
00050                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00051       {
00052         T alpha = fac2;
00053         if (options2 & (1 << 0))
00054           alpha = -alpha;
00055 
00056         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00057         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00058 
00059         if (options2 & (1 << 1))
00060         {
00061           for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00062             for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00063               A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
00064         }
00065         else
00066         {
00067           for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00068             for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00069               A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
00070         }
00071       }
00072 
00073       // alpha on GPU
00074       template <typename T>
00075       __global__ void am_col_kernel(
00076                 T * A,
00077                 unsigned int A_start1, unsigned int A_start2,
00078                 unsigned int A_inc1,   unsigned int A_inc2,
00079                 unsigned int A_size1,  unsigned int A_size2,
00080                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00081 
00082                 const T * fac2,
00083                 unsigned int options2,
00084                 const T * B,
00085                 unsigned int B_start1, unsigned int B_start2,
00086                 unsigned int B_inc1,   unsigned int B_inc2,
00087                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00088       {
00089         T alpha = *fac2;
00090         if (options2 & (1 << 0))
00091           alpha = -alpha;
00092 
00093         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00094         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00095 
00096         if (options2 & (1 << 1))
00097         {
00098           for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00099             for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00100               A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
00101         }
00102         else
00103         {
00104           for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00105             for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00106               A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
00107         }
00108       }
00109 
00110 
00111       //
00112       // ambm
00113       //
00114 
00115       // alpha and beta on CPU
00116       template <typename T>
00117       __global__ void ambm_col_kernel(
00118                 T * A,
00119                 unsigned int A_start1, unsigned int A_start2,
00120                 unsigned int A_inc1,   unsigned int A_inc2,
00121                 unsigned int A_size1,  unsigned int A_size2,
00122                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00123 
00124                 T fac2,
00125                 unsigned int options2,
00126                 const T * B,
00127                 unsigned int B_start1, unsigned int B_start2,
00128                 unsigned int B_inc1,   unsigned int B_inc2,
00129                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00130 
00131                 T fac3,
00132                 unsigned int options3,
00133                 const T * C,
00134                 unsigned int C_start1, unsigned int C_start2,
00135                 unsigned int C_inc1,   unsigned int C_inc2,
00136                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00137       {
00138         T alpha = fac2;
00139         if (options2 & (1 << 0))
00140           alpha = -alpha;
00141 
00142         T beta = fac3;
00143         if (options3 & (1 << 0))
00144           beta = -beta;
00145 
00146         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00147         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00148 
00149         if (options2 & (1 << 1))
00150         {
00151           if (options3 & (1 << 1))
00152           {
00153             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00154               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00155                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00156               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00157               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00158           }
00159           else
00160           {
00161             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00162               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00163                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00164               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00165               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00166           }
00167         }
00168         else
00169         {
00170           if (options3 & (1 << 1))
00171           {
00172             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00173               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00174                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00175               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00176               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00177           }
00178           else
00179           {
00180             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00181               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00182                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00183               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00184               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00185           }
00186         }
00187       }
00188 
00189 
00190       // alpha on CPU, beta on GPU
00191       template <typename T>
00192       __global__ void ambm_col_kernel(
00193                 T * A,
00194                 unsigned int A_start1, unsigned int A_start2,
00195                 unsigned int A_inc1,   unsigned int A_inc2,
00196                 unsigned int A_size1,  unsigned int A_size2,
00197                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00198 
00199                 T fac2,
00200                 unsigned int options2,
00201                 const T * B,
00202                 unsigned int B_start1, unsigned int B_start2,
00203                 unsigned int B_inc1,   unsigned int B_inc2,
00204                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00205 
00206                 const T * fac3,
00207                 unsigned int options3,
00208                 const T * C,
00209                 unsigned int C_start1, unsigned int C_start2,
00210                 unsigned int C_inc1,   unsigned int C_inc2,
00211                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00212       {
00213         T alpha = fac2;
00214         if (options2 & (1 << 0))
00215           alpha = -alpha;
00216 
00217         T beta = *fac3;
00218         if (options3 & (1 << 0))
00219           beta = -beta;
00220 
00221         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00222         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00223 
00224         if (options2 & (1 << 1))
00225         {
00226           if (options3 & (1 << 1))
00227           {
00228             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00229               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00230                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00231               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00232               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00233           }
00234           else
00235           {
00236             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00237               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00238                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00239               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00240               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00241           }
00242         }
00243         else
00244         {
00245           if (options3 & (1 << 1))
00246           {
00247             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00248               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00249                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00250               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00251               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00252           }
00253           else
00254           {
00255             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00256               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00257                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00258               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00259               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00260           }
00261         }
00262       }
00263 
00264       // alpha on GPU, beta on CPU
00265       template <typename T>
00266       __global__ void ambm_col_kernel(
00267                 T * A,
00268                 unsigned int A_start1, unsigned int A_start2,
00269                 unsigned int A_inc1,   unsigned int A_inc2,
00270                 unsigned int A_size1,  unsigned int A_size2,
00271                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00272 
00273                 const T * fac2,
00274                 unsigned int options2,
00275                 const T * B,
00276                 unsigned int B_start1, unsigned int B_start2,
00277                 unsigned int B_inc1,   unsigned int B_inc2,
00278                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00279 
00280                 T fac3,
00281                 unsigned int options3,
00282                 const T * C,
00283                 unsigned int C_start1, unsigned int C_start2,
00284                 unsigned int C_inc1,   unsigned int C_inc2,
00285                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00286       {
00287         T alpha = *fac2;
00288         if (options2 & (1 << 0))
00289           alpha = -alpha;
00290 
00291         T beta = fac3;
00292         if (options3 & (1 << 0))
00293           beta = -beta;
00294 
00295         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00296         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00297 
00298         if (options2 & (1 << 1))
00299         {
00300           if (options3 & (1 << 1))
00301           {
00302             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00303               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00304                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00305               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00306               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00307           }
00308           else
00309           {
00310             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00311               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00312                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00313               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00314               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00315           }
00316         }
00317         else
00318         {
00319           if (options3 & (1 << 1))
00320           {
00321             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00322               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00323                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00324               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00325               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00326           }
00327           else
00328           {
00329             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00330               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00331                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00332               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00333               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00334           }
00335         }
00336       }
00337 
00338 
00339       // alpha and beta on GPU
00340       template <typename T>
00341       __global__ void ambm_col_kernel(
00342                 T * A,
00343                 unsigned int A_start1, unsigned int A_start2,
00344                 unsigned int A_inc1,   unsigned int A_inc2,
00345                 unsigned int A_size1,  unsigned int A_size2,
00346                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00347 
00348                 const T * fac2,
00349                 unsigned int options2,
00350                 const T * B,
00351                 unsigned int B_start1, unsigned int B_start2,
00352                 unsigned int B_inc1,   unsigned int B_inc2,
00353                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00354 
00355                 const T * fac3,
00356                 unsigned int options3,
00357                 const T * C,
00358                 unsigned int C_start1, unsigned int C_start2,
00359                 unsigned int C_inc1,   unsigned int C_inc2,
00360                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00361       {
00362         T alpha = *fac2;
00363         if (options2 & (1 << 0))
00364           alpha = -alpha;
00365 
00366         T beta = *fac3;
00367         if (options3 & (1 << 0))
00368           beta = -beta;
00369 
00370         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00371         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00372 
00373         if (options2 & (1 << 1))
00374         {
00375           if (options3 & (1 << 1))
00376           {
00377             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00378               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00379                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00380               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00381               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00382           }
00383           else
00384           {
00385             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00386               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00387                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00388               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00389               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00390           }
00391         }
00392         else
00393         {
00394           if (options3 & (1 << 1))
00395           {
00396             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00397               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00398                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00399               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00400               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00401           }
00402           else
00403           {
00404             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00405               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00406                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00407               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00408               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00409           }
00410         }
00411       }
00412 
00413 
00414       //
00415       // ambm_m
00416       //
00417 
00418       // alpha and beta on CPU
00419       template <typename T>
00420       __global__ void ambm_m_col_kernel(
00421                 T * A,
00422                 unsigned int A_start1, unsigned int A_start2,
00423                 unsigned int A_inc1,   unsigned int A_inc2,
00424                 unsigned int A_size1,  unsigned int A_size2,
00425                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00426 
00427                 T fac2,
00428                 unsigned int options2,
00429                 const T * B,
00430                 unsigned int B_start1, unsigned int B_start2,
00431                 unsigned int B_inc1,   unsigned int B_inc2,
00432                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00433 
00434                 T fac3,
00435                 unsigned int options3,
00436                 const T * C,
00437                 unsigned int C_start1, unsigned int C_start2,
00438                 unsigned int C_inc1,   unsigned int C_inc2,
00439                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00440       {
00441         T alpha = fac2;
00442         if (options2 & (1 << 0))
00443           alpha = -alpha;
00444 
00445         T beta = fac3;
00446         if (options3 & (1 << 0))
00447           beta = -beta;
00448 
00449         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00450         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00451 
00452         if (options2 & (1 << 1))
00453         {
00454           if (options3 & (1 << 1))
00455           {
00456             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00457               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00458                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00459              += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00460               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00461           }
00462           else
00463           {
00464             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00465               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00466                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00467              += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00468               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00469           }
00470         }
00471         else
00472         {
00473           if (options3 & (1 << 1))
00474           {
00475             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00476               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00477                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00478              += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00479               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00480           }
00481           else
00482           {
00483             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00484               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00485                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00486              += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00487               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00488           }
00489         }
00490       }
00491 
00492 
00493       // alpha on CPU, beta on GPU
00494       template <typename T>
00495       __global__ void ambm_m_col_kernel(
00496                 T * A,
00497                 unsigned int A_start1, unsigned int A_start2,
00498                 unsigned int A_inc1,   unsigned int A_inc2,
00499                 unsigned int A_size1,  unsigned int A_size2,
00500                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00501 
00502                 T fac2,
00503                 unsigned int options2,
00504                 const T * B,
00505                 unsigned int B_start1, unsigned int B_start2,
00506                 unsigned int B_inc1,   unsigned int B_inc2,
00507                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00508 
00509                 const T * fac3,
00510                 unsigned int options3,
00511                 const T * C,
00512                 unsigned int C_start1, unsigned int C_start2,
00513                 unsigned int C_inc1,   unsigned int C_inc2,
00514                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00515       {
00516         T alpha = fac2;
00517         if (options2 & (1 << 0))
00518           alpha = -alpha;
00519 
00520         T beta = *fac3;
00521         if (options3 & (1 << 0))
00522           beta = -beta;
00523 
00524         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00525         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00526 
00527         if (options2 & (1 << 1))
00528         {
00529           if (options3 & (1 << 1))
00530           {
00531             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00532               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00533                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00534               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00535               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00536           }
00537           else
00538           {
00539             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00540               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00541                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00542               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00543               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00544           }
00545         }
00546         else
00547         {
00548           if (options3 & (1 << 1))
00549           {
00550             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00551               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00552                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00553               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00554               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00555           }
00556           else
00557           {
00558             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00559               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00560                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00561               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00562               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00563           }
00564         }
00565       }
00566 
00567       // alpha on GPU, beta on CPU
00568       template <typename T>
00569       __global__ void ambm_m_col_kernel(
00570                 T * A,
00571                 unsigned int A_start1, unsigned int A_start2,
00572                 unsigned int A_inc1,   unsigned int A_inc2,
00573                 unsigned int A_size1,  unsigned int A_size2,
00574                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00575 
00576                 const T * fac2,
00577                 unsigned int options2,
00578                 const T * B,
00579                 unsigned int B_start1, unsigned int B_start2,
00580                 unsigned int B_inc1,   unsigned int B_inc2,
00581                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00582 
00583                 T fac3,
00584                 unsigned int options3,
00585                 const T * C,
00586                 unsigned int C_start1, unsigned int C_start2,
00587                 unsigned int C_inc1,   unsigned int C_inc2,
00588                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00589       {
00590         T alpha = *fac2;
00591         if (options2 & (1 << 0))
00592           alpha = -alpha;
00593 
00594         T beta = fac3;
00595         if (options3 & (1 << 0))
00596           beta = -beta;
00597 
00598         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00599         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00600 
00601         if (options2 & (1 << 1))
00602         {
00603           if (options3 & (1 << 1))
00604           {
00605             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00606               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00607                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00608               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00609               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00610           }
00611           else
00612           {
00613             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00614               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00615                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00616               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00617               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00618           }
00619         }
00620         else
00621         {
00622           if (options3 & (1 << 1))
00623           {
00624             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00625               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00626                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00627               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00628               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00629           }
00630           else
00631           {
00632             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00633               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00634                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00635               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00636               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00637           }
00638         }
00639       }
00640 
00641 
00642       // alpha and beta on GPU
00643       template <typename T>
00644       __global__ void ambm_m_col_kernel(
00645                 T * A,
00646                 unsigned int A_start1, unsigned int A_start2,
00647                 unsigned int A_inc1,   unsigned int A_inc2,
00648                 unsigned int A_size1,  unsigned int A_size2,
00649                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00650 
00651                 const T * fac2,
00652                 unsigned int options2,
00653                 const T * B,
00654                 unsigned int B_start1, unsigned int B_start2,
00655                 unsigned int B_inc1,   unsigned int B_inc2,
00656                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00657 
00658                 const T * fac3,
00659                 unsigned int options3,
00660                 const T * C,
00661                 unsigned int C_start1, unsigned int C_start2,
00662                 unsigned int C_inc1,   unsigned int C_inc2,
00663                 unsigned int C_internal_size1,  unsigned int C_internal_size2)
00664       {
00665         T alpha = *fac2;
00666         if (options2 & (1 << 0))
00667           alpha = -alpha;
00668 
00669         T beta = *fac3;
00670         if (options3 & (1 << 0))
00671           beta = -beta;
00672 
00673         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00674         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00675 
00676         if (options2 & (1 << 1))
00677         {
00678           if (options3 & (1 << 1))
00679           {
00680             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00681               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00682                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00683               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00684               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00685           }
00686           else
00687           {
00688             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00689               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00690                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00691               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
00692               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00693           }
00694         }
00695         else
00696         {
00697           if (options3 & (1 << 1))
00698           {
00699             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00700               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00701                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00702               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00703               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
00704           }
00705           else
00706           {
00707             for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00708               for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00709                 A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00710               = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
00711               + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
00712           }
00713         }
00714       }
00715 
00716 
00717 
00718       //
00719       // assignments
00720       //
00721 
00722       template <typename T>
00723       __global__ void matrix_col_assign_kernel(
00724                 T * A,
00725                 unsigned int A_start1, unsigned int A_start2,
00726                 unsigned int A_inc1,   unsigned int A_inc2,
00727                 unsigned int A_size1,  unsigned int A_size2,
00728                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00729                 T alpha)
00730       {
00731         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00732         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00733 
00734         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00735           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00736             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
00737       }
00738 
00739 
00740       template <typename T>
00741       __global__ void matrix_col_diagonal_assign_kernel(
00742                 T * A,
00743                 unsigned int A_start1, unsigned int A_start2,
00744                 unsigned int A_inc1,   unsigned int A_inc2,
00745                 unsigned int A_size1,  unsigned int A_size2,
00746                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00747                 T alpha)
00748       {
00749         unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
00750 
00751         for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
00752           A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1] = alpha;
00753       }
00754 
00755       //
00756       // binary element-wise operations
00757       //
00758 
00759       template <typename T>
00760       __global__ void element_op_col_kernel(
00761                 T * A,
00762                 unsigned int A_start1, unsigned int A_start2,
00763                 unsigned int A_inc1,   unsigned int A_inc2,
00764                 unsigned int A_size1,  unsigned int A_size2,
00765                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00766 
00767                 const T * B,
00768                 unsigned int B_start1, unsigned int B_start2,
00769                 unsigned int B_inc1,   unsigned int B_inc2,
00770                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00771 
00772                 const T * C,
00773                 unsigned int C_start1, unsigned int C_start2,
00774                 unsigned int C_inc1,   unsigned int C_inc2,
00775                 unsigned int C_internal_size1,  unsigned int C_internal_size2,
00776 
00777                 unsigned int op_type) //0: product, 1: division, 2: pow
00778       {
00779         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00780         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00781 
00782         if (op_type == 2)
00783         {
00784           for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00785             for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00786               A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00787             = pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
00788                   C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
00789         }
00790         else if (op_type == 1)
00791         {
00792           for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00793             for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00794               A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00795             = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
00796             / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
00797         }
00798         else if (op_type == 0)
00799         {
00800           for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00801             for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00802               A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00803             = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
00804             * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
00805         }
00806       }
00807 
00808       template <typename T>
00809       __global__ void element_op_int_col_kernel(
00810                 T * A,
00811                 unsigned int A_start1, unsigned int A_start2,
00812                 unsigned int A_inc1,   unsigned int A_inc2,
00813                 unsigned int A_size1,  unsigned int A_size2,
00814                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00815 
00816                 const T * B,
00817                 unsigned int B_start1, unsigned int B_start2,
00818                 unsigned int B_inc1,   unsigned int B_inc2,
00819                 unsigned int B_internal_size1,  unsigned int B_internal_size2,
00820 
00821                 const T * C,
00822                 unsigned int C_start1, unsigned int C_start2,
00823                 unsigned int C_inc1,   unsigned int C_inc2,
00824                 unsigned int C_internal_size1,  unsigned int C_internal_size2,
00825 
00826                 unsigned int op_type) //0: product, 1: division, 2: pow
00827       {
00828         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00829         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00830 
00831         if (op_type == 1)
00832         {
00833           for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00834             for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00835               A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00836             = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
00837             / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
00838         }
00839         else if (op_type == 0)
00840         {
00841           for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00842             for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00843               A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
00844             = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
00845             * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
00846         }
00847       }
00848 
00849 
00850       //
00851       // unary element-wise operations
00852       //
00853 
00854       // abs
00855       template <typename T>
00856       __global__ void matrix_col_element_abs_kernel(
00857                 T * A,
00858                 unsigned int A_start1, unsigned int A_start2,
00859                 unsigned int A_inc1,   unsigned int A_inc2,
00860                 unsigned int A_size1,  unsigned int A_size2,
00861                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00862 
00863                 const T * B,
00864                 unsigned int B_start1, unsigned int B_start2,
00865                 unsigned int B_inc1,   unsigned int B_inc2,
00866                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00867       {
00868         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00869         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00870 
00871         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00872           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00873             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
00874       }
00875 
00876 
00877       // acos
00878       template <typename T>
00879       __global__ void matrix_col_element_acos_kernel(
00880                 T * A,
00881                 unsigned int A_start1, unsigned int A_start2,
00882                 unsigned int A_inc1,   unsigned int A_inc2,
00883                 unsigned int A_size1,  unsigned int A_size2,
00884                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00885 
00886                 const T * B,
00887                 unsigned int B_start1, unsigned int B_start2,
00888                 unsigned int B_inc1,   unsigned int B_inc2,
00889                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00890       {
00891         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00892         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00893 
00894         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00895           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00896             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
00897       }
00898 
00899 
00900       // asin
00901       template <typename T>
00902       __global__ void matrix_col_element_asin_kernel(
00903                 T * A,
00904                 unsigned int A_start1, unsigned int A_start2,
00905                 unsigned int A_inc1,   unsigned int A_inc2,
00906                 unsigned int A_size1,  unsigned int A_size2,
00907                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00908 
00909                 const T * B,
00910                 unsigned int B_start1, unsigned int B_start2,
00911                 unsigned int B_inc1,   unsigned int B_inc2,
00912                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00913       {
00914         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00915         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00916 
00917         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00918           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00919             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
00920       }
00921 
00922 
00923       // atan
00924       template <typename T>
00925       __global__ void matrix_col_element_atan_kernel(
00926                 T * A,
00927                 unsigned int A_start1, unsigned int A_start2,
00928                 unsigned int A_inc1,   unsigned int A_inc2,
00929                 unsigned int A_size1,  unsigned int A_size2,
00930                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00931 
00932                 const T * B,
00933                 unsigned int B_start1, unsigned int B_start2,
00934                 unsigned int B_inc1,   unsigned int B_inc2,
00935                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00936       {
00937         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00938         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00939 
00940         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00941           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00942             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
00943       }
00944 
00945 
00946       // ceil
00947       template <typename T>
00948       __global__ void matrix_col_element_ceil_kernel(
00949                 T * A,
00950                 unsigned int A_start1, unsigned int A_start2,
00951                 unsigned int A_inc1,   unsigned int A_inc2,
00952                 unsigned int A_size1,  unsigned int A_size2,
00953                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00954 
00955                 const T * B,
00956                 unsigned int B_start1, unsigned int B_start2,
00957                 unsigned int B_inc1,   unsigned int B_inc2,
00958                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00959       {
00960         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00961         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00962 
00963         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00964           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00965             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
00966       }
00967 
00968 
00969       // cos
00970       template <typename T>
00971       __global__ void matrix_col_element_cos_kernel(
00972                 T * A,
00973                 unsigned int A_start1, unsigned int A_start2,
00974                 unsigned int A_inc1,   unsigned int A_inc2,
00975                 unsigned int A_size1,  unsigned int A_size2,
00976                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
00977 
00978                 const T * B,
00979                 unsigned int B_start1, unsigned int B_start2,
00980                 unsigned int B_inc1,   unsigned int B_inc2,
00981                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
00982       {
00983         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
00984         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
00985 
00986         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
00987           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
00988             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
00989       }
00990 
00991 
00992       // cosh
00993       template <typename T>
00994       __global__ void matrix_col_element_cosh_kernel(
00995                 T * A,
00996                 unsigned int A_start1, unsigned int A_start2,
00997                 unsigned int A_inc1,   unsigned int A_inc2,
00998                 unsigned int A_size1,  unsigned int A_size2,
00999                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01000 
01001                 const T * B,
01002                 unsigned int B_start1, unsigned int B_start2,
01003                 unsigned int B_inc1,   unsigned int B_inc2,
01004                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01005       {
01006         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01007         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01008 
01009         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01010           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01011             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01012       }
01013 
01014 
01015       // exp
01016       template <typename T>
01017       __global__ void matrix_col_element_exp_kernel(
01018                 T * A,
01019                 unsigned int A_start1, unsigned int A_start2,
01020                 unsigned int A_inc1,   unsigned int A_inc2,
01021                 unsigned int A_size1,  unsigned int A_size2,
01022                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01023 
01024                 const T * B,
01025                 unsigned int B_start1, unsigned int B_start2,
01026                 unsigned int B_inc1,   unsigned int B_inc2,
01027                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01028       {
01029         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01030         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01031 
01032         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01033           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01034             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01035       }
01036 
01037 
01038       // fabs
01039       template <typename T>
01040       __global__ void matrix_col_element_fabs_kernel(
01041                 T * A,
01042                 unsigned int A_start1, unsigned int A_start2,
01043                 unsigned int A_inc1,   unsigned int A_inc2,
01044                 unsigned int A_size1,  unsigned int A_size2,
01045                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01046 
01047                 const T * B,
01048                 unsigned int B_start1, unsigned int B_start2,
01049                 unsigned int B_inc1,   unsigned int B_inc2,
01050                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01051       {
01052         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01053         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01054 
01055         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01056           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01057             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01058       }
01059 
01060 
01061       // floor
01062       template <typename T>
01063       __global__ void matrix_col_element_floor_kernel(
01064                 T * A,
01065                 unsigned int A_start1, unsigned int A_start2,
01066                 unsigned int A_inc1,   unsigned int A_inc2,
01067                 unsigned int A_size1,  unsigned int A_size2,
01068                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01069 
01070                 const T * B,
01071                 unsigned int B_start1, unsigned int B_start2,
01072                 unsigned int B_inc1,   unsigned int B_inc2,
01073                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01074       {
01075         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01076         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01077 
01078         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01079           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01080             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01081       }
01082 
01083 
01084       // log
01085       template <typename T>
01086       __global__ void matrix_col_element_log_kernel(
01087                 T * A,
01088                 unsigned int A_start1, unsigned int A_start2,
01089                 unsigned int A_inc1,   unsigned int A_inc2,
01090                 unsigned int A_size1,  unsigned int A_size2,
01091                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01092 
01093                 const T * B,
01094                 unsigned int B_start1, unsigned int B_start2,
01095                 unsigned int B_inc1,   unsigned int B_inc2,
01096                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01097       {
01098         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01099         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01100 
01101         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01102           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01103             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01104       }
01105 
01106 
01107       // log10
01108       template <typename T>
01109       __global__ void matrix_col_element_log10_kernel(
01110                 T * A,
01111                 unsigned int A_start1, unsigned int A_start2,
01112                 unsigned int A_inc1,   unsigned int A_inc2,
01113                 unsigned int A_size1,  unsigned int A_size2,
01114                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01115 
01116                 const T * B,
01117                 unsigned int B_start1, unsigned int B_start2,
01118                 unsigned int B_inc1,   unsigned int B_inc2,
01119                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01120       {
01121         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01122         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01123 
01124         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01125           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01126             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01127       }
01128 
01129 
01130       // sin
01131       template <typename T>
01132       __global__ void matrix_col_element_sin_kernel(
01133                 T * A,
01134                 unsigned int A_start1, unsigned int A_start2,
01135                 unsigned int A_inc1,   unsigned int A_inc2,
01136                 unsigned int A_size1,  unsigned int A_size2,
01137                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01138 
01139                 const T * B,
01140                 unsigned int B_start1, unsigned int B_start2,
01141                 unsigned int B_inc1,   unsigned int B_inc2,
01142                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01143       {
01144         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01145         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01146 
01147         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01148           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01149             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01150       }
01151 
01152 
01153       // sinh
01154       template <typename T>
01155       __global__ void matrix_col_element_sinh_kernel(
01156                 T * A,
01157                 unsigned int A_start1, unsigned int A_start2,
01158                 unsigned int A_inc1,   unsigned int A_inc2,
01159                 unsigned int A_size1,  unsigned int A_size2,
01160                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01161 
01162                 const T * B,
01163                 unsigned int B_start1, unsigned int B_start2,
01164                 unsigned int B_inc1,   unsigned int B_inc2,
01165                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01166       {
01167         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01168         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01169 
01170         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01171           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01172             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01173       }
01174 
01175 
01176       // sqrt
01177       template <typename T>
01178       __global__ void matrix_col_element_sqrt_kernel(
01179                 T * A,
01180                 unsigned int A_start1, unsigned int A_start2,
01181                 unsigned int A_inc1,   unsigned int A_inc2,
01182                 unsigned int A_size1,  unsigned int A_size2,
01183                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01184 
01185                 const T * B,
01186                 unsigned int B_start1, unsigned int B_start2,
01187                 unsigned int B_inc1,   unsigned int B_inc2,
01188                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01189       {
01190         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01191         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01192 
01193         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01194           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01195             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01196       }
01197 
01198 
01199       // tan
01200       template <typename T>
01201       __global__ void matrix_col_element_tan_kernel(
01202                 T * A,
01203                 unsigned int A_start1, unsigned int A_start2,
01204                 unsigned int A_inc1,   unsigned int A_inc2,
01205                 unsigned int A_size1,  unsigned int A_size2,
01206                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01207 
01208                 const T * B,
01209                 unsigned int B_start1, unsigned int B_start2,
01210                 unsigned int B_inc1,   unsigned int B_inc2,
01211                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01212       {
01213         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01214         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01215 
01216         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01217           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01218             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01219       }
01220 
01221 
01222       // tanh
01223       template <typename T>
01224       __global__ void matrix_col_element_tanh_kernel(
01225                 T * A,
01226                 unsigned int A_start1, unsigned int A_start2,
01227                 unsigned int A_inc1,   unsigned int A_inc2,
01228                 unsigned int A_size1,  unsigned int A_size2,
01229                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01230 
01231                 const T * B,
01232                 unsigned int B_start1, unsigned int B_start2,
01233                 unsigned int B_inc1,   unsigned int B_inc2,
01234                 unsigned int B_internal_size1,  unsigned int B_internal_size2)
01235       {
01236         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01237         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01238 
01239         for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
01240           for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
01241             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
01242       }
01243 
01244 
01245 
01246       //
01247       // matrix-vector product
01248       //
01249 
01250       template <typename T>
01251       __global__ void vec_mul_col_kernel(
01252                 const T * A,
01253                 unsigned int A_row_start,
01254                 unsigned int A_col_start,
01255                 unsigned int A_row_inc,
01256                 unsigned int A_col_inc,
01257                 unsigned int A_row_size,
01258                 unsigned int A_col_size,
01259                 unsigned int A_internal_rows,
01260                 unsigned int A_internal_cols,
01261                 const T * v,
01262                 unsigned int v_start,
01263                 unsigned int v_inc,
01264                 unsigned int v_size,
01265                 T * result,
01266                 unsigned int result_start,
01267                 unsigned int result_inc,
01268                 unsigned int result_size)
01269       {
01270 
01271         for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_row_size; row += gridDim.x * blockDim.x)
01272         {
01273           T dot_prod = 0;
01274           for (unsigned int col = 0; col < A_col_size; ++col)
01275             dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
01276           result[row * result_inc + result_start] = dot_prod;
01277         }
01278       }
01279 
01280 
01281       template <typename T>
01282       __global__ void trans_vec_mul_col_kernel(
01283                 const T * A,
01284                 unsigned int A_row_start,
01285                 unsigned int A_col_start,
01286                 unsigned int A_row_inc,
01287                 unsigned int A_col_inc,
01288                 unsigned int A_row_size,
01289                 unsigned int A_col_size,
01290                 unsigned int A_internal_rows,
01291                 unsigned int A_internal_cols,
01292                 const T * v,
01293                 unsigned int v_start,
01294                 unsigned int v_inc,
01295                 unsigned int v_size,
01296                 T * result,
01297                 unsigned int result_start,
01298                 unsigned int result_inc,
01299                 unsigned int result_size)
01300       {
01301         __shared__ T work[128];
01302 
01303         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01304         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01305         unsigned int lid = threadIdx.x;
01306 
01307         for (unsigned int row = row_gid; row < A_col_size; row += gridDim.x)
01308         {
01309           T dot_prod = 0;
01310           for (unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
01311             dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
01312           work[lid] = dot_prod;
01313 
01314           for(unsigned int stride = blockDim.x/2 ; stride>0 ; stride>>=1){
01315             __syncthreads();
01316             if(lid < stride)
01317               work[lid] += work[lid+stride];
01318           }
01319 
01320           if(lid == 0)
01321             result[row * result_inc + result_start] = work[0];
01322         }
01323       }
01324 
01325 
01326       //
01327       // matrix-matrix products
01328       //
01329 
01330 
01331 
01332 
01333       //
01334       // scaled rank-1-update
01335       //
01336 
01337       // alpha on CPU
01338       template <typename T>
01339       __global__ void scaled_rank1_update_col_kernel(
01340                 T * A,
01341                 unsigned int A_start1, unsigned int A_start2,
01342                 unsigned int A_inc1,   unsigned int A_inc2,
01343                 unsigned int A_size1,  unsigned int A_size2,
01344                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01345 
01346                 T val,
01347                 unsigned int options2,
01348 
01349                 const T * vec1,
01350                 unsigned int start1,
01351                 unsigned int inc1,
01352                 unsigned int size1,
01353 
01354                 const T * vec2,
01355                 unsigned int start2,
01356                 unsigned int inc2,
01357                 unsigned int size2)
01358       {
01359         T alpha = val;
01360         if (options2 & (1 << 0))
01361           alpha = -alpha;
01362         if (options2 & (1 << 1))
01363           alpha = ((T)(1)) / alpha;
01364 
01365         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01366         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01367 
01368         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01369         {
01370           T tmp = alpha * vec1[row * inc1 + start1];
01371           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01372             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
01373         }
01374       }
01375 
01376 
01377       // alpha on GPU
01378       template <typename T>
01379       __global__ void scaled_rank1_update_col_kernel(
01380                 T * A,
01381                 unsigned int A_start1, unsigned int A_start2,
01382                 unsigned int A_inc1,   unsigned int A_inc2,
01383                 unsigned int A_size1,  unsigned int A_size2,
01384                 unsigned int A_internal_size1,  unsigned int A_internal_size2,
01385 
01386                 const T * val,
01387                 unsigned int options2,
01388 
01389                 const T * vec1,
01390                 unsigned int start1,
01391                 unsigned int inc1,
01392                 unsigned int size1,
01393 
01394                 const T * vec2,
01395                 unsigned int start2,
01396                 unsigned int inc2,
01397                 unsigned int size2)
01398       {
01399         T alpha = *val;
01400         if (options2 & (1 << 0))
01401           alpha = -alpha;
01402         if (options2 & (1 << 1))
01403           alpha = ((T)(1)) / alpha;
01404 
01405         unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
01406         unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
01407 
01408         for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
01409         {
01410           T tmp = alpha * vec1[row * inc1 + start1];
01411           for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
01412             A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
01413         }
01414       }
01415 
01416 
01417 
01418     } // namespace cuda
01419   } //namespace linalg
01420 } //namespace viennacl
01421 
01422 
01423 #endif