ViennaCL - The Vienna Computing Library
1.5.1
|
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP 00002 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP 00003 00004 #include "viennacl/tools/tools.hpp" 00005 #include "viennacl/ocl/kernel.hpp" 00006 #include "viennacl/ocl/platform.hpp" 00007 #include "viennacl/ocl/utils.hpp" 00008 00011 namespace viennacl 00012 { 00013 namespace linalg 00014 { 00015 namespace opencl 00016 { 00017 namespace kernels 00018 { 00019 00021 00023 enum ambm_scalar_type 00024 { 00025 VIENNACL_AMBM_NONE = 0, // vector does not exist/contribute 00026 VIENNACL_AMBM_CPU, 00027 VIENNACL_AMBM_GPU 00028 }; 00029 00031 struct ambm_config 00032 { 00033 ambm_config() : with_stride_and_range(true), is_row_major(true), a(VIENNACL_AMBM_CPU), b(VIENNACL_AMBM_NONE) {} 00034 00035 bool with_stride_and_range; 00036 bool is_row_major; 00037 std::string assign_op; 00038 ambm_scalar_type a; 00039 ambm_scalar_type b; 00040 }; 00041 00042 // just returns the for-loop 00043 template <typename StringType> 00044 void generate_ambm_impl2(StringType & source, ambm_config const & cfg, bool mult_alpha, bool mult_beta) 00045 { 00046 if (cfg.is_row_major) 00047 { 00048 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n"); 00049 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n"); 00050 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n"); 00051 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n"); 00052 } 00053 else 00054 { 00055 source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n"); 00056 source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n"); 00057 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n"); 00058 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n"); 00059 } 00060 00061 if (cfg.with_stride_and_range) 00062 { 00063 if (cfg.is_row_major) 00064 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] "); 00065 else 00066 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] "); 00067 source.append(cfg.assign_op); 00068 if (cfg.is_row_major) 00069 source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] "); 00070 else 00071 source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] "); 00072 00073 if (mult_alpha) 00074 source.append("* alpha "); 00075 else 00076 source.append("/ alpha "); 00077 if (cfg.b != VIENNACL_AMBM_NONE) 00078 { 00079 if (cfg.is_row_major) 00080 source.append("+ C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] "); 00081 else 00082 source.append("+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] "); 00083 if (mult_beta) 00084 source.append("* beta"); 00085 else 00086 source.append("/ beta"); 00087 } 00088 } 00089 else 00090 { 00091 if (cfg.is_row_major) 00092 source.append(" A[row * A_internal_size2 + col] "); 00093 else 00094 source.append(" A[row + col * A_internal_size1] "); 00095 source.append(cfg.assign_op); 00096 if (cfg.is_row_major) 00097 source.append(" B[row * B_internal_size2 + col] "); 00098 else 00099 source.append(" B[row + col * B_internal_size1] "); 00100 00101 if (mult_alpha) 00102 source.append("* alpha "); 00103 else 00104 source.append("/ alpha "); 00105 if (cfg.b != VIENNACL_AMBM_NONE) 00106 { 00107 if (cfg.is_row_major) 00108 source.append("+ C[row * C_internal_size2 + col] "); 00109 else 00110 source.append("+ C[row + col * C_internal_size2] "); 00111 if (mult_beta) 00112 source.append("* beta"); 00113 else 00114 source.append("/ beta"); 00115 } 00116 } 00117 source.append("; \n"); 00118 } 00119 00120 template <typename StringType> 00121 void generate_ambm_impl(StringType & source, std::string const & numeric_string, ambm_config const & cfg) 00122 { 00123 source.append("__kernel void am"); 00124 if (cfg.b != VIENNACL_AMBM_NONE) 00125 source.append("bm"); 00126 if (cfg.assign_op != "=") 00127 source.append("_m"); 00128 00129 if (cfg.a == VIENNACL_AMBM_CPU) 00130 source.append("_cpu"); 00131 else if (cfg.a == VIENNACL_AMBM_GPU) 00132 source.append("_gpu"); 00133 00134 if (cfg.b == VIENNACL_AMBM_CPU) 00135 source.append("_cpu"); 00136 else if (cfg.b == VIENNACL_AMBM_GPU) 00137 source.append("_gpu"); 00138 source.append("( \n"); 00139 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00140 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00141 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00142 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00143 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00144 if (cfg.a == VIENNACL_AMBM_CPU) 00145 { 00146 source.append(" "); source.append(numeric_string); source.append(" fac2, \n"); 00147 } 00148 else if (cfg.a == VIENNACL_AMBM_GPU) 00149 { 00150 source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n"); 00151 } 00152 source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse 00153 source.append(" __global const "); source.append(numeric_string); source.append(" * B, \n"); 00154 source.append(" unsigned int B_start1, unsigned int B_start2, \n"); 00155 source.append(" unsigned int B_inc1, unsigned int B_inc2, \n"); 00156 source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2"); 00157 00158 if (cfg.b != VIENNACL_AMBM_NONE) 00159 { 00160 source.append(", \n\n"); 00161 if (cfg.b == VIENNACL_AMBM_CPU) 00162 { 00163 source.append(" "); source.append(numeric_string); source.append(" fac3, \n"); 00164 } 00165 else if (cfg.b == VIENNACL_AMBM_GPU) 00166 { 00167 source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n"); 00168 } 00169 source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse 00170 source.append(" __global const "); source.append(numeric_string); source.append(" * C, \n"); 00171 source.append(" unsigned int C_start1, unsigned int C_start2, \n"); 00172 source.append(" unsigned int C_inc1, unsigned int C_inc2, \n"); 00173 source.append(" unsigned int C_internal_size1, unsigned int C_internal_size2 \n"); 00174 } 00175 source.append(") { \n"); 00176 00177 if (cfg.a == VIENNACL_AMBM_CPU) 00178 { 00179 source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n"); 00180 } 00181 else if (cfg.a == VIENNACL_AMBM_GPU) 00182 { 00183 source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n"); 00184 } 00185 source.append(" if (options2 & (1 << 0)) \n"); 00186 source.append(" alpha = -alpha; \n"); 00187 source.append(" \n"); 00188 00189 if (cfg.b == VIENNACL_AMBM_CPU) 00190 { 00191 source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n"); 00192 } 00193 else if (cfg.b == VIENNACL_AMBM_GPU) 00194 { 00195 source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n"); 00196 } 00197 if (cfg.b != VIENNACL_AMBM_NONE) 00198 { 00199 source.append(" if (options3 & (1 << 0)) \n"); 00200 source.append(" beta = -beta; \n"); 00201 source.append(" \n"); 00202 } 00203 source.append(" if (options2 & (1 << 1)) { \n"); 00204 if (cfg.b != VIENNACL_AMBM_NONE) 00205 { 00206 source.append(" if (options3 & (1 << 1)) {\n"); 00207 generate_ambm_impl2(source, cfg, false, false); 00208 source.append(" } else {\n"); 00209 generate_ambm_impl2(source, cfg, false, true); 00210 source.append(" } \n"); 00211 } 00212 else 00213 generate_ambm_impl2(source, cfg, false, true); 00214 source.append(" } else { \n"); 00215 if (cfg.b != VIENNACL_AMBM_NONE) 00216 { 00217 source.append(" if (options3 & (1 << 1)) {\n"); 00218 generate_ambm_impl2(source, cfg, true, false); 00219 source.append(" } else {\n"); 00220 generate_ambm_impl2(source, cfg, true, true); 00221 source.append(" } \n"); 00222 } 00223 else 00224 generate_ambm_impl2(source, cfg, true, true); 00225 source.append(" } \n"); 00226 source.append("} \n"); 00227 } 00228 00229 template <typename StringType> 00230 void generate_ambm(StringType & source, std::string const & numeric_string, bool is_row_major) 00231 { 00232 ambm_config cfg; 00233 cfg.assign_op = "="; 00234 cfg.with_stride_and_range = true; 00235 cfg.is_row_major = is_row_major; 00236 00237 // am 00238 cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg); 00239 cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg); 00240 00241 // ambm 00242 cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg); 00243 cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg); 00244 cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg); 00245 cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg); 00246 00247 // ambm_m 00248 cfg.assign_op = "+="; 00249 00250 cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg); 00251 cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg); 00252 cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg); 00253 cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg); 00254 } 00255 00256 template <typename StringType> 00257 void generate_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major) 00258 { 00259 source.append("__kernel void assign_cpu( \n"); 00260 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00261 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00262 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00263 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00264 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00265 source.append(" "); source.append(numeric_string); source.append(" alpha) \n"); 00266 source.append("{ \n"); 00267 if (is_row_major) 00268 { 00269 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n"); 00270 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n"); 00271 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n"); 00272 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n"); 00273 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; \n"); 00274 } 00275 else 00276 { 00277 source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n"); 00278 source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n"); 00279 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n"); 00280 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n"); 00281 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha; \n"); 00282 } 00283 source.append("} \n"); 00284 } 00285 00286 template <typename StringType> 00287 void generate_diagonal_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major) 00288 { 00289 source.append("__kernel void diagonal_assign_cpu( \n"); 00290 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00291 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00292 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00293 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00294 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00295 source.append(" "); source.append(numeric_string); source.append(" alpha) \n"); 00296 source.append("{ \n"); 00297 source.append(" for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))\n"); 00298 if (is_row_major) 00299 source.append(" A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; \n"); 00300 else 00301 source.append(" A[(idx * A_inc1 + A_start1) + (idx * A_inc2 + A_start2) * A_internal_size1] = alpha; \n"); 00302 source.append("} \n"); 00303 } 00304 00305 template <typename StringType> 00306 void generate_element_op(StringType & source, std::string const & numeric_string, bool is_row_major) 00307 { 00308 source.append("__kernel void element_op( \n"); 00309 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00310 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00311 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00312 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00313 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00314 source.append(" __global "); source.append(numeric_string); source.append(" * B, \n"); 00315 source.append(" unsigned int B_start1, unsigned int B_start2, \n"); 00316 source.append(" unsigned int B_inc1, unsigned int B_inc2, \n"); 00317 source.append(" unsigned int B_internal_size1, unsigned int B_internal_size2, \n"); 00318 source.append(" __global "); source.append(numeric_string); source.append(" * C, \n"); 00319 source.append(" unsigned int C_start1, unsigned int C_start2, \n"); 00320 source.append(" unsigned int C_inc1, unsigned int C_inc2, \n"); 00321 source.append(" unsigned int C_internal_size1, unsigned int C_internal_size2, \n"); 00322 source.append(" unsigned int op_type) \n"); //0: product, 1: division, 2: pow 00323 source.append("{ \n"); 00324 if (is_row_major) 00325 { 00326 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0);\n"); 00327 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0);\n"); 00328 source.append(" if (op_type == 2) {"); 00329 if (numeric_string == "float" || numeric_string == "double") 00330 { 00331 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n"); 00332 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n"); 00333 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n"); 00334 source.append(" pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)], \n"); 00335 source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]); \n"); 00336 } 00337 source.append(" } else if (op_type == 1) {"); 00338 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n"); 00339 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n"); 00340 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n"); 00341 source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / \n"); 00342 source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n"); 00343 source.append(" } else if (op_type == 0) {"); 00344 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n"); 00345 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n"); 00346 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n"); 00347 source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * \n"); 00348 source.append(" C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n"); 00349 source.append(" }"); 00350 } 00351 else 00352 { 00353 source.append(" unsigned int row_gid = get_global_id(0) % get_local_size(0);\n"); 00354 source.append(" unsigned int col_gid = get_global_id(0) / get_local_size(0);\n"); 00355 source.append(" if (op_type == 2) {"); 00356 if (numeric_string == "float" || numeric_string == "double") 00357 { 00358 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n"); 00359 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n"); 00360 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n"); 00361 source.append(" pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1], \n"); 00362 source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]); \n"); 00363 } 00364 source.append(" } else if (op_type == 1) {"); 00365 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n"); 00366 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n"); 00367 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n"); 00368 source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / \n"); 00369 source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; \n"); 00370 source.append(" } else if (op_type == 0) {"); 00371 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n"); 00372 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n"); 00373 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = \n"); 00374 source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * \n"); 00375 source.append(" C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]; \n"); 00376 source.append(" }"); 00377 } 00378 source.append("} \n"); 00379 } 00380 00381 00382 template <typename StringType> 00383 void generate_fft(StringType & source, std::string const & numeric_string, bool is_row_major) 00384 { 00385 // naive fourier transform (quadratic complexity, use for reference only) 00386 source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n"); 00387 source.append(" __global "); source.append(numeric_string); source.append("2 *output, \n"); 00388 source.append(" unsigned int size, \n"); 00389 source.append(" unsigned int stride, \n"); 00390 source.append(" unsigned int batch_num, \n"); 00391 source.append(" "); source.append(numeric_string); source.append(" sign) { \n"); 00392 source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n"); 00393 source.append(" \n"); 00394 source.append(" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n"); 00395 source.append(" for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n"); 00396 source.append(" "); source.append(numeric_string); source.append("2 f = 0.0f; \n"); 00397 source.append(" \n"); 00398 source.append(" for(unsigned int n = 0; n < size; n++) { \n"); 00399 source.append(" "); source.append(numeric_string); source.append("2 in = "); 00400 if (is_row_major) 00401 source.append("input[batch_id * stride + n]; \n"); //input index here 00402 else 00403 source.append("input[n * stride + batch_id]; \n"); //input index here 00404 source.append(" \n"); 00405 source.append(" "); source.append(numeric_string); source.append(" sn, cs; \n"); 00406 source.append(" "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n"); 00407 source.append(" sn = sincos(arg, &cs); \n"); 00408 source.append(" \n"); 00409 source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n"); 00410 source.append(" f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n"); 00411 source.append(" } \n"); 00412 source.append(" \n"); 00413 if (is_row_major) 00414 source.append(" output[batch_id * stride + k] = f; \n"); // output index here 00415 else 00416 source.append(" output[k * stride + batch_id] = f; \n"); // output index here 00417 source.append(" } \n"); 00418 source.append(" } \n"); 00419 source.append("} \n"); 00420 00421 source.append(" \n"); 00422 00423 source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n"); 00424 source.append(" unsigned int s, \n"); 00425 source.append(" unsigned int bit_size, \n"); 00426 source.append(" unsigned int size, \n"); 00427 source.append(" unsigned int stride, \n"); 00428 source.append(" unsigned int batch_num, \n"); 00429 source.append(" "); source.append(numeric_string); source.append(" sign) { \n"); 00430 source.append(" \n"); 00431 source.append(" unsigned int ss = 1 << s; \n"); 00432 source.append(" unsigned int half_size = size >> 1; \n"); 00433 source.append(" \n"); 00434 source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n"); 00435 source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n"); 00436 source.append(" \n"); 00437 source.append(" unsigned int glb_id = get_global_id(0); \n"); 00438 source.append(" unsigned int glb_sz = get_global_size(0); \n"); 00439 00440 source.append(" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n"); 00441 source.append(" for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n"); 00442 source.append(" unsigned int group = (tid & (ss - 1)); \n"); 00443 source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n"); 00444 00445 if (is_row_major) 00446 { 00447 source.append(" unsigned int offset = batch_id * stride + pos; \n"); 00448 source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index 00449 source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index 00450 } 00451 else 00452 { 00453 source.append(" unsigned int offset = pos * stride + batch_id; \n"); 00454 source.append(" "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index 00455 source.append(" "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index 00456 } 00457 00458 source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n"); 00459 00460 source.append(" sn = sincos(arg, &cs); \n"); 00461 00462 source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n"); 00463 00464 source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n"); 00465 00466 if (is_row_major) 00467 source.append(" input[offset + ss] = in1 - tmp; \n");//index 00468 else 00469 source.append(" input[offset + ss * stride] = in1 - tmp; \n");//index 00470 source.append(" input[offset] = in1 + tmp; \n");//index 00471 source.append(" } \n"); 00472 source.append(" } \n"); 00473 source.append("} \n"); 00474 00475 source.append(" \n"); 00476 00477 source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n"); 00478 source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n"); 00479 source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n"); 00480 source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n"); 00481 source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n"); 00482 source.append(" v = (v >> 16) | (v << 16); \n"); 00483 source.append(" \n"); 00484 source.append(" v = v >> (32 - bit_size); \n"); 00485 source.append(" \n"); 00486 source.append(" return v; \n"); 00487 source.append(" } \n"); 00488 00489 source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n"); 00490 source.append(" __local "); source.append(numeric_string); source.append("2* lcl_input, \n"); 00491 source.append(" unsigned int bit_size, \n"); 00492 source.append(" unsigned int size, \n"); 00493 source.append(" unsigned int stride, \n"); 00494 source.append(" unsigned int batch_num, \n"); 00495 source.append(" "); source.append(numeric_string); source.append(" sign) { \n"); 00496 00497 source.append(" unsigned int grp_id = get_group_id(0); \n"); 00498 source.append(" unsigned int grp_num = get_num_groups(0); \n"); 00499 00500 source.append(" unsigned int lcl_sz = get_local_size(0); \n"); 00501 source.append(" unsigned int lcl_id = get_local_id(0); \n"); 00502 source.append(" const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n"); 00503 00504 source.append(" for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n"); 00505 //unsigned int base_offset = stride * batch_id; \n"); 00506 //copy chunk of global memory to local \n"); 00507 source.append(" for(unsigned int p = lcl_id; p < size; p += lcl_sz) { \n"); 00508 source.append(" unsigned int v = get_reorder_num(p, bit_size); \n"); 00509 if (is_row_major) 00510 source.append(" lcl_input[v] = input[batch_id * stride + p]; \n"); //index 00511 else 00512 source.append(" lcl_input[v] = input[p * stride + batch_id]; \n"); //index 00513 source.append(" } \n"); 00514 00515 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00516 00517 //performs Cooley-Tukey FFT on local array 00518 source.append(" for(unsigned int s = 0; s < bit_size; s++) { \n"); 00519 source.append(" unsigned int ss = 1 << s; \n"); 00520 00521 source.append(" "); source.append(numeric_string); source.append(" cs, sn; \n"); 00522 00523 source.append(" for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n"); 00524 source.append(" unsigned int group = (tid & (ss - 1)); \n"); 00525 source.append(" unsigned int pos = ((tid >> s) << (s + 1)) + group; \n"); 00526 00527 source.append(" "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n"); 00528 source.append(" "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n"); 00529 00530 source.append(" "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n"); 00531 00532 source.append(" sn = sincos(arg, &cs); \n"); 00533 source.append(" "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n"); 00534 00535 source.append(" "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n"); 00536 00537 source.append(" lcl_input[pos + ss] = in1 - tmp; \n"); 00538 source.append(" lcl_input[pos] = in1 + tmp; \n"); 00539 source.append(" } \n"); 00540 00541 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00542 source.append(" } \n"); 00543 00544 //copy local array back to global memory 00545 source.append(" for(unsigned int p = lcl_id; p < size; p += lcl_sz) { \n"); 00546 if (is_row_major) 00547 source.append(" input[batch_id * stride + p] = lcl_input[p]; \n");//index 00548 else 00549 source.append(" input[p * stride + batch_id] = lcl_input[p]; \n");//index 00550 source.append(" } \n"); 00551 source.append(" } \n"); 00552 source.append(" } \n"); 00553 00554 source.append(" \n"); 00555 00556 // 00557 // Performs reordering of input data in bit-reversal order 00558 // Probably it's better to do in host side, 00559 // 00560 source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n"); 00561 source.append(" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n"); 00562 source.append(" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n"); 00563 source.append(" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n"); 00564 source.append(" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n"); 00565 source.append(" v = (v >> 16) | (v << 16); \n"); 00566 00567 source.append(" v = v >> (32 - bit_size); \n"); 00568 00569 source.append(" return v; \n"); 00570 source.append("} \n"); 00571 00572 source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n"); 00573 source.append(" unsigned int bit_size, \n"); 00574 source.append(" unsigned int size, \n"); 00575 source.append(" unsigned int stride, \n"); 00576 source.append(" int batch_num) { \n"); 00577 00578 source.append(" unsigned int glb_id = get_global_id(0); \n"); 00579 source.append(" unsigned int glb_sz = get_global_size(0); \n"); 00580 00581 source.append(" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n"); 00582 source.append(" for(unsigned int i = glb_id; i < size; i += glb_sz) { \n"); 00583 source.append(" unsigned int v = get_reorder_num_2(i, bit_size); \n"); 00584 00585 source.append(" if(i < v) {\n"); 00586 if (is_row_major) 00587 { 00588 source.append(" "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index 00589 source.append(" input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index 00590 source.append(" input[batch_id * stride + v] = tmp; \n"); //index 00591 } 00592 else 00593 { 00594 source.append(" "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index 00595 source.append(" input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index 00596 source.append(" input[v * stride + batch_id] = tmp; \n"); //index 00597 } 00598 source.append(" } \n"); 00599 source.append(" } \n"); 00600 source.append(" } \n"); 00601 source.append("} \n"); 00602 } 00603 00604 template <typename StringType> 00605 void generate_lu(StringType & source, std::string const & numeric_string, bool is_row_major) 00606 { 00607 source.append("__kernel void lu_factorize( \n"); 00608 source.append(" __global "); source.append(numeric_string); source.append(" * matrix, \n"); 00609 source.append(" unsigned int matrix_rows, \n"); 00610 source.append(" unsigned int matrix_cols, \n"); 00611 source.append(" unsigned int matrix_internal_rows, \n"); 00612 source.append(" unsigned int matrix_internal_cols) \n"); 00613 source.append("{ \n"); 00614 source.append(" "); source.append(numeric_string); source.append(" temp; \n"); 00615 00616 if (is_row_major) 00617 { 00618 source.append(" unsigned rowi; \n"); 00619 source.append(" unsigned rowk; \n"); 00620 source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n"); 00621 source.append(" { \n"); 00622 source.append(" rowi = i * matrix_internal_cols; \n"); 00623 source.append(" for (unsigned int k=0; k<i; ++k) \n"); 00624 source.append(" { \n"); 00625 source.append(" rowk = k * matrix_internal_cols; \n"); 00626 source.append(" if (get_global_id(0) == 0) \n"); 00627 source.append(" matrix[rowi + k] /= matrix[rowk + k]; \n"); 00628 00629 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00630 source.append(" temp = matrix[rowi + k]; \n"); 00631 00632 //parallel subtraction: 00633 source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n"); 00634 source.append(" matrix[rowi + j] -= temp * matrix[rowk + j]; \n"); 00635 } 00636 else 00637 { 00638 source.append(" for (unsigned int i=1; i<matrix_rows; ++i) \n"); 00639 source.append(" { \n"); 00640 source.append(" for (unsigned int k=0; k<i; ++k) \n"); 00641 source.append(" { \n"); 00642 00643 source.append(" if (get_global_id(0) == 0) \n"); 00644 source.append(" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n"); 00645 00646 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00647 source.append(" temp = matrix[i + k*matrix_internal_rows]; \n"); 00648 00649 //parallel subtraction: 00650 source.append(" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n"); 00651 source.append(" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n"); 00652 } 00653 source.append(" }"); 00654 source.append(" }"); 00655 source.append("}"); 00656 } 00657 00658 00659 template <typename StringType> 00660 void generate_scaled_rank1_update(StringType & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu) 00661 { 00662 source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n"); 00663 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00664 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00665 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00666 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00667 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00668 00669 if (alpha_on_cpu) { 00670 source.append(" "); source.append(numeric_string); source.append(" val, \n"); 00671 } else { 00672 source.append(" __global const "); source.append(numeric_string); source.append(" *val, \n"); 00673 } 00674 source.append(" unsigned int options2, \n"); 00675 00676 source.append(" __global const "); source.append(numeric_string); source.append(" * vec1, \n"); 00677 source.append(" unsigned int start1, \n"); 00678 source.append(" unsigned int inc1, \n"); 00679 source.append(" unsigned int size1, \n"); 00680 00681 source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n"); 00682 source.append(" unsigned int start2, \n"); 00683 source.append(" unsigned int inc2, \n"); 00684 source.append(" unsigned int size2) \n"); 00685 source.append("{ \n"); 00686 00687 if (alpha_on_cpu) { 00688 source.append(" "); source.append(numeric_string); source.append(" alpha = val; \n"); 00689 } else { 00690 source.append(" "); source.append(numeric_string); source.append(" alpha = val[0]; \n"); 00691 } 00692 source.append(" if (options2 & (1 << 0)) \n"); 00693 source.append(" alpha = -alpha; \n"); 00694 00695 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n"); 00696 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n"); 00697 00698 source.append(" for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n"); 00699 source.append(" { \n"); 00700 source.append(" "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];"); 00701 source.append(" tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;"); 00702 source.append(" for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n"); 00703 if (is_row_major) 00704 source.append(" A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n"); 00705 else 00706 source.append(" A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n"); 00707 source.append(" } \n"); 00708 source.append("} \n"); 00709 } 00710 00711 template <typename StringType> 00712 void generate_trans_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major) 00713 { 00714 source.append("__kernel void trans_vec_mul( \n"); 00715 source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n"); 00716 source.append(" unsigned int A_row_start, unsigned int A_col_start, \n"); 00717 source.append(" unsigned int A_row_inc, unsigned int A_col_inc, \n"); 00718 source.append(" unsigned int A_row_size, unsigned int A_col_size, \n"); 00719 source.append(" unsigned int A_internal_rows, unsigned int A_internal_cols, \n"); 00720 source.append(" __global const "); source.append(numeric_string); source.append(" * v, \n"); 00721 source.append(" unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n"); 00722 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00723 source.append(" unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n"); 00724 source.append(" __local "); source.append(numeric_string); source.append(" * work) \n"); 00725 source.append("{ \n"); 00726 if (is_row_major) 00727 { 00728 source.append(" for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) \n"); 00729 source.append(" { \n"); 00730 source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n"); 00731 source.append(" for (unsigned int col = 0; col < A_row_size; ++col) \n"); 00732 source.append(" dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; \n"); 00733 source.append(" result[row * result_inc + result_start] = dot_prod; \n"); 00734 } 00735 else 00736 { 00737 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n"); 00738 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n"); 00739 source.append(" unsigned int lid = get_local_id(0); \n"); 00740 00741 source.append(" for (unsigned int row = row_gid; row < A_col_size; row += get_num_groups(0)) \n"); 00742 source.append(" { \n"); 00743 source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n"); 00744 source.append(" for (unsigned int col = col_gid; col < A_row_size; col+=get_local_size(0)) \n"); 00745 source.append(" dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; \n"); 00746 source.append(" work[lid] = dot_prod; \n"); 00747 00748 source.append(" for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n"); 00749 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00750 source.append(" if(lid < stride) \n"); 00751 source.append(" work[lid] += work[lid+stride]; \n"); 00752 source.append(" } \n"); 00753 00754 source.append(" if(lid == 0) \n"); 00755 source.append(" result[row * result_inc + result_start] = work[0]; \n"); 00756 } 00757 source.append(" } \n"); 00758 source.append("} \n"); 00759 } 00760 00761 template <typename StringType> 00762 void generate_triangular_substitute_inplace(StringType & source, std::string const & numeric_string, bool is_row_major) 00763 { 00764 source.append("__kernel void triangular_substitute_inplace( \n"); 00765 source.append(" __global "); source.append(numeric_string); source.append(" * A, \n"); 00766 source.append(" unsigned int A_start1, unsigned int A_start2, \n"); 00767 source.append(" unsigned int A_inc1, unsigned int A_inc2, \n"); 00768 source.append(" unsigned int A_size1, unsigned int A_size2, \n"); 00769 source.append(" unsigned int A_internal_size1, unsigned int A_internal_size2, \n"); 00770 source.append(" __global "); source.append(numeric_string); source.append(" * v, \n"); 00771 source.append(" unsigned int v_start, \n"); 00772 source.append(" unsigned int v_inc, \n"); 00773 source.append(" unsigned int v_size, \n"); 00774 source.append(" unsigned int options) \n"); 00775 source.append("{ \n"); 00776 source.append(" "); source.append(numeric_string); source.append(" temp; \n"); 00777 source.append(" unsigned int unit_diagonal_flag = (options & (1 << 0)); \n"); 00778 source.append(" unsigned int transposed_access_A = (options & (1 << 1)); \n"); 00779 source.append(" unsigned int is_lower_solve = (options & (1 << 2)); \n"); 00780 source.append(" unsigned int row; \n"); 00781 source.append(" for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed) \n"); //Note: A required to be square 00782 source.append(" { \n"); 00783 source.append(" row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n"); 00784 source.append(" if (!unit_diagonal_flag) \n"); 00785 source.append(" { \n"); 00786 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00787 source.append(" if (get_global_id(0) == 0) \n"); 00788 if (is_row_major) 00789 source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n"); 00790 else 00791 source.append(" v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n"); 00792 source.append(" } \n"); 00793 00794 source.append(" barrier(CLK_GLOBAL_MEM_FENCE); \n"); 00795 00796 source.append(" temp = v[row * v_inc + v_start]; \n"); 00797 00798 source.append(" for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n"); 00799 source.append(" elim < (is_lower_solve ? A_size1 : row); \n"); 00800 source.append(" elim += get_global_size(0)) \n"); 00801 if (is_row_major) 00802 { 00803 source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n"); 00804 source.append(" : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2))]; \n"); 00805 } 00806 else 00807 { 00808 source.append(" v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n"); 00809 source.append(" : ((elim * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1)]; \n"); 00810 } 00811 source.append(" } \n"); 00812 source.append("} \n"); 00813 } 00814 00815 template <typename StringType> 00816 void generate_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major) 00817 { 00818 source.append("__kernel void vec_mul( \n"); 00819 source.append(" __global const "); source.append(numeric_string); source.append(" * A, \n"); 00820 source.append(" unsigned int A_row_start, unsigned int A_col_start, \n"); 00821 source.append(" unsigned int A_row_inc, unsigned int A_col_inc, \n"); 00822 source.append(" unsigned int A_row_size, unsigned int A_col_size, \n"); 00823 source.append(" unsigned int A_internal_rows, unsigned int A_internal_cols, \n"); 00824 source.append(" __global const "); source.append(numeric_string); source.append(" * v, \n"); 00825 source.append(" unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n"); 00826 source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); 00827 source.append(" unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n"); 00828 source.append(" __local "); source.append(numeric_string); source.append(" * work) \n"); 00829 source.append("{ \n"); 00830 if (is_row_major) 00831 { 00832 source.append(" unsigned int row_gid = get_global_id(0) / get_local_size(0); \n"); 00833 source.append(" unsigned int col_gid = get_global_id(0) % get_local_size(0); \n"); 00834 source.append(" unsigned int lid = get_local_id(0); \n"); 00835 00836 source.append(" for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) \n"); 00837 source.append(" { \n"); 00838 source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n"); 00839 source.append(" for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) \n"); 00840 source.append(" dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; \n"); 00841 source.append(" work[lid] = dot_prod; \n"); 00842 00843 source.append(" for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n"); 00844 source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); 00845 source.append(" if(lid < stride) \n"); 00846 source.append(" work[lid] += work[lid+stride]; \n"); 00847 source.append(" } \n"); 00848 00849 source.append(" if(lid == 0) \n"); 00850 source.append(" result[row * result_inc + result_start] = work[0]; \n"); 00851 00852 } 00853 else 00854 { 00855 source.append(" for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0)) \n"); 00856 source.append(" { \n"); 00857 source.append(" "); source.append(numeric_string); source.append(" dot_prod = 0; \n"); 00858 source.append(" for (unsigned int col = 0; col < A_col_size; ++col) \n"); 00859 source.append(" dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; \n"); 00860 source.append(" result[row * result_inc + result_start] = dot_prod; \n"); 00861 } 00862 source.append(" } \n"); 00863 source.append("} \n"); 00864 } 00865 00866 namespace detail 00867 { 00868 inline std::string type_to_string(viennacl::row_major) { return "row"; } 00869 inline std::string type_to_string(viennacl::column_major) { return "col"; } 00870 } 00871 00873 00874 // main kernel class 00876 template <typename NumericT, typename F> 00877 struct matrix 00878 { 00879 static std::string program_name() 00880 { 00881 return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_" + detail::type_to_string(F()); 00882 } 00883 00884 static void init(viennacl::ocl::context & ctx) 00885 { 00886 viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); 00887 std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); 00888 bool is_row_major = viennacl::is_row_major<F>::value; 00889 00890 static std::map<cl_context, bool> init_done; 00891 if (!init_done[ctx.handle().get()]) 00892 { 00893 std::string source; 00894 source.reserve(8192); 00895 00896 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); 00897 00898 // fully parametrized kernels: 00899 generate_ambm(source, numeric_string, is_row_major); 00900 00901 // kernels with mostly predetermined skeleton: 00902 generate_assign_cpu(source, numeric_string, is_row_major); 00903 generate_diagonal_assign_cpu(source, numeric_string, is_row_major); 00904 generate_element_op(source, numeric_string, is_row_major); 00905 generate_scaled_rank1_update(source, numeric_string, is_row_major, true); 00906 generate_scaled_rank1_update(source, numeric_string, is_row_major, false); 00907 generate_trans_vec_mul(source, numeric_string, is_row_major); 00908 generate_vec_mul(source, numeric_string, is_row_major); 00909 00910 if (numeric_string == "float" || numeric_string == "double") 00911 { 00912 generate_fft(source, numeric_string, is_row_major); 00913 generate_lu(source, numeric_string, is_row_major); 00914 generate_triangular_substitute_inplace(source, numeric_string, is_row_major); 00915 } 00916 00917 std::string prog_name = program_name(); 00918 #ifdef VIENNACL_BUILD_INFO 00919 std::cout << "Creating program " << prog_name << std::endl; 00920 #endif 00921 ctx.add_program(source, prog_name); 00922 init_done[ctx.handle().get()] = true; 00923 } //if 00924 } //init 00925 }; 00926 00927 } // namespace kernels 00928 } // namespace opencl 00929 } // namespace linalg 00930 } // namespace viennacl 00931 #endif 00932