ViennaCL - The Vienna Computing Library  1.2.0
matrix_row_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_MATRIX_ROW_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_MATRIX_ROW_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const matrix_row_align1_inplace_divide =
11 "__kernel void inplace_divide(\n"
12 " __global float * vec,\n"
13 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
14 " unsigned int size) \n"
15 "{ \n"
16 " float factor = *fac;\n"
17 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
18 " vec[i] /= factor;\n"
19 "}\n"
20 ; //matrix_row_align1_inplace_divide
21 
23 "__kernel void trans_lower_triangular_substitute_inplace(\n"
24 " __global const float * matrix,\n"
25 " unsigned int matrix_rows,\n"
26 " unsigned int matrix_cols,\n"
27 " unsigned int matrix_internal_rows,\n"
28 " unsigned int matrix_internal_cols,\n"
29 " __global float * vector)\n"
30 "{\n"
31 " float temp;\n"
32 " for (int row = 0; row < matrix_rows; ++row)\n"
33 " {\n"
34 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
35 " if (get_global_id(0) == 0)\n"
36 " vector[row] /= matrix[row+row*matrix_internal_cols];\n"
37 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
38 " temp = vector[row];\n"
39 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
40 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];\n"
41 " }\n"
42 "}\n"
43 ; //matrix_row_align1_trans_lower_triangular_substitute_inplace
44 
46 "//transposed lower triangular matrix\n"
47 "__kernel void trans_unit_upper_triangular_substitute_inplace(\n"
48 " __global const float * matrix, \n"
49 " unsigned int matrix_rows,\n"
50 " unsigned int matrix_cols,\n"
51 " unsigned int matrix_internal_rows,\n"
52 " unsigned int matrix_internal_cols,\n"
53 " __global float * vector) \n"
54 "{ \n"
55 " float temp; \n"
56 " for (int row = matrix_rows-1; row > -1; --row) \n"
57 " { \n"
58 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
59 " temp = vector[row]; \n"
60 " //eliminate column with index 'row' in parallel: \n"
61 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
62 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; \n"
63 " } \n"
64 " \n"
65 "}\n"
66 ; //matrix_row_align1_trans_unit_upper_triangular_substitute_inplace
67 
68 const char * const matrix_row_align1_sub =
69 "__kernel void sub(\n"
70 " __global const float * vec1,\n"
71 " __global const float * vec2, \n"
72 " __global float * result,\n"
73 " unsigned int size)\n"
74 "{ \n"
75 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
76 " result[i] = vec1[i] - vec2[i];\n"
77 "}\n"
78 ; //matrix_row_align1_sub
79 
81 "__kernel void scaled_rank1_update(\n"
82 " __global float * matrix,\n"
83 " unsigned int matrix_rows,\n"
84 " unsigned int matrix_cols,\n"
85 " unsigned int matrix_internal_rows,\n"
86 " unsigned int matrix_internal_cols,\n"
87 " float val,\n"
88 " __global const float * vector1, \n"
89 " __global const float * vector2) \n"
90 "{ \n"
91 " float tmp;\n"
92 " unsigned int offset;\n"
93 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
94 " {\n"
95 " tmp = val * vector1[row];\n"
96 " offset = row * matrix_internal_cols;\n"
97 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
98 " matrix[offset+col] += tmp * vector2[col];\n"
99 " }\n"
100 "}\n"
101 ; //matrix_row_align1_scaled_rank1_update
102 
103 const char * const matrix_row_align1_inplace_sub =
104 "__kernel void inplace_sub(\n"
105 " __global float * vec1,\n"
106 " __global const float * vec2,\n"
107 " unsigned int size) \n"
108 "{ \n"
109 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
110 " vec1[i] -= vec2[i];\n"
111 "}\n"
112 ; //matrix_row_align1_inplace_sub
113 
115 "__kernel void lower_triangular_substitute_inplace(\n"
116 " __global const float * matrix,\n"
117 " unsigned int matrix_rows,\n"
118 " unsigned int matrix_cols,\n"
119 " unsigned int matrix_internal_rows,\n"
120 " unsigned int matrix_internal_cols,\n"
121 " __global float * vector)\n"
122 "{\n"
123 " float temp;\n"
124 " for (int row = 0; row < matrix_rows; ++row)\n"
125 " {\n"
126 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
127 " if (get_global_id(0) == 0)\n"
128 " vector[row] /= matrix[row+row*matrix_internal_cols];\n"
129 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
130 " temp = vector[row];\n"
131 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
132 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];\n"
133 " }\n"
134 "}\n"
135 ; //matrix_row_align1_lower_triangular_substitute_inplace
136 
137 const char * const matrix_row_align1_vec_mul =
138 "\n"
139 "\n"
140 "\n"
141 "__kernel void vec_mul(\n"
142 " __global const float * matrix,\n"
143 " unsigned int matrix_rows,\n"
144 " unsigned int matrix_cols,\n"
145 " unsigned int matrix_internal_rows,\n"
146 " unsigned int matrix_internal_cols,\n"
147 " __global const float * vector, \n"
148 " __global float * result) \n"
149 "{ \n"
150 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
151 " {\n"
152 " float dot_prod = 0.0f;\n"
153 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
154 " dot_prod += matrix[row*matrix_internal_cols + col] * vector[col];\n"
155 " result[row] = dot_prod;\n"
156 " }\n"
157 "}\n"
158 "\n"
159 "\n"
160 ; //matrix_row_align1_vec_mul
161 
162 const char * const matrix_row_align1_lu_factorize =
163 "__kernel void lu_factorize(\n"
164 " __global float * matrix,\n"
165 " unsigned int matrix_rows,\n"
166 " unsigned int matrix_cols,\n"
167 " unsigned int matrix_internal_rows,\n"
168 " unsigned int matrix_internal_cols) \n"
169 "{ \n"
170 " float temp;\n"
171 " unsigned rowi;\n"
172 " unsigned rowk;\n"
173 " for (unsigned int i=1; i<matrix_rows; ++i)\n"
174 " {\n"
175 " rowi = i * matrix_internal_cols;\n"
176 " for (unsigned int k=0; k<i; ++k)\n"
177 " {\n"
178 " rowk = k * matrix_internal_cols;\n"
179 " if (get_global_id(0) == 0)\n"
180 " matrix[rowi + k] /= matrix[rowk + k];\n"
181 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
182 " temp = matrix[rowi + k];\n"
183 " \n"
184 " //parallel subtraction:\n"
185 " for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0))\n"
186 " matrix[rowi + j] -= temp * matrix[rowk + j];\n"
187 " }\n"
188 " }\n"
189 "} \n"
190 ; //matrix_row_align1_lu_factorize
191 
192 const char * const matrix_row_align1_rank1_update =
193 "//perform a rank-1 update of the matrix, i.e. A += x * x^T\n"
194 "__kernel void rank1_update(\n"
195 " __global float * matrix,\n"
196 " unsigned int matrix_rows,\n"
197 " unsigned int matrix_cols,\n"
198 " unsigned int matrix_internal_rows,\n"
199 " unsigned int matrix_internal_cols,\n"
200 " __global const float * vector1, \n"
201 " __global const float * vector2) \n"
202 "{ \n"
203 " float tmp;\n"
204 " unsigned int offset;\n"
205 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
206 " {\n"
207 " tmp = vector1[row];\n"
208 " offset = row*matrix_internal_cols;\n"
209 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
210 " matrix[offset+col] += tmp * vector2[col];\n"
211 " }\n"
212 "}\n"
213 ; //matrix_row_align1_rank1_update
214 
216 "//transposed lower triangular matrix\n"
217 "__kernel void trans_upper_triangular_substitute_inplace(\n"
218 " __global const float * matrix, \n"
219 " unsigned int matrix_rows,\n"
220 " unsigned int matrix_cols,\n"
221 " unsigned int matrix_internal_rows,\n"
222 " unsigned int matrix_internal_cols,\n"
223 " __global float * vector) \n"
224 "{ \n"
225 " float temp; \n"
226 " for (int row = matrix_rows-1; row > -1; --row) \n"
227 " { \n"
228 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
229 " if (get_global_id(0) == 0) \n"
230 " vector[row] /= matrix[row*matrix_internal_cols + row]; \n"
231 " \n"
232 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
233 " temp = vector[row]; \n"
234 " //eliminate column with index 'row' in parallel: \n"
235 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
236 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; \n"
237 " } \n"
238 " \n"
239 "}\n"
240 ; //matrix_row_align1_trans_upper_triangular_substitute_inplace
241 
243 "unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {\n"
244 " v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
245 " v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
246 " v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
247 " v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
248 " v = (v >> 16) | (v << 16);\n"
249 " v = v >> (32 - bit_size);\n"
250 " return v;\n"
251 "}\n"
252 "__kernel void fft_radix2_local(__global float2* input,\n"
253 " __local float2* lcl_input,\n"
254 " unsigned int bit_size,\n"
255 " unsigned int size,\n"
256 " unsigned int stride,\n"
257 " unsigned int batch_num,\n"
258 " float sign) {\n"
259 " unsigned int grp_id = get_group_id(0);\n"
260 " unsigned int grp_num = get_num_groups(0);\n"
261 " unsigned int lcl_sz = get_local_size(0);\n"
262 " unsigned int lcl_id = get_local_id(0);\n"
263 " const float NUM_PI = 3.14159265358979323846;\n"
264 " for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {\n"
265 " //unsigned int base_offset = stride * batch_id;\n"
266 " //copy chunk of global memory to local\n"
267 " for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
268 " unsigned int v = get_reorder_num(p, bit_size);\n"
269 " lcl_input[v] = input[batch_id * stride + p];//index\n"
270 " }\n"
271 " barrier(CLK_LOCAL_MEM_FENCE);\n"
272 " \n"
273 " //performs Cooley-Tukey FFT on local array\n"
274 " for(unsigned int s = 0; s < bit_size; s++) {\n"
275 " unsigned int ss = 1 << s;\n"
276 " float cs, sn;\n"
277 " for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {\n"
278 " unsigned int group = (tid & (ss - 1));\n"
279 " unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
280 " float2 in1 = lcl_input[pos];\n"
281 " float2 in2 = lcl_input[pos + ss];\n"
282 " float arg = group * sign * NUM_PI / ss;\n"
283 " sn = sincos(arg, &cs);\n"
284 " float2 ex = (float2)(cs, sn);\n"
285 " float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
286 " lcl_input[pos + ss] = in1 - tmp;\n"
287 " lcl_input[pos] = in1 + tmp;\n"
288 " }\n"
289 " barrier(CLK_LOCAL_MEM_FENCE);\n"
290 " }\n"
291 " \n"
292 " //copy local array back to global memory\n"
293 " for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
294 " input[batch_id * stride + p] = lcl_input[p];//index\n"
295 " }\n"
296 " }\n"
297 "}\n"
298 ; //matrix_row_align1_fft_radix2_local
299 
300 const char * const matrix_row_align1_fft_direct =
301 "// naive fourier transform (quadratic complexity, use for reference only)\n"
302 "__kernel void fft_direct(__global float2* input,\n"
303 " __global float2* output,\n"
304 " unsigned int size,\n"
305 " unsigned int stride,\n"
306 " unsigned int batch_num,\n"
307 " float sign) {\n"
308 "// unsigned int base_offset = 0;\n"
309 " const float NUM_PI = 3.14159265358979323846;\n"
310 " \n"
311 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
312 " for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {\n"
313 " float2 f = 0.0f;\n"
314 " for(unsigned int n = 0; n < size; n++) {\n"
315 " float2 in = input[batch_id * stride + n]; //input index here\n"
316 " float sn, cs;\n"
317 " float arg = sign * 2 * NUM_PI * k / size * n;\n"
318 " sn = sincos(arg, &cs);\n"
319 " float2 ex = (float2)(cs, sn);\n"
320 " f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);\n"
321 " }\n"
322 " output[batch_id * stride + k] = f;// output index here\n"
323 " }\n"
324 "// base_offset += stride;\n"
325 " }\n"
326 "}\n"
327 ; //matrix_row_align1_fft_direct
328 
330 "__kernel void unit_lower_triangular_substitute_inplace(\n"
331 " __global const float * matrix,\n"
332 " unsigned int matrix_rows,\n"
333 " unsigned int matrix_cols,\n"
334 " unsigned int matrix_internal_rows,\n"
335 " unsigned int matrix_internal_cols,\n"
336 " __global float * vector)\n"
337 "{\n"
338 " float temp;\n"
339 " for (int row = 0; row < matrix_rows; ++row)\n"
340 " {\n"
341 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
342 " temp = vector[row];\n"
343 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
344 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];\n"
345 " }\n"
346 "}\n"
347 ; //matrix_row_align1_unit_lower_triangular_substitute_inplace
348 
349 const char * const matrix_row_align1_fft_reorder =
350 "/*\n"
351 "* Performs reordering of input data in bit-reversal order\n"
352 "* Probably it's better to do in host side,\n"
353 "*/\n"
354 "unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {\n"
355 " v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
356 " v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
357 " v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
358 " v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
359 " v = (v >> 16) | (v << 16);\n"
360 " v = v >> (32 - bit_size);\n"
361 " return v;\n"
362 "}\n"
363 "__kernel void fft_reorder(__global float2* input,\n"
364 " unsigned int bit_size,\n"
365 " unsigned int size,\n"
366 " unsigned int stride,\n"
367 " int batch_num) {\n"
368 " //unsigned int base_offset = 0;\n"
369 " unsigned int glb_id = get_global_id(0);\n"
370 " unsigned int glb_sz = get_global_size(0);\n"
371 " \n"
372 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
373 " for(unsigned int i = glb_id; i < size; i += glb_sz) {\n"
374 " unsigned int v = get_reorder_num_2(i, bit_size);\n"
375 " if(i < v) {\n"
376 " float2 tmp = input[batch_id * stride + i]; // index\n"
377 " input[batch_id * stride + i] = input[batch_id * stride + v]; //index\n"
378 " input[batch_id * stride + v] = tmp; //index\n"
379 " }\n"
380 " }\n"
381 " //base_offset += stride;\n"
382 " }\n"
383 "}\n"
384 ; //matrix_row_align1_fft_reorder
385 
387 "__kernel void upper_triangular_substitute_inplace( \n"
388 " __global const float * matrix, \n"
389 " unsigned int matrix_rows,\n"
390 " unsigned int matrix_cols,\n"
391 " unsigned int matrix_internal_rows,\n"
392 " unsigned int matrix_internal_cols,\n"
393 " __global float * vector) \n"
394 "{ \n"
395 " float temp; \n"
396 " for (int row = matrix_rows-1; row > -1; --row) \n"
397 " { \n"
398 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
399 " if (get_global_id(0) == 0) \n"
400 " vector[row] /= matrix[row*matrix_internal_cols + row]; \n"
401 " \n"
402 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
403 " temp = vector[row]; \n"
404 " //eliminate column with index 'row' in parallel: \n"
405 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
406 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; \n"
407 " } \n"
408 " \n"
409 "}\n"
410 ; //matrix_row_align1_upper_triangular_substitute_inplace
411 
412 const char * const matrix_row_align1_inplace_add =
413 "__kernel void inplace_add(\n"
414 " __global float * A,\n"
415 " unsigned int A_row_start,\n"
416 " unsigned int A_col_start,\n"
417 " unsigned int A_row_size,\n"
418 " unsigned int A_col_size,\n"
419 " unsigned int A_internal_rows,\n"
420 " unsigned int A_internal_cols,\n"
421 " __global const float * B, \n"
422 " unsigned int B_row_start,\n"
423 " unsigned int B_col_start,\n"
424 " unsigned int B_row_size,\n"
425 " unsigned int B_col_size,\n"
426 " unsigned int B_internal_rows,\n"
427 " unsigned int B_internal_cols)\n"
428 "{ \n"
429 " if ( get_global_id(0) < A_row_size\n"
430 " && get_global_id(1) < A_col_size\n"
431 " )\n"
432 " A[ (get_global_id(0) + A_row_start) * A_internal_cols\n"
433 " + (get_global_id(1) + A_col_start)] \n"
434 " += B[ (get_global_id(0) + B_row_start) * B_internal_cols\n"
435 " + (get_global_id(1) + B_col_start)];\n"
436 "}\n"
437 ; //matrix_row_align1_inplace_add
438 
439 const char * const matrix_row_align1_fft_radix2 =
440 "__kernel void fft_radix2(__global float2* input,\n"
441 " unsigned int s,\n"
442 " unsigned int bit_size,\n"
443 " unsigned int size,\n"
444 " unsigned int stride,\n"
445 " unsigned int batch_num,\n"
446 " float sign) {\n"
447 " unsigned int ss = 1 << s;\n"
448 " unsigned int half_size = size >> 1;\n"
449 " float cs, sn;\n"
450 " const float NUM_PI = 3.14159265358979323846;\n"
451 " unsigned int glb_id = get_global_id(0);\n"
452 " unsigned int glb_sz = get_global_size(0);\n"
453 " \n"
454 "// unsigned int base_offset = 0;\n"
455 " \n"
456 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
457 " for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {\n"
458 " unsigned int group = (tid & (ss - 1));\n"
459 " unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
460 " unsigned int offset = batch_id * stride + pos;\n"
461 " float2 in1 = input[offset];//index\n"
462 " float2 in2 = input[offset + ss];//index\n"
463 " float arg = group * sign * NUM_PI / ss;\n"
464 " sn = sincos(arg, &cs);\n"
465 " //sn = native_sin(arg);\n"
466 " //cs = native_cos(arg);\n"
467 " float2 ex = (float2)(cs, sn);\n"
468 " float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
469 " input[offset + ss] = in1 - tmp;//index\n"
470 " input[offset] = in1 + tmp;//index\n"
471 " }\n"
472 "// base_offset += stride;\n"
473 " }\n"
474 "}\n"
475 ; //matrix_row_align1_fft_radix2
476 
477 const char * const matrix_row_align1_trans_vec_mul =
478 "__kernel void trans_vec_mul(\n"
479 " __global const float * matrix,\n"
480 " unsigned int matrix_rows,\n"
481 " unsigned int matrix_cols,\n"
482 " unsigned int matrix_internal_rows,\n"
483 " unsigned int matrix_internal_cols,\n"
484 " __global const float * vector, \n"
485 " __global float * result) \n"
486 "{ \n"
487 " //row and col indicate indices within transposed matrix\n"
488 " for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))\n"
489 " {\n"
490 " float dot_prod2 = 0.0f;\n"
491 " for (unsigned int col = 0; col < matrix_rows; ++col)\n"
492 " dot_prod2 += matrix[row + col*matrix_internal_cols] * vector[col];\n"
493 " result[row] = dot_prod2;\n"
494 " }\n"
495 "}\n"
496 ; //matrix_row_align1_trans_vec_mul
497 
499 "\n"
500 "__kernel void trans_unit_lower_triangular_substitute_inplace(\n"
501 " __global const float * matrix,\n"
502 " unsigned int matrix_rows,\n"
503 " unsigned int matrix_cols,\n"
504 " unsigned int matrix_internal_rows,\n"
505 " unsigned int matrix_internal_cols,\n"
506 " __global float * vector)\n"
507 "{\n"
508 " float temp;\n"
509 " for (int row = 0; row < matrix_rows; ++row)\n"
510 " {\n"
511 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
512 "\n"
513 " temp = vector[row];\n"
514 "\n"
515 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
516 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];\n"
517 " }\n"
518 "}\n"
519 "\n"
520 "\n"
521 ; //matrix_row_align1_trans_unit_lower_triangular_substitute_inplace
522 
524 "__kernel void cpu_inplace_mult(\n"
525 " __global float * vec,\n"
526 " float factor, \n"
527 " unsigned int size) \n"
528 "{ \n"
529 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
530 " vec[i] *= factor;\n"
531 "}\n"
532 ; //matrix_row_align1_cpu_inplace_mult
533 
534 const char * const matrix_row_align1_clear =
535 "__kernel void clear(\n"
536 " __global float * vec,\n"
537 " unsigned int size) \n"
538 "{ \n"
539 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
540 " vec[i] = 0;\n"
541 "}\n"
542 ; //matrix_row_align1_clear
543 
544 const char * const matrix_row_align1_inplace_mult =
545 "__kernel void inplace_mult(\n"
546 " __global float * vec,\n"
547 " __global const float * fac, \n"
548 " unsigned int size) \n"
549 "{ \n"
550 " float factor = *fac;\n"
551 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
552 " vec[i] *= factor;\n"
553 "}\n"
554 ; //matrix_row_align1_inplace_mult
555 
557 "__kernel void unit_upper_triangular_substitute_inplace( \n"
558 " __global const float * matrix, \n"
559 " unsigned int matrix_rows,\n"
560 " unsigned int matrix_cols,\n"
561 " unsigned int matrix_internal_rows,\n"
562 " unsigned int matrix_internal_cols,\n"
563 " __global float * vector) \n"
564 "{ \n"
565 " float temp; \n"
566 " for (int row = matrix_rows-1; row > -1; --row) \n"
567 " { \n"
568 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
569 " temp = vector[row]; \n"
570 " //eliminate column with index 'row' in parallel: \n"
571 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
572 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; \n"
573 " } \n"
574 " \n"
575 "}\n"
576 ; //matrix_row_align1_unit_upper_triangular_substitute_inplace
577 
578 const char * const matrix_row_align1_add =
579 "__kernel void add(\n"
580 " __global const float * vec1,\n"
581 " __global const float * vec2, \n"
582 " __global float * result,\n"
583 " unsigned int size) \n"
584 "{ \n"
585 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
586 " result[i] = vec1[i] + vec2[i];\n"
587 "}\n"
588 ; //matrix_row_align1_add
589 
590  } //namespace kernels
591  } //namespace linalg
592 } //namespace viennacl
593 #endif