ViennaCL - The Vienna Computing Library  1.2.0
matrix_col_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_MATRIX_COL_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_MATRIX_COL_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const matrix_col_align1_inplace_divide =
11 "__kernel void inplace_divide(\n"
12 " __global float * vec,\n"
13 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
14 " unsigned int size) \n"
15 "{ \n"
16 " float factor = *fac;\n"
17 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
18 " vec[i] /= factor;\n"
19 "}\n"
20 ; //matrix_col_align1_inplace_divide
21 
23 "__kernel void trans_lower_triangular_substitute_inplace(\n"
24 " __global const float * matrix,\n"
25 " unsigned int matrix_rows,\n"
26 " unsigned int matrix_cols,\n"
27 " unsigned int matrix_internal_rows,\n"
28 " unsigned int matrix_internal_cols,\n"
29 " __global float * vector)\n"
30 "{\n"
31 " float temp;\n"
32 " for (int row = 0; row < matrix_rows; ++row)\n"
33 " {\n"
34 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
35 " if (get_global_id(0) == 0)\n"
36 " vector[row] /= matrix[row+row*matrix_internal_rows];\n"
37 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
38 " temp = vector[row];\n"
39 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
40 " vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];\n"
41 " }\n"
42 "}\n"
43 ; //matrix_col_align1_trans_lower_triangular_substitute_inplace
44 
46 "//transposed lower triangular matrix\n"
47 "__kernel void trans_unit_upper_triangular_substitute_inplace(\n"
48 " __global const float * matrix, \n"
49 " unsigned int matrix_rows,\n"
50 " unsigned int matrix_cols,\n"
51 " unsigned int matrix_internal_rows,\n"
52 " unsigned int matrix_internal_cols,\n"
53 " __global float * vector) \n"
54 "{ \n"
55 " float temp; \n"
56 " for (int row = matrix_rows-1; row > -1; --row) \n"
57 " { \n"
58 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
59 " \n"
60 " temp = vector[row]; \n"
61 " //eliminate column with index 'row' in parallel: \n"
62 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
63 " vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; \n"
64 " } \n"
65 " \n"
66 "}\n"
67 ; //matrix_col_align1_trans_unit_upper_triangular_substitute_inplace
68 
69 const char * const matrix_col_align1_sub =
70 "__kernel void sub(\n"
71 " __global const float * vec1,\n"
72 " __global const float * vec2, \n"
73 " __global float * result,\n"
74 " unsigned int size)\n"
75 "{ \n"
76 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
77 " result[i] = vec1[i] - vec2[i];\n"
78 "}\n"
79 ; //matrix_col_align1_sub
80 
82 "__kernel void scaled_rank1_update(\n"
83 " __global float * matrix,\n"
84 " unsigned int matrix_rows,\n"
85 " unsigned int matrix_cols,\n"
86 " unsigned int matrix_internal_rows,\n"
87 " unsigned int matrix_internal_cols,\n"
88 " float val,\n"
89 " __global const float * vector1, \n"
90 " __global const float * vector2) \n"
91 "{ \n"
92 " float tmp;\n"
93 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
94 " {\n"
95 " tmp = val * vector1[row];\n"
96 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
97 " matrix[row + col*matrix_internal_rows] += tmp * vector2[col];\n"
98 " }\n"
99 "}\n"
100 ; //matrix_col_align1_scaled_rank1_update
101 
102 const char * const matrix_col_align1_inplace_sub =
103 "__kernel void inplace_sub(\n"
104 " __global float * vec1,\n"
105 " __global const float * vec2,\n"
106 " unsigned int size) \n"
107 "{ \n"
108 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
109 " vec1[i] -= vec2[i];\n"
110 "}\n"
111 ; //matrix_col_align1_inplace_sub
112 
114 "__kernel void lower_triangular_substitute_inplace(\n"
115 " __global const float * matrix,\n"
116 " unsigned int matrix_rows,\n"
117 " unsigned int matrix_cols,\n"
118 " unsigned int matrix_internal_rows,\n"
119 " unsigned int matrix_internal_cols,\n"
120 " __global float * vector)\n"
121 "{\n"
122 " float temp;\n"
123 " for (int row = 0; row < matrix_rows; ++row)\n"
124 " {\n"
125 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
126 " if (get_global_id(0) == 0)\n"
127 " vector[row] /= matrix[row+row*matrix_internal_rows];\n"
128 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
129 " temp = vector[row];\n"
130 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
131 " vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];\n"
132 " }\n"
133 "}\n"
134 ; //matrix_col_align1_lower_triangular_substitute_inplace
135 
136 const char * const matrix_col_align1_vec_mul =
137 "__kernel void vec_mul(\n"
138 " __global const float * matrix,\n"
139 " unsigned int matrix_rows,\n"
140 " unsigned int matrix_cols,\n"
141 " unsigned int matrix_internal_rows,\n"
142 " unsigned int matrix_internal_cols,\n"
143 " __global const float * vector, \n"
144 " __global float * result) \n"
145 "{ \n"
146 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
147 " {\n"
148 " float dot_prod = 0.0f;\n"
149 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
150 " dot_prod += matrix[row + col*matrix_internal_rows] * vector[col];\n"
151 " result[row] = dot_prod;\n"
152 " }\n"
153 "}\n"
154 ; //matrix_col_align1_vec_mul
155 
156 const char * const matrix_col_align1_lu_factorize =
157 "__kernel void lu_factorize(\n"
158 " __global float * matrix,\n"
159 " unsigned int matrix_rows,\n"
160 " unsigned int matrix_cols,\n"
161 " unsigned int matrix_internal_rows,\n"
162 " unsigned int matrix_internal_cols) \n"
163 "{ \n"
164 " float temp;\n"
165 " for (unsigned int i=1; i<matrix_rows; ++i)\n"
166 " {\n"
167 " for (unsigned int k=0; k<i; ++k)\n"
168 " {\n"
169 " if (get_global_id(0) == 0)\n"
170 " matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows];\n"
171 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
172 " temp = matrix[i + k*matrix_internal_rows];\n"
173 " \n"
174 " //parallel subtraction:\n"
175 " for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0))\n"
176 " matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows];\n"
177 " }\n"
178 " }\n"
179 "} \n"
180 ; //matrix_col_align1_lu_factorize
181 
182 const char * const matrix_col_align1_rank1_update =
183 "//perform a rank-1 update of the matrix, i.e. A += x * x^T\n"
184 "__kernel void rank1_update(\n"
185 " __global float * matrix,\n"
186 " unsigned int matrix_rows,\n"
187 " unsigned int matrix_cols,\n"
188 " unsigned int matrix_internal_rows,\n"
189 " unsigned int matrix_internal_cols,\n"
190 " __global const float * vector1, \n"
191 " __global const float * vector2) \n"
192 "{ \n"
193 " float tmp;\n"
194 " for (unsigned int row= get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
195 " {\n"
196 " tmp = vector1[row];\n"
197 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
198 " matrix[row + col * matrix_internal_rows] += tmp * vector2[col];\n"
199 " }\n"
200 "}\n"
201 ; //matrix_col_align1_rank1_update
202 
204 "//transposed lower triangular matrix\n"
205 "__kernel void trans_upper_triangular_substitute_inplace(\n"
206 " __global const float * matrix, \n"
207 " unsigned int matrix_rows,\n"
208 " unsigned int matrix_cols,\n"
209 " unsigned int matrix_internal_rows,\n"
210 " unsigned int matrix_internal_cols,\n"
211 " __global float * vector) \n"
212 "{ \n"
213 " float temp; \n"
214 " for (int row = matrix_rows-1; row > -1; --row) \n"
215 " { \n"
216 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
217 " if (get_global_id(0) == 0) \n"
218 " vector[row] /= matrix[row + row*matrix_internal_rows]; \n"
219 " \n"
220 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
221 " temp = vector[row]; \n"
222 " //eliminate column with index 'row' in parallel: \n"
223 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
224 " vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; \n"
225 " } \n"
226 " \n"
227 "}\n"
228 ; //matrix_col_align1_trans_upper_triangular_substitute_inplace
229 
231 "unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {\n"
232 " v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
233 " v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
234 " v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
235 " v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
236 " v = (v >> 16) | (v << 16);\n"
237 " v = v >> (32 - bit_size);\n"
238 " return v;\n"
239 "}\n"
240 "__kernel void fft_radix2_local(__global float2* input,\n"
241 " __local float2* lcl_input,\n"
242 " unsigned int bit_size,\n"
243 " unsigned int size,\n"
244 " unsigned int stride,\n"
245 " unsigned int batch_num,\n"
246 " float sign) {\n"
247 " unsigned int grp_id = get_group_id(0);\n"
248 " unsigned int grp_num = get_num_groups(0);\n"
249 " unsigned int lcl_sz = get_local_size(0);\n"
250 " unsigned int lcl_id = get_local_id(0);\n"
251 " const float NUM_PI = 3.14159265358979323846;\n"
252 " for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {\n"
253 " //unsigned int base_offset = stride * batch_id;\n"
254 " //copy chunk of global memory to local\n"
255 " for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
256 " unsigned int v = get_reorder_num(p, bit_size);\n"
257 " lcl_input[v] = input[p * stride + batch_id];//index\n"
258 " }\n"
259 " barrier(CLK_LOCAL_MEM_FENCE);\n"
260 " //performs Cooley-Tukey FFT on local array\n"
261 " for(unsigned int s = 0; s < bit_size; s++) {\n"
262 " unsigned int ss = 1 << s;\n"
263 " float cs, sn;\n"
264 " for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {\n"
265 " unsigned int group = (tid & (ss - 1));\n"
266 " unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
267 " float2 in1 = lcl_input[pos];\n"
268 " float2 in2 = lcl_input[pos + ss];\n"
269 " float arg = group * sign * NUM_PI / ss;\n"
270 " sn = sincos(arg, &cs);\n"
271 " float2 ex = (float2)(cs, sn);\n"
272 " float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
273 " lcl_input[pos + ss] = in1 - tmp;\n"
274 " lcl_input[pos] = in1 + tmp;\n"
275 " }\n"
276 " barrier(CLK_LOCAL_MEM_FENCE);\n"
277 " }\n"
278 " //copy local array back to global memory\n"
279 " for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
280 " input[p * stride + batch_id] = lcl_input[p];//index\n"
281 " }\n"
282 " }\n"
283 "}\n"
284 ; //matrix_col_align1_fft_radix2_local
285 
286 const char * const matrix_col_align1_fft_direct =
287 "// Direct FFT computation (quadratic complexity - use for reference only)\n"
288 "__kernel void fft_direct(__global float2* input,\n"
289 " __global float2* output,\n"
290 " unsigned int size,\n"
291 " unsigned int stride,\n"
292 " unsigned int batch_num,\n"
293 " float sign) {\n"
294 " \n"
295 " const float NUM_PI = 3.14159265358979323846;\n"
296 " \n"
297 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
298 " for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {\n"
299 " float2 f = 0.0f;\n"
300 " for(unsigned int n = 0; n < size; n++) {\n"
301 " float2 in = input[n * stride + batch_id]; //input index here\n"
302 " float sn, cs;\n"
303 " float arg = sign * 2 * NUM_PI * k / size * n;\n"
304 " sn = sincos(arg, &cs);\n"
305 " float2 ex = (float2)(cs, sn);\n"
306 " f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);\n"
307 " }\n"
308 " output[k * stride + batch_id] = f;// output index here\n"
309 " }\n"
310 " }\n"
311 "}\n"
312 ; //matrix_col_align1_fft_direct
313 
315 "__kernel void unit_lower_triangular_substitute_inplace(\n"
316 " __global const float * matrix,\n"
317 " unsigned int matrix_rows,\n"
318 " unsigned int matrix_cols,\n"
319 " unsigned int matrix_internal_rows,\n"
320 " unsigned int matrix_internal_cols,\n"
321 " __global float * vector)\n"
322 "{\n"
323 " float temp;\n"
324 " for (int row = 0; row < matrix_rows; ++row)\n"
325 " {\n"
326 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
327 " temp = vector[row];\n"
328 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
329 " vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];\n"
330 " }\n"
331 "}\n"
332 ; //matrix_col_align1_unit_lower_triangular_substitute_inplace
333 
334 const char * const matrix_col_align1_fft_reorder =
335 "/*\n"
336 "* Performs reordering of input data in bit-reversal order\n"
337 "* Probably it's better to do in host side,\n"
338 "*/\n"
339 "unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {\n"
340 " v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
341 " v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
342 " v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
343 " v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
344 " v = (v >> 16) | (v << 16);\n"
345 " v = v >> (32 - bit_size);\n"
346 " return v;\n"
347 "}\n"
348 "__kernel void fft_reorder(__global float2* input,\n"
349 " unsigned int bit_size,\n"
350 " unsigned int size,\n"
351 " unsigned int stride,\n"
352 " int batch_num) {\n"
353 " unsigned int glb_id = get_global_id(0);\n"
354 " unsigned int glb_sz = get_global_size(0);\n"
355 " \n"
356 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
357 " for(unsigned int i = glb_id; i < size; i += glb_sz) {\n"
358 " unsigned int v = get_reorder_num_2(i, bit_size);\n"
359 " if(i < v) {\n"
360 " float2 tmp = input[i * stride + batch_id]; // index\n"
361 " input[i * stride + batch_id] = input[v * stride + batch_id]; //index\n"
362 " input[v * stride + batch_id] = tmp; //index\n"
363 " }\n"
364 " }\n"
365 " }\n"
366 "}\n"
367 ; //matrix_col_align1_fft_reorder
368 
370 "__kernel void upper_triangular_substitute_inplace( \n"
371 " __global const float * matrix, \n"
372 " unsigned int matrix_rows,\n"
373 " unsigned int matrix_cols,\n"
374 " unsigned int matrix_internal_rows,\n"
375 " unsigned int matrix_internal_cols,\n"
376 " __global float * vector) \n"
377 "{ \n"
378 " float temp; \n"
379 " for (int row = matrix_rows-1; row > -1; --row) \n"
380 " { \n"
381 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
382 " if (get_global_id(0) == 0) \n"
383 " vector[row] /= matrix[row + row*matrix_internal_rows]; \n"
384 " \n"
385 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
386 " temp = vector[row]; \n"
387 " //eliminate column with index 'row' in parallel: \n"
388 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
389 " vector[elim] -= temp * matrix[elim + row * matrix_internal_rows]; \n"
390 " } \n"
391 " \n"
392 "}\n"
393 ; //matrix_col_align1_upper_triangular_substitute_inplace
394 
395 const char * const matrix_col_align1_inplace_add =
396 "__kernel void inplace_add(\n"
397 " __global float * A,\n"
398 " unsigned int A_row_start,\n"
399 " unsigned int A_col_start,\n"
400 " unsigned int A_row_size,\n"
401 " unsigned int A_col_size,\n"
402 " unsigned int A_internal_rows,\n"
403 " unsigned int A_internal_cols,\n"
404 " __global const float * B, \n"
405 " unsigned int B_row_start,\n"
406 " unsigned int B_col_start,\n"
407 " unsigned int B_row_size,\n"
408 " unsigned int B_col_size,\n"
409 " unsigned int B_internal_rows,\n"
410 " unsigned int B_internal_cols)\n"
411 "{ \n"
412 " if ( get_global_id(0) < A_row_size\n"
413 " && get_global_id(1) < A_col_size\n"
414 " )\n"
415 " A[ (get_global_id(0) + A_row_start)\n"
416 " + (get_global_id(1) + A_col_start) * A_internal_rows] \n"
417 " += B[ (get_global_id(0) + B_row_start)\n"
418 " + (get_global_id(1) + B_col_start) * B_internal_rows];\n"
419 "}\n"
420 ; //matrix_col_align1_inplace_add
421 
422 const char * const matrix_col_align1_fft_radix2 =
423 "__kernel void fft_radix2(__global float2* input,\n"
424 " unsigned int s,\n"
425 " unsigned int bit_size,\n"
426 " unsigned int size,\n"
427 " unsigned int stride,\n"
428 " unsigned int batch_num,\n"
429 " float sign) {\n"
430 " unsigned int ss = 1 << s;\n"
431 " unsigned int half_size = size >> 1;\n"
432 " float cs, sn;\n"
433 " const float NUM_PI = 3.14159265358979323846;\n"
434 " unsigned int glb_id = get_global_id(0);\n"
435 " unsigned int glb_sz = get_global_size(0);\n"
436 " \n"
437 " for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
438 " for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {\n"
439 " unsigned int group = (tid & (ss - 1));\n"
440 " unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
441 " unsigned int offset = pos * stride + batch_id;\n"
442 " float2 in1 = input[offset];//index\n"
443 " float2 in2 = input[offset + ss * stride];//index\n"
444 " float arg = group * sign * NUM_PI / ss;\n"
445 " sn = sincos(arg, &cs);\n"
446 " float2 ex = (float2)(cs, sn);\n"
447 " float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
448 " input[offset + ss * stride] = in1 - tmp;//index\n"
449 " input[offset] = in1 + tmp;//index\n"
450 " }\n"
451 " }\n"
452 "}\n"
453 ; //matrix_col_align1_fft_radix2
454 
455 const char * const matrix_col_align1_trans_vec_mul =
456 "__kernel void trans_vec_mul(\n"
457 " __global const float * matrix,\n"
458 " unsigned int matrix_rows,\n"
459 " unsigned int matrix_cols,\n"
460 " unsigned int matrix_internal_rows,\n"
461 " unsigned int matrix_internal_cols,\n"
462 " __global const float * vector, \n"
463 " __global float * result) \n"
464 "{ \n"
465 " //row and col indicate indices within transposed matrix\n"
466 " for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))\n"
467 " {\n"
468 " float dot_prod2 = 0.0f;\n"
469 " for (unsigned int col = 0; col < matrix_rows; ++col)\n"
470 " dot_prod2 += matrix[row * matrix_internal_rows + col] * vector[col];\n"
471 " result[row] = dot_prod2;\n"
472 " }\n"
473 "}\n"
474 ; //matrix_col_align1_trans_vec_mul
475 
477 "\n"
478 "__kernel void trans_unit_lower_triangular_substitute_inplace(\n"
479 " __global const float * matrix,\n"
480 " unsigned int matrix_rows,\n"
481 " unsigned int matrix_cols,\n"
482 " unsigned int matrix_internal_rows,\n"
483 " unsigned int matrix_internal_cols,\n"
484 " __global float * vector)\n"
485 "{\n"
486 " float temp;\n"
487 " for (int row = 0; row < matrix_rows; ++row)\n"
488 " {\n"
489 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
490 "\n"
491 " temp = vector[row];\n"
492 "\n"
493 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
494 " vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];\n"
495 " }\n"
496 "}\n"
497 "\n"
498 "\n"
499 ; //matrix_col_align1_trans_unit_lower_triangular_substitute_inplace
500 
502 "__kernel void cpu_inplace_mult(\n"
503 " __global float * vec,\n"
504 " float factor, \n"
505 " unsigned int size) \n"
506 "{ \n"
507 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
508 " vec[i] *= factor;\n"
509 "}\n"
510 ; //matrix_col_align1_cpu_inplace_mult
511 
512 const char * const matrix_col_align1_clear =
513 "__kernel void clear(\n"
514 " __global float * vec,\n"
515 " unsigned int size) \n"
516 "{ \n"
517 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
518 " vec[i] = 0;\n"
519 "}\n"
520 ; //matrix_col_align1_clear
521 
522 const char * const matrix_col_align1_inplace_mult =
523 "__kernel void inplace_mult(\n"
524 " __global float * vec,\n"
525 " __global const float * fac, \n"
526 " unsigned int size) \n"
527 "{ \n"
528 " float factor = *fac;\n"
529 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
530 " vec[i] *= factor;\n"
531 "}\n"
532 ; //matrix_col_align1_inplace_mult
533 
535 "__kernel void unit_upper_triangular_substitute_inplace( \n"
536 " __global const float * matrix, \n"
537 " unsigned int matrix_rows,\n"
538 " unsigned int matrix_cols,\n"
539 " unsigned int matrix_internal_rows,\n"
540 " unsigned int matrix_internal_cols,\n"
541 " __global float * vector) \n"
542 "{ \n"
543 " float temp; \n"
544 " for (int row = matrix_rows-1; row > -1; --row) \n"
545 " { \n"
546 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
547 " \n"
548 " temp = vector[row]; \n"
549 " //eliminate column with index 'row' in parallel: \n"
550 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
551 " vector[elim] -= temp * matrix[elim + row * matrix_internal_rows]; \n"
552 " } \n"
553 " \n"
554 "}\n"
555 ; //matrix_col_align1_unit_upper_triangular_substitute_inplace
556 
557 const char * const matrix_col_align1_add =
558 "__kernel void add(\n"
559 " __global const float * vec1,\n"
560 " __global const float * vec2, \n"
561 " __global float * result,\n"
562 " unsigned int size) \n"
563 "{ \n"
564 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
565 " result[i] = vec1[i] + vec2[i];\n"
566 "}\n"
567 ; //matrix_col_align1_add
568 
569  } //namespace kernels
570  } //namespace linalg
571 } //namespace viennacl
572 #endif