ViennaCL - The Vienna Computing Library  1.2.0
vector_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const vector_align4_inplace_div_sub =
11 "__kernel void inplace_div_sub(\n"
12 " __global float4 * vec1,\n"
13 " unsigned int start1,\n"
14 " unsigned int size1,\n"
15 " __global const float4 * vec2,\n"
16 " unsigned int start2,\n"
17 " unsigned int size2,\n"
18 " __global const float * fac) //CPU variant is mapped to mult_add\n"
19 "{ \n"
20 " float factor = *fac;\n"
21 " unsigned int i_end = size1/4;\n"
22 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
23 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
24 "}\n"
25 ; //vector_align4_inplace_div_sub
26 
27 const char * const vector_align4_mul_add =
28 "__kernel void mul_add(\n"
29 " __global const float4 * vec1,\n"
30 " unsigned int start1,\n"
31 " unsigned int size1,\n"
32 " __global const float * fac,\n"
33 " __global const float4 * vec2,\n"
34 " unsigned int start2,\n"
35 " unsigned int size2,\n"
36 " __global float4 * result,\n"
37 " unsigned int start3,\n"
38 " unsigned int size3) \n"
39 "{ \n"
40 " float factor = *fac;\n"
41 " unsigned int i_end = size1/4;\n"
42 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
43 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
44 "}\n"
45 ; //vector_align4_mul_add
46 
47 const char * const vector_align4_inplace_mul_add =
48 "__kernel void inplace_mul_add(\n"
49 " __global float4 * vec1,\n"
50 " unsigned int start1,\n"
51 " unsigned int size1,\n"
52 " __global const float4 * vec2,\n"
53 " unsigned int start2,\n"
54 " unsigned int size2,\n"
55 " __global const float * fac) \n"
56 "{ \n"
57 " float factor = *fac;\n"
58 " unsigned int size_div_4 = size1/4;\n"
59 " for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))\n"
60 " vec1[i+start1] += vec2[i+start2] * factor;\n"
61 "}\n"
62 ; //vector_align4_inplace_mul_add
63 
64 const char * const vector_align4_inplace_div_add =
65 "__kernel void inplace_div_add(\n"
66 " __global float4 * vec1,\n"
67 " unsigned int start1,\n"
68 " unsigned int size1,\n"
69 " __global const float4 * vec2,\n"
70 " unsigned int start2,\n"
71 " unsigned int size2,\n"
72 " __global const float * fac) //CPU variant is mapped to mult_add\n"
73 "{ \n"
74 " float factor = *fac;\n"
75 " unsigned int i_end = size1 / 4;\n"
76 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
77 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
78 "}\n"
79 ; //vector_align4_inplace_div_add
80 
81 const char * const vector_align4_cpu_mul_add =
82 "__kernel void cpu_mul_add(\n"
83 " __global const float4 * vec1,\n"
84 " unsigned int start1,\n"
85 " unsigned int size1,\n"
86 " float factor,\n"
87 " __global const float4 * vec2,\n"
88 " unsigned int start2,\n"
89 " unsigned int size2,\n"
90 " __global float4 * result,\n"
91 " unsigned int start3,\n"
92 " unsigned int size3) \n"
93 "{ \n"
94 " unsigned int i_end = size1/4;\n"
95 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
96 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
97 "}\n"
98 ; //vector_align4_cpu_mul_add
99 
101 "__kernel void cpu_inplace_mul_add(\n"
102 " __global float4 * vec1,\n"
103 " unsigned int start1,\n"
104 " unsigned int size1,\n"
105 " __global const float4 * vec2,\n"
106 " unsigned int start2,\n"
107 " unsigned int size2,\n"
108 " float factor) \n"
109 "{ \n"
110 " unsigned int i_end = size1/4;\n"
111 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
112 " vec1[i+start1] += vec2[i+start2] * factor;\n"
113 "}\n"
114 ; //vector_align4_cpu_inplace_mul_add
115 
116 const char * const vector_align4_inplace_mul_sub =
117 "__kernel void inplace_mul_sub(\n"
118 " __global float4 * vec1,\n"
119 " unsigned int start1,\n"
120 " unsigned int size1,\n"
121 " __global const float4 * vec2,\n"
122 " unsigned int start2,\n"
123 " unsigned int size2,\n"
124 " __global const float * fac) //CPU variant is mapped to mult_add\n"
125 "{ \n"
126 " float factor = *fac;\n"
127 " unsigned int i_end = size/4;\n"
128 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
129 " vec1[i+start1] -= vec2[i+start2] * factor;\n"
130 "}\n"
131 ; //vector_align4_inplace_mul_sub
132 
133 const char * const vector_align1_inplace_divide =
134 "__kernel void inplace_divide(\n"
135 " __global float * vec,\n"
136 " unsigned int start1,\n"
137 " unsigned int size1,\n"
138 " __global const float * fac) //note: CPU variant is mapped to prod_scalar\n"
139 "{ \n"
140 " float factor = *fac;\n"
141 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
142 " vec[i+start1] /= factor;\n"
143 "}\n"
144 ; //vector_align1_inplace_divide
145 
146 const char * const vector_align1_inplace_div_sub =
147 "///// divide substract:\n"
148 "__kernel void inplace_div_sub(\n"
149 " __global float * vec1,\n"
150 " unsigned int start1,\n"
151 " unsigned int size1,\n"
152 " __global const float * vec2,\n"
153 " unsigned int start2,\n"
154 " unsigned int size2,\n"
155 " __global const float * fac) //CPU variant is mapped to mult_add\n"
156 "{ \n"
157 " float factor = *fac;\n"
158 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
159 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
160 "}\n"
161 ; //vector_align1_inplace_div_sub
162 
163 const char * const vector_align1_vmax =
164 "__kernel void vmax(\n"
165 " __global float * vec1,\n"
166 " unsigned int start1,\n"
167 " unsigned int size1,\n"
168 " __global float * result) \n"
169 "{ \n"
170 " //parallel reduction on global memory (make sure that size is a power of 2)\n"
171 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
172 " {\n"
173 " if (get_global_id(0) < stride)\n"
174 " vec1[get_global_id(0)+start1] = fmax(vec1[get_global_id(0)+start1+stride], vec1[get_global_id(0)+start1]);\n"
175 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
176 " }\n"
177 " \n"
178 " if (get_global_id(0) == 0)\n"
179 " *result = vec1[start1];\n"
180 "}\n"
181 ; //vector_align1_vmax
182 
183 const char * const vector_align1_index_norm_inf =
184 "//index_norm_inf:\n"
185 "unsigned int float_vector1_index_norm_inf_impl(\n"
186 " __global const float * vec,\n"
187 " unsigned int start1,\n"
188 " unsigned int size1,\n"
189 " __local float * float_buffer,\n"
190 " __local unsigned int * index_buffer)\n"
191 "{\n"
192 " //step 1: fill buffer:\n"
193 " float cur_max = 0.0f;\n"
194 " float tmp;\n"
195 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
196 " {\n"
197 " tmp = fabs(vec[i+start1]);\n"
198 " if (cur_max < tmp)\n"
199 " {\n"
200 " float_buffer[get_global_id(0)] = tmp;\n"
201 " index_buffer[get_global_id(0)] = i;\n"
202 " cur_max = tmp;\n"
203 " }\n"
204 " }\n"
205 " \n"
206 " //step 2: parallel reduction:\n"
207 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
208 " {\n"
209 " barrier(CLK_LOCAL_MEM_FENCE);\n"
210 " if (get_global_id(0) < stride)\n"
211 " {\n"
212 " //find the first occurring index\n"
213 " if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])\n"
214 " {\n"
215 " index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];\n"
216 " float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];\n"
217 " }\n"
218 " \n"
219 " //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];\n"
220 " //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);\n"
221 " }\n"
222 " }\n"
223 " \n"
224 " return index_buffer[0];\n"
225 "}\n"
226 "\n"
227 "__kernel void index_norm_inf(\n"
228 " __global float * vec,\n"
229 " unsigned int start1,\n"
230 " unsigned int size1,\n"
231 " __local float * float_buffer,\n"
232 " __local unsigned int * index_buffer,\n"
233 " global unsigned int * result) \n"
234 "{ \n"
235 " unsigned int tmp = float_vector1_index_norm_inf_impl(vec, start1, size1, float_buffer, index_buffer);\n"
236 " if (get_global_id(0) == 0) *result = tmp;\n"
237 "}\n"
238 "\n"
239 "\n"
240 ; //vector_align1_index_norm_inf
241 
242 const char * const vector_align1_sub =
243 "__kernel void sub(\n"
244 " __global const float * vec1,\n"
245 " unsigned int start1,\n"
246 " unsigned int size1,\n"
247 " __global const float * vec2, \n"
248 " unsigned int start2,\n"
249 " unsigned int size2,\n"
250 " __global float * result,\n"
251 " unsigned int start3,\n"
252 " unsigned int size3)\n"
253 "{ \n"
254 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
255 " result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
256 "}\n"
257 ; //vector_align1_sub
258 
259 const char * const vector_align1_mul_add =
260 "__kernel void mul_add(\n"
261 " __global const float * vec1,\n"
262 " unsigned int start1,\n"
263 " unsigned int size1,\n"
264 " __global const float * fac,\n"
265 " __global const float * vec2,\n"
266 " unsigned int start2,\n"
267 " unsigned int size2,\n"
268 " __global float * result,\n"
269 " unsigned int start3,\n"
270 " unsigned int size3\n"
271 " ) \n"
272 "{ \n"
273 " float factor = *fac;\n"
274 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
275 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
276 "}\n"
277 ; //vector_align1_mul_add
278 
279 const char * const vector_align1_inplace_sub =
280 "__kernel void inplace_sub(\n"
281 " __global float * vec1,\n"
282 " unsigned int start1,\n"
283 " unsigned int size1,\n"
284 " __global const float * vec2,\n"
285 " unsigned int start2,\n"
286 " unsigned int size2) \n"
287 "{ \n"
288 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
289 " vec1[i+start1] -= vec2[i+start2];\n"
290 "}\n"
291 ; //vector_align1_inplace_sub
292 
293 const char * const vector_align1_inner_prod =
294 "//helper:\n"
295 "void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )\n"
296 "{\n"
297 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
298 " {\n"
299 " barrier(CLK_LOCAL_MEM_FENCE);\n"
300 " if (get_local_id(0) < stride)\n"
301 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
302 " }\n"
303 "}\n"
304 "//////// inner products:\n"
305 "float impl_inner_prod(\n"
306 " __global const float * vec1,\n"
307 " unsigned int start1,\n"
308 " unsigned int size1,\n"
309 " __global const float * vec2,\n"
310 " unsigned int start2,\n"
311 " unsigned int size2,\n"
312 " __local float * tmp_buffer)\n"
313 "{\n"
314 " float tmp = 0;\n"
315 " for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))\n"
316 " tmp += vec1[i+start1] * vec2[i+start2];\n"
317 " tmp_buffer[get_local_id(0)] = tmp;\n"
318 " \n"
319 " helper_inner_prod_parallel_reduction(tmp_buffer);\n"
320 " \n"
321 " return tmp_buffer[0];\n"
322 "}\n"
323 "__kernel void inner_prod(\n"
324 " __global const float * vec1,\n"
325 " unsigned int start1,\n"
326 " unsigned int size1,\n"
327 " __global const float * vec2,\n"
328 " unsigned int start2,\n"
329 " unsigned int size2,\n"
330 " __local float * tmp_buffer,\n"
331 " global float * group_buffer)\n"
332 "{\n"
333 " float tmp = impl_inner_prod(vec1,\n"
334 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
335 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) - ( get_group_id(0) * size1) / get_num_groups(0),\n"
336 " vec2,\n"
337 " ( get_group_id(0) * size2) / get_num_groups(0) + start2,\n"
338 " ((get_group_id(0) + 1) * size2) / get_num_groups(0) - ( get_group_id(0) * size2) / get_num_groups(0),\n"
339 " tmp_buffer);\n"
340 " \n"
341 " if (get_local_id(0) == 0)\n"
342 " group_buffer[get_group_id(0)] = tmp;\n"
343 " \n"
344 "}\n"
345 ; //vector_align1_inner_prod
346 
347 const char * const vector_align1_mult =
348 "__kernel void mult(\n"
349 " __global const float * vec,\n"
350 " unsigned int start1,\n"
351 " unsigned int size1,\n"
352 " __global const float * fac, \n"
353 " __global float * result,\n"
354 " unsigned int start3,\n"
355 " unsigned int size3) \n"
356 "{ \n"
357 " float factor = *fac;\n"
358 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
359 " result[i+start3] = vec[i+start1] * factor;\n"
360 "}\n"
361 ; //vector_align1_mult
362 
363 const char * const vector_align1_diag_precond =
364 "__kernel void diag_precond(\n"
365 " __global const float * diag_A_inv, \n"
366 " unsigned int start1,\n"
367 " unsigned int size1,\n"
368 " __global float * x, \n"
369 " unsigned int start2,\n"
370 " unsigned int size2) \n"
371 "{ \n"
372 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
373 " x[i+start2] *= diag_A_inv[i+start1];\n"
374 "}\n"
375 ; //vector_align1_diag_precond
376 
377 const char * const vector_align1_inplace_mul_add =
378 "__kernel void inplace_mul_add(\n"
379 " __global float * vec1,\n"
380 " unsigned int start1,\n"
381 " unsigned int size1,\n"
382 " __global const float * vec2,\n"
383 " unsigned int start2,\n"
384 " unsigned int size2,\n"
385 " __global const float * fac) \n"
386 "{ \n"
387 " float factor = *fac;\n"
388 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
389 " vec1[i+start1] += vec2[i+start2] * factor;\n"
390 "}\n"
391 ; //vector_align1_inplace_mul_add
392 
393 const char * const vector_align1_norm_1 =
394 "//helper:\n"
395 "void helper_norm1_parallel_reduction( __local float * tmp_buffer )\n"
396 "{\n"
397 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
398 " {\n"
399 " barrier(CLK_LOCAL_MEM_FENCE);\n"
400 " if (get_global_id(0) < stride)\n"
401 " tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
402 " }\n"
403 "}\n"
404 "\n"
405 "////// norm_1\n"
406 "float impl_norm_1(\n"
407 " __global const float * vec,\n"
408 " unsigned int start_index,\n"
409 " unsigned int end_index,\n"
410 " __local float * tmp_buffer)\n"
411 "{\n"
412 " float tmp = 0;\n"
413 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
414 " tmp += fabs(vec[i]);\n"
415 " \n"
416 " tmp_buffer[get_local_id(0)] = tmp;\n"
417 " \n"
418 " helper_norm1_parallel_reduction(tmp_buffer);\n"
419 " \n"
420 " return tmp_buffer[0];\n"
421 "};\n"
422 "\n"
423 "__kernel void norm_1(\n"
424 " __global const float * vec,\n"
425 " unsigned int start1,\n"
426 " unsigned int size1,\n"
427 " __local float * tmp_buffer,\n"
428 " global float * group_buffer)\n"
429 "{\n"
430 " float tmp = impl_norm_1(vec,\n"
431 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
432 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
433 " tmp_buffer);\n"
434 " \n"
435 " if (get_local_id(0) == 0)\n"
436 " group_buffer[get_group_id(0)] = tmp; \n"
437 "}\n"
438 "\n"
439 ; //vector_align1_norm_1
440 
441 const char * const vector_align1_divide =
442 "// Note: name 'div' is not allowed by the jit-compiler\n"
443 "__kernel void divide(\n"
444 " __global const float * vec,\n"
445 " unsigned int start1,\n"
446 " unsigned int size1,\n"
447 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
448 " __global float * result,\n"
449 " unsigned int start3,\n"
450 " unsigned int size3) \n"
451 "{ \n"
452 " float factor = *fac;\n"
453 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
454 " result[i+start3] = vec[i+start1] / factor;\n"
455 "}\n"
456 ; //vector_align1_divide
457 
458 const char * const vector_align1_swap =
459 "////// swap:\n"
460 "__kernel void swap(\n"
461 " __global float * vec1,\n"
462 " unsigned int start1,\n"
463 " unsigned int size1,\n"
464 " __global float * vec2,\n"
465 " unsigned int start2,\n"
466 " unsigned int size2\n"
467 " ) \n"
468 "{ \n"
469 " float tmp;\n"
470 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
471 " {\n"
472 " tmp = vec2[i+start2];\n"
473 " vec2[i+start2] = vec1[i+start1];\n"
474 " vec1[i+start1] = tmp;\n"
475 " }\n"
476 "}\n"
477 " \n"
478 ; //vector_align1_swap
479 
480 const char * const vector_align1_norm_inf =
481 "\n"
482 "////// norm_inf\n"
483 "float impl_norm_inf(\n"
484 " __global const float * vec,\n"
485 " unsigned int start_index,\n"
486 " unsigned int end_index,\n"
487 " __local float * tmp_buffer)\n"
488 "{\n"
489 " float tmp = 0;\n"
490 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
491 " tmp = fmax(fabs(vec[i]), tmp);\n"
492 " tmp_buffer[get_local_id(0)] = tmp;\n"
493 " \n"
494 " //step 2: parallel reduction:\n"
495 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
496 " {\n"
497 " barrier(CLK_LOCAL_MEM_FENCE);\n"
498 " if (get_global_id(0) < stride)\n"
499 " tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);\n"
500 " }\n"
501 " \n"
502 " return tmp_buffer[0];\n"
503 "}\n"
504 "\n"
505 "__kernel void norm_inf(\n"
506 " __global const float * vec,\n"
507 " unsigned int start1,\n"
508 " unsigned int size1,\n"
509 " __local float * tmp_buffer,\n"
510 " global float * group_buffer)\n"
511 "{\n"
512 " float tmp = impl_norm_inf(vec,\n"
513 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
514 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
515 " tmp_buffer);\n"
516 " \n"
517 " if (get_local_id(0) == 0)\n"
518 " group_buffer[get_group_id(0)] = tmp; \n"
519 "}\n"
520 ; //vector_align1_norm_inf
521 
522 const char * const vector_align1_inplace_div_add =
523 "///// divide add:\n"
524 "__kernel void inplace_div_add(\n"
525 " __global float * vec1,\n"
526 " unsigned int start1,\n"
527 " unsigned int size1,\n"
528 " __global const float * vec2,\n"
529 " unsigned int start2,\n"
530 " unsigned int size2,\n"
531 " __global const float * fac) //CPU variant is mapped to mult_add\n"
532 "{ \n"
533 " float factor = *fac;\n"
534 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
535 " vec1[i+start1] -= vec2[i+start2] / factor;\n"
536 "}\n"
537 ; //vector_align1_inplace_div_add
538 
539 const char * const vector_align1_sqrt_sum =
540 "__kernel void sqrt_sum(\n"
541 " __global float * vec1,\n"
542 " unsigned int start1,\n"
543 " unsigned int size1,\n"
544 " __global float * result) \n"
545 "{ \n"
546 " //parallel reduction on global memory: (make sure get_global_size(0) is a power of 2)\n"
547 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
548 " {\n"
549 " if (get_global_id(0) < stride)\n"
550 " vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
551 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
552 " }\n"
553 " \n"
554 " if (get_global_id(0) == 0)\n"
555 " *result = sqrt(vec1[start1]);\n"
556 " \n"
557 "}\n"
558 ; //vector_align1_sqrt_sum
559 
560 const char * const vector_align1_inplace_add =
561 "__kernel void inplace_add(\n"
562 " __global float * vec1,\n"
563 " unsigned int start1,\n"
564 " unsigned int size1,\n"
565 " __global const float * vec2,\n"
566 " unsigned int start2,\n"
567 " unsigned int size2) \n"
568 "{ \n"
569 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
570 " vec1[i+start1] += vec2[i+start2];\n"
571 "}\n"
572 ; //vector_align1_inplace_add
573 
574 const char * const vector_align1_mul_sub =
575 "///// multiply subtract:\n"
576 "__kernel void mul_sub(\n"
577 " __global const float * vec1,\n"
578 " unsigned int start1,\n"
579 " unsigned int size1,\n"
580 " __global const float * fac,\n"
581 " __global const float * vec2,\n"
582 " unsigned int start2,\n"
583 " unsigned int size2,\n"
584 " __global float * result,\n"
585 " unsigned int start3,\n"
586 " unsigned int size3\n"
587 " ) \n"
588 "{ \n"
589 " float factor = *fac;\n"
590 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
591 " result[i+start3] = vec1[i+start1] * factor - vec2[i+start2];\n"
592 "}\n"
593 ; //vector_align1_mul_sub
594 
595 const char * const vector_align1_sum =
596 "__kernel void sum(\n"
597 " __global float * vec1,\n"
598 " unsigned int start1,\n"
599 " unsigned int size1,\n"
600 " __global float * result) \n"
601 "{ \n"
602 " //parallel reduction on global memory (make sure get_global_size(0) is a power of 2)\n"
603 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
604 " {\n"
605 " if (get_global_id(0) < stride)\n"
606 " vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
607 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
608 " }\n"
609 " \n"
610 " if (get_global_id(0) == 0)\n"
611 " *result = vec1[0]; \n"
612 "}\n"
613 ; //vector_align1_sum
614 
615 const char * const vector_align1_cpu_mul_add =
616 "__kernel void cpu_mul_add(\n"
617 " __global const float * vec1,\n"
618 " unsigned int start1,\n"
619 " unsigned int size1,\n"
620 " float factor,\n"
621 " __global const float * vec2,\n"
622 " unsigned int start2,\n"
623 " unsigned int size2,\n"
624 " __global float * result,\n"
625 " unsigned int start3,\n"
626 " unsigned int size3\n"
627 " ) \n"
628 "{ \n"
629 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
630 " result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
631 "}\n"
632 ; //vector_align1_cpu_mul_add
633 
634 const char * const vector_align1_cpu_mult =
635 "__kernel void cpu_mult(\n"
636 " __global const float * vec,\n"
637 " unsigned int start1,\n"
638 " unsigned int size1,\n"
639 " float factor, \n"
640 " __global float * result,\n"
641 " unsigned int start2,\n"
642 " unsigned int size2) \n"
643 "{ \n"
644 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
645 " result[i+start2] = vec[i+start1] * factor;\n"
646 "}\n"
647 ; //vector_align1_cpu_mult
648 
650 "__kernel void cpu_inplace_mul_add(\n"
651 " __global float * vec1,\n"
652 " unsigned int start1,\n"
653 " unsigned int size1,\n"
654 " __global const float * vec2,\n"
655 " unsigned int start2,\n"
656 " unsigned int size2,\n"
657 " float factor) \n"
658 "{ \n"
659 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
660 " vec1[i+start1] += vec2[i+start2] * factor;\n"
661 "}\n"
662 ; //vector_align1_cpu_inplace_mul_add
663 
664 const char * const vector_align1_cpu_inplace_mult =
665 "__kernel void cpu_inplace_mult(\n"
666 " __global float * vec,\n"
667 " unsigned int start1,\n"
668 " unsigned int size1,\n"
669 " float factor) \n"
670 "{ \n"
671 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
672 " vec[i+start1] *= factor;\n"
673 "}\n"
674 ; //vector_align1_cpu_inplace_mult
675 
676 const char * const vector_align1_plane_rotation =
677 "////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)\n"
678 "__kernel void plane_rotation(\n"
679 " __global float * vec1,\n"
680 " unsigned int start1,\n"
681 " unsigned int size1,\n"
682 " __global float * vec2, \n"
683 " unsigned int start2,\n"
684 " unsigned int size2,\n"
685 " float alpha,\n"
686 " float beta) \n"
687 "{ \n"
688 " float tmp1 = 0;\n"
689 " float tmp2 = 0;\n"
690 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
691 " {\n"
692 " tmp1 = vec1[i+start1];\n"
693 " tmp2 = vec2[i+start2];\n"
694 " \n"
695 " vec1[i+start1] = alpha * tmp1 + beta * tmp2;\n"
696 " vec2[i+start2] = alpha * tmp2 - beta * tmp1;\n"
697 " }\n"
698 "}\n"
699 ; //vector_align1_plane_rotation
700 
701 const char * const vector_align1_clear =
702 "__kernel void clear(\n"
703 " __global float * vec,\n"
704 " unsigned int start1,\n"
705 " unsigned int size1) \n"
706 "{ \n"
707 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
708 " vec[i+start1] = 0;\n"
709 "}\n"
710 ; //vector_align1_clear
711 
712 const char * const vector_align1_inplace_mult =
713 "__kernel void inplace_mult(\n"
714 " __global float * vec,\n"
715 " unsigned int start1,\n"
716 " unsigned int size1,\n"
717 " __global const float * fac) \n"
718 "{ \n"
719 " float factor = *fac;\n"
720 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
721 " vec[i+start1] *= factor;\n"
722 "}\n"
723 ; //vector_align1_inplace_mult
724 
725 const char * const vector_align1_inplace_mul_sub =
726 "__kernel void inplace_mul_sub(\n"
727 " __global float * vec1,\n"
728 " unsigned int start1,\n"
729 " unsigned int size1,\n"
730 " __global const float * vec2,\n"
731 " unsigned int start2,\n"
732 " unsigned int size2,\n"
733 " __global const float * fac) //CPU variant is mapped to mult_add\n"
734 "{ \n"
735 " float factor = *fac;\n"
736 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
737 " vec1[i+start1] -= vec2[i+start2] * factor;\n"
738 "}\n"
739 ; //vector_align1_inplace_mul_sub
740 
741 const char * const vector_align1_norm_2 =
742 "//helper:\n"
743 "void helper_norm2_parallel_reduction( __local float * tmp_buffer )\n"
744 "{\n"
745 " for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
746 " {\n"
747 " barrier(CLK_LOCAL_MEM_FENCE);\n"
748 " if (get_global_id(0) < stride)\n"
749 " tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
750 " }\n"
751 "}\n"
752 "\n"
753 "////// norm_2\n"
754 "float impl_norm_2(\n"
755 " __global const float * vec,\n"
756 " unsigned int start_index,\n"
757 " unsigned int end_index,\n"
758 " __local float * tmp_buffer)\n"
759 "{\n"
760 " float tmp = 0;\n"
761 " float vec_entry = 0;\n"
762 " for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
763 " {\n"
764 " vec_entry = vec[i];\n"
765 " tmp += vec_entry * vec_entry;\n"
766 " }\n"
767 " tmp_buffer[get_local_id(0)] = tmp;\n"
768 " \n"
769 " helper_norm2_parallel_reduction(tmp_buffer);\n"
770 " \n"
771 " return tmp_buffer[0];\n"
772 "};\n"
773 "\n"
774 "__kernel void norm_2(\n"
775 " __global const float * vec,\n"
776 " unsigned int start1,\n"
777 " unsigned int size1,\n"
778 " __local float * tmp_buffer,\n"
779 " global float * group_buffer)\n"
780 "{\n"
781 " float tmp = impl_norm_2(vec,\n"
782 " ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
783 " ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
784 " tmp_buffer);\n"
785 " \n"
786 " if (get_local_id(0) == 0)\n"
787 " group_buffer[get_group_id(0)] = tmp; \n"
788 "}\n"
789 "\n"
790 ; //vector_align1_norm_2
791 
792 const char * const vector_align1_add =
793 "__kernel void add(\n"
794 " __global const float * vec1,\n"
795 " unsigned int start1,\n"
796 " unsigned int size1,\n"
797 " __global const float * vec2,\n"
798 " unsigned int start2,\n"
799 " unsigned int size2,\n"
800 " __global float * result,\n"
801 " unsigned int start3,\n"
802 " unsigned int size3) \n"
803 "{ \n"
804 " for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
805 " result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
806 "}\n"
807 ; //vector_align1_add
808 
809 const char * const vector_align16_inplace_divide =
810 "__kernel void inplace_divide(\n"
811 " __global float16 * vec,\n"
812 " unsigned int start1,\n"
813 " unsigned int size1,\n"
814 " __global const float * fac) //note: CPU variant is mapped to prod_scalar\n"
815 "{ \n"
816 " float factor = *fac;\n"
817 " unsigned int i_end = size1/16;\n"
818 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
819 " vec[i+start1] /= factor;\n"
820 "}\n"
821 ; //vector_align16_inplace_divide
822 
823 const char * const vector_align16_sub =
824 "__kernel void sub(\n"
825 " __global const float16 * vec1,\n"
826 " unsigned int start1,\n"
827 " unsigned int size1,\n"
828 " __global const float16 * vec2, \n"
829 " unsigned int start2,\n"
830 " unsigned int size2,\n"
831 " __global float16 * result,\n"
832 " unsigned int start3,\n"
833 " unsigned int size3)\n"
834 "{ \n"
835 " unsigned int i_end = size1 / 16;\n"
836 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
837 " result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
838 "}\n"
839 ; //vector_align16_sub
840 
841 const char * const vector_align16_cpu_inplace_mul =
842 "\n"
843 "__kernel void cpu_inplace_mult(\n"
844 " __global float16 * vec,\n"
845 " unsigned int start1,\n"
846 " unsigned int size1,\n"
847 " float factor) \n"
848 "{ \n"
849 " unsigned int i_end = size1/16;\n"
850 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
851 " vec[i+start1] *= factor;\n"
852 "}\n"
853 "\n"
854 ; //vector_align16_cpu_inplace_mul
855 
856 const char * const vector_align16_inplace_sub =
857 "__kernel void inplace_sub(\n"
858 " __global float16 * vec1,\n"
859 " unsigned int start1,\n"
860 " unsigned int size1,\n"
861 " __global const float16 * vec2,\n"
862 " unsigned int start2,\n"
863 " unsigned int size2) \n"
864 "{ \n"
865 " unsigned int i_end = size1/16;\n"
866 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
867 " vec1[i+start1] -= vec2[i+start2];\n"
868 "}\n"
869 ; //vector_align16_inplace_sub
870 
871 const char * const vector_align16_mult =
872 "__kernel void mult(\n"
873 " __global const float16 * vec,\n"
874 " unsigned int start1,\n"
875 " unsigned int size1,\n"
876 " __global const float * fac, \n"
877 " __global float16 * result,\n"
878 " unsigned int start2,\n"
879 " unsigned int size2) \n"
880 "{ \n"
881 " float factor = *fac;\n"
882 " unsigned int i_end = size1/16;\n"
883 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
884 " result[i+start2] = vec[i+start1] * factor;\n"
885 "}\n"
886 ; //vector_align16_mult
887 
888 const char * const vector_align16_divide =
889 "//Note: 'div' cannot be used because of complaints by the jit-compiler\n"
890 "__kernel void divide(\n"
891 " __global const float16 * vec,\n"
892 " unsigned int start1,\n"
893 " unsigned int size1,\n"
894 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
895 " __global float16 * result,\n"
896 " unsigned int start2,\n"
897 " unsigned int size2) \n"
898 "{ \n"
899 " float factor = *fac;\n"
900 " unsigned int i_end = size/16;\n"
901 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
902 " result[i+start2] = vec[i+start1] / factor;\n"
903 "}\n"
904 ; //vector_align16_divide
905 
906 const char * const vector_align16_inplace_add =
907 "__kernel void inplace_add(\n"
908 " __global float16 * vec1,\n"
909 " unsigned int start1,\n"
910 " unsigned int size1,\n"
911 " __global const float16 * vec2,\n"
912 " unsigned int start2,\n"
913 " unsigned int size2) \n"
914 "{ \n"
915 " unsigned int i_end = size1/16;\n"
916 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
917 " vec1[i+start1] += vec2[i+start2];\n"
918 "}\n"
919 ; //vector_align16_inplace_add
920 
921 const char * const vector_align16_cpu_mult =
922 "__kernel void cpu_mult(\n"
923 " __global const float16 * vec,\n"
924 " unsigned int start1,\n"
925 " unsigned int size1,\n"
926 " float factor, \n"
927 " __global float16 * result,\n"
928 " unsigned int start2,\n"
929 " unsigned int size2) \n"
930 "{ \n"
931 " unsigned int i_end = size1/16;\n"
932 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
933 " result[i+start2] = vec[i+start1] * factor;\n"
934 "}\n"
935 ; //vector_align16_cpu_mult
936 
937 const char * const vector_align16_inplace_mult =
938 "__kernel void inplace_mult(\n"
939 " __global float16 * vec,\n"
940 " unsigned int start1,\n"
941 " unsigned int size1,\n"
942 " __global const float * fac) \n"
943 "{ \n"
944 " float factor = *fac;\n"
945 " unsigned int i_end = size1/16;\n"
946 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
947 " vec[i+start1] *= factor;\n"
948 "}\n"
949 ; //vector_align16_inplace_mult
950 
951 const char * const vector_align16_add =
952 "__kernel void add(\n"
953 " __global const float16 * vec1,\n"
954 " unsigned int start1,\n"
955 " unsigned int size1,\n"
956 " __global const float16 * vec2, \n"
957 " unsigned int start2,\n"
958 " unsigned int size2,\n"
959 " __global float16 * result,\n"
960 " unsigned int start3,\n"
961 " unsigned int size3)\n"
962 "{ \n"
963 " unsigned int i_end = size/16;\n"
964 " for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
965 " result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
966 "}\n"
967 ; //vector_align16_add
968 
969  } //namespace kernels
970  } //namespace linalg
971 } //namespace viennacl
972 #endif