ViennaCL - The Vienna Computing Library
1.2.0
Main Page
Namespaces
Data Structures
Files
File List
Globals
viennacl
linalg
kernels
vector_source.h
Go to the documentation of this file.
1
#ifndef VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
2
#define VIENNACL_LINALG_KERNELS_VECTOR_SOURCE_HPP_
3
//Automatically generated file from auxiliary-directory, do not edit manually!
4
namespace
viennacl
5
{
6
namespace
linalg
7
{
8
namespace
kernels
9
{
10
const
char
*
const
vector_align4_inplace_div_sub
=
11
"__kernel void inplace_div_sub(\n"
12
" __global float4 * vec1,\n"
13
" unsigned int start1,\n"
14
" unsigned int size1,\n"
15
" __global const float4 * vec2,\n"
16
" unsigned int start2,\n"
17
" unsigned int size2,\n"
18
" __global const float * fac) //CPU variant is mapped to mult_add\n"
19
"{ \n"
20
" float factor = *fac;\n"
21
" unsigned int i_end = size1/4;\n"
22
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
23
" vec1[i+start1] -= vec2[i+start2] / factor;\n"
24
"}\n"
25
;
//vector_align4_inplace_div_sub
26
27
const
char
*
const
vector_align4_mul_add
=
28
"__kernel void mul_add(\n"
29
" __global const float4 * vec1,\n"
30
" unsigned int start1,\n"
31
" unsigned int size1,\n"
32
" __global const float * fac,\n"
33
" __global const float4 * vec2,\n"
34
" unsigned int start2,\n"
35
" unsigned int size2,\n"
36
" __global float4 * result,\n"
37
" unsigned int start3,\n"
38
" unsigned int size3) \n"
39
"{ \n"
40
" float factor = *fac;\n"
41
" unsigned int i_end = size1/4;\n"
42
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
43
" result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
44
"}\n"
45
;
//vector_align4_mul_add
46
47
const
char
*
const
vector_align4_inplace_mul_add
=
48
"__kernel void inplace_mul_add(\n"
49
" __global float4 * vec1,\n"
50
" unsigned int start1,\n"
51
" unsigned int size1,\n"
52
" __global const float4 * vec2,\n"
53
" unsigned int start2,\n"
54
" unsigned int size2,\n"
55
" __global const float * fac) \n"
56
"{ \n"
57
" float factor = *fac;\n"
58
" unsigned int size_div_4 = size1/4;\n"
59
" for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))\n"
60
" vec1[i+start1] += vec2[i+start2] * factor;\n"
61
"}\n"
62
;
//vector_align4_inplace_mul_add
63
64
const
char
*
const
vector_align4_inplace_div_add
=
65
"__kernel void inplace_div_add(\n"
66
" __global float4 * vec1,\n"
67
" unsigned int start1,\n"
68
" unsigned int size1,\n"
69
" __global const float4 * vec2,\n"
70
" unsigned int start2,\n"
71
" unsigned int size2,\n"
72
" __global const float * fac) //CPU variant is mapped to mult_add\n"
73
"{ \n"
74
" float factor = *fac;\n"
75
" unsigned int i_end = size1 / 4;\n"
76
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
77
" vec1[i+start1] -= vec2[i+start2] / factor;\n"
78
"}\n"
79
;
//vector_align4_inplace_div_add
80
81
const
char
*
const
vector_align4_cpu_mul_add
=
82
"__kernel void cpu_mul_add(\n"
83
" __global const float4 * vec1,\n"
84
" unsigned int start1,\n"
85
" unsigned int size1,\n"
86
" float factor,\n"
87
" __global const float4 * vec2,\n"
88
" unsigned int start2,\n"
89
" unsigned int size2,\n"
90
" __global float4 * result,\n"
91
" unsigned int start3,\n"
92
" unsigned int size3) \n"
93
"{ \n"
94
" unsigned int i_end = size1/4;\n"
95
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
96
" result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
97
"}\n"
98
;
//vector_align4_cpu_mul_add
99
100
const
char
*
const
vector_align4_cpu_inplace_mul_add
=
101
"__kernel void cpu_inplace_mul_add(\n"
102
" __global float4 * vec1,\n"
103
" unsigned int start1,\n"
104
" unsigned int size1,\n"
105
" __global const float4 * vec2,\n"
106
" unsigned int start2,\n"
107
" unsigned int size2,\n"
108
" float factor) \n"
109
"{ \n"
110
" unsigned int i_end = size1/4;\n"
111
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
112
" vec1[i+start1] += vec2[i+start2] * factor;\n"
113
"}\n"
114
;
//vector_align4_cpu_inplace_mul_add
115
116
const
char
*
const
vector_align4_inplace_mul_sub
=
117
"__kernel void inplace_mul_sub(\n"
118
" __global float4 * vec1,\n"
119
" unsigned int start1,\n"
120
" unsigned int size1,\n"
121
" __global const float4 * vec2,\n"
122
" unsigned int start2,\n"
123
" unsigned int size2,\n"
124
" __global const float * fac) //CPU variant is mapped to mult_add\n"
125
"{ \n"
126
" float factor = *fac;\n"
127
" unsigned int i_end = size/4;\n"
128
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
129
" vec1[i+start1] -= vec2[i+start2] * factor;\n"
130
"}\n"
131
;
//vector_align4_inplace_mul_sub
132
133
const
char
*
const
vector_align1_inplace_divide
=
134
"__kernel void inplace_divide(\n"
135
" __global float * vec,\n"
136
" unsigned int start1,\n"
137
" unsigned int size1,\n"
138
" __global const float * fac) //note: CPU variant is mapped to prod_scalar\n"
139
"{ \n"
140
" float factor = *fac;\n"
141
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
142
" vec[i+start1] /= factor;\n"
143
"}\n"
144
;
//vector_align1_inplace_divide
145
146
const
char
*
const
vector_align1_inplace_div_sub
=
147
"///// divide substract:\n"
148
"__kernel void inplace_div_sub(\n"
149
" __global float * vec1,\n"
150
" unsigned int start1,\n"
151
" unsigned int size1,\n"
152
" __global const float * vec2,\n"
153
" unsigned int start2,\n"
154
" unsigned int size2,\n"
155
" __global const float * fac) //CPU variant is mapped to mult_add\n"
156
"{ \n"
157
" float factor = *fac;\n"
158
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
159
" vec1[i+start1] -= vec2[i+start2] / factor;\n"
160
"}\n"
161
;
//vector_align1_inplace_div_sub
162
163
const
char
*
const
vector_align1_vmax
=
164
"__kernel void vmax(\n"
165
" __global float * vec1,\n"
166
" unsigned int start1,\n"
167
" unsigned int size1,\n"
168
" __global float * result) \n"
169
"{ \n"
170
" //parallel reduction on global memory (make sure that size is a power of 2)\n"
171
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
172
" {\n"
173
" if (get_global_id(0) < stride)\n"
174
" vec1[get_global_id(0)+start1] = fmax(vec1[get_global_id(0)+start1+stride], vec1[get_global_id(0)+start1]);\n"
175
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
176
" }\n"
177
" \n"
178
" if (get_global_id(0) == 0)\n"
179
" *result = vec1[start1];\n"
180
"}\n"
181
;
//vector_align1_vmax
182
183
const
char
*
const
vector_align1_index_norm_inf
=
184
"//index_norm_inf:\n"
185
"unsigned int float_vector1_index_norm_inf_impl(\n"
186
" __global const float * vec,\n"
187
" unsigned int start1,\n"
188
" unsigned int size1,\n"
189
" __local float * float_buffer,\n"
190
" __local unsigned int * index_buffer)\n"
191
"{\n"
192
" //step 1: fill buffer:\n"
193
" float cur_max = 0.0f;\n"
194
" float tmp;\n"
195
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
196
" {\n"
197
" tmp = fabs(vec[i+start1]);\n"
198
" if (cur_max < tmp)\n"
199
" {\n"
200
" float_buffer[get_global_id(0)] = tmp;\n"
201
" index_buffer[get_global_id(0)] = i;\n"
202
" cur_max = tmp;\n"
203
" }\n"
204
" }\n"
205
" \n"
206
" //step 2: parallel reduction:\n"
207
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
208
" {\n"
209
" barrier(CLK_LOCAL_MEM_FENCE);\n"
210
" if (get_global_id(0) < stride)\n"
211
" {\n"
212
" //find the first occurring index\n"
213
" if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])\n"
214
" {\n"
215
" index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];\n"
216
" float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];\n"
217
" }\n"
218
" \n"
219
" //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];\n"
220
" //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);\n"
221
" }\n"
222
" }\n"
223
" \n"
224
" return index_buffer[0];\n"
225
"}\n"
226
"\n"
227
"__kernel void index_norm_inf(\n"
228
" __global float * vec,\n"
229
" unsigned int start1,\n"
230
" unsigned int size1,\n"
231
" __local float * float_buffer,\n"
232
" __local unsigned int * index_buffer,\n"
233
" global unsigned int * result) \n"
234
"{ \n"
235
" unsigned int tmp = float_vector1_index_norm_inf_impl(vec, start1, size1, float_buffer, index_buffer);\n"
236
" if (get_global_id(0) == 0) *result = tmp;\n"
237
"}\n"
238
"\n"
239
"\n"
240
;
//vector_align1_index_norm_inf
241
242
const
char
*
const
vector_align1_sub
=
243
"__kernel void sub(\n"
244
" __global const float * vec1,\n"
245
" unsigned int start1,\n"
246
" unsigned int size1,\n"
247
" __global const float * vec2, \n"
248
" unsigned int start2,\n"
249
" unsigned int size2,\n"
250
" __global float * result,\n"
251
" unsigned int start3,\n"
252
" unsigned int size3)\n"
253
"{ \n"
254
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
255
" result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
256
"}\n"
257
;
//vector_align1_sub
258
259
const
char
*
const
vector_align1_mul_add
=
260
"__kernel void mul_add(\n"
261
" __global const float * vec1,\n"
262
" unsigned int start1,\n"
263
" unsigned int size1,\n"
264
" __global const float * fac,\n"
265
" __global const float * vec2,\n"
266
" unsigned int start2,\n"
267
" unsigned int size2,\n"
268
" __global float * result,\n"
269
" unsigned int start3,\n"
270
" unsigned int size3\n"
271
" ) \n"
272
"{ \n"
273
" float factor = *fac;\n"
274
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
275
" result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
276
"}\n"
277
;
//vector_align1_mul_add
278
279
const
char
*
const
vector_align1_inplace_sub
=
280
"__kernel void inplace_sub(\n"
281
" __global float * vec1,\n"
282
" unsigned int start1,\n"
283
" unsigned int size1,\n"
284
" __global const float * vec2,\n"
285
" unsigned int start2,\n"
286
" unsigned int size2) \n"
287
"{ \n"
288
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
289
" vec1[i+start1] -= vec2[i+start2];\n"
290
"}\n"
291
;
//vector_align1_inplace_sub
292
293
const
char
*
const
vector_align1_inner_prod
=
294
"//helper:\n"
295
"void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )\n"
296
"{\n"
297
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
298
" {\n"
299
" barrier(CLK_LOCAL_MEM_FENCE);\n"
300
" if (get_local_id(0) < stride)\n"
301
" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
302
" }\n"
303
"}\n"
304
"//////// inner products:\n"
305
"float impl_inner_prod(\n"
306
" __global const float * vec1,\n"
307
" unsigned int start1,\n"
308
" unsigned int size1,\n"
309
" __global const float * vec2,\n"
310
" unsigned int start2,\n"
311
" unsigned int size2,\n"
312
" __local float * tmp_buffer)\n"
313
"{\n"
314
" float tmp = 0;\n"
315
" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0))\n"
316
" tmp += vec1[i+start1] * vec2[i+start2];\n"
317
" tmp_buffer[get_local_id(0)] = tmp;\n"
318
" \n"
319
" helper_inner_prod_parallel_reduction(tmp_buffer);\n"
320
" \n"
321
" return tmp_buffer[0];\n"
322
"}\n"
323
"__kernel void inner_prod(\n"
324
" __global const float * vec1,\n"
325
" unsigned int start1,\n"
326
" unsigned int size1,\n"
327
" __global const float * vec2,\n"
328
" unsigned int start2,\n"
329
" unsigned int size2,\n"
330
" __local float * tmp_buffer,\n"
331
" global float * group_buffer)\n"
332
"{\n"
333
" float tmp = impl_inner_prod(vec1,\n"
334
" ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
335
" ((get_group_id(0) + 1) * size1) / get_num_groups(0) - ( get_group_id(0) * size1) / get_num_groups(0),\n"
336
" vec2,\n"
337
" ( get_group_id(0) * size2) / get_num_groups(0) + start2,\n"
338
" ((get_group_id(0) + 1) * size2) / get_num_groups(0) - ( get_group_id(0) * size2) / get_num_groups(0),\n"
339
" tmp_buffer);\n"
340
" \n"
341
" if (get_local_id(0) == 0)\n"
342
" group_buffer[get_group_id(0)] = tmp;\n"
343
" \n"
344
"}\n"
345
;
//vector_align1_inner_prod
346
347
const
char
*
const
vector_align1_mult
=
348
"__kernel void mult(\n"
349
" __global const float * vec,\n"
350
" unsigned int start1,\n"
351
" unsigned int size1,\n"
352
" __global const float * fac, \n"
353
" __global float * result,\n"
354
" unsigned int start3,\n"
355
" unsigned int size3) \n"
356
"{ \n"
357
" float factor = *fac;\n"
358
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
359
" result[i+start3] = vec[i+start1] * factor;\n"
360
"}\n"
361
;
//vector_align1_mult
362
363
const
char
*
const
vector_align1_diag_precond
=
364
"__kernel void diag_precond(\n"
365
" __global const float * diag_A_inv, \n"
366
" unsigned int start1,\n"
367
" unsigned int size1,\n"
368
" __global float * x, \n"
369
" unsigned int start2,\n"
370
" unsigned int size2) \n"
371
"{ \n"
372
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
373
" x[i+start2] *= diag_A_inv[i+start1];\n"
374
"}\n"
375
;
//vector_align1_diag_precond
376
377
const
char
*
const
vector_align1_inplace_mul_add
=
378
"__kernel void inplace_mul_add(\n"
379
" __global float * vec1,\n"
380
" unsigned int start1,\n"
381
" unsigned int size1,\n"
382
" __global const float * vec2,\n"
383
" unsigned int start2,\n"
384
" unsigned int size2,\n"
385
" __global const float * fac) \n"
386
"{ \n"
387
" float factor = *fac;\n"
388
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
389
" vec1[i+start1] += vec2[i+start2] * factor;\n"
390
"}\n"
391
;
//vector_align1_inplace_mul_add
392
393
const
char
*
const
vector_align1_norm_1
=
394
"//helper:\n"
395
"void helper_norm1_parallel_reduction( __local float * tmp_buffer )\n"
396
"{\n"
397
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
398
" {\n"
399
" barrier(CLK_LOCAL_MEM_FENCE);\n"
400
" if (get_global_id(0) < stride)\n"
401
" tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
402
" }\n"
403
"}\n"
404
"\n"
405
"////// norm_1\n"
406
"float impl_norm_1(\n"
407
" __global const float * vec,\n"
408
" unsigned int start_index,\n"
409
" unsigned int end_index,\n"
410
" __local float * tmp_buffer)\n"
411
"{\n"
412
" float tmp = 0;\n"
413
" for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
414
" tmp += fabs(vec[i]);\n"
415
" \n"
416
" tmp_buffer[get_local_id(0)] = tmp;\n"
417
" \n"
418
" helper_norm1_parallel_reduction(tmp_buffer);\n"
419
" \n"
420
" return tmp_buffer[0];\n"
421
"};\n"
422
"\n"
423
"__kernel void norm_1(\n"
424
" __global const float * vec,\n"
425
" unsigned int start1,\n"
426
" unsigned int size1,\n"
427
" __local float * tmp_buffer,\n"
428
" global float * group_buffer)\n"
429
"{\n"
430
" float tmp = impl_norm_1(vec,\n"
431
" ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
432
" ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
433
" tmp_buffer);\n"
434
" \n"
435
" if (get_local_id(0) == 0)\n"
436
" group_buffer[get_group_id(0)] = tmp; \n"
437
"}\n"
438
"\n"
439
;
//vector_align1_norm_1
440
441
const
char
*
const
vector_align1_divide
=
442
"// Note: name 'div' is not allowed by the jit-compiler\n"
443
"__kernel void divide(\n"
444
" __global const float * vec,\n"
445
" unsigned int start1,\n"
446
" unsigned int size1,\n"
447
" __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
448
" __global float * result,\n"
449
" unsigned int start3,\n"
450
" unsigned int size3) \n"
451
"{ \n"
452
" float factor = *fac;\n"
453
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
454
" result[i+start3] = vec[i+start1] / factor;\n"
455
"}\n"
456
;
//vector_align1_divide
457
458
const
char
*
const
vector_align1_swap
=
459
"////// swap:\n"
460
"__kernel void swap(\n"
461
" __global float * vec1,\n"
462
" unsigned int start1,\n"
463
" unsigned int size1,\n"
464
" __global float * vec2,\n"
465
" unsigned int start2,\n"
466
" unsigned int size2\n"
467
" ) \n"
468
"{ \n"
469
" float tmp;\n"
470
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
471
" {\n"
472
" tmp = vec2[i+start2];\n"
473
" vec2[i+start2] = vec1[i+start1];\n"
474
" vec1[i+start1] = tmp;\n"
475
" }\n"
476
"}\n"
477
" \n"
478
;
//vector_align1_swap
479
480
const
char
*
const
vector_align1_norm_inf
=
481
"\n"
482
"////// norm_inf\n"
483
"float impl_norm_inf(\n"
484
" __global const float * vec,\n"
485
" unsigned int start_index,\n"
486
" unsigned int end_index,\n"
487
" __local float * tmp_buffer)\n"
488
"{\n"
489
" float tmp = 0;\n"
490
" for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
491
" tmp = fmax(fabs(vec[i]), tmp);\n"
492
" tmp_buffer[get_local_id(0)] = tmp;\n"
493
" \n"
494
" //step 2: parallel reduction:\n"
495
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
496
" {\n"
497
" barrier(CLK_LOCAL_MEM_FENCE);\n"
498
" if (get_global_id(0) < stride)\n"
499
" tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);\n"
500
" }\n"
501
" \n"
502
" return tmp_buffer[0];\n"
503
"}\n"
504
"\n"
505
"__kernel void norm_inf(\n"
506
" __global const float * vec,\n"
507
" unsigned int start1,\n"
508
" unsigned int size1,\n"
509
" __local float * tmp_buffer,\n"
510
" global float * group_buffer)\n"
511
"{\n"
512
" float tmp = impl_norm_inf(vec,\n"
513
" ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
514
" ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
515
" tmp_buffer);\n"
516
" \n"
517
" if (get_local_id(0) == 0)\n"
518
" group_buffer[get_group_id(0)] = tmp; \n"
519
"}\n"
520
;
//vector_align1_norm_inf
521
522
const
char
*
const
vector_align1_inplace_div_add
=
523
"///// divide add:\n"
524
"__kernel void inplace_div_add(\n"
525
" __global float * vec1,\n"
526
" unsigned int start1,\n"
527
" unsigned int size1,\n"
528
" __global const float * vec2,\n"
529
" unsigned int start2,\n"
530
" unsigned int size2,\n"
531
" __global const float * fac) //CPU variant is mapped to mult_add\n"
532
"{ \n"
533
" float factor = *fac;\n"
534
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
535
" vec1[i+start1] -= vec2[i+start2] / factor;\n"
536
"}\n"
537
;
//vector_align1_inplace_div_add
538
539
const
char
*
const
vector_align1_sqrt_sum
=
540
"__kernel void sqrt_sum(\n"
541
" __global float * vec1,\n"
542
" unsigned int start1,\n"
543
" unsigned int size1,\n"
544
" __global float * result) \n"
545
"{ \n"
546
" //parallel reduction on global memory: (make sure get_global_size(0) is a power of 2)\n"
547
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
548
" {\n"
549
" if (get_global_id(0) < stride)\n"
550
" vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
551
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
552
" }\n"
553
" \n"
554
" if (get_global_id(0) == 0)\n"
555
" *result = sqrt(vec1[start1]);\n"
556
" \n"
557
"}\n"
558
;
//vector_align1_sqrt_sum
559
560
const
char
*
const
vector_align1_inplace_add
=
561
"__kernel void inplace_add(\n"
562
" __global float * vec1,\n"
563
" unsigned int start1,\n"
564
" unsigned int size1,\n"
565
" __global const float * vec2,\n"
566
" unsigned int start2,\n"
567
" unsigned int size2) \n"
568
"{ \n"
569
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
570
" vec1[i+start1] += vec2[i+start2];\n"
571
"}\n"
572
;
//vector_align1_inplace_add
573
574
const
char
*
const
vector_align1_mul_sub
=
575
"///// multiply subtract:\n"
576
"__kernel void mul_sub(\n"
577
" __global const float * vec1,\n"
578
" unsigned int start1,\n"
579
" unsigned int size1,\n"
580
" __global const float * fac,\n"
581
" __global const float * vec2,\n"
582
" unsigned int start2,\n"
583
" unsigned int size2,\n"
584
" __global float * result,\n"
585
" unsigned int start3,\n"
586
" unsigned int size3\n"
587
" ) \n"
588
"{ \n"
589
" float factor = *fac;\n"
590
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
591
" result[i+start3] = vec1[i+start1] * factor - vec2[i+start2];\n"
592
"}\n"
593
;
//vector_align1_mul_sub
594
595
const
char
*
const
vector_align1_sum
=
596
"__kernel void sum(\n"
597
" __global float * vec1,\n"
598
" unsigned int start1,\n"
599
" unsigned int size1,\n"
600
" __global float * result) \n"
601
"{ \n"
602
" //parallel reduction on global memory (make sure get_global_size(0) is a power of 2)\n"
603
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
604
" {\n"
605
" if (get_global_id(0) < stride)\n"
606
" vec1[get_global_id(0)+start1] += vec1[get_global_id(0)+start1+stride];\n"
607
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
608
" }\n"
609
" \n"
610
" if (get_global_id(0) == 0)\n"
611
" *result = vec1[0]; \n"
612
"}\n"
613
;
//vector_align1_sum
614
615
const
char
*
const
vector_align1_cpu_mul_add
=
616
"__kernel void cpu_mul_add(\n"
617
" __global const float * vec1,\n"
618
" unsigned int start1,\n"
619
" unsigned int size1,\n"
620
" float factor,\n"
621
" __global const float * vec2,\n"
622
" unsigned int start2,\n"
623
" unsigned int size2,\n"
624
" __global float * result,\n"
625
" unsigned int start3,\n"
626
" unsigned int size3\n"
627
" ) \n"
628
"{ \n"
629
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
630
" result[i+start3] = vec1[i+start1] * factor + vec2[i+start2];\n"
631
"}\n"
632
;
//vector_align1_cpu_mul_add
633
634
const
char
*
const
vector_align1_cpu_mult
=
635
"__kernel void cpu_mult(\n"
636
" __global const float * vec,\n"
637
" unsigned int start1,\n"
638
" unsigned int size1,\n"
639
" float factor, \n"
640
" __global float * result,\n"
641
" unsigned int start2,\n"
642
" unsigned int size2) \n"
643
"{ \n"
644
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
645
" result[i+start2] = vec[i+start1] * factor;\n"
646
"}\n"
647
;
//vector_align1_cpu_mult
648
649
const
char
*
const
vector_align1_cpu_inplace_mul_add
=
650
"__kernel void cpu_inplace_mul_add(\n"
651
" __global float * vec1,\n"
652
" unsigned int start1,\n"
653
" unsigned int size1,\n"
654
" __global const float * vec2,\n"
655
" unsigned int start2,\n"
656
" unsigned int size2,\n"
657
" float factor) \n"
658
"{ \n"
659
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
660
" vec1[i+start1] += vec2[i+start2] * factor;\n"
661
"}\n"
662
;
//vector_align1_cpu_inplace_mul_add
663
664
const
char
*
const
vector_align1_cpu_inplace_mult
=
665
"__kernel void cpu_inplace_mult(\n"
666
" __global float * vec,\n"
667
" unsigned int start1,\n"
668
" unsigned int size1,\n"
669
" float factor) \n"
670
"{ \n"
671
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
672
" vec[i+start1] *= factor;\n"
673
"}\n"
674
;
//vector_align1_cpu_inplace_mult
675
676
const
char
*
const
vector_align1_plane_rotation
=
677
"////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)\n"
678
"__kernel void plane_rotation(\n"
679
" __global float * vec1,\n"
680
" unsigned int start1,\n"
681
" unsigned int size1,\n"
682
" __global float * vec2, \n"
683
" unsigned int start2,\n"
684
" unsigned int size2,\n"
685
" float alpha,\n"
686
" float beta) \n"
687
"{ \n"
688
" float tmp1 = 0;\n"
689
" float tmp2 = 0;\n"
690
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
691
" {\n"
692
" tmp1 = vec1[i+start1];\n"
693
" tmp2 = vec2[i+start2];\n"
694
" \n"
695
" vec1[i+start1] = alpha * tmp1 + beta * tmp2;\n"
696
" vec2[i+start2] = alpha * tmp2 - beta * tmp1;\n"
697
" }\n"
698
"}\n"
699
;
//vector_align1_plane_rotation
700
701
const
char
*
const
vector_align1_clear
=
702
"__kernel void clear(\n"
703
" __global float * vec,\n"
704
" unsigned int start1,\n"
705
" unsigned int size1) \n"
706
"{ \n"
707
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
708
" vec[i+start1] = 0;\n"
709
"}\n"
710
;
//vector_align1_clear
711
712
const
char
*
const
vector_align1_inplace_mult
=
713
"__kernel void inplace_mult(\n"
714
" __global float * vec,\n"
715
" unsigned int start1,\n"
716
" unsigned int size1,\n"
717
" __global const float * fac) \n"
718
"{ \n"
719
" float factor = *fac;\n"
720
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
721
" vec[i+start1] *= factor;\n"
722
"}\n"
723
;
//vector_align1_inplace_mult
724
725
const
char
*
const
vector_align1_inplace_mul_sub
=
726
"__kernel void inplace_mul_sub(\n"
727
" __global float * vec1,\n"
728
" unsigned int start1,\n"
729
" unsigned int size1,\n"
730
" __global const float * vec2,\n"
731
" unsigned int start2,\n"
732
" unsigned int size2,\n"
733
" __global const float * fac) //CPU variant is mapped to mult_add\n"
734
"{ \n"
735
" float factor = *fac;\n"
736
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
737
" vec1[i+start1] -= vec2[i+start2] * factor;\n"
738
"}\n"
739
;
//vector_align1_inplace_mul_sub
740
741
const
char
*
const
vector_align1_norm_2
=
742
"//helper:\n"
743
"void helper_norm2_parallel_reduction( __local float * tmp_buffer )\n"
744
"{\n"
745
" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
746
" {\n"
747
" barrier(CLK_LOCAL_MEM_FENCE);\n"
748
" if (get_global_id(0) < stride)\n"
749
" tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
750
" }\n"
751
"}\n"
752
"\n"
753
"////// norm_2\n"
754
"float impl_norm_2(\n"
755
" __global const float * vec,\n"
756
" unsigned int start_index,\n"
757
" unsigned int end_index,\n"
758
" __local float * tmp_buffer)\n"
759
"{\n"
760
" float tmp = 0;\n"
761
" float vec_entry = 0;\n"
762
" for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
763
" {\n"
764
" vec_entry = vec[i];\n"
765
" tmp += vec_entry * vec_entry;\n"
766
" }\n"
767
" tmp_buffer[get_local_id(0)] = tmp;\n"
768
" \n"
769
" helper_norm2_parallel_reduction(tmp_buffer);\n"
770
" \n"
771
" return tmp_buffer[0];\n"
772
"};\n"
773
"\n"
774
"__kernel void norm_2(\n"
775
" __global const float * vec,\n"
776
" unsigned int start1,\n"
777
" unsigned int size1,\n"
778
" __local float * tmp_buffer,\n"
779
" global float * group_buffer)\n"
780
"{\n"
781
" float tmp = impl_norm_2(vec,\n"
782
" ( get_group_id(0) * size1) / get_num_groups(0) + start1,\n"
783
" ((get_group_id(0) + 1) * size1) / get_num_groups(0) + start1,\n"
784
" tmp_buffer);\n"
785
" \n"
786
" if (get_local_id(0) == 0)\n"
787
" group_buffer[get_group_id(0)] = tmp; \n"
788
"}\n"
789
"\n"
790
;
//vector_align1_norm_2
791
792
const
char
*
const
vector_align1_add
=
793
"__kernel void add(\n"
794
" __global const float * vec1,\n"
795
" unsigned int start1,\n"
796
" unsigned int size1,\n"
797
" __global const float * vec2,\n"
798
" unsigned int start2,\n"
799
" unsigned int size2,\n"
800
" __global float * result,\n"
801
" unsigned int start3,\n"
802
" unsigned int size3) \n"
803
"{ \n"
804
" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0))\n"
805
" result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
806
"}\n"
807
;
//vector_align1_add
808
809
const
char
*
const
vector_align16_inplace_divide
=
810
"__kernel void inplace_divide(\n"
811
" __global float16 * vec,\n"
812
" unsigned int start1,\n"
813
" unsigned int size1,\n"
814
" __global const float * fac) //note: CPU variant is mapped to prod_scalar\n"
815
"{ \n"
816
" float factor = *fac;\n"
817
" unsigned int i_end = size1/16;\n"
818
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
819
" vec[i+start1] /= factor;\n"
820
"}\n"
821
;
//vector_align16_inplace_divide
822
823
const
char
*
const
vector_align16_sub
=
824
"__kernel void sub(\n"
825
" __global const float16 * vec1,\n"
826
" unsigned int start1,\n"
827
" unsigned int size1,\n"
828
" __global const float16 * vec2, \n"
829
" unsigned int start2,\n"
830
" unsigned int size2,\n"
831
" __global float16 * result,\n"
832
" unsigned int start3,\n"
833
" unsigned int size3)\n"
834
"{ \n"
835
" unsigned int i_end = size1 / 16;\n"
836
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
837
" result[i+start3] = vec1[i+start1] - vec2[i+start2];\n"
838
"}\n"
839
;
//vector_align16_sub
840
841
const
char
*
const
vector_align16_cpu_inplace_mul
=
842
"\n"
843
"__kernel void cpu_inplace_mult(\n"
844
" __global float16 * vec,\n"
845
" unsigned int start1,\n"
846
" unsigned int size1,\n"
847
" float factor) \n"
848
"{ \n"
849
" unsigned int i_end = size1/16;\n"
850
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
851
" vec[i+start1] *= factor;\n"
852
"}\n"
853
"\n"
854
;
//vector_align16_cpu_inplace_mul
855
856
const
char
*
const
vector_align16_inplace_sub
=
857
"__kernel void inplace_sub(\n"
858
" __global float16 * vec1,\n"
859
" unsigned int start1,\n"
860
" unsigned int size1,\n"
861
" __global const float16 * vec2,\n"
862
" unsigned int start2,\n"
863
" unsigned int size2) \n"
864
"{ \n"
865
" unsigned int i_end = size1/16;\n"
866
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
867
" vec1[i+start1] -= vec2[i+start2];\n"
868
"}\n"
869
;
//vector_align16_inplace_sub
870
871
const
char
*
const
vector_align16_mult
=
872
"__kernel void mult(\n"
873
" __global const float16 * vec,\n"
874
" unsigned int start1,\n"
875
" unsigned int size1,\n"
876
" __global const float * fac, \n"
877
" __global float16 * result,\n"
878
" unsigned int start2,\n"
879
" unsigned int size2) \n"
880
"{ \n"
881
" float factor = *fac;\n"
882
" unsigned int i_end = size1/16;\n"
883
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
884
" result[i+start2] = vec[i+start1] * factor;\n"
885
"}\n"
886
;
//vector_align16_mult
887
888
const
char
*
const
vector_align16_divide
=
889
"//Note: 'div' cannot be used because of complaints by the jit-compiler\n"
890
"__kernel void divide(\n"
891
" __global const float16 * vec,\n"
892
" unsigned int start1,\n"
893
" unsigned int size1,\n"
894
" __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
895
" __global float16 * result,\n"
896
" unsigned int start2,\n"
897
" unsigned int size2) \n"
898
"{ \n"
899
" float factor = *fac;\n"
900
" unsigned int i_end = size/16;\n"
901
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
902
" result[i+start2] = vec[i+start1] / factor;\n"
903
"}\n"
904
;
//vector_align16_divide
905
906
const
char
*
const
vector_align16_inplace_add
=
907
"__kernel void inplace_add(\n"
908
" __global float16 * vec1,\n"
909
" unsigned int start1,\n"
910
" unsigned int size1,\n"
911
" __global const float16 * vec2,\n"
912
" unsigned int start2,\n"
913
" unsigned int size2) \n"
914
"{ \n"
915
" unsigned int i_end = size1/16;\n"
916
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
917
" vec1[i+start1] += vec2[i+start2];\n"
918
"}\n"
919
;
//vector_align16_inplace_add
920
921
const
char
*
const
vector_align16_cpu_mult
=
922
"__kernel void cpu_mult(\n"
923
" __global const float16 * vec,\n"
924
" unsigned int start1,\n"
925
" unsigned int size1,\n"
926
" float factor, \n"
927
" __global float16 * result,\n"
928
" unsigned int start2,\n"
929
" unsigned int size2) \n"
930
"{ \n"
931
" unsigned int i_end = size1/16;\n"
932
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
933
" result[i+start2] = vec[i+start1] * factor;\n"
934
"}\n"
935
;
//vector_align16_cpu_mult
936
937
const
char
*
const
vector_align16_inplace_mult
=
938
"__kernel void inplace_mult(\n"
939
" __global float16 * vec,\n"
940
" unsigned int start1,\n"
941
" unsigned int size1,\n"
942
" __global const float * fac) \n"
943
"{ \n"
944
" float factor = *fac;\n"
945
" unsigned int i_end = size1/16;\n"
946
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
947
" vec[i+start1] *= factor;\n"
948
"}\n"
949
;
//vector_align16_inplace_mult
950
951
const
char
*
const
vector_align16_add
=
952
"__kernel void add(\n"
953
" __global const float16 * vec1,\n"
954
" unsigned int start1,\n"
955
" unsigned int size1,\n"
956
" __global const float16 * vec2, \n"
957
" unsigned int start2,\n"
958
" unsigned int size2,\n"
959
" __global float16 * result,\n"
960
" unsigned int start3,\n"
961
" unsigned int size3)\n"
962
"{ \n"
963
" unsigned int i_end = size/16;\n"
964
" for (unsigned int i = get_global_id(0); i < i_end; i += get_global_size(0))\n"
965
" result[i+start3] = vec1[i+start1] + vec2[i+start2];\n"
966
"}\n"
967
;
//vector_align16_add
968
969
}
//namespace kernels
970
}
//namespace linalg
971
}
//namespace viennacl
972
#endif
Generated on Wed Oct 10 2012 09:58:14 for ViennaCL - The Vienna Computing Library by
1.8.1.2