ViennaCL - The Vienna Computing Library
1.2.0
Main Page
Namespaces
Data Structures
Files
File List
Globals
viennacl
linalg
kernels
matrix_row_source.h
Go to the documentation of this file.
1
#ifndef VIENNACL_LINALG_KERNELS_MATRIX_ROW_SOURCE_HPP_
2
#define VIENNACL_LINALG_KERNELS_MATRIX_ROW_SOURCE_HPP_
3
//Automatically generated file from auxiliary-directory, do not edit manually!
4
namespace
viennacl
5
{
6
namespace
linalg
7
{
8
namespace
kernels
9
{
10
const
char
*
const
matrix_row_align1_inplace_divide
=
11
"__kernel void inplace_divide(\n"
12
" __global float * vec,\n"
13
" __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
14
" unsigned int size) \n"
15
"{ \n"
16
" float factor = *fac;\n"
17
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
18
" vec[i] /= factor;\n"
19
"}\n"
20
;
//matrix_row_align1_inplace_divide
21
22
const
char
*
const
matrix_row_align1_trans_lower_triangular_substitute_inplace
=
23
"__kernel void trans_lower_triangular_substitute_inplace(\n"
24
" __global const float * matrix,\n"
25
" unsigned int matrix_rows,\n"
26
" unsigned int matrix_cols,\n"
27
" unsigned int matrix_internal_rows,\n"
28
" unsigned int matrix_internal_cols,\n"
29
" __global float * vector)\n"
30
"{\n"
31
" float temp;\n"
32
" for (int row = 0; row < matrix_rows; ++row)\n"
33
" {\n"
34
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
35
" if (get_global_id(0) == 0)\n"
36
" vector[row] /= matrix[row+row*matrix_internal_cols];\n"
37
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
38
" temp = vector[row];\n"
39
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
40
" vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];\n"
41
" }\n"
42
"}\n"
43
;
//matrix_row_align1_trans_lower_triangular_substitute_inplace
44
45
const
char
*
const
matrix_row_align1_trans_unit_upper_triangular_substitute_inplace
=
46
"//transposed lower triangular matrix\n"
47
"__kernel void trans_unit_upper_triangular_substitute_inplace(\n"
48
" __global const float * matrix, \n"
49
" unsigned int matrix_rows,\n"
50
" unsigned int matrix_cols,\n"
51
" unsigned int matrix_internal_rows,\n"
52
" unsigned int matrix_internal_cols,\n"
53
" __global float * vector) \n"
54
"{ \n"
55
" float temp; \n"
56
" for (int row = matrix_rows-1; row > -1; --row) \n"
57
" { \n"
58
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
59
" temp = vector[row]; \n"
60
" //eliminate column with index 'row' in parallel: \n"
61
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
62
" vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; \n"
63
" } \n"
64
" \n"
65
"}\n"
66
;
//matrix_row_align1_trans_unit_upper_triangular_substitute_inplace
67
68
const
char
*
const
matrix_row_align1_sub
=
69
"__kernel void sub(\n"
70
" __global const float * vec1,\n"
71
" __global const float * vec2, \n"
72
" __global float * result,\n"
73
" unsigned int size)\n"
74
"{ \n"
75
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
76
" result[i] = vec1[i] - vec2[i];\n"
77
"}\n"
78
;
//matrix_row_align1_sub
79
80
const
char
*
const
matrix_row_align1_scaled_rank1_update
=
81
"__kernel void scaled_rank1_update(\n"
82
" __global float * matrix,\n"
83
" unsigned int matrix_rows,\n"
84
" unsigned int matrix_cols,\n"
85
" unsigned int matrix_internal_rows,\n"
86
" unsigned int matrix_internal_cols,\n"
87
" float val,\n"
88
" __global const float * vector1, \n"
89
" __global const float * vector2) \n"
90
"{ \n"
91
" float tmp;\n"
92
" unsigned int offset;\n"
93
" for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
94
" {\n"
95
" tmp = val * vector1[row];\n"
96
" offset = row * matrix_internal_cols;\n"
97
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
98
" matrix[offset+col] += tmp * vector2[col];\n"
99
" }\n"
100
"}\n"
101
;
//matrix_row_align1_scaled_rank1_update
102
103
const
char
*
const
matrix_row_align1_inplace_sub
=
104
"__kernel void inplace_sub(\n"
105
" __global float * vec1,\n"
106
" __global const float * vec2,\n"
107
" unsigned int size) \n"
108
"{ \n"
109
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
110
" vec1[i] -= vec2[i];\n"
111
"}\n"
112
;
//matrix_row_align1_inplace_sub
113
114
const
char
*
const
matrix_row_align1_lower_triangular_substitute_inplace
=
115
"__kernel void lower_triangular_substitute_inplace(\n"
116
" __global const float * matrix,\n"
117
" unsigned int matrix_rows,\n"
118
" unsigned int matrix_cols,\n"
119
" unsigned int matrix_internal_rows,\n"
120
" unsigned int matrix_internal_cols,\n"
121
" __global float * vector)\n"
122
"{\n"
123
" float temp;\n"
124
" for (int row = 0; row < matrix_rows; ++row)\n"
125
" {\n"
126
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
127
" if (get_global_id(0) == 0)\n"
128
" vector[row] /= matrix[row+row*matrix_internal_cols];\n"
129
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
130
" temp = vector[row];\n"
131
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
132
" vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];\n"
133
" }\n"
134
"}\n"
135
;
//matrix_row_align1_lower_triangular_substitute_inplace
136
137
const
char
*
const
matrix_row_align1_vec_mul
=
138
"\n"
139
"\n"
140
"\n"
141
"__kernel void vec_mul(\n"
142
" __global const float * matrix,\n"
143
" unsigned int matrix_rows,\n"
144
" unsigned int matrix_cols,\n"
145
" unsigned int matrix_internal_rows,\n"
146
" unsigned int matrix_internal_cols,\n"
147
" __global const float * vector, \n"
148
" __global float * result) \n"
149
"{ \n"
150
" for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
151
" {\n"
152
" float dot_prod = 0.0f;\n"
153
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
154
" dot_prod += matrix[row*matrix_internal_cols + col] * vector[col];\n"
155
" result[row] = dot_prod;\n"
156
" }\n"
157
"}\n"
158
"\n"
159
"\n"
160
;
//matrix_row_align1_vec_mul
161
162
const
char
*
const
matrix_row_align1_lu_factorize
=
163
"__kernel void lu_factorize(\n"
164
" __global float * matrix,\n"
165
" unsigned int matrix_rows,\n"
166
" unsigned int matrix_cols,\n"
167
" unsigned int matrix_internal_rows,\n"
168
" unsigned int matrix_internal_cols) \n"
169
"{ \n"
170
" float temp;\n"
171
" unsigned rowi;\n"
172
" unsigned rowk;\n"
173
" for (unsigned int i=1; i<matrix_rows; ++i)\n"
174
" {\n"
175
" rowi = i * matrix_internal_cols;\n"
176
" for (unsigned int k=0; k<i; ++k)\n"
177
" {\n"
178
" rowk = k * matrix_internal_cols;\n"
179
" if (get_global_id(0) == 0)\n"
180
" matrix[rowi + k] /= matrix[rowk + k];\n"
181
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
182
" temp = matrix[rowi + k];\n"
183
" \n"
184
" //parallel subtraction:\n"
185
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0))\n"
186
" matrix[rowi + j] -= temp * matrix[rowk + j];\n"
187
" }\n"
188
" }\n"
189
"} \n"
190
;
//matrix_row_align1_lu_factorize
191
192
const
char
*
const
matrix_row_align1_rank1_update
=
193
"//perform a rank-1 update of the matrix, i.e. A += x * x^T\n"
194
"__kernel void rank1_update(\n"
195
" __global float * matrix,\n"
196
" unsigned int matrix_rows,\n"
197
" unsigned int matrix_cols,\n"
198
" unsigned int matrix_internal_rows,\n"
199
" unsigned int matrix_internal_cols,\n"
200
" __global const float * vector1, \n"
201
" __global const float * vector2) \n"
202
"{ \n"
203
" float tmp;\n"
204
" unsigned int offset;\n"
205
" for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
206
" {\n"
207
" tmp = vector1[row];\n"
208
" offset = row*matrix_internal_cols;\n"
209
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
210
" matrix[offset+col] += tmp * vector2[col];\n"
211
" }\n"
212
"}\n"
213
;
//matrix_row_align1_rank1_update
214
215
const
char
*
const
matrix_row_align1_trans_upper_triangular_substitute_inplace
=
216
"//transposed lower triangular matrix\n"
217
"__kernel void trans_upper_triangular_substitute_inplace(\n"
218
" __global const float * matrix, \n"
219
" unsigned int matrix_rows,\n"
220
" unsigned int matrix_cols,\n"
221
" unsigned int matrix_internal_rows,\n"
222
" unsigned int matrix_internal_cols,\n"
223
" __global float * vector) \n"
224
"{ \n"
225
" float temp; \n"
226
" for (int row = matrix_rows-1; row > -1; --row) \n"
227
" { \n"
228
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
229
" if (get_global_id(0) == 0) \n"
230
" vector[row] /= matrix[row*matrix_internal_cols + row]; \n"
231
" \n"
232
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
233
" temp = vector[row]; \n"
234
" //eliminate column with index 'row' in parallel: \n"
235
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
236
" vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; \n"
237
" } \n"
238
" \n"
239
"}\n"
240
;
//matrix_row_align1_trans_upper_triangular_substitute_inplace
241
242
const
char
*
const
matrix_row_align1_fft_radix2_local
=
243
"unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {\n"
244
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
245
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
246
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
247
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
248
" v = (v >> 16) | (v << 16);\n"
249
" v = v >> (32 - bit_size);\n"
250
" return v;\n"
251
"}\n"
252
"__kernel void fft_radix2_local(__global float2* input,\n"
253
" __local float2* lcl_input,\n"
254
" unsigned int bit_size,\n"
255
" unsigned int size,\n"
256
" unsigned int stride,\n"
257
" unsigned int batch_num,\n"
258
" float sign) {\n"
259
" unsigned int grp_id = get_group_id(0);\n"
260
" unsigned int grp_num = get_num_groups(0);\n"
261
" unsigned int lcl_sz = get_local_size(0);\n"
262
" unsigned int lcl_id = get_local_id(0);\n"
263
" const float NUM_PI = 3.14159265358979323846;\n"
264
" for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {\n"
265
" //unsigned int base_offset = stride * batch_id;\n"
266
" //copy chunk of global memory to local\n"
267
" for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
268
" unsigned int v = get_reorder_num(p, bit_size);\n"
269
" lcl_input[v] = input[batch_id * stride + p];//index\n"
270
" }\n"
271
" barrier(CLK_LOCAL_MEM_FENCE);\n"
272
" \n"
273
" //performs Cooley-Tukey FFT on local array\n"
274
" for(unsigned int s = 0; s < bit_size; s++) {\n"
275
" unsigned int ss = 1 << s;\n"
276
" float cs, sn;\n"
277
" for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {\n"
278
" unsigned int group = (tid & (ss - 1));\n"
279
" unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
280
" float2 in1 = lcl_input[pos];\n"
281
" float2 in2 = lcl_input[pos + ss];\n"
282
" float arg = group * sign * NUM_PI / ss;\n"
283
" sn = sincos(arg, &cs);\n"
284
" float2 ex = (float2)(cs, sn);\n"
285
" float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
286
" lcl_input[pos + ss] = in1 - tmp;\n"
287
" lcl_input[pos] = in1 + tmp;\n"
288
" }\n"
289
" barrier(CLK_LOCAL_MEM_FENCE);\n"
290
" }\n"
291
" \n"
292
" //copy local array back to global memory\n"
293
" for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
294
" input[batch_id * stride + p] = lcl_input[p];//index\n"
295
" }\n"
296
" }\n"
297
"}\n"
298
;
//matrix_row_align1_fft_radix2_local
299
300
const
char
*
const
matrix_row_align1_fft_direct
=
301
"// naive fourier transform (quadratic complexity, use for reference only)\n"
302
"__kernel void fft_direct(__global float2* input,\n"
303
" __global float2* output,\n"
304
" unsigned int size,\n"
305
" unsigned int stride,\n"
306
" unsigned int batch_num,\n"
307
" float sign) {\n"
308
"// unsigned int base_offset = 0;\n"
309
" const float NUM_PI = 3.14159265358979323846;\n"
310
" \n"
311
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
312
" for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {\n"
313
" float2 f = 0.0f;\n"
314
" for(unsigned int n = 0; n < size; n++) {\n"
315
" float2 in = input[batch_id * stride + n]; //input index here\n"
316
" float sn, cs;\n"
317
" float arg = sign * 2 * NUM_PI * k / size * n;\n"
318
" sn = sincos(arg, &cs);\n"
319
" float2 ex = (float2)(cs, sn);\n"
320
" f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);\n"
321
" }\n"
322
" output[batch_id * stride + k] = f;// output index here\n"
323
" }\n"
324
"// base_offset += stride;\n"
325
" }\n"
326
"}\n"
327
;
//matrix_row_align1_fft_direct
328
329
const
char
*
const
matrix_row_align1_unit_lower_triangular_substitute_inplace
=
330
"__kernel void unit_lower_triangular_substitute_inplace(\n"
331
" __global const float * matrix,\n"
332
" unsigned int matrix_rows,\n"
333
" unsigned int matrix_cols,\n"
334
" unsigned int matrix_internal_rows,\n"
335
" unsigned int matrix_internal_cols,\n"
336
" __global float * vector)\n"
337
"{\n"
338
" float temp;\n"
339
" for (int row = 0; row < matrix_rows; ++row)\n"
340
" {\n"
341
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
342
" temp = vector[row];\n"
343
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
344
" vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];\n"
345
" }\n"
346
"}\n"
347
;
//matrix_row_align1_unit_lower_triangular_substitute_inplace
348
349
const
char
*
const
matrix_row_align1_fft_reorder
=
350
"/*\n"
351
"* Performs reordering of input data in bit-reversal order\n"
352
"* Probably it's better to do in host side,\n"
353
"*/\n"
354
"unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {\n"
355
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
356
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
357
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
358
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
359
" v = (v >> 16) | (v << 16);\n"
360
" v = v >> (32 - bit_size);\n"
361
" return v;\n"
362
"}\n"
363
"__kernel void fft_reorder(__global float2* input,\n"
364
" unsigned int bit_size,\n"
365
" unsigned int size,\n"
366
" unsigned int stride,\n"
367
" int batch_num) {\n"
368
" //unsigned int base_offset = 0;\n"
369
" unsigned int glb_id = get_global_id(0);\n"
370
" unsigned int glb_sz = get_global_size(0);\n"
371
" \n"
372
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
373
" for(unsigned int i = glb_id; i < size; i += glb_sz) {\n"
374
" unsigned int v = get_reorder_num_2(i, bit_size);\n"
375
" if(i < v) {\n"
376
" float2 tmp = input[batch_id * stride + i]; // index\n"
377
" input[batch_id * stride + i] = input[batch_id * stride + v]; //index\n"
378
" input[batch_id * stride + v] = tmp; //index\n"
379
" }\n"
380
" }\n"
381
" //base_offset += stride;\n"
382
" }\n"
383
"}\n"
384
;
//matrix_row_align1_fft_reorder
385
386
const
char
*
const
matrix_row_align1_upper_triangular_substitute_inplace
=
387
"__kernel void upper_triangular_substitute_inplace( \n"
388
" __global const float * matrix, \n"
389
" unsigned int matrix_rows,\n"
390
" unsigned int matrix_cols,\n"
391
" unsigned int matrix_internal_rows,\n"
392
" unsigned int matrix_internal_cols,\n"
393
" __global float * vector) \n"
394
"{ \n"
395
" float temp; \n"
396
" for (int row = matrix_rows-1; row > -1; --row) \n"
397
" { \n"
398
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
399
" if (get_global_id(0) == 0) \n"
400
" vector[row] /= matrix[row*matrix_internal_cols + row]; \n"
401
" \n"
402
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
403
" temp = vector[row]; \n"
404
" //eliminate column with index 'row' in parallel: \n"
405
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
406
" vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; \n"
407
" } \n"
408
" \n"
409
"}\n"
410
;
//matrix_row_align1_upper_triangular_substitute_inplace
411
412
const
char
*
const
matrix_row_align1_inplace_add
=
413
"__kernel void inplace_add(\n"
414
" __global float * A,\n"
415
" unsigned int A_row_start,\n"
416
" unsigned int A_col_start,\n"
417
" unsigned int A_row_size,\n"
418
" unsigned int A_col_size,\n"
419
" unsigned int A_internal_rows,\n"
420
" unsigned int A_internal_cols,\n"
421
" __global const float * B, \n"
422
" unsigned int B_row_start,\n"
423
" unsigned int B_col_start,\n"
424
" unsigned int B_row_size,\n"
425
" unsigned int B_col_size,\n"
426
" unsigned int B_internal_rows,\n"
427
" unsigned int B_internal_cols)\n"
428
"{ \n"
429
" if ( get_global_id(0) < A_row_size\n"
430
" && get_global_id(1) < A_col_size\n"
431
" )\n"
432
" A[ (get_global_id(0) + A_row_start) * A_internal_cols\n"
433
" + (get_global_id(1) + A_col_start)] \n"
434
" += B[ (get_global_id(0) + B_row_start) * B_internal_cols\n"
435
" + (get_global_id(1) + B_col_start)];\n"
436
"}\n"
437
;
//matrix_row_align1_inplace_add
438
439
const
char
*
const
matrix_row_align1_fft_radix2
=
440
"__kernel void fft_radix2(__global float2* input,\n"
441
" unsigned int s,\n"
442
" unsigned int bit_size,\n"
443
" unsigned int size,\n"
444
" unsigned int stride,\n"
445
" unsigned int batch_num,\n"
446
" float sign) {\n"
447
" unsigned int ss = 1 << s;\n"
448
" unsigned int half_size = size >> 1;\n"
449
" float cs, sn;\n"
450
" const float NUM_PI = 3.14159265358979323846;\n"
451
" unsigned int glb_id = get_global_id(0);\n"
452
" unsigned int glb_sz = get_global_size(0);\n"
453
" \n"
454
"// unsigned int base_offset = 0;\n"
455
" \n"
456
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
457
" for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {\n"
458
" unsigned int group = (tid & (ss - 1));\n"
459
" unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
460
" unsigned int offset = batch_id * stride + pos;\n"
461
" float2 in1 = input[offset];//index\n"
462
" float2 in2 = input[offset + ss];//index\n"
463
" float arg = group * sign * NUM_PI / ss;\n"
464
" sn = sincos(arg, &cs);\n"
465
" //sn = native_sin(arg);\n"
466
" //cs = native_cos(arg);\n"
467
" float2 ex = (float2)(cs, sn);\n"
468
" float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
469
" input[offset + ss] = in1 - tmp;//index\n"
470
" input[offset] = in1 + tmp;//index\n"
471
" }\n"
472
"// base_offset += stride;\n"
473
" }\n"
474
"}\n"
475
;
//matrix_row_align1_fft_radix2
476
477
const
char
*
const
matrix_row_align1_trans_vec_mul
=
478
"__kernel void trans_vec_mul(\n"
479
" __global const float * matrix,\n"
480
" unsigned int matrix_rows,\n"
481
" unsigned int matrix_cols,\n"
482
" unsigned int matrix_internal_rows,\n"
483
" unsigned int matrix_internal_cols,\n"
484
" __global const float * vector, \n"
485
" __global float * result) \n"
486
"{ \n"
487
" //row and col indicate indices within transposed matrix\n"
488
" for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))\n"
489
" {\n"
490
" float dot_prod2 = 0.0f;\n"
491
" for (unsigned int col = 0; col < matrix_rows; ++col)\n"
492
" dot_prod2 += matrix[row + col*matrix_internal_cols] * vector[col];\n"
493
" result[row] = dot_prod2;\n"
494
" }\n"
495
"}\n"
496
;
//matrix_row_align1_trans_vec_mul
497
498
const
char
*
const
matrix_row_align1_trans_unit_lower_triangular_substitute_inplace
=
499
"\n"
500
"__kernel void trans_unit_lower_triangular_substitute_inplace(\n"
501
" __global const float * matrix,\n"
502
" unsigned int matrix_rows,\n"
503
" unsigned int matrix_cols,\n"
504
" unsigned int matrix_internal_rows,\n"
505
" unsigned int matrix_internal_cols,\n"
506
" __global float * vector)\n"
507
"{\n"
508
" float temp;\n"
509
" for (int row = 0; row < matrix_rows; ++row)\n"
510
" {\n"
511
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
512
"\n"
513
" temp = vector[row];\n"
514
"\n"
515
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
516
" vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];\n"
517
" }\n"
518
"}\n"
519
"\n"
520
"\n"
521
;
//matrix_row_align1_trans_unit_lower_triangular_substitute_inplace
522
523
const
char
*
const
matrix_row_align1_cpu_inplace_mult
=
524
"__kernel void cpu_inplace_mult(\n"
525
" __global float * vec,\n"
526
" float factor, \n"
527
" unsigned int size) \n"
528
"{ \n"
529
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
530
" vec[i] *= factor;\n"
531
"}\n"
532
;
//matrix_row_align1_cpu_inplace_mult
533
534
const
char
*
const
matrix_row_align1_clear
=
535
"__kernel void clear(\n"
536
" __global float * vec,\n"
537
" unsigned int size) \n"
538
"{ \n"
539
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
540
" vec[i] = 0;\n"
541
"}\n"
542
;
//matrix_row_align1_clear
543
544
const
char
*
const
matrix_row_align1_inplace_mult
=
545
"__kernel void inplace_mult(\n"
546
" __global float * vec,\n"
547
" __global const float * fac, \n"
548
" unsigned int size) \n"
549
"{ \n"
550
" float factor = *fac;\n"
551
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
552
" vec[i] *= factor;\n"
553
"}\n"
554
;
//matrix_row_align1_inplace_mult
555
556
const
char
*
const
matrix_row_align1_unit_upper_triangular_substitute_inplace
=
557
"__kernel void unit_upper_triangular_substitute_inplace( \n"
558
" __global const float * matrix, \n"
559
" unsigned int matrix_rows,\n"
560
" unsigned int matrix_cols,\n"
561
" unsigned int matrix_internal_rows,\n"
562
" unsigned int matrix_internal_cols,\n"
563
" __global float * vector) \n"
564
"{ \n"
565
" float temp; \n"
566
" for (int row = matrix_rows-1; row > -1; --row) \n"
567
" { \n"
568
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
569
" temp = vector[row]; \n"
570
" //eliminate column with index 'row' in parallel: \n"
571
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
572
" vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; \n"
573
" } \n"
574
" \n"
575
"}\n"
576
;
//matrix_row_align1_unit_upper_triangular_substitute_inplace
577
578
const
char
*
const
matrix_row_align1_add
=
579
"__kernel void add(\n"
580
" __global const float * vec1,\n"
581
" __global const float * vec2, \n"
582
" __global float * result,\n"
583
" unsigned int size) \n"
584
"{ \n"
585
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
586
" result[i] = vec1[i] + vec2[i];\n"
587
"}\n"
588
;
//matrix_row_align1_add
589
590
}
//namespace kernels
591
}
//namespace linalg
592
}
//namespace viennacl
593
#endif
Generated on Wed Oct 10 2012 09:58:14 for ViennaCL - The Vienna Computing Library by
1.8.1.2