ViennaCL - The Vienna Computing Library
1.2.0
Main Page
Namespaces
Data Structures
Files
File List
Globals
viennacl
linalg
kernels
matrix_col_source.h
Go to the documentation of this file.
1
#ifndef VIENNACL_LINALG_KERNELS_MATRIX_COL_SOURCE_HPP_
2
#define VIENNACL_LINALG_KERNELS_MATRIX_COL_SOURCE_HPP_
3
//Automatically generated file from auxiliary-directory, do not edit manually!
4
namespace
viennacl
5
{
6
namespace
linalg
7
{
8
namespace
kernels
9
{
10
const
char
*
const
matrix_col_align1_inplace_divide
=
11
"__kernel void inplace_divide(\n"
12
" __global float * vec,\n"
13
" __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
14
" unsigned int size) \n"
15
"{ \n"
16
" float factor = *fac;\n"
17
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
18
" vec[i] /= factor;\n"
19
"}\n"
20
;
//matrix_col_align1_inplace_divide
21
22
const
char
*
const
matrix_col_align1_trans_lower_triangular_substitute_inplace
=
23
"__kernel void trans_lower_triangular_substitute_inplace(\n"
24
" __global const float * matrix,\n"
25
" unsigned int matrix_rows,\n"
26
" unsigned int matrix_cols,\n"
27
" unsigned int matrix_internal_rows,\n"
28
" unsigned int matrix_internal_cols,\n"
29
" __global float * vector)\n"
30
"{\n"
31
" float temp;\n"
32
" for (int row = 0; row < matrix_rows; ++row)\n"
33
" {\n"
34
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
35
" if (get_global_id(0) == 0)\n"
36
" vector[row] /= matrix[row+row*matrix_internal_rows];\n"
37
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
38
" temp = vector[row];\n"
39
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
40
" vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];\n"
41
" }\n"
42
"}\n"
43
;
//matrix_col_align1_trans_lower_triangular_substitute_inplace
44
45
const
char
*
const
matrix_col_align1_trans_unit_upper_triangular_substitute_inplace
=
46
"//transposed lower triangular matrix\n"
47
"__kernel void trans_unit_upper_triangular_substitute_inplace(\n"
48
" __global const float * matrix, \n"
49
" unsigned int matrix_rows,\n"
50
" unsigned int matrix_cols,\n"
51
" unsigned int matrix_internal_rows,\n"
52
" unsigned int matrix_internal_cols,\n"
53
" __global float * vector) \n"
54
"{ \n"
55
" float temp; \n"
56
" for (int row = matrix_rows-1; row > -1; --row) \n"
57
" { \n"
58
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
59
" \n"
60
" temp = vector[row]; \n"
61
" //eliminate column with index 'row' in parallel: \n"
62
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
63
" vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; \n"
64
" } \n"
65
" \n"
66
"}\n"
67
;
//matrix_col_align1_trans_unit_upper_triangular_substitute_inplace
68
69
const
char
*
const
matrix_col_align1_sub
=
70
"__kernel void sub(\n"
71
" __global const float * vec1,\n"
72
" __global const float * vec2, \n"
73
" __global float * result,\n"
74
" unsigned int size)\n"
75
"{ \n"
76
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
77
" result[i] = vec1[i] - vec2[i];\n"
78
"}\n"
79
;
//matrix_col_align1_sub
80
81
const
char
*
const
matrix_col_align1_scaled_rank1_update
=
82
"__kernel void scaled_rank1_update(\n"
83
" __global float * matrix,\n"
84
" unsigned int matrix_rows,\n"
85
" unsigned int matrix_cols,\n"
86
" unsigned int matrix_internal_rows,\n"
87
" unsigned int matrix_internal_cols,\n"
88
" float val,\n"
89
" __global const float * vector1, \n"
90
" __global const float * vector2) \n"
91
"{ \n"
92
" float tmp;\n"
93
" for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
94
" {\n"
95
" tmp = val * vector1[row];\n"
96
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
97
" matrix[row + col*matrix_internal_rows] += tmp * vector2[col];\n"
98
" }\n"
99
"}\n"
100
;
//matrix_col_align1_scaled_rank1_update
101
102
const
char
*
const
matrix_col_align1_inplace_sub
=
103
"__kernel void inplace_sub(\n"
104
" __global float * vec1,\n"
105
" __global const float * vec2,\n"
106
" unsigned int size) \n"
107
"{ \n"
108
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
109
" vec1[i] -= vec2[i];\n"
110
"}\n"
111
;
//matrix_col_align1_inplace_sub
112
113
const
char
*
const
matrix_col_align1_lower_triangular_substitute_inplace
=
114
"__kernel void lower_triangular_substitute_inplace(\n"
115
" __global const float * matrix,\n"
116
" unsigned int matrix_rows,\n"
117
" unsigned int matrix_cols,\n"
118
" unsigned int matrix_internal_rows,\n"
119
" unsigned int matrix_internal_cols,\n"
120
" __global float * vector)\n"
121
"{\n"
122
" float temp;\n"
123
" for (int row = 0; row < matrix_rows; ++row)\n"
124
" {\n"
125
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
126
" if (get_global_id(0) == 0)\n"
127
" vector[row] /= matrix[row+row*matrix_internal_rows];\n"
128
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
129
" temp = vector[row];\n"
130
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
131
" vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];\n"
132
" }\n"
133
"}\n"
134
;
//matrix_col_align1_lower_triangular_substitute_inplace
135
136
const
char
*
const
matrix_col_align1_vec_mul
=
137
"__kernel void vec_mul(\n"
138
" __global const float * matrix,\n"
139
" unsigned int matrix_rows,\n"
140
" unsigned int matrix_cols,\n"
141
" unsigned int matrix_internal_rows,\n"
142
" unsigned int matrix_internal_cols,\n"
143
" __global const float * vector, \n"
144
" __global float * result) \n"
145
"{ \n"
146
" for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
147
" {\n"
148
" float dot_prod = 0.0f;\n"
149
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
150
" dot_prod += matrix[row + col*matrix_internal_rows] * vector[col];\n"
151
" result[row] = dot_prod;\n"
152
" }\n"
153
"}\n"
154
;
//matrix_col_align1_vec_mul
155
156
const
char
*
const
matrix_col_align1_lu_factorize
=
157
"__kernel void lu_factorize(\n"
158
" __global float * matrix,\n"
159
" unsigned int matrix_rows,\n"
160
" unsigned int matrix_cols,\n"
161
" unsigned int matrix_internal_rows,\n"
162
" unsigned int matrix_internal_cols) \n"
163
"{ \n"
164
" float temp;\n"
165
" for (unsigned int i=1; i<matrix_rows; ++i)\n"
166
" {\n"
167
" for (unsigned int k=0; k<i; ++k)\n"
168
" {\n"
169
" if (get_global_id(0) == 0)\n"
170
" matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows];\n"
171
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
172
" temp = matrix[i + k*matrix_internal_rows];\n"
173
" \n"
174
" //parallel subtraction:\n"
175
" for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0))\n"
176
" matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows];\n"
177
" }\n"
178
" }\n"
179
"} \n"
180
;
//matrix_col_align1_lu_factorize
181
182
const
char
*
const
matrix_col_align1_rank1_update
=
183
"//perform a rank-1 update of the matrix, i.e. A += x * x^T\n"
184
"__kernel void rank1_update(\n"
185
" __global float * matrix,\n"
186
" unsigned int matrix_rows,\n"
187
" unsigned int matrix_cols,\n"
188
" unsigned int matrix_internal_rows,\n"
189
" unsigned int matrix_internal_cols,\n"
190
" __global const float * vector1, \n"
191
" __global const float * vector2) \n"
192
"{ \n"
193
" float tmp;\n"
194
" for (unsigned int row= get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
195
" {\n"
196
" tmp = vector1[row];\n"
197
" for (unsigned int col = 0; col < matrix_cols; ++col)\n"
198
" matrix[row + col * matrix_internal_rows] += tmp * vector2[col];\n"
199
" }\n"
200
"}\n"
201
;
//matrix_col_align1_rank1_update
202
203
const
char
*
const
matrix_col_align1_trans_upper_triangular_substitute_inplace
=
204
"//transposed lower triangular matrix\n"
205
"__kernel void trans_upper_triangular_substitute_inplace(\n"
206
" __global const float * matrix, \n"
207
" unsigned int matrix_rows,\n"
208
" unsigned int matrix_cols,\n"
209
" unsigned int matrix_internal_rows,\n"
210
" unsigned int matrix_internal_cols,\n"
211
" __global float * vector) \n"
212
"{ \n"
213
" float temp; \n"
214
" for (int row = matrix_rows-1; row > -1; --row) \n"
215
" { \n"
216
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
217
" if (get_global_id(0) == 0) \n"
218
" vector[row] /= matrix[row + row*matrix_internal_rows]; \n"
219
" \n"
220
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
221
" temp = vector[row]; \n"
222
" //eliminate column with index 'row' in parallel: \n"
223
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
224
" vector[elim] -= temp * matrix[row + elim * matrix_internal_rows]; \n"
225
" } \n"
226
" \n"
227
"}\n"
228
;
//matrix_col_align1_trans_upper_triangular_substitute_inplace
229
230
const
char
*
const
matrix_col_align1_fft_radix2_local
=
231
"unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) {\n"
232
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
233
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
234
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
235
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
236
" v = (v >> 16) | (v << 16);\n"
237
" v = v >> (32 - bit_size);\n"
238
" return v;\n"
239
"}\n"
240
"__kernel void fft_radix2_local(__global float2* input,\n"
241
" __local float2* lcl_input,\n"
242
" unsigned int bit_size,\n"
243
" unsigned int size,\n"
244
" unsigned int stride,\n"
245
" unsigned int batch_num,\n"
246
" float sign) {\n"
247
" unsigned int grp_id = get_group_id(0);\n"
248
" unsigned int grp_num = get_num_groups(0);\n"
249
" unsigned int lcl_sz = get_local_size(0);\n"
250
" unsigned int lcl_id = get_local_id(0);\n"
251
" const float NUM_PI = 3.14159265358979323846;\n"
252
" for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) {\n"
253
" //unsigned int base_offset = stride * batch_id;\n"
254
" //copy chunk of global memory to local\n"
255
" for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
256
" unsigned int v = get_reorder_num(p, bit_size);\n"
257
" lcl_input[v] = input[p * stride + batch_id];//index\n"
258
" }\n"
259
" barrier(CLK_LOCAL_MEM_FENCE);\n"
260
" //performs Cooley-Tukey FFT on local array\n"
261
" for(unsigned int s = 0; s < bit_size; s++) {\n"
262
" unsigned int ss = 1 << s;\n"
263
" float cs, sn;\n"
264
" for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) {\n"
265
" unsigned int group = (tid & (ss - 1));\n"
266
" unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
267
" float2 in1 = lcl_input[pos];\n"
268
" float2 in2 = lcl_input[pos + ss];\n"
269
" float arg = group * sign * NUM_PI / ss;\n"
270
" sn = sincos(arg, &cs);\n"
271
" float2 ex = (float2)(cs, sn);\n"
272
" float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
273
" lcl_input[pos + ss] = in1 - tmp;\n"
274
" lcl_input[pos] = in1 + tmp;\n"
275
" }\n"
276
" barrier(CLK_LOCAL_MEM_FENCE);\n"
277
" }\n"
278
" //copy local array back to global memory\n"
279
" for(unsigned int p = lcl_id; p < size; p += lcl_sz) {\n"
280
" input[p * stride + batch_id] = lcl_input[p];//index\n"
281
" }\n"
282
" }\n"
283
"}\n"
284
;
//matrix_col_align1_fft_radix2_local
285
286
const
char
*
const
matrix_col_align1_fft_direct
=
287
"// Direct FFT computation (quadratic complexity - use for reference only)\n"
288
"__kernel void fft_direct(__global float2* input,\n"
289
" __global float2* output,\n"
290
" unsigned int size,\n"
291
" unsigned int stride,\n"
292
" unsigned int batch_num,\n"
293
" float sign) {\n"
294
" \n"
295
" const float NUM_PI = 3.14159265358979323846;\n"
296
" \n"
297
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
298
" for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) {\n"
299
" float2 f = 0.0f;\n"
300
" for(unsigned int n = 0; n < size; n++) {\n"
301
" float2 in = input[n * stride + batch_id]; //input index here\n"
302
" float sn, cs;\n"
303
" float arg = sign * 2 * NUM_PI * k / size * n;\n"
304
" sn = sincos(arg, &cs);\n"
305
" float2 ex = (float2)(cs, sn);\n"
306
" f = f + (float2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x);\n"
307
" }\n"
308
" output[k * stride + batch_id] = f;// output index here\n"
309
" }\n"
310
" }\n"
311
"}\n"
312
;
//matrix_col_align1_fft_direct
313
314
const
char
*
const
matrix_col_align1_unit_lower_triangular_substitute_inplace
=
315
"__kernel void unit_lower_triangular_substitute_inplace(\n"
316
" __global const float * matrix,\n"
317
" unsigned int matrix_rows,\n"
318
" unsigned int matrix_cols,\n"
319
" unsigned int matrix_internal_rows,\n"
320
" unsigned int matrix_internal_cols,\n"
321
" __global float * vector)\n"
322
"{\n"
323
" float temp;\n"
324
" for (int row = 0; row < matrix_rows; ++row)\n"
325
" {\n"
326
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
327
" temp = vector[row];\n"
328
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
329
" vector[elim] -= temp * matrix[row * matrix_internal_rows + elim];\n"
330
" }\n"
331
"}\n"
332
;
//matrix_col_align1_unit_lower_triangular_substitute_inplace
333
334
const
char
*
const
matrix_col_align1_fft_reorder
=
335
"/*\n"
336
"* Performs reordering of input data in bit-reversal order\n"
337
"* Probably it's better to do in host side,\n"
338
"*/\n"
339
"unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) {\n"
340
" v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);\n"
341
" v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);\n"
342
" v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);\n"
343
" v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);\n"
344
" v = (v >> 16) | (v << 16);\n"
345
" v = v >> (32 - bit_size);\n"
346
" return v;\n"
347
"}\n"
348
"__kernel void fft_reorder(__global float2* input,\n"
349
" unsigned int bit_size,\n"
350
" unsigned int size,\n"
351
" unsigned int stride,\n"
352
" int batch_num) {\n"
353
" unsigned int glb_id = get_global_id(0);\n"
354
" unsigned int glb_sz = get_global_size(0);\n"
355
" \n"
356
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
357
" for(unsigned int i = glb_id; i < size; i += glb_sz) {\n"
358
" unsigned int v = get_reorder_num_2(i, bit_size);\n"
359
" if(i < v) {\n"
360
" float2 tmp = input[i * stride + batch_id]; // index\n"
361
" input[i * stride + batch_id] = input[v * stride + batch_id]; //index\n"
362
" input[v * stride + batch_id] = tmp; //index\n"
363
" }\n"
364
" }\n"
365
" }\n"
366
"}\n"
367
;
//matrix_col_align1_fft_reorder
368
369
const
char
*
const
matrix_col_align1_upper_triangular_substitute_inplace
=
370
"__kernel void upper_triangular_substitute_inplace( \n"
371
" __global const float * matrix, \n"
372
" unsigned int matrix_rows,\n"
373
" unsigned int matrix_cols,\n"
374
" unsigned int matrix_internal_rows,\n"
375
" unsigned int matrix_internal_cols,\n"
376
" __global float * vector) \n"
377
"{ \n"
378
" float temp; \n"
379
" for (int row = matrix_rows-1; row > -1; --row) \n"
380
" { \n"
381
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
382
" if (get_global_id(0) == 0) \n"
383
" vector[row] /= matrix[row + row*matrix_internal_rows]; \n"
384
" \n"
385
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
386
" temp = vector[row]; \n"
387
" //eliminate column with index 'row' in parallel: \n"
388
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
389
" vector[elim] -= temp * matrix[elim + row * matrix_internal_rows]; \n"
390
" } \n"
391
" \n"
392
"}\n"
393
;
//matrix_col_align1_upper_triangular_substitute_inplace
394
395
const
char
*
const
matrix_col_align1_inplace_add
=
396
"__kernel void inplace_add(\n"
397
" __global float * A,\n"
398
" unsigned int A_row_start,\n"
399
" unsigned int A_col_start,\n"
400
" unsigned int A_row_size,\n"
401
" unsigned int A_col_size,\n"
402
" unsigned int A_internal_rows,\n"
403
" unsigned int A_internal_cols,\n"
404
" __global const float * B, \n"
405
" unsigned int B_row_start,\n"
406
" unsigned int B_col_start,\n"
407
" unsigned int B_row_size,\n"
408
" unsigned int B_col_size,\n"
409
" unsigned int B_internal_rows,\n"
410
" unsigned int B_internal_cols)\n"
411
"{ \n"
412
" if ( get_global_id(0) < A_row_size\n"
413
" && get_global_id(1) < A_col_size\n"
414
" )\n"
415
" A[ (get_global_id(0) + A_row_start)\n"
416
" + (get_global_id(1) + A_col_start) * A_internal_rows] \n"
417
" += B[ (get_global_id(0) + B_row_start)\n"
418
" + (get_global_id(1) + B_col_start) * B_internal_rows];\n"
419
"}\n"
420
;
//matrix_col_align1_inplace_add
421
422
const
char
*
const
matrix_col_align1_fft_radix2
=
423
"__kernel void fft_radix2(__global float2* input,\n"
424
" unsigned int s,\n"
425
" unsigned int bit_size,\n"
426
" unsigned int size,\n"
427
" unsigned int stride,\n"
428
" unsigned int batch_num,\n"
429
" float sign) {\n"
430
" unsigned int ss = 1 << s;\n"
431
" unsigned int half_size = size >> 1;\n"
432
" float cs, sn;\n"
433
" const float NUM_PI = 3.14159265358979323846;\n"
434
" unsigned int glb_id = get_global_id(0);\n"
435
" unsigned int glb_sz = get_global_size(0);\n"
436
" \n"
437
" for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) {\n"
438
" for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) {\n"
439
" unsigned int group = (tid & (ss - 1));\n"
440
" unsigned int pos = ((tid >> s) << (s + 1)) + group;\n"
441
" unsigned int offset = pos * stride + batch_id;\n"
442
" float2 in1 = input[offset];//index\n"
443
" float2 in2 = input[offset + ss * stride];//index\n"
444
" float arg = group * sign * NUM_PI / ss;\n"
445
" sn = sincos(arg, &cs);\n"
446
" float2 ex = (float2)(cs, sn);\n"
447
" float2 tmp = (float2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x);\n"
448
" input[offset + ss * stride] = in1 - tmp;//index\n"
449
" input[offset] = in1 + tmp;//index\n"
450
" }\n"
451
" }\n"
452
"}\n"
453
;
//matrix_col_align1_fft_radix2
454
455
const
char
*
const
matrix_col_align1_trans_vec_mul
=
456
"__kernel void trans_vec_mul(\n"
457
" __global const float * matrix,\n"
458
" unsigned int matrix_rows,\n"
459
" unsigned int matrix_cols,\n"
460
" unsigned int matrix_internal_rows,\n"
461
" unsigned int matrix_internal_cols,\n"
462
" __global const float * vector, \n"
463
" __global float * result) \n"
464
"{ \n"
465
" //row and col indicate indices within transposed matrix\n"
466
" for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))\n"
467
" {\n"
468
" float dot_prod2 = 0.0f;\n"
469
" for (unsigned int col = 0; col < matrix_rows; ++col)\n"
470
" dot_prod2 += matrix[row * matrix_internal_rows + col] * vector[col];\n"
471
" result[row] = dot_prod2;\n"
472
" }\n"
473
"}\n"
474
;
//matrix_col_align1_trans_vec_mul
475
476
const
char
*
const
matrix_col_align1_trans_unit_lower_triangular_substitute_inplace
=
477
"\n"
478
"__kernel void trans_unit_lower_triangular_substitute_inplace(\n"
479
" __global const float * matrix,\n"
480
" unsigned int matrix_rows,\n"
481
" unsigned int matrix_cols,\n"
482
" unsigned int matrix_internal_rows,\n"
483
" unsigned int matrix_internal_cols,\n"
484
" __global float * vector)\n"
485
"{\n"
486
" float temp;\n"
487
" for (int row = 0; row < matrix_rows; ++row)\n"
488
" {\n"
489
" barrier(CLK_GLOBAL_MEM_FENCE);\n"
490
"\n"
491
" temp = vector[row];\n"
492
"\n"
493
" for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
494
" vector[elim] -= temp * matrix[elim * matrix_internal_rows + row];\n"
495
" }\n"
496
"}\n"
497
"\n"
498
"\n"
499
;
//matrix_col_align1_trans_unit_lower_triangular_substitute_inplace
500
501
const
char
*
const
matrix_col_align1_cpu_inplace_mult
=
502
"__kernel void cpu_inplace_mult(\n"
503
" __global float * vec,\n"
504
" float factor, \n"
505
" unsigned int size) \n"
506
"{ \n"
507
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
508
" vec[i] *= factor;\n"
509
"}\n"
510
;
//matrix_col_align1_cpu_inplace_mult
511
512
const
char
*
const
matrix_col_align1_clear
=
513
"__kernel void clear(\n"
514
" __global float * vec,\n"
515
" unsigned int size) \n"
516
"{ \n"
517
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
518
" vec[i] = 0;\n"
519
"}\n"
520
;
//matrix_col_align1_clear
521
522
const
char
*
const
matrix_col_align1_inplace_mult
=
523
"__kernel void inplace_mult(\n"
524
" __global float * vec,\n"
525
" __global const float * fac, \n"
526
" unsigned int size) \n"
527
"{ \n"
528
" float factor = *fac;\n"
529
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
530
" vec[i] *= factor;\n"
531
"}\n"
532
;
//matrix_col_align1_inplace_mult
533
534
const
char
*
const
matrix_col_align1_unit_upper_triangular_substitute_inplace
=
535
"__kernel void unit_upper_triangular_substitute_inplace( \n"
536
" __global const float * matrix, \n"
537
" unsigned int matrix_rows,\n"
538
" unsigned int matrix_cols,\n"
539
" unsigned int matrix_internal_rows,\n"
540
" unsigned int matrix_internal_cols,\n"
541
" __global float * vector) \n"
542
"{ \n"
543
" float temp; \n"
544
" for (int row = matrix_rows-1; row > -1; --row) \n"
545
" { \n"
546
" barrier(CLK_GLOBAL_MEM_FENCE); \n"
547
" \n"
548
" temp = vector[row]; \n"
549
" //eliminate column with index 'row' in parallel: \n"
550
" for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
551
" vector[elim] -= temp * matrix[elim + row * matrix_internal_rows]; \n"
552
" } \n"
553
" \n"
554
"}\n"
555
;
//matrix_col_align1_unit_upper_triangular_substitute_inplace
556
557
const
char
*
const
matrix_col_align1_add
=
558
"__kernel void add(\n"
559
" __global const float * vec1,\n"
560
" __global const float * vec2, \n"
561
" __global float * result,\n"
562
" unsigned int size) \n"
563
"{ \n"
564
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
565
" result[i] = vec1[i] + vec2[i];\n"
566
"}\n"
567
;
//matrix_col_align1_add
568
569
}
//namespace kernels
570
}
//namespace linalg
571
}
//namespace viennacl
572
#endif
Generated on Wed Oct 10 2012 09:58:14 for ViennaCL - The Vienna Computing Library by
1.8.1.2