ViennaCL - The Vienna Computing Library  1.2.0
compressed_matrix_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const compressed_matrix_align4_vec_mul =
11 "__kernel void vec_mul(\n"
12 " __global const unsigned int * row_indices,\n"
13 " __global const uint4 * column_indices, \n"
14 " __global const float4 * elements,\n"
15 " __global const float * vector, \n"
16 " __global float * result,\n"
17 " unsigned int size)\n"
18 "{ \n"
19 " float dot_prod;\n"
20 " unsigned int start, next_stop;\n"
21 " uint4 col_idx;\n"
22 " float4 tmp_vec;\n"
23 " float4 tmp_entries;\n"
24 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
25 " {\n"
26 " dot_prod = 0.0f;\n"
27 " start = row_indices[row] / 4;\n"
28 " next_stop = row_indices[row+1] / 4;\n"
29 " for (unsigned int i = start; i < next_stop; ++i)\n"
30 " {\n"
31 " col_idx = column_indices[i];\n"
32 " tmp_entries = elements[i];\n"
33 " tmp_vec.x = vector[col_idx.x];\n"
34 " tmp_vec.y = vector[col_idx.y];\n"
35 " tmp_vec.z = vector[col_idx.z];\n"
36 " tmp_vec.w = vector[col_idx.w];\n"
37 " dot_prod += dot(tmp_entries, tmp_vec);\n"
38 " }\n"
39 " result[row] = dot_prod;\n"
40 " }\n"
41 "}\n"
42 ; //compressed_matrix_align4_vec_mul
43 
45 "__kernel void jacobi_precond(\n"
46 " __global const unsigned int * row_indices,\n"
47 " __global const unsigned int * column_indices, \n"
48 " __global const float * elements,\n"
49 " __global float * diag_M_inv,\n"
50 " unsigned int size) \n"
51 "{ \n"
52 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
53 " {\n"
54 " float diag = 1.0f;\n"
55 " unsigned int row_end = row_indices[row+1];\n"
56 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
57 " {\n"
58 " if (row == column_indices[i])\n"
59 " {\n"
60 " diag = elements[i];\n"
61 " break;\n"
62 " }\n"
63 " }\n"
64 " diag_M_inv[row] = 1.0f / diag;\n"
65 " }\n"
66 "}\n"
67 ; //compressed_matrix_align1_jacobi_precond
68 
70 " \n"
71 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
72 "__kernel void lu_forward(\n"
73 " __global const unsigned int * row_indices,\n"
74 " __global const unsigned int * column_indices, \n"
75 " __global const float * elements,\n"
76 " __local int * buffer, \n"
77 " __local float * vec_entries, //a memory block from vector\n"
78 " __global float * vector,\n"
79 " unsigned int size) \n"
80 "{\n"
81 " int waiting_for; //block index that must be finished before the current thread can start\n"
82 " unsigned int waiting_for_index;\n"
83 " int block_offset;\n"
84 " unsigned int col;\n"
85 " unsigned int row;\n"
86 " unsigned int row_index_end;\n"
87 " \n"
88 " //backward substitution: one thread per row in blocks of get_global_size(0)\n"
89 " for (unsigned int block_num = 0; block_num <= size / get_global_size(0); ++block_num)\n"
90 " {\n"
91 " block_offset = block_num * get_global_size(0);\n"
92 " row = block_offset + get_global_id(0);\n"
93 " buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
94 " waiting_for = -1;\n"
95 " if (row < size)\n"
96 " {\n"
97 " vec_entries[get_global_id(0)] = vector[row];\n"
98 " waiting_for_index = row_indices[row];\n"
99 " row_index_end = row_indices[row+1];\n"
100 " }\n"
101 " \n"
102 " if (get_global_id(0) == 0)\n"
103 " buffer[get_global_size(0)] = 1;\n"
104 " //try to eliminate all lines in the block. \n"
105 " //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
106 " for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
107 " {\n"
108 " barrier(CLK_LOCAL_MEM_FENCE);\n"
109 " if (row < size) //valid index?\n"
110 " {\n"
111 " if (waiting_for >= 0)\n"
112 " {\n"
113 " if (buffer[waiting_for] == 1)\n"
114 " waiting_for = -1;\n"
115 " }\n"
116 " \n"
117 " if (waiting_for == -1) //substitution not yet done, check whether possible\n"
118 " {\n"
119 " //check whether reduction is possible:\n"
120 " for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
121 " {\n"
122 " col = column_indices[j];\n"
123 " if (col < block_offset) //index valid, but not from current block\n"
124 " vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
125 " else if (col < row) //index is from current block\n"
126 " {\n"
127 " if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
128 " {\n"
129 " waiting_for = col - block_offset;\n"
130 " waiting_for_index = j;\n"
131 " break;\n"
132 " }\n"
133 " else //updated entry is available in shared memory:\n"
134 " vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
135 " }\n"
136 " }\n"
137 " \n"
138 " if (waiting_for == -1) //this row is done\n"
139 " {\n"
140 " buffer[get_global_id(0)] = 1;\n"
141 " waiting_for = -2; //magic number: thread is finished\n"
142 " }\n"
143 " } \n"
144 " } //row < size\n"
145 " else\n"
146 " buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
147 " ///////// check whether all threads are done. If yes, exit loop /////////////\n"
148 " \n"
149 " if (buffer[get_global_id(0)] == 0)\n"
150 " buffer[get_global_size(0)] = 0;\n"
151 " barrier(CLK_LOCAL_MEM_FENCE);\n"
152 " \n"
153 " if (buffer[get_global_size(0)] > 0) //all threads break this loop simultaneously\n"
154 " break;\n"
155 " if (get_global_id(0) == 0)\n"
156 " buffer[get_global_size(0)] = 1;\n"
157 " } //for k\n"
158 " \n"
159 " //write to vector:\n"
160 " if (row < size)\n"
161 " vector[row] = vec_entries[get_global_id(0)];\n"
162 " \n"
163 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
164 " } //for block_num\n"
165 "}\n"
166 ; //compressed_matrix_align1_lu_forward
167 
169 "void helper_bicgstab_kernel1_parallel_reduction( __local float * tmp_buffer )\n"
170 "{\n"
171 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
172 " {\n"
173 " barrier(CLK_LOCAL_MEM_FENCE);\n"
174 " if (get_local_id(0) < stride)\n"
175 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
176 " }\n"
177 "}\n"
178 "//////// inner products:\n"
179 "float bicgstab_kernel1_inner_prod(\n"
180 " __global const float * vec1,\n"
181 " __global const float * vec2,\n"
182 " unsigned int size,\n"
183 " __local float * tmp_buffer)\n"
184 "{\n"
185 " float tmp = 0;\n"
186 " unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
187 " for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
188 " {\n"
189 " if (i < size)\n"
190 " tmp += vec1[i] * vec2[i];\n"
191 " }\n"
192 " tmp_buffer[get_local_id(0)] = tmp;\n"
193 " \n"
194 " helper_bicgstab_kernel1_parallel_reduction(tmp_buffer);\n"
195 " barrier(CLK_LOCAL_MEM_FENCE);\n"
196 " return tmp_buffer[0];\n"
197 "}\n"
198 "__kernel void bicgstab_kernel1(\n"
199 " __global const float * tmp0,\n"
200 " __global const float * r0star, \n"
201 " __global const float * residual,\n"
202 " __global float * s,\n"
203 " __global float * alpha,\n"
204 " __global const float * ip_rr0star,\n"
205 " __local float * tmp_buffer,\n"
206 " unsigned int size) \n"
207 "{ \n"
208 " float alpha_local = ip_rr0star[0] / bicgstab_kernel1_inner_prod(tmp0, r0star, size, tmp_buffer);\n"
209 " \n"
210 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
211 " s[i] = residual[i] - alpha_local * tmp0[i];\n"
212 " \n"
213 " if (get_global_id(0) == 0)\n"
214 " alpha[0] = alpha_local;\n"
215 "}\n"
216 ; //compressed_matrix_align1_bicgstab_kernel1
217 
219 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n"
220 "__kernel void lu_backward(\n"
221 " __global const unsigned int * row_indices,\n"
222 " __global const unsigned int * column_indices, \n"
223 " __global const float * elements,\n"
224 " __local int * buffer, \n"
225 " __local float * vec_entries, //a memory block from vector\n"
226 " __global float * vector,\n"
227 " unsigned int size) \n"
228 "{\n"
229 " int waiting_for; //block index that must be finished before the current thread can start\n"
230 " unsigned int waiting_for_index;\n"
231 " unsigned int block_offset;\n"
232 " unsigned int col;\n"
233 " unsigned int row;\n"
234 " unsigned int row_index_end;\n"
235 " float diagonal_entry = 42;\n"
236 " \n"
237 " //forward substitution: one thread per row in blocks of get_global_size(0)\n"
238 " for (int block_num = size / get_global_size(0); block_num > -1; --block_num)\n"
239 " {\n"
240 " block_offset = block_num * get_global_size(0);\n"
241 " row = block_offset + get_global_id(0);\n"
242 " buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
243 " waiting_for = -1;\n"
244 " \n"
245 " if (row < size)\n"
246 " {\n"
247 " vec_entries[get_global_id(0)] = vector[row];\n"
248 " waiting_for_index = row_indices[row];\n"
249 " row_index_end = row_indices[row+1];\n"
250 " diagonal_entry = column_indices[waiting_for_index];\n"
251 " }\n"
252 " \n"
253 " if (get_global_id(0) == 0)\n"
254 " buffer[get_global_size(0)] = 1;\n"
255 " //try to eliminate all lines in the block. \n"
256 " //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
257 " for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
258 " {\n"
259 " barrier(CLK_LOCAL_MEM_FENCE);\n"
260 " if (row < size) //valid index?\n"
261 " {\n"
262 " if (waiting_for >= 0)\n"
263 " {\n"
264 " if (buffer[waiting_for] == 1)\n"
265 " waiting_for = -1;\n"
266 " }\n"
267 " \n"
268 " if (waiting_for == -1) //substitution not yet done, check whether possible\n"
269 " {\n"
270 " //check whether reduction is possible:\n"
271 " for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
272 " {\n"
273 " col = column_indices[j];\n"
274 " barrier(CLK_LOCAL_MEM_FENCE);\n"
275 " if (col >= block_offset + get_global_size(0)) //index valid, but not from current block\n"
276 " vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
277 " else if (col > row) //index is from current block\n"
278 " {\n"
279 " if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
280 " {\n"
281 " waiting_for = col - block_offset;\n"
282 " waiting_for_index = j;\n"
283 " break;\n"
284 " }\n"
285 " else //updated entry is available in shared memory:\n"
286 " vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
287 " }\n"
288 " else if (col == row)\n"
289 " diagonal_entry = elements[j];\n"
290 " }\n"
291 " \n"
292 " if (waiting_for == -1) //this row is done\n"
293 " {\n"
294 " if (row == 0)\n"
295 " vec_entries[get_global_id(0)] /= elements[0];\n"
296 " else\n"
297 " vec_entries[get_global_id(0)] /= diagonal_entry;\n"
298 " buffer[get_global_id(0)] = 1;\n"
299 " waiting_for = -2; //magic number: thread is finished\n"
300 " }\n"
301 " } \n"
302 " } //row < size\n"
303 " else\n"
304 " buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
305 " \n"
306 " ///////// check whether all threads are done. If yes, exit loop /////////////\n"
307 " if (buffer[get_global_id(0)] == 0)\n"
308 " buffer[get_global_size(0)] = 0;\n"
309 " barrier(CLK_LOCAL_MEM_FENCE);\n"
310 " \n"
311 " if (buffer[get_global_size(0)] > 0) //all threads break the loop simultaneously\n"
312 " break;\n"
313 " if (get_global_id(0) == 0)\n"
314 " buffer[get_global_size(0)] = 1;\n"
315 " } //for k\n"
316 " if (row < size)\n"
317 " vector[row] = vec_entries[get_global_id(0)];\n"
318 " //vector[row] = diagonal_entry;\n"
319 " \n"
320 " //if (row == 0)\n"
321 " //vector[0] = diagonal_entry;\n"
322 " //vector[0] = elements[0];\n"
323 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
324 " } //for block_num\n"
325 "}\n"
326 ; //compressed_matrix_align1_lu_backward
327 
329 "__kernel void row_scaling_2(\n"
330 " __global const unsigned int * row_indices,\n"
331 " __global const unsigned int * column_indices, \n"
332 " __global const float * elements,\n"
333 " __global float * diag_M_inv,\n"
334 " unsigned int size) \n"
335 "{ \n"
336 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
337 " {\n"
338 " float dot_prod = 0.0f;\n"
339 " float temp = 0.0f;\n"
340 " unsigned int row_end = row_indices[row+1];\n"
341 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
342 " {\n"
343 " temp = elements[i];\n"
344 " dot_prod += temp * temp;\n"
345 " }\n"
346 " diag_M_inv[row] = 1.0f / sqrt(dot_prod);\n"
347 " }\n"
348 "}\n"
349 ; //compressed_matrix_align1_row_scaling_2
350 
351 const char * const compressed_matrix_align1_vec_mul =
352 "__kernel void vec_mul(\n"
353 " __global const unsigned int * row_indices,\n"
354 " __global const unsigned int * column_indices, \n"
355 " __global const float * elements,\n"
356 " __global const float * vector, \n"
357 " __global float * result,\n"
358 " unsigned int size) \n"
359 "{ \n"
360 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
361 " {\n"
362 " float dot_prod = 0.0f;\n"
363 " unsigned int row_end = row_indices[row+1];\n"
364 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
365 " dot_prod += elements[i] * vector[column_indices[i]];\n"
366 " result[row] = dot_prod;\n"
367 " }\n"
368 "}\n"
369 ; //compressed_matrix_align1_vec_mul
370 
371 const char * const compressed_matrix_align1_jacobi =
372 "__kernel void jacobi(\n"
373 " __global const unsigned int * row_indices,\n"
374 " __global const unsigned int * column_indices,\n"
375 " __global const float * elements,\n"
376 " float weight,\n"
377 " __global const float * old_result,\n"
378 " __global float * new_result,\n"
379 " __global const float * rhs,\n"
380 " unsigned int size)\n"
381 " {\n"
382 " float sum, diag=1;\n"
383 " int col;\n"
384 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
385 " {\n"
386 " sum = 0;\n"
387 " for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++)\n"
388 " {\n"
389 " col = column_indices[j];\n"
390 " if (i == col)\n"
391 " diag = elements[j];\n"
392 " else \n"
393 " sum += elements[j] * old_result[col]; \n"
394 " } \n"
395 " new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n"
396 " } \n"
397 " } \n"
398 ; //compressed_matrix_align1_jacobi
399 
401 "void helper_bicgstab_kernel2_parallel_reduction( __local float * tmp_buffer )\n"
402 "{\n"
403 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
404 " {\n"
405 " barrier(CLK_LOCAL_MEM_FENCE);\n"
406 " if (get_local_id(0) < stride)\n"
407 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
408 " }\n"
409 "}\n"
410 "//////// inner products:\n"
411 "float bicgstab_kernel2_inner_prod(\n"
412 " __global const float * vec1,\n"
413 " __global const float * vec2,\n"
414 " unsigned int size,\n"
415 " __local float * tmp_buffer)\n"
416 "{\n"
417 " float tmp = 0;\n"
418 " unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
419 " for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
420 " {\n"
421 " if (i < size)\n"
422 " tmp += vec1[i] * vec2[i];\n"
423 " }\n"
424 " tmp_buffer[get_local_id(0)] = tmp;\n"
425 " \n"
426 " helper_bicgstab_kernel2_parallel_reduction(tmp_buffer);\n"
427 " barrier(CLK_LOCAL_MEM_FENCE);\n"
428 " return tmp_buffer[0];\n"
429 "}\n"
430 "__kernel void bicgstab_kernel2(\n"
431 " __global const float * tmp0,\n"
432 " __global const float * tmp1,\n"
433 " __global const float * r0star, \n"
434 " __global const float * s, \n"
435 " __global float * p, \n"
436 " __global float * result,\n"
437 " __global float * residual,\n"
438 " __global const float * alpha,\n"
439 " __global float * ip_rr0star,\n"
440 " __global float * error_estimate,\n"
441 " __local float * tmp_buffer,\n"
442 " unsigned int size) \n"
443 "{ \n"
444 " float omega_local = bicgstab_kernel2_inner_prod(tmp1, s, size, tmp_buffer) / bicgstab_kernel2_inner_prod(tmp1, tmp1, size, tmp_buffer);\n"
445 " float alpha_local = alpha[0];\n"
446 " \n"
447 " //result += alpha * p + omega * s;\n"
448 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
449 " result[i] += alpha_local * p[i] + omega_local * s[i];\n"
450 " //residual = s - omega * tmp1;\n"
451 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
452 " residual[i] = s[i] - omega_local * tmp1[i];\n"
453 " //new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);\n"
454 " float new_ip_rr0star = bicgstab_kernel2_inner_prod(residual, r0star, size, tmp_buffer);\n"
455 " float beta = (new_ip_rr0star / ip_rr0star[0]) * (alpha_local / omega_local);\n"
456 " \n"
457 " //p = residual + beta * (p - omega*tmp0);\n"
458 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
459 " p[i] = residual[i] + beta * (p[i] - omega_local * tmp0[i]);\n"
460 " //compute norm of residual:\n"
461 " float new_error_estimate = bicgstab_kernel2_inner_prod(residual, residual, size, tmp_buffer);\n"
462 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
463 " //update values:\n"
464 " if (get_global_id(0) == 0)\n"
465 " {\n"
466 " error_estimate[0] = new_error_estimate;\n"
467 " ip_rr0star[0] = new_ip_rr0star;\n"
468 " }\n"
469 "}\n"
470 ; //compressed_matrix_align1_bicgstab_kernel2
471 
473 "__kernel void row_scaling_1(\n"
474 " __global const unsigned int * row_indices,\n"
475 " __global const unsigned int * column_indices, \n"
476 " __global const float * elements,\n"
477 " __global float * diag_M_inv,\n"
478 " unsigned int size) \n"
479 "{ \n"
480 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
481 " {\n"
482 " float dot_prod = 0.0f;\n"
483 " unsigned int row_end = row_indices[row+1];\n"
484 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
485 " dot_prod += fabs(elements[i]);\n"
486 " diag_M_inv[row] = 1.0f / dot_prod;\n"
487 " }\n"
488 "}\n"
489 ; //compressed_matrix_align1_row_scaling_1
490 
491 const char * const compressed_matrix_align8_vec_mul =
492 "__kernel void vec_mul(\n"
493 " __global const unsigned int * row_indices,\n"
494 " __global const uint8 * column_indices, \n"
495 " __global const float8 * elements,\n"
496 " __global const float * vector, \n"
497 " __global float * result,\n"
498 " unsigned int size)\n"
499 "{ \n"
500 " float dot_prod;\n"
501 " unsigned int start, next_stop;\n"
502 " uint8 col_idx;\n"
503 " float8 tmp_vec;\n"
504 " float8 tmp_entries;\n"
505 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
506 " {\n"
507 " dot_prod = 0.0f;\n"
508 " start = row_indices[row] / 8;\n"
509 " next_stop = row_indices[row+1] / 8;\n"
510 " for (unsigned int i = start; i < next_stop; ++i)\n"
511 " {\n"
512 " col_idx = column_indices[i];\n"
513 " tmp_entries = elements[i];\n"
514 " tmp_vec.s0 = vector[col_idx.s0];\n"
515 " tmp_vec.s1 = vector[col_idx.s1];\n"
516 " tmp_vec.s2 = vector[col_idx.s2];\n"
517 " tmp_vec.s3 = vector[col_idx.s3];\n"
518 " tmp_vec.s4 = vector[col_idx.s4];\n"
519 " tmp_vec.s5 = vector[col_idx.s5];\n"
520 " tmp_vec.s6 = vector[col_idx.s6];\n"
521 " tmp_vec.s7 = vector[col_idx.s7];\n"
522 " dot_prod += dot(tmp_entries.lo, tmp_vec.lo);\n"
523 " dot_prod += dot(tmp_entries.hi, tmp_vec.hi);\n"
524 " }\n"
525 " result[row] = dot_prod;\n"
526 " }\n"
527 "}\n"
528 ; //compressed_matrix_align8_vec_mul
529 
530  } //namespace kernels
531  } //namespace linalg
532 } //namespace viennacl
533 #endif