ViennaCL - The Vienna Computing Library  1.2.0
spai_source.h
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_KERNELS_SPAI_SOURCE_HPP_
2 #define VIENNACL_LINALG_KERNELS_SPAI_SOURCE_HPP_
3 //Automatically generated file from auxiliary-directory, do not edit manually!
4 namespace viennacl
5 {
6  namespace linalg
7  {
8  namespace kernels
9  {
10 const char * const spai_align1_block_qr_assembly =
11 "void assemble_upper_part(__global float * R_q,\n"
12 " unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, \n"
13 " unsigned int row_n_u, unsigned int col_n_u,\n"
14 " unsigned int col_n, unsigned int diff){\n"
15 " for(unsigned int i = 0; i < col_n_q; ++i){\n"
16 " for(unsigned int j = 0; j < diff; ++j){\n"
17 " R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ];\n"
18 " }\n"
19 " }\n"
20 " }\n"
21 "void assemble_lower_part(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u_u, \n"
22 " unsigned int row_n_u_u, unsigned int col_n_u_u, \n"
23 " unsigned int diff){\n"
24 " for(unsigned int i = 0; i < col_n_u_u; ++i){\n"
25 " for(unsigned int j = 0; j < row_n_u_u; ++j){\n"
26 " R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j];\n"
27 " }\n"
28 " } \n"
29 "}\n"
30 "void assemble_qr_block(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, unsigned int row_n_u,\n"
31 " unsigned int col_n_u, __global float * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){\n"
32 " unsigned int diff = row_n_u - col_n;\n"
33 " assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff);\n"
34 " if(diff > 0){\n"
35 " assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff);\n"
36 " }\n"
37 "}\n"
38 "__kernel void block_qr_assembly(\n"
39 " __global unsigned int * matrix_dimensions,\n"
40 " __global float * R_u,\n"
41 " __global unsigned int * block_ind_u,\n"
42 " __global unsigned int * matrix_dimensions_u,\n"
43 " __global float * R_u_u,\n"
44 " __global unsigned int * block_ind_u_u,\n"
45 " __global unsigned int * matrix_dimensions_u_u,\n"
46 " __global float * R_q,\n"
47 " __global unsigned int * block_ind_q,\n"
48 " __global unsigned int * matrix_dimensions_q,\n"
49 " __global unsigned int * g_is_update,\n"
50 " //__local float * local_R_q,\n"
51 " unsigned int block_elems_num) \n"
52 "{ \n"
53 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
54 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
55 " //\n"
56 " assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n"
57 " matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]);\n"
58 " }\n"
59 " }\n"
60 "}\n"
61 ; //spai_align1_block_qr_assembly
62 
63 const char * const spai_align1_block_r_assembly =
64 "void assemble_r(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R, \n"
65 " unsigned int row_n, unsigned int col_n)\n"
66 "{\n"
67 " for(unsigned int i = 0; i < col_n; ++i){\n"
68 " for(unsigned int j = 0; j < row_n; ++j){\n"
69 " gR[i*row_n_r + j] = R[i*row_n + j ];\n"
70 " }\n"
71 " }\n"
72 "}\n"
73 "void assemble_r_u(__global float * gR,\n"
74 " unsigned int row_n_r, unsigned int col_n_r, __global float * R_u, unsigned int row_n_u, unsigned int col_n_u, \n"
75 " unsigned int col_n)\n"
76 "{\n"
77 " for(unsigned int i = 0; i < col_n_u; ++i){\n"
78 " for(unsigned int j = 0; j < col_n; ++j){\n"
79 " gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j];\n"
80 " }\n"
81 " } \n"
82 "}\n"
83 "void assemble_r_u_u(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R_u_u, unsigned int row_n_u_u, \n"
84 " unsigned int col_n_u_u, unsigned int col_n)\n"
85 "{\n"
86 " for(unsigned int i = 0; i < col_n_u_u; ++i){\n"
87 " for(unsigned int j = 0; j < row_n_u_u; ++j){\n"
88 " gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j];\n"
89 " }\n"
90 " } \n"
91 "}\n"
92 "void assemble_r_block(__global float * gR, unsigned int row_n_r, unsigned int col_n_r, __global float * R, unsigned int row_n, \n"
93 " unsigned int col_n, __global float * R_u, unsigned int row_n_u, unsigned int col_n_u, __global float * R_u_u, \n"
94 " unsigned int row_n_u_u, unsigned int col_n_u_u){\n"
95 " assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n"
96 " assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n);\n"
97 " assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n);\n"
98 "}\n"
99 "__kernel void block_r_assembly(\n"
100 " __global float * R,\n"
101 " __global unsigned int * block_ind,\n"
102 " __global unsigned int * matrix_dimensions,\n"
103 " __global float * R_u,\n"
104 " __global unsigned int * block_ind_u,\n"
105 " __global unsigned int * matrix_dimensions_u,\n"
106 " __global float * R_u_u,\n"
107 " __global unsigned int * block_ind_u_u,\n"
108 " __global unsigned int * matrix_dimensions_u_u,\n"
109 " __global float * g_R,\n"
110 " __global unsigned int * block_ind_r,\n"
111 " __global unsigned int * matrix_dimensions_r,\n"
112 " __global unsigned int * g_is_update,\n"
113 " //__local float * local_gR,\n"
114 " unsigned int block_elems_num) \n"
115 "{ \n"
116 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
117 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
118 " \n"
119 " assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n"
120 " matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1],\n"
121 " R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]);\n"
122 " \n"
123 " }\n"
124 " }\n"
125 "}\n"
126 ; //spai_align1_block_r_assembly
127 
128 const char * const spai_align1_block_least_squares =
129 "void custom_dot_prod_ls(__global float * A, unsigned int row_n, __global float * v, unsigned int ind, float *res){\n"
130 " *res = 0.0;\n"
131 " for(unsigned int j = ind; j < row_n; ++j){\n"
132 " if(j == ind){\n"
133 " *res += v[ j];\n"
134 " }else{\n"
135 " *res += A[ j + ind*row_n]*v[ j];\n"
136 " }\n"
137 " }\n"
138 " }\n"
139 "void backwardSolve(__global float * R, unsigned int row_n, unsigned int col_n, __global float * y, __global float * x){\n"
140 " for (int i = col_n-1; i >= 0 ; i--) {\n"
141 " x[ i] = y[ i];\n"
142 " for (int j = i+1; j < col_n; ++j) {\n"
143 " x[ i] -= R[ i + j*row_n]*x[ j];\n"
144 " }\n"
145 " x[i] /= R[ i + i*row_n];\n"
146 " }\n"
147 " \n"
148 "}\n"
149 " \n"
150 "void apply_q_trans_vec_ls(__global float * R, unsigned int row_n, unsigned int col_n, __global const float * b_v, __global float * y){\n"
151 " float inn_prod = 0;\n"
152 " for(unsigned int i = 0; i < col_n; ++i){\n"
153 " custom_dot_prod_ls(R, row_n, y, i, &inn_prod);\n"
154 " for(unsigned int j = i; j < row_n; ++j){\n"
155 " if(i == j){\n"
156 " y[ j] -= b_v[ i]*inn_prod;\n"
157 " }\n"
158 " else{\n"
159 " y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n];\n"
160 " }\n"
161 " }\n"
162 " //std::cout<<y<<std::endl;\n"
163 " }\n"
164 " }\n"
165 "void ls(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __global float * m_v, __global float * y_v){\n"
166 " \n"
167 " apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v);\n"
168 " //m_new - is m_v now\n"
169 " backwardSolve(R, row_n, col_n, y_v, m_v);\n"
170 "}\n"
171 "__kernel void block_least_squares(\n"
172 " __global float * global_R,\n"
173 " __global unsigned int * block_ind,\n"
174 " __global float * b_v,\n"
175 " __global unsigned int * start_bv_inds,\n"
176 " __global float * m_v,\n"
177 " __global float * y_v,\n"
178 " __global unsigned int * start_y_inds,\n"
179 " __global unsigned int * matrix_dimensions,\n"
180 " __global unsigned int * g_is_update,\n"
181 " //__local float * local_R,\n"
182 " unsigned int block_elems_num) \n"
183 "{ \n"
184 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
185 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
186 " \n"
187 " ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] );\n"
188 " \n"
189 " }\n"
190 " }\n"
191 "}\n"
192 ; //spai_align1_block_least_squares
193 
194 const char * const spai_align1_block_qr_assembly_1 =
195 "void assemble_upper_part_1(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, \n"
196 " unsigned int row_n_u, unsigned int col_n_u,\n"
197 " unsigned int col_n, unsigned int diff){\n"
198 " for(unsigned int i = 0; i < col_n_q; ++i){\n"
199 " for(unsigned int j = 0; j < diff; ++j){\n"
200 " R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ];\n"
201 " }\n"
202 " }\n"
203 " }\n"
204 "void assemble_qr_block_1(__global float * R_q, unsigned int row_n_q, unsigned int col_n_q, __global float * R_u, unsigned int row_n_u,\n"
205 " unsigned int col_n_u, unsigned int col_n){\n"
206 " unsigned int diff = row_n_u - col_n;\n"
207 " assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff);\n"
208 "}\n"
209 "__kernel void block_qr_assembly_1(\n"
210 " __global unsigned int * matrix_dimensions,\n"
211 " __global float * R_u,\n"
212 " __global unsigned int * block_ind_u,\n"
213 " __global unsigned int * matrix_dimensions_u,\n"
214 " __global float * R_q,\n"
215 " __global unsigned int * block_ind_q,\n"
216 " __global unsigned int * matrix_dimensions_q,\n"
217 " __global unsigned int * g_is_update,\n"
218 " //__local float * local_R_q,\n"
219 " unsigned int block_elems_num) \n"
220 "{ \n"
221 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
222 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
223 " assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n"
224 " matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]);\n"
225 " }\n"
226 " }\n"
227 "}\n"
228 ; //spai_align1_block_qr_assembly_1
229 
230 const char * const spai_align1_block_q_mult =
231 "void custom_dot_prod(__global float * A, unsigned int row_n, __local float * v, unsigned int ind, float *res){\n"
232 " *res = 0.0;\n"
233 " for(unsigned int j = ind; j < row_n; ++j){\n"
234 " if(j == ind){\n"
235 " *res += v[j];\n"
236 " }else{\n"
237 " *res += A[j + ind*row_n]*v[j];\n"
238 " }\n"
239 " }\n"
240 " }\n"
241 "void apply_q_trans_vec(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __local float * y){\n"
242 " float inn_prod = 0;\n"
243 " for(unsigned int i = 0; i < col_n; ++i){\n"
244 " custom_dot_prod(R, row_n, y, i, &inn_prod);\n"
245 " for(unsigned int j = i; j < row_n; ++j){\n"
246 " if(i == j){\n"
247 " y[j] -= b_v[ i]*inn_prod;\n"
248 " }\n"
249 " else{\n"
250 " y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n];\n"
251 " }\n"
252 " }\n"
253 " }\n"
254 " }\n"
255 "void q_mult(__global float * R, unsigned int row_n, unsigned int col_n, __global float * b_v, __local float * R_u, unsigned int col_n_u){\n"
256 " for(unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){\n"
257 " apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i);\n"
258 " } \n"
259 "}\n"
260 "void matrix_from_global_to_local(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
261 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
262 " for(unsigned int j = 0; j < row_n; ++j){\n"
263 " l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j];\n"
264 " }\n"
265 " }\n"
266 "}\n"
267 "void matrix_from_local_to_global(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
268 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
269 " for(unsigned int j = 0; j < row_n; ++j){\n"
270 " g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j];\n"
271 " }\n"
272 " }\n"
273 "}\n"
274 "__kernel void block_q_mult(__global float * global_R,\n"
275 " __global unsigned int * block_ind,\n"
276 " __global float * global_R_u,\n"
277 " __global unsigned int *block_ind_u,\n"
278 " __global float * b_v,\n"
279 " __global unsigned int * start_bv_inds,\n"
280 " __global unsigned int * matrix_dimensions,\n"
281 " __global unsigned int * matrix_dimensions_u,\n"
282 " __global unsigned int * g_is_update,\n"
283 " __local float * local_R_u,\n"
284 " unsigned int block_elems_num){\n"
285 " for(unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){\n"
286 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){\n"
287 " //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);\n"
288 " matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]);\n"
289 " barrier(CLK_LOCAL_MEM_FENCE);\n"
290 " q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n"
291 " matrix_dimensions_u[2*i + 1]);\n"
292 " barrier(CLK_LOCAL_MEM_FENCE);\n"
293 " matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]);\n"
294 " }\n"
295 " }\n"
296 "}\n"
297 ; //spai_align1_block_q_mult
298 
299 const char * const spai_align1_block_qr =
300 "void dot_prod(__local const float* A, unsigned int n, unsigned int beg_ind, float* res){\n"
301 " *res = 0;\n"
302 " for(unsigned int i = beg_ind; i < n; ++i){\n"
303 " *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i];\n"
304 " }\n"
305 "}\n"
306 " \n"
307 "void vector_div(__global float* v, unsigned int beg_ind, float b, unsigned int n){\n"
308 " for(unsigned int i = beg_ind; i < n; ++i){\n"
309 " v[i] /= b;\n"
310 " }\n"
311 "}\n"
312 "void copy_vector(__local const float* A, __global float* v, const unsigned int beg_ind, const unsigned int n){\n"
313 " for(unsigned int i = beg_ind; i < n; ++i){\n"
314 " v[i] = A[(beg_ind-1)*n + i];\n"
315 " }\n"
316 "}\n"
317 " \n"
318 " \n"
319 "void householder_vector(__local const float* A, unsigned int j, unsigned int n, __global float* v, __global float* b){\n"
320 " float sg;\n"
321 " dot_prod(A, n, j+1, &sg); \n"
322 " copy_vector(A, v, j+1, n);\n"
323 " float mu;\n"
324 " v[j] = 1.0;\n"
325 " //print_contigious_vector(v, v_start_ind, n);\n"
326 " if(sg == 0){\n"
327 " *b = 0;\n"
328 " }\n"
329 " else{\n"
330 " mu = sqrt(A[j*n + j]*A[ j*n + j] + sg);\n"
331 " if(A[ j*n + j] <= 0){\n"
332 " v[j] = A[ j*n + j] - mu;\n"
333 " }else{\n"
334 " v[j] = -sg/(A[ j*n + j] + mu);\n"
335 " }\n"
336 " *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]);\n"
337 " //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j]));\n"
338 " vector_div(v, j, v[j], n);\n"
339 " //print_contigious_vector(v, v_start_ind, n);\n"
340 " }\n"
341 "}\n"
342 "void custom_inner_prod(__local const float* A, __global float* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, float* res){\n"
343 " for(unsigned int i = start_ind; i < row_num; ++i){\n"
344 " *res += A[col_ind*row_num + i]*v[i]; \n"
345 " }\n"
346 "}\n"
347 "// \n"
348 "void apply_householder_reflection(__local float* A, unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global float* v, float b){\n"
349 " float in_prod_res;\n"
350 " for(unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){\n"
351 " in_prod_res = 0.0;\n"
352 " custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res);\n"
353 " for(unsigned int j = iter_cnt; j < row_n; ++j){\n"
354 " A[ i*row_n + j] -= b*in_prod_res* v[j];\n"
355 " }\n"
356 " }\n"
357 " \n"
358 "}\n"
359 "void store_householder_vector(__local float* A, unsigned int ind, unsigned int n, __global float* v){\n"
360 " for(unsigned int i = ind; i < n; ++i){\n"
361 " A[ (ind-1)*n + i] = v[i];\n"
362 " }\n"
363 "}\n"
364 "void single_qr( __local float* R, __global unsigned int* matrix_dimensions, __global float* b_v, __global float* v, unsigned int matrix_ind){\n"
365 " //matrix_dimensions[0] - number of rows\n"
366 " //matrix_dimensions[1] - number of columns\n"
367 " unsigned int col_n = matrix_dimensions[2*matrix_ind + 1];\n"
368 " unsigned int row_n = matrix_dimensions[2*matrix_ind];\n"
369 " \n"
370 " if((col_n == row_n)&&(row_n == 1)){\n"
371 " b_v[0] = 0.0;\n"
372 " return;\n"
373 " }\n"
374 " for(unsigned int i = 0; i < col_n; ++i){\n"
375 " if(get_local_id(0) == 0){\n"
376 " householder_vector(R, i, row_n, v, b_v + i);\n"
377 " }\n"
378 " barrier(CLK_LOCAL_MEM_FENCE);\n"
379 " apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]);\n"
380 " barrier(CLK_LOCAL_MEM_FENCE);\n"
381 " if(get_local_id(0) == 0){\n"
382 " if(i < matrix_dimensions[2*matrix_ind]){\n"
383 " store_householder_vector(R, i+1, row_n, v);\n"
384 " }\n"
385 " }\n"
386 " }\n"
387 "}\n"
388 "void matrix_from_global_to_local_qr(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
389 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
390 " for(unsigned int j = 0; j < row_n; ++j){\n"
391 " l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j];\n"
392 " }\n"
393 " }\n"
394 "}\n"
395 "void matrix_from_local_to_global_qr(__global float* g_M, __local float* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){\n"
396 " for(unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){\n"
397 " for(unsigned int j = 0; j < row_n; ++j){\n"
398 " g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j];\n"
399 " }\n"
400 " }\n"
401 "}\n"
402 "__kernel void block_qr(\n"
403 " __global float* R, \n"
404 " __global unsigned int* matrix_dimensions, \n"
405 " __global float* b_v, \n"
406 " __global float* v, \n"
407 " __global unsigned int* start_matrix_inds, \n"
408 " __global unsigned int* start_bv_inds, \n"
409 " __global unsigned int* start_v_inds,\n"
410 " __global unsigned int * g_is_update, \n"
411 " __local float* local_buff_R,\n"
412 " unsigned int block_elems_num){\n"
413 " for(unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){\n"
414 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
415 " matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);\n"
416 " barrier(CLK_LOCAL_MEM_FENCE);\n"
417 " single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i);\n"
418 " barrier(CLK_LOCAL_MEM_FENCE);\n"
419 " matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]);\n"
420 " }\n"
421 " }\n"
422 "}\n"
423 ; //spai_align1_block_qr
424 
425 const char * const spai_align1_block_bv_assembly =
426 "void assemble_bv(__global float * g_bv_r, __global float * g_bv, unsigned int col_n){\n"
427 " for(unsigned int i = 0; i < col_n; ++i){\n"
428 " g_bv_r[i] = g_bv[ i];\n"
429 " }\n"
430 "}\n"
431 "void assemble_bv_block(__global float * g_bv_r, __global float * g_bv, unsigned int col_n,\n"
432 " __global float * g_bv_u, unsigned int col_n_u)\n"
433 "{\n"
434 " assemble_bv(g_bv_r, g_bv, col_n);\n"
435 " assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u);\n"
436 " \n"
437 "}\n"
438 "__kernel void block_bv_assembly(__global float * g_bv,\n"
439 " __global unsigned int * start_bv_ind,\n"
440 " __global unsigned int * matrix_dimensions,\n"
441 " __global float * g_bv_u,\n"
442 " __global unsigned int * start_bv_u_ind,\n"
443 " __global unsigned int * matrix_dimensions_u,\n"
444 " __global float * g_bv_r,\n"
445 " __global unsigned int * start_bv_r_ind,\n"
446 " __global unsigned int * matrix_dimensions_r,\n"
447 " __global unsigned int * g_is_update,\n"
448 " //__local float * local_gb,\n"
449 " unsigned int block_elems_num)\n"
450 "{ \n"
451 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
452 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
453 " assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]);\n"
454 " }\n"
455 " }\n"
456 "}\n"
457 ; //spai_align1_block_bv_assembly
458 
459 const char * const spai_align1_assemble_blocks =
460 "float get_element(__global const unsigned int * row_indices,\n"
461 " __global const unsigned int * column_indices,\n"
462 " __global const float * elements,\n"
463 " unsigned int row,\n"
464 " unsigned int col\n"
465 " )\n"
466 "{\n"
467 " unsigned int row_end = row_indices[row+1];\n"
468 " for(unsigned int i = row_indices[row]; i < row_end; ++i){\n"
469 " if(column_indices[i] == col)\n"
470 " return elements[i];\n"
471 " if(column_indices[i] > col)\n"
472 " return 0.0;\n"
473 " }\n"
474 " return 0.0; \n"
475 "}\n"
476 "void block_assembly(__global const unsigned int * row_indices,\n"
477 " __global const unsigned int * column_indices, \n"
478 " __global const float * elements,\n"
479 " __global const unsigned int * matrix_dimensions,\n"
480 " __global const unsigned int * set_I,\n"
481 " __global const unsigned int * set_J, \n"
482 " unsigned int matrix_ind,\n"
483 " __global float * com_A_I_J)\n"
484 "{\n"
485 " unsigned int row_n = matrix_dimensions[2*matrix_ind];\n"
486 " unsigned int col_n = matrix_dimensions[2*matrix_ind + 1];\n"
487 " \n"
488 " for(unsigned int i = 0; i < col_n; ++i){\n"
489 " //start row index\n"
490 " for(unsigned int j = 0; j < row_n; j++){\n"
491 " com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]);\n"
492 " }\n"
493 " }\n"
494 " \n"
495 "}\n"
496 "__kernel void assemble_blocks(\n"
497 " __global const unsigned int * row_indices,\n"
498 " __global const unsigned int * column_indices, \n"
499 " __global const float * elements,\n"
500 " __global const unsigned int * set_I,\n"
501 " __global const unsigned int * set_J,\n"
502 " __global const unsigned int * i_ind,\n"
503 " __global const unsigned int * j_ind,\n"
504 " __global const unsigned int * block_ind,\n"
505 " __global const unsigned int * matrix_dimensions,\n"
506 " __global float * com_A_I_J,\n"
507 " __global unsigned int * g_is_update,\n"
508 " unsigned int block_elems_num) \n"
509 "{ \n"
510 " for(unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){\n"
511 " if((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){\n"
512 " \n"
513 " block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]);\n"
514 " }\n"
515 " }\n"
516 "}\n"
517 ; //spai_align1_assemble_blocks
518 
519  } //namespace kernels
520  } //namespace linalg
521 } //namespace viennacl
522 #endif