ViennaCL - The Vienna Computing Library  1.2.0
spai.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
2 #define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
3 
4 /* =========================================================================
5  Copyright (c) 2010-2011, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8 
9  -----------------
10  ViennaCL - The Vienna Computing Library
11  -----------------
12 
13  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
14 
15  (A list of authors and contributors can be found in the PDF manual)
16 
17  License: MIT (X11), see file LICENSE in the base directory
18 ============================================================================= */
19 
24 #include <utility>
25 #include <iostream>
26 #include <fstream>
27 #include <string>
28 #include <algorithm>
29 #include <vector>
30 #include <math.h>
31 #include <map>
32 
33 //local includes
35 #include "viennacl/linalg/qr.hpp"
41 
42 //boost includes
43 #include "boost/numeric/ublas/vector.hpp"
44 #include "boost/numeric/ublas/matrix.hpp"
45 #include "boost/numeric/ublas/matrix_proxy.hpp"
46 #include "boost/numeric/ublas/vector_proxy.hpp"
47 #include "boost/numeric/ublas/storage.hpp"
48 #include "boost/numeric/ublas/io.hpp"
49 #include "boost/numeric/ublas/lu.hpp"
50 #include "boost/numeric/ublas/triangular.hpp"
51 #include "boost/numeric/ublas/matrix_expression.hpp"
52 
53 // ViennaCL includes
54 #include "viennacl/linalg/prod.hpp"
55 #include "viennacl/matrix.hpp"
59 #include "viennacl/scalar.hpp"
61 #include "viennacl/linalg/ilu.hpp"
62 #include "viennacl/ocl/backend.hpp"
65 
66 
67 
68 #define VIENNACL_SPAI_K_b 20
69 
70 namespace viennacl
71 {
72  namespace linalg
73  {
74  namespace detail
75  {
76  namespace spai
77  {
78 
79  //debug function for print
80  template<typename SparseVectorType>
81  void print_sparse_vector(const SparseVectorType& v){
82  for(typename SparseVectorType::const_iterator vec_it = v.begin(); vec_it!= v.end(); ++vec_it){
83  std::cout<<"[ "<<vec_it->first<<" ]:"<<vec_it->second<<std::endl;
84  }
85  }
86  template<typename DenseMatrixType>
87  void print_matrix( DenseMatrixType & m){
88  for(int i = 0; i < m.size2(); ++i){
89  for(int j = 0; j < m.size1(); ++j){
90  std::cout<<m(j, i)<<" ";
91  }
92  std::cout<<std::endl;
93  }
94  }
95 
101  template<typename SparseVectorType, typename ScalarType>
102  void add_sparse_vectors(const SparseVectorType& v, const ScalarType b, SparseVectorType& res_v){
103  for(typename SparseVectorType::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it){
104  res_v[v_it->first] += b*v_it->second;
105  }
106  }
107  //sparse-matrix - vector product
114  template<typename SparseVectorType, typename ScalarType>
115  void compute_spai_residual(const std::vector<SparseVectorType>& A_v_c, const SparseVectorType& v,
116  const unsigned int ind, SparseVectorType& res){
117  for(typename SparseVectorType::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it){
118  add_sparse_vectors(A_v_c[v_it->first], v_it->second, res);
119  }
120  res[ind] -= static_cast<ScalarType>(1);
121  }
122 
129  template<typename SparseVectorType>
130  void build_index_set(const std::vector<SparseVectorType>& A_v_c, const SparseVectorType& v, std::vector<unsigned int>& J,
131  std::vector<unsigned int>& I){
132  buildColumnIndexSet(v, J);
133  projectRows(A_v_c, J, I);
134  }
135 
136  /************************************************** CPU BLOCK SET UP ***************************************/
146  template<typename SparseMatrixType, typename DenseMatrixType, typename SparseVectorType, typename VectorType>
147  void block_set_up(const SparseMatrixType& A,
148  const std::vector<SparseVectorType>& A_v_c,
149  const std::vector<SparseVectorType>& M_v,
150  std::vector<std::vector<unsigned int> >& g_I,
151  std::vector<std::vector<unsigned int> >& g_J,
152  std::vector<DenseMatrixType>& g_A_I_J,
153  std::vector<VectorType>& g_b_v){
154 #ifdef _OPENMP
155  #pragma omp parallel for
156 #endif
157  for(std::size_t i = 0; i < M_v.size(); ++i){
158  build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
159  initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);
160  //print_matrix(g_A_I_J[i]);
161  single_qr(g_A_I_J[i], g_b_v[i]);
162  //print_matrix(g_A_I_J[i]);
163  }
164  }
165 
172  template<typename SparseVectorType>
173  void index_set_up(const std::vector<SparseVectorType>& A_v_c,
174  const std::vector<SparseVectorType>& M_v,
175  std::vector<std::vector<unsigned int> >& g_J,
176  std::vector<std::vector<unsigned int> >& g_I)
177  {
178 #ifdef _OPENMP
179  #pragma omp parallel for
180 #endif
181  for(std::size_t i = 0; i < M_v.size(); ++i){
182  build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
183  }
184  }
185 
186  /************************************************** GPU BLOCK SET UP ***************************************/
198  template<typename ScalarType, unsigned int MAT_ALIGNMENT, typename SparseVectorType>
200  const std::vector<SparseVectorType>& A_v_c,
201  const std::vector<SparseVectorType>& M_v,
202  std::vector<cl_uint> g_is_update,
203  std::vector<std::vector<unsigned int> >& g_I,
204  std::vector<std::vector<unsigned int> >& g_J,
205  block_matrix & g_A_I_J,
206  block_vector & g_bv,
207  const unsigned int cur_iter)
208  {
209  bool is_empty_block;
210  //build index set
211  index_set_up(A_v_c, M_v, g_J, g_I);
212  block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block, cur_iter);
213  block_qr<ScalarType>(g_I, g_J, g_A_I_J, g_bv, g_is_update, cur_iter);
214 
215  }
216 
217 
218  /***************************************************************************************************/
219  /******************************** SOLVING LS PROBLEMS ON GPU ***************************************/
220  /***************************************************************************************************/
227  template<typename ScalarType, typename SparseVectorType>
228  void custom_fan_out(const std::vector<ScalarType> & m_in,
229  unsigned int start_m_ind,
230  const std::vector<unsigned int> & J,
231  SparseVectorType & m)
232  {
233  unsigned int cnt = 0;
234  for (std::size_t i = 0; i < J.size(); ++i) {
235  m[J[i]] = m_in[start_m_ind + cnt++];
236  }
237  }
238 
239 
240 
241  //GPU based least square problem
254  template<typename SparseVectorType, typename ScalarType>
255  void least_square_solve(std::vector<SparseVectorType> & A_v_c,
256  std::vector<SparseVectorType> & M_v,
257  std::vector<std::vector<unsigned int> >& g_I,
258  std::vector<std::vector<unsigned int> > & g_J,
259  block_matrix & g_A_I_J_vcl,
260  block_vector & g_bv_vcl,
261  std::vector<SparseVectorType> & g_res,
262  std::vector<cl_uint> & g_is_update,
263  const spai_tag & tag,
264  const unsigned int cur_iter){
265  unsigned int y_sz, m_sz;
266  std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
267  std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
268  get_size(g_I, y_sz);
269  init_start_inds(g_I, y_inds);
270  init_start_inds(g_J, m_inds);
271  //create y_v
272  std::vector<ScalarType> y_v(y_sz, static_cast<ScalarType>(0));
273  for(std::size_t i = 0; i < M_v.size(); ++i){
274  for(std::size_t j = 0; j < g_I[i].size(); ++j){
275  if(g_I[i][j] == i)
276  y_v[y_inds[i] + j] = static_cast<ScalarType>(1.0);
277  }
278  }
279  //compute m_v
280  get_size(g_J, m_sz);
281  std::vector<ScalarType> m_v(m_sz, static_cast<cl_uint>(0));
282 
283  //acquire kernel
284  /*if(cur_iter == 0){
285  std::string ls_kernel_file_name = "kernels/spai/ls_g.cl";
286  std::string ls_kernel_source;
287  read_kernel_from_file(ls_kernel_file_name, ls_kernel_source);
288  //compilation of a kernel
289  viennacl::ocl::program & ls_prog = viennacl::ocl::current_context().add_program(ls_kernel_source.c_str(), "ls_kernel_source");
290  //least square kernel
291  ls_prog.add_kernel("block_least_squares");
292  }*/
293  block_vector y_v_vcl;
294  block_vector m_v_vcl;
295  //prepearing memory for least square problem on GPU
296  y_v_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
297  static_cast<unsigned int>(sizeof(ScalarType)*y_v.size()),
298  &(y_v[0]));
299  m_v_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
300  static_cast<unsigned int>(sizeof(ScalarType)*m_v.size()),
301  &(m_v[0]));
302  y_v_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
303  static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
304  &(y_inds[0]));
305  viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
306  static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
307  &(g_is_update[0]));
308  viennacl::ocl::kernel& ls_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "block_least_squares");
309  ls_kernel.local_work_size(0, 1);
310  ls_kernel.global_work_size(0, 256);
311  viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(),
312  y_v_vcl.handle(), y_v_vcl.handle1(),
313  g_A_I_J_vcl.handle1(), g_is_update_vcl,
314  //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
315  static_cast<unsigned int>(M_v.size())));
316  //copy vector m_v back from GPU to CPU
317  cl_int vcl_err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(),
318  m_v_vcl.handle(), CL_TRUE, 0,
319  sizeof(ScalarType)*(m_v.size()),
320  &(m_v[0]), 0, NULL, NULL);
321  VIENNACL_ERR_CHECK(vcl_err);
322  //fan out vector in parallel
323  //#pragma omp parallel for
324  for(std::size_t i = 0; i < M_v.size(); ++i){
325  if(g_is_update[i]){
326  //faned out onto sparse vector
327  custom_fan_out(m_v, m_inds[i], g_J[i], M_v[i]);
328  g_res[i].clear();
329  compute_spai_residual<SparseVectorType, ScalarType>(A_v_c, M_v[i], static_cast<unsigned int>(i), g_res[i]);
330  ScalarType res_norm = 0;
331  //compute norm of res - just to make sure that this implementatino works correct
332  sparse_norm_2(g_res[i], res_norm);
333  //std::cout<<"Residual norm of column #: "<<i<<std::endl;
334  //std::cout<<res_norm<<std::endl;
335  //std::cout<<"************************"<<std::endl;
336  g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
337 
338  }
339  }
340  }
341 
342  //CPU based least square problems
354  template<typename SparseVectorType, typename DenseMatrixType, typename VectorType>
355  void least_square_solve(const std::vector<SparseVectorType>& A_v_c,
356  std::vector<DenseMatrixType>& g_R,
357  std::vector<VectorType>& g_b_v,
358  std::vector<std::vector<unsigned int> >& g_I,
359  std::vector<std::vector<unsigned int> >& g_J,
360  std::vector<SparseVectorType>& g_res,
361  std::vector<bool>& g_is_update,
362  std::vector<SparseVectorType>& M_v,
363  const spai_tag& tag){
364  typedef typename DenseMatrixType::value_type ScalarType;
365  //VectorType m_new, y;
366 #ifdef _OPENMP
367  #pragma omp parallel for
368 #endif
369  for(std::size_t i = 0; i < M_v.size(); ++i){
370  if(g_is_update[i]){
371  VectorType y = boost::numeric::ublas::zero_vector<ScalarType>(g_I[i].size());
372  //std::cout<<y<<std::endl;
373  projectI<VectorType, ScalarType>(g_I[i], y, static_cast<unsigned int>(tag.getBegInd() + i));
374  apply_q_trans_vec(g_R[i], g_b_v[i], y);
375  VectorType m_new = boost::numeric::ublas::zero_vector<ScalarType>(g_R[i].size2());
376  backwardSolve(g_R[i], y, m_new);
377  fanOutVector(m_new, g_J[i], M_v[i]);
378  g_res[i].clear();
379  compute_spai_residual<SparseVectorType, ScalarType>(A_v_c, M_v[i], static_cast<unsigned int>(tag.getBegInd() + i), g_res[i]);
380  ScalarType res_norm = 0;
381  sparse_norm_2(g_res[i], res_norm);
382 // std::cout<<"Residual norm of column #: "<<i<<std::endl;
383 // std::cout<<res_norm<<std::endl;
384 // std::cout<<"************************"<<std::endl;
385  g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic());
386  }
387  }
388  }
389 
390  //************************************ UPDATE CHECK ***************************************************//
391  template<typename VectorType>
392  bool is_all_update(VectorType& parallel_is_update){
393 
394  for(unsigned int i = 0; i < parallel_is_update.size(); ++i){
395  if(parallel_is_update[i])
396  return true;
397  }
398  return false;
399  }
400 
401  //********************************** MATRIX VECTORIZATION ***********************************************//
402  //Matrix vectorization, column based approach
407  template<typename SparseMatrixType, typename SparseVectorType>
408  void vectorize_column_matrix(const SparseMatrixType& M_in, std::vector<SparseVectorType>& M_v){
409  for(typename SparseMatrixType::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it){
410  //
411  for(typename SparseMatrixType::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it){
412  M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it;
413  }
414  //std::cout<<std::endl;
415  }
416  }
417 
418  //Matrix vectorization row based approach
419  template<typename SparseMatrixType, typename SparseVectorType>
420  void vectorize_row_matrix(const SparseMatrixType& M_in, std::vector<SparseVectorType>& M_v){
421  for(typename SparseMatrixType::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it){
422  for(typename SparseMatrixType::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it){
423  M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it;
424  }
425  }
426  }
427 
428  //************************************* BLOCK ASSEMBLY CODE *********************************************//
429 
430 
431 
432  void write_set_to_array(const std::vector<std::vector<unsigned int> >& ind_set, std::vector<cl_uint>& a){
433  unsigned int cnt = 0;
434  //unsigned int tmp;
435  for(size_t i = 0; i < ind_set.size(); ++i){
436  for(size_t j = 0; j < ind_set[i].size(); ++j){
437  a[cnt++] = static_cast<cl_uint>(ind_set[i][j]);
438  }
439  }
440  }
441 
442 
443 
444  //assembling blocks on GPU
454  template<typename ScalarType, unsigned int MAT_ALIGNMENT>
455  void block_assembly(const viennacl::compressed_matrix<ScalarType, MAT_ALIGNMENT>& A, const std::vector<std::vector<unsigned int> >& g_J,
456  const std::vector<std::vector<unsigned int> >& g_I,
457  block_matrix& g_A_I_J_vcl,
458  std::vector<cl_uint>& g_is_update,
459  bool& is_empty_block,
460  const unsigned int cur_iter){
461  //computing start indices for index sets and start indices for block matrices
462  unsigned int sz_I, sz_J, sz_blocks;
463  std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
464  std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));
465  std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));
466  std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
467  //
468  init_start_inds(g_J, j_ind);
469  init_start_inds(g_I, i_ind);
470  //
471  get_size(g_J, sz_J);
472  get_size(g_I, sz_I);
473  std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0));
474  //
475  std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0));
476  // computing size for blocks
477  // writing set to arrays
478  write_set_to_array(g_I, I_set);
479  write_set_to_array(g_J, J_set);
480  // if block for assembly does exist
481  if(I_set.size() > 0 && J_set.size() > 0){
482  compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
483  std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0));
484 
485  /*if(cur_iter == 0){
486  std::string block_asm_file_name = "kernels/spai/block_assembly_g.cl";
487  std::string block_asm_source;
488  read_kernel_from_file(block_asm_file_name, block_asm_source);
489  viennacl::ocl::program & block_asm_prog = viennacl::ocl::current_context().add_program(block_asm_source.c_str(),
490  "block_assembly_kernel_source");
491 
492  block_asm_prog.add_kernel("assemble_blocks");
493  }*/
494  block_vector set_I_vcl, set_J_vcl;
495  //init memory on GPU
496  //contigious g_A_I_J
497  g_A_I_J_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
498  static_cast<unsigned int>(sizeof(ScalarType)*(sz_blocks)),
499  &(con_A_I_J[0]));
500  //matrix_dimensions
501  g_A_I_J_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
502  static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())),
503  &(matrix_dims[0]));
504  //start_block inds
505  g_A_I_J_vcl.handle2() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
506  static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
507  &(blocks_ind[0]));
508  //set_I
509  set_I_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
510  static_cast<unsigned int>(sizeof(cl_uint)*sz_I),
511  &(I_set[0]));
512  //set_J
513  set_J_vcl.handle() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
514  static_cast<unsigned int>(sizeof(cl_uint)*sz_J),
515  &(J_set[0]));
516  //i_ind
517  set_I_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
518  static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
519  &(i_ind[0]));
520  //j_ind
521  set_J_vcl.handle1() = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
522  static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)),
523  &(j_ind[0]));
524 
525  viennacl::ocl::handle<cl_mem> g_is_update_vcl = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE,
526  static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
527  &(g_is_update[0]));
528  viennacl::ocl::kernel& assembly_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::spai<ScalarType, 1>::program_name(), "assemble_blocks");
529  assembly_kernel.local_work_size(0, 1);
530  assembly_kernel.global_work_size(0, 256);
531  viennacl::ocl::enqueue(assembly_kernel(A.handle1(), A.handle2(), A.handle(),
532  set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(),
533  set_J_vcl.handle1(),
534  g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
535  g_is_update_vcl,
536  static_cast<unsigned int>(g_I.size())));
537  is_empty_block = false;
538  }else{
539  is_empty_block = true;
540  }
541  }
542 
543  /************************************************************************************************************************/
544 
550  template<typename SparseMatrixType, typename SparseVectorType>
551  void insert_sparse_columns(const std::vector<SparseVectorType>& M_v,
552  SparseMatrixType& M,
553  bool is_right){
554  if (is_right)
555  {
556  for(unsigned int i = 0; i < M_v.size(); ++i){
557  for(typename SparseVectorType::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it){
558  M(vec_it->first, i) = vec_it->second;
559  }
560  }
561  }
562  else //transposed fill of M
563  {
564  for(unsigned int i = 0; i < M_v.size(); ++i){
565  for(typename SparseVectorType::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it){
566  M(i, vec_it->first) = vec_it->second;
567  }
568  }
569  }
570  }
571 
576  template<typename MatrixType>
577  void sparse_transpose(const MatrixType& A_in, MatrixType& A){
578  typedef typename MatrixType::value_type ScalarType;
579  std::vector<std::map<size_t, ScalarType> > temp_A(A_in.size2());
580  A.resize(A_in.size2(), A_in.size1(), false);
581 
582  for (typename MatrixType::const_iterator1 row_it = A_in.begin1();
583  row_it != A_in.end1();
584  ++row_it)
585  {
586  for (typename MatrixType::const_iterator2 col_it = row_it.begin();
587  col_it != row_it.end();
588  ++col_it)
589  {
590  temp_A[col_it.index2()][col_it.index1()] = *col_it;
591  }
592  }
593 
594  for (size_t i=0; i<temp_A.size(); ++i)
595  {
596  for (typename std::map<size_t, ScalarType>::const_iterator it = temp_A[i].begin();
597  it != temp_A[i].end();
598  ++it)
599  A(i, it->first) = it->second;
600  }
601  }
602 
603 
604 
605 
606 // template<typename SparseVectorType>
607 // void custom_copy(std::vector<SparseVectorType> & M_v, std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){
608 // for(int i = 0; i < l_M_v.size(); ++i){
609 // l_M_v[i] = M_v[i + beg_ind];
610 // }
611 // }
612 
613  //CPU version
619  template <typename MatrixType>
620  void computeSPAI(const MatrixType & A, MatrixType & M, spai_tag & tag){
621  typedef typename MatrixType::value_type ScalarType;
622  typedef typename boost::numeric::ublas::vector<ScalarType> VectorType;
623  typedef typename viennacl::linalg::detail::spai::sparse_vector<ScalarType> SparseVectorType;
624  typedef typename boost::numeric::ublas::matrix<ScalarType> DenseMatrixType;
625  //sparse matrix transpose...
626  unsigned int cur_iter = 0;
628  bool go_on = true;
629  std::vector<SparseVectorType> A_v_c(M.size2());
630  std::vector<SparseVectorType> M_v(M.size2());
631  vectorize_column_matrix(A, A_v_c);
632  vectorize_column_matrix(M, M_v);
633 
634 
635  while(go_on){
636  go_on = (tag.getEndInd() < static_cast<long>(M.size2()));
637  cur_iter = 0;
638  unsigned int l_sz = tag.getEndInd() - tag.getBegInd();
639  //std::vector<bool> g_is_update(M.size2(), true);
640  std::vector<bool> g_is_update(l_sz, true);
641  //init is update
642  //init_parallel_is_update(g_is_update);
643  //std::vector<SparseVectorType> A_v_c(K);
644  //std::vector<SparseVectorType> M_v(K);
645  //vectorization of marices
646  //print_matrix(M_v);
647  std::vector<SparseVectorType> l_M_v(l_sz);
648  //custom_copy(M_v, l_M_v, beg_ind);
649  std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin());
650  //print_matrix(l_M_v);
651  //std::vector<SparseVectorType> l_A_v_c(K);
652  //custom_copy(A_v_c, l_A_v_c, beg_ind);
653  //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, l_A_v_c.begin());
654  //print_matrix(l_A_v_c);
655  //vectorize_row_matrix(A, A_v_r);
656  //working blocks
657  //std::vector<DenseMatrixType> g_A_I_J(M.size2())
658  std::vector<DenseMatrixType> g_A_I_J(l_sz);
659  //std::vector<VectorType> g_b_v(M.size2());
660  std::vector<VectorType> g_b_v(l_sz);
661  //std::vector<SparseVectorType> g_res(M.size2());
662  std::vector<SparseVectorType> g_res(l_sz);
663  //std::vector<std::vector<unsigned int> > g_I(M.size2());
664  std::vector<std::vector<unsigned int> > g_I(l_sz);
665  //std::vector<std::vector<unsigned int> > g_J(M.size2());
666  std::vector<std::vector<unsigned int> > g_J(l_sz);
667  while((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update)){
668  // SET UP THE BLOCKS..
669  // PHASE ONE
670  if(cur_iter == 0) block_set_up(A, A_v_c, l_M_v, g_I, g_J, g_A_I_J, g_b_v);
671  else block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, tag);
672 
673  //PHASE TWO, LEAST SQUARE SOLUTION
674  least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, l_M_v, tag);
675 
676  if(tag.getIsStatic()) break;
677  cur_iter++;
678 
679 
680  }
681  std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
682  tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;
683  tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));
684  //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
685 
686  }
687  M.resize(M.size1(), M.size2(), false);
688  insert_sparse_columns(M_v, M, tag.getIsRight());
689  }
690 
691 
692  //GPU - based version
700  template <typename ScalarType, unsigned int MAT_ALIGNMENT>
702  const boost::numeric::ublas::compressed_matrix<ScalarType>& cpu_A,
703  boost::numeric::ublas::compressed_matrix<ScalarType>& cpu_M, //output
705  const spai_tag& tag){
706  typedef typename boost::numeric::ublas::vector<ScalarType> VectorType;
707  typedef typename viennacl::linalg::detail::spai::sparse_vector<ScalarType> SparseVectorType;
708  typedef typename boost::numeric::ublas::matrix<ScalarType> DenseMatrixType;
709  typedef typename boost::numeric::ublas::compressed_matrix<ScalarType> CPUMatrixType;
710  //typedef typename viennacl::compressed_matrix<ScalarType> GPUSparseMatrixType;
711  //sparse matrix transpose...
712  unsigned int cur_iter = 0;
713  std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1));
714  //init is update
715  //init_parallel_is_update(g_is_update);
716  std::vector<SparseVectorType> A_v_c(cpu_M.size2());
717  std::vector<SparseVectorType> M_v(cpu_M.size2());
718  vectorize_column_matrix(cpu_A, A_v_c);
719  vectorize_column_matrix(cpu_M, M_v);
720  std::vector<SparseVectorType> g_res(cpu_M.size2());
721  std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());
722  std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());
723 
724  //OpenCL variables
725  block_matrix g_A_I_J_vcl;
726  block_vector g_bv_vcl;
727  while((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update)){
728  // SET UP THE BLOCKS..
729  // PHASE ONE..
730  //timer.start();
731  //index set up on CPU
732  if(cur_iter == 0) block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, cur_iter);
733  else block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag, cur_iter);
734  //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;
735  //PERFORM LEAST SQUARE problems solution
736  //PHASE TWO
737  //timer.start();
738  least_square_solve<SparseVectorType, ScalarType>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, cur_iter);
739  //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;
740  if(tag.getIsStatic()) break;
741  cur_iter++;
742  }
743  cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false);
744  insert_sparse_columns(M_v, cpu_M, tag.getIsRight());
745  //copy back to GPU
746  M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned int>(cpu_M.size2()));
747  viennacl::copy(cpu_M, M);
748  }
749 
750  }
751  }
752  }
753 }
754 #endif