libstdc++
random_shuffle.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 
3 // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 3, or (at your option) any later
9 // version.
10 
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 /** @file parallel/random_shuffle.h
26  * @brief Parallel implementation of std::random_shuffle().
27  * This file is a GNU parallel extension to the Standard C++ Library.
28  */
29 
30 // Written by Johannes Singler.
31 
32 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
33 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
34 
35 #include <limits>
36 #include <bits/stl_numeric.h>
37 #include <parallel/parallel.h>
38 #include <parallel/random_number.h>
39 
40 namespace __gnu_parallel
41 {
42 /** @brief Type to hold the index of a bin.
43  *
44  * Since many variables of this type are allocated, it should be
45  * chosen as small as possible.
46  */
47 typedef unsigned short bin_index;
48 
49 /** @brief Data known to every thread participating in
50  __gnu_parallel::parallel_random_shuffle(). */
51 template<typename RandomAccessIterator>
53  {
55  typedef typename traits_type::value_type value_type;
56  typedef typename traits_type::difference_type difference_type;
57 
58  /** @brief Begin iterator of the source. */
59  RandomAccessIterator& source;
60 
61  /** @brief Temporary arrays for each thread. */
62  value_type** temporaries;
63 
64  /** @brief Two-dimensional array to hold the thread-bin distribution.
65  *
66  * Dimensions (num_threads + 1) x (num_bins + 1). */
67  difference_type** dist;
68 
69  /** @brief Start indexes of the threads' chunks. */
70  difference_type* starts;
71 
72  /** @brief Number of the thread that will further process the
73  corresponding bin. */
75 
76  /** @brief Number of bins to distribute to. */
77  int num_bins;
78 
79  /** @brief Number of bits needed to address the bins. */
80  int num_bits;
81 
82  /** @brief Constructor. */
83  DRandomShufflingGlobalData(RandomAccessIterator& _source)
84  : source(_source) { }
85  };
86 
87 /** @brief Local data for a thread participating in
88  __gnu_parallel::parallel_random_shuffle().
89  */
90 template<typename RandomAccessIterator, typename RandomNumberGenerator>
91  struct DRSSorterPU
92  {
93  /** @brief Number of threads participating in total. */
95 
96  /** @brief Begin index for bins taken care of by this thread. */
98 
99  /** @brief End index for bins taken care of by this thread. */
101 
102  /** @brief Random seed for this thread. */
104 
105  /** @brief Pointer to global data. */
107  };
108 
109 /** @brief Generate a random number in @c [0,2^logp).
110  * @param logp Logarithm (basis 2) of the upper range bound.
111  * @param rng Random number generator to use.
112  */
113 template<typename RandomNumberGenerator>
114  inline int
115  random_number_pow2(int logp, RandomNumberGenerator& rng)
116  { return rng.genrand_bits(logp); }
117 
118 /** @brief Random shuffle code executed by each thread.
119  * @param pus Array of thread-local data records. */
120 template<typename RandomAccessIterator, typename RandomNumberGenerator>
121  void
123  RandomNumberGenerator>* pus)
124  {
125  typedef std::iterator_traits<RandomAccessIterator> traits_type;
126  typedef typename traits_type::value_type value_type;
127  typedef typename traits_type::difference_type difference_type;
128 
129  thread_index_t iam = omp_get_thread_num();
132 
133  // Indexing: dist[bin][processor]
134  difference_type length = sd->starts[iam + 1] - sd->starts[iam];
135  bin_index* oracles = new bin_index[length];
136  difference_type* dist = new difference_type[sd->num_bins + 1];
137  bin_index* bin_proc = new bin_index[sd->num_bins];
138  value_type** temporaries = new value_type*[d->num_threads];
139 
140  // Compute oracles and count appearances.
141  for (bin_index b = 0; b < sd->num_bins + 1; ++b)
142  dist[b] = 0;
143  int num_bits = sd->num_bits;
144 
145  random_number rng(d->seed);
146 
147  // First main loop.
148  for (difference_type i = 0; i < length; ++i)
149  {
150  bin_index oracle = random_number_pow2(num_bits, rng);
151  oracles[i] = oracle;
152 
153  // To allow prefix (partial) sum.
154  ++(dist[oracle + 1]);
155  }
156 
157  for (bin_index b = 0; b < sd->num_bins + 1; ++b)
158  sd->dist[b][iam + 1] = dist[b];
159 
160 # pragma omp barrier
161 
162 # pragma omp single
163  {
164  // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
165  // total number of items in bin s
166  for (bin_index s = 0; s < sd->num_bins; ++s)
167  __gnu_sequential::partial_sum(sd->dist[s + 1],
168  sd->dist[s + 1] + d->num_threads + 1,
169  sd->dist[s + 1]);
170  }
171 
172 # pragma omp barrier
173 
174  sequence_index_t offset = 0, global_offset = 0;
175  for (bin_index s = 0; s < d->bins_begin; ++s)
176  global_offset += sd->dist[s + 1][d->num_threads];
177 
178 # pragma omp barrier
179 
180  for (bin_index s = d->bins_begin; s < d->bins_end; ++s)
181  {
182  for (int t = 0; t < d->num_threads + 1; ++t)
183  sd->dist[s + 1][t] += offset;
184  offset = sd->dist[s + 1][d->num_threads];
185  }
186 
187  sd->temporaries[iam] = static_cast<value_type*>(
188  ::operator new(sizeof(value_type) * offset));
189 
190 # pragma omp barrier
191 
192  // Draw local copies to avoid false sharing.
193  for (bin_index b = 0; b < sd->num_bins + 1; ++b)
194  dist[b] = sd->dist[b][iam];
195  for (bin_index b = 0; b < sd->num_bins; ++b)
196  bin_proc[b] = sd->bin_proc[b];
197  for (thread_index_t t = 0; t < d->num_threads; ++t)
198  temporaries[t] = sd->temporaries[t];
199 
200  RandomAccessIterator source = sd->source;
201  difference_type start = sd->starts[iam];
202 
203  // Distribute according to oracles, second main loop.
204  for (difference_type i = 0; i < length; ++i)
205  {
206  bin_index target_bin = oracles[i];
207  thread_index_t target_p = bin_proc[target_bin];
208 
209  // Last column [d->num_threads] stays unchanged.
210  ::new(&(temporaries[target_p][dist[target_bin + 1]++]))
211  value_type(*(source + i + start));
212  }
213 
214  delete[] oracles;
215  delete[] dist;
216  delete[] bin_proc;
217  delete[] temporaries;
218 
219 # pragma omp barrier
220 
221  // Shuffle bins internally.
222  for (bin_index b = d->bins_begin; b < d->bins_end; ++b)
223  {
224  value_type* begin =
225  sd->temporaries[iam] +
226  ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
227  * end =
228  sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
229  sequential_random_shuffle(begin, end, rng);
230  std::copy(begin, end, sd->source + global_offset +
231  ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
232  }
233 
234  ::operator delete(sd->temporaries[iam]);
235  }
236 
237 /** @brief Round up to the next greater power of 2.
238  * @param x Integer to round up */
239 template<typename T>
240  T
242  {
243  if (x <= 1)
244  return 1;
245  else
246  return (T)1 << (__log2(x - 1) + 1);
247  }
248 
249 /** @brief Main parallel random shuffle step.
250  * @param begin Begin iterator of sequence.
251  * @param end End iterator of sequence.
252  * @param n Length of sequence.
253  * @param num_threads Number of threads to use.
254  * @param rng Random number generator to use.
255  */
256 template<typename RandomAccessIterator, typename RandomNumberGenerator>
257  void
258  parallel_random_shuffle_drs(RandomAccessIterator begin,
259  RandomAccessIterator end,
260  typename std::iterator_traits
261  <RandomAccessIterator>::difference_type n,
262  thread_index_t num_threads,
263  RandomNumberGenerator& rng)
264  {
265  typedef std::iterator_traits<RandomAccessIterator> traits_type;
266  typedef typename traits_type::value_type value_type;
267  typedef typename traits_type::difference_type difference_type;
268 
269  _GLIBCXX_CALL(n)
270 
271  const _Settings& __s = _Settings::get();
272 
273  if (num_threads > n)
274  num_threads = static_cast<thread_index_t>(n);
275 
276  bin_index num_bins, num_bins_cache;
277 
278 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
279  // Try the L1 cache first.
280 
281  // Must fit into L1.
282  num_bins_cache = std::max<difference_type>(
283  1, n / (__s.L1_cache_size_lb / sizeof(value_type)));
284  num_bins_cache = round_up_to_pow2(num_bins_cache);
285 
286  // No more buckets than TLB entries, power of 2
287  // Power of 2 and at least one element per bin, at most the TLB size.
288  num_bins = std::min<difference_type>(n, num_bins_cache);
289 
290 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
291  // 2 TLB entries needed per bin.
292  num_bins = std::min<difference_type>(__s.TLB_size / 2, num_bins);
293 #endif
294  num_bins = round_up_to_pow2(num_bins);
295 
296  if (num_bins < num_bins_cache)
297  {
298 #endif
299  // Now try the L2 cache
300  // Must fit into L2
301  num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
302  1, n / (__s.L2_cache_size / sizeof(value_type))));
303  num_bins_cache = round_up_to_pow2(num_bins_cache);
304 
305  // No more buckets than TLB entries, power of 2.
306  num_bins = static_cast<bin_index>(
307  std::min(n, static_cast<difference_type>(num_bins_cache)));
308  // Power of 2 and at least one element per bin, at most the TLB size.
309 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
310  // 2 TLB entries needed per bin.
311  num_bins = std::min(
312  static_cast<difference_type>(__s.TLB_size / 2), num_bins);
313 #endif
314  num_bins = round_up_to_pow2(num_bins);
315 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
316  }
317 #endif
318 
319  num_threads = std::min<bin_index>(num_threads, num_bins);
320 
321  if (num_threads <= 1)
322  return sequential_random_shuffle(begin, end, rng);
323 
326  difference_type* starts;
327 
328 # pragma omp parallel num_threads(num_threads)
329  {
330  thread_index_t num_threads = omp_get_num_threads();
331 # pragma omp single
332  {
334  [num_threads];
335 
336  sd.temporaries = new value_type*[num_threads];
337  sd.dist = new difference_type*[num_bins + 1];
338  sd.bin_proc = new thread_index_t[num_bins];
339  for (bin_index b = 0; b < num_bins + 1; ++b)
340  sd.dist[b] = new difference_type[num_threads + 1];
341  for (bin_index b = 0; b < (num_bins + 1); ++b)
342  {
343  sd.dist[0][0] = 0;
344  sd.dist[b][0] = 0;
345  }
346  starts = sd.starts = new difference_type[num_threads + 1];
347  int bin_cursor = 0;
348  sd.num_bins = num_bins;
349  sd.num_bits = __log2(num_bins);
350 
351  difference_type chunk_length = n / num_threads,
352  split = n % num_threads, start = 0;
353  difference_type bin_chunk_length = num_bins / num_threads,
354  bin_split = num_bins % num_threads;
355  for (thread_index_t i = 0; i < num_threads; ++i)
356  {
357  starts[i] = start;
358  start += (i < split) ? (chunk_length + 1) : chunk_length;
359  int j = pus[i].bins_begin = bin_cursor;
360 
361  // Range of bins for this processor.
362  bin_cursor += (i < bin_split) ?
363  (bin_chunk_length + 1) : bin_chunk_length;
364  pus[i].bins_end = bin_cursor;
365  for (; j < bin_cursor; ++j)
366  sd.bin_proc[j] = i;
367  pus[i].num_threads = num_threads;
368  pus[i].seed = rng(std::numeric_limits<uint32>::max());
369  pus[i].sd = &sd;
370  }
371  starts[num_threads] = start;
372  } //single
373  // Now shuffle in parallel.
375  } // parallel
376 
377  delete[] starts;
378  delete[] sd.bin_proc;
379  for (int s = 0; s < (num_bins + 1); ++s)
380  delete[] sd.dist[s];
381  delete[] sd.dist;
382  delete[] sd.temporaries;
383 
384  delete[] pus;
385  }
386 
387 /** @brief Sequential cache-efficient random shuffle.
388  * @param begin Begin iterator of sequence.
389  * @param end End iterator of sequence.
390  * @param rng Random number generator to use.
391  */
392 template<typename RandomAccessIterator, typename RandomNumberGenerator>
393  void
394  sequential_random_shuffle(RandomAccessIterator begin,
395  RandomAccessIterator end,
396  RandomNumberGenerator& rng)
397  {
398  typedef std::iterator_traits<RandomAccessIterator> traits_type;
399  typedef typename traits_type::value_type value_type;
400  typedef typename traits_type::difference_type difference_type;
401 
402  difference_type n = end - begin;
403  const _Settings& __s = _Settings::get();
404 
405  bin_index num_bins, num_bins_cache;
406 
407 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
408  // Try the L1 cache first, must fit into L1.
409  num_bins_cache =
410  std::max<difference_type>
411  (1, n / (__s.L1_cache_size_lb / sizeof(value_type)));
412  num_bins_cache = round_up_to_pow2(num_bins_cache);
413 
414  // No more buckets than TLB entries, power of 2
415  // Power of 2 and at least one element per bin, at most the TLB size
416  num_bins = std::min(n, (difference_type)num_bins_cache);
417 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
418  // 2 TLB entries needed per bin
419  num_bins = std::min((difference_type)__s.TLB_size / 2, num_bins);
420 #endif
421  num_bins = round_up_to_pow2(num_bins);
422 
423  if (num_bins < num_bins_cache)
424  {
425 #endif
426  // Now try the L2 cache, must fit into L2.
427  num_bins_cache =
428  static_cast<bin_index>(std::max<difference_type>(
429  1, n / (__s.L2_cache_size / sizeof(value_type))));
430  num_bins_cache = round_up_to_pow2(num_bins_cache);
431 
432  // No more buckets than TLB entries, power of 2
433  // Power of 2 and at least one element per bin, at most the TLB size.
434  num_bins = static_cast<bin_index>
435  (std::min(n, static_cast<difference_type>(num_bins_cache)));
436 
437 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
438  // 2 TLB entries needed per bin
439  num_bins =
440  std::min<difference_type>(__s.TLB_size / 2, num_bins);
441 #endif
442  num_bins = round_up_to_pow2(num_bins);
443 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
444  }
445 #endif
446 
447  int num_bits = __log2(num_bins);
448 
449  if (num_bins > 1)
450  {
451  value_type* target = static_cast<value_type*>(
452  ::operator new(sizeof(value_type) * n));
453  bin_index* oracles = new bin_index[n];
454  difference_type* dist0 = new difference_type[num_bins + 1],
455  * dist1 = new difference_type[num_bins + 1];
456 
457  for (int b = 0; b < num_bins + 1; ++b)
458  dist0[b] = 0;
459 
460  random_number bitrng(rng(0xFFFFFFFF));
461 
462  for (difference_type i = 0; i < n; ++i)
463  {
464  bin_index oracle = random_number_pow2(num_bits, bitrng);
465  oracles[i] = oracle;
466 
467  // To allow prefix (partial) sum.
468  ++(dist0[oracle + 1]);
469  }
470 
471  // Sum up bins.
472  __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
473 
474  for (int b = 0; b < num_bins + 1; ++b)
475  dist1[b] = dist0[b];
476 
477  // Distribute according to oracles.
478  for (difference_type i = 0; i < n; ++i)
479  ::new(&(target[(dist0[oracles[i]])++])) value_type(*(begin + i));
480 
481  for (int b = 0; b < num_bins; ++b)
482  {
483  sequential_random_shuffle(target + dist1[b],
484  target + dist1[b + 1],
485  rng);
486  }
487 
488  // Copy elements back.
489  std::copy(target, target + n, begin);
490 
491  delete[] dist0;
492  delete[] dist1;
493  delete[] oracles;
494  ::operator delete(target);
495  }
496  else
497  __gnu_sequential::random_shuffle(begin, end, rng);
498  }
499 
500 /** @brief Parallel random public call.
501  * @param begin Begin iterator of sequence.
502  * @param end End iterator of sequence.
503  * @param rng Random number generator to use.
504  */
505 template<typename RandomAccessIterator, typename RandomNumberGenerator>
506  inline void
507  parallel_random_shuffle(RandomAccessIterator begin,
508  RandomAccessIterator end,
509  RandomNumberGenerator rng = random_number())
510  {
511  typedef std::iterator_traits<RandomAccessIterator> traits_type;
512  typedef typename traits_type::difference_type difference_type;
513  difference_type n = end - begin;
514  parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ;
515  }
516 
517 }
518 
519 #endif /* _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H */