Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_runtime.c
1 /*
2  * kmp_runtime.c -- KPTS runtime support library
3  * $Revision: 42642 $
4  * $Date: 2013-09-06 01:57:24 -0500 (Fri, 06 Sep 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "kmp.h"
38 #include "kmp_atomic.h"
39 #include "kmp_wrapper_getpid.h"
40 #include "kmp_environment.h"
41 #include "kmp_itt.h"
42 #include "kmp_str.h"
43 #include "kmp_settings.h"
44 #include "kmp_i18n.h"
45 #include "kmp_io.h"
46 #include "kmp_error.h"
47 
48 /* these are temporary issues to be dealt with */
49 #define KMP_USE_PRCTL 0
50 #define KMP_USE_POOLED_ALLOC 0
51 
52 #if KMP_MIC
53 #include <immintrin.h>
54 #define USE_NGO_STORES 1
55 #endif // KMP_MIC
56 
57 #if KMP_MIC && USE_NGO_STORES
58 #define load_icvs(src) __m512d Vt_icvs = _mm512_load_pd((void *)(src))
59 #define store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt_icvs)
60 #define sync_icvs() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
61 #else
62 #define load_icvs(src) ((void)0)
63 #define store_icvs(dst, src) copy_icvs((dst), (src))
64 #define sync_icvs() ((void)0)
65 #endif /* KMP_MIC && USE_NGO_STORES */
66 
67 #if KMP_OS_WINDOWS
68 #include <process.h>
69 #endif
70 
71 
72 #if defined(KMP_GOMP_COMPAT)
73 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
74 #endif /* defined(KMP_GOMP_COMPAT) */
75 
76 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
77 #if OMP_40_ENABLED
78  "4.0 (201307)";
79 #elif OMP_30_ENABLED
80  "3.1 (201107)";
81 #else
82  "2.5 (200505)";
83 #endif
84 
85 #ifdef KMP_DEBUG
86 
87 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
88 
89 char const __kmp_version_perf_v19[] = KMP_VERSION_PREFIX "perf v19: "
90 #if KMP_PERF_V19 == KMP_ON
91  "on";
92 #elif KMP_PERF_V19 == KMP_OFF
93  "off";
94 #else
95  #error "Must specify KMP_PERF_V19 option"
96 #endif
97 
98 char const __kmp_version_perf_v106[] = KMP_VERSION_PREFIX "perf v106: "
99 #if KMP_PERF_V106 == KMP_ON
100  "on";
101 #elif KMP_PERF_V106 == KMP_OFF
102  "off";
103 #else
104  #error "Must specify KMP_PERF_V106 option"
105 #endif
106 
107 #endif /* KMP_DEBUG */
108 
109 
110 /* ------------------------------------------------------------------------ */
111 /* ------------------------------------------------------------------------ */
112 
113 kmp_info_t __kmp_monitor;
114 
115 /* ------------------------------------------------------------------------ */
116 /* ------------------------------------------------------------------------ */
117 
118 /* Forward declarations */
119 
120 void __kmp_cleanup( void );
121 
122 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
123 static void __kmp_initialize_team(
124  kmp_team_t * team,
125  int new_nproc,
126  #if OMP_30_ENABLED
127  kmp_internal_control_t * new_icvs,
128  ident_t * loc
129  #else
130  int new_set_nproc, int new_set_dynamic, int new_set_nested,
131  int new_set_blocktime, int new_bt_intervals, int new_bt_set
132  #endif // OMP_30_ENABLED
133 );
134 static void __kmp_partition_places( kmp_team_t *team );
135 static void __kmp_do_serial_initialize( void );
136 
137 
138 #ifdef USE_LOAD_BALANCE
139 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
140 #endif
141 
142 static int __kmp_expand_threads(int nWish, int nNeed);
143 static int __kmp_unregister_root_other_thread( int gtid );
144 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
145 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
146 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
147 
148 /* ------------------------------------------------------------------------ */
149 /* ------------------------------------------------------------------------ */
150 
151 /* Calculate the identifier of the current thread */
152 /* fast (and somewhat portable) way to get unique */
153 /* identifier of executing thread. */
154 /* returns KMP_GTID_DNE if we haven't been assigned a gtid */
155 
156 int
157 __kmp_get_global_thread_id( )
158 {
159  int i;
160  kmp_info_t **other_threads;
161  size_t stack_data;
162  char *stack_addr;
163  size_t stack_size;
164  char *stack_base;
165 
166  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
167  __kmp_nth, __kmp_all_nth ));
168 
169  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
170  parallel region, made it return KMP_GTID_DNE to force serial_initialize by
171  caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
172  __kmp_init_gtid for this to work. */
173 
174  if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
175 
176 #ifdef KMP_TDATA_GTID
177  if ( TCR_4(__kmp_gtid_mode) >= 3) {
178  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
179  return __kmp_gtid;
180  }
181 #endif
182  if ( TCR_4(__kmp_gtid_mode) >= 2) {
183  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
184  return __kmp_gtid_get_specific();
185  }
186  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
187 
188  stack_addr = (char*) & stack_data;
189  other_threads = __kmp_threads;
190 
191  /*
192  ATT: The code below is a source of potential bugs due to unsynchronized access to
193  __kmp_threads array. For example:
194  1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
195  2. Current thread is suspended by OS.
196  3. Another thread unregisters and finishes (debug versions of free() may fill memory
197  with something like 0xEF).
198  4. Current thread is resumed.
199  5. Current thread reads junk from *thr.
200  TODO: Fix it.
201  --ln
202  */
203 
204  for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
205 
206  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
207  if( !thr ) continue;
208 
209  stack_size = (size_t)TCR_PTR(thr -> th.th_info.ds.ds_stacksize);
210  stack_base = (char *)TCR_PTR(thr -> th.th_info.ds.ds_stackbase);
211 
212  /* stack grows down -- search through all of the active threads */
213 
214  if( stack_addr <= stack_base ) {
215  size_t stack_diff = stack_base - stack_addr;
216 
217  if( stack_diff <= stack_size ) {
218  /* The only way we can be closer than the allocated */
219  /* stack size is if we are running on this thread. */
220  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
221  return i;
222  }
223  }
224  }
225 
226  /* get specific to try and determine our gtid */
227  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
228  "thread, using TLS\n" ));
229  i = __kmp_gtid_get_specific();
230 
231  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
232 
233  /* if we havn't been assigned a gtid, then return code */
234  if( i<0 ) return i;
235 
236  /* dynamically updated stack window for uber threads to avoid get_specific call */
237  if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
238  KMP_FATAL( StackOverflow, i );
239  }
240 
241  stack_base = (char *) other_threads[i] -> th.th_info.ds.ds_stackbase;
242  if( stack_addr > stack_base ) {
243  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
244  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
245  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
246  } else {
247  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
248  }
249 
250  /* Reprint stack bounds for ubermaster since they have been refined */
251  if ( __kmp_storage_map ) {
252  char *stack_end = (char *) other_threads[i] -> th.th_info.ds.ds_stackbase;
253  char *stack_beg = stack_end - other_threads[i] -> th.th_info.ds.ds_stacksize;
254  __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
255  other_threads[i] -> th.th_info.ds.ds_stacksize,
256  "th_%d stack (refinement)", i );
257  }
258  return i;
259 }
260 
261 int
262 __kmp_get_global_thread_id_reg( )
263 {
264  int gtid;
265 
266  if ( !__kmp_init_serial ) {
267  gtid = KMP_GTID_DNE;
268  } else
269 #ifdef KMP_TDATA_GTID
270  if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
271  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
272  gtid = __kmp_gtid;
273  } else
274 #endif
275  if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
276  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
277  gtid = __kmp_gtid_get_specific();
278  } else {
279  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
280  gtid = __kmp_get_global_thread_id();
281  }
282 
283  /* we must be a new uber master sibling thread */
284  if( gtid == KMP_GTID_DNE ) {
285  KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
286  "Registering a new gtid.\n" ));
287  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
288  if( !__kmp_init_serial ) {
289  __kmp_do_serial_initialize();
290  gtid = __kmp_gtid_get_specific();
291  } else {
292  gtid = __kmp_register_root(FALSE);
293  }
294  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
295  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
296  }
297 
298  KMP_DEBUG_ASSERT( gtid >=0 );
299 
300  return gtid;
301 }
302 
303 /* caller must hold forkjoin_lock */
304 void
305 __kmp_check_stack_overlap( kmp_info_t *th )
306 {
307  int f;
308  char *stack_beg = NULL;
309  char *stack_end = NULL;
310  int gtid;
311 
312  KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
313  if ( __kmp_storage_map ) {
314  stack_end = (char *) th -> th.th_info.ds.ds_stackbase;
315  stack_beg = stack_end - th -> th.th_info.ds.ds_stacksize;
316 
317  gtid = __kmp_gtid_from_thread( th );
318 
319  if (gtid == KMP_GTID_MONITOR) {
320  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
321  "th_%s stack (%s)", "mon",
322  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
323  } else {
324  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
325  "th_%d stack (%s)", gtid,
326  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
327  }
328  }
329 
330  /* No point in checking ubermaster threads since they use refinement and cannot overlap */
331  if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid = __kmp_gtid_from_thread( th )))
332  {
333  KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
334  if ( stack_beg == NULL ) {
335  stack_end = (char *) th -> th.th_info.ds.ds_stackbase;
336  stack_beg = stack_end - th -> th.th_info.ds.ds_stacksize;
337  }
338 
339  for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
340  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
341 
342  if( f_th && f_th != th ) {
343  char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
344  char *other_stack_beg = other_stack_end -
345  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
346  if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
347  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
348 
349  /* Print the other stack values before the abort */
350  if ( __kmp_storage_map )
351  __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
352  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
353  "th_%d stack (overlapped)",
354  __kmp_gtid_from_thread( f_th ) );
355 
356  __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
357  }
358  }
359  }
360  }
361  KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
362 }
363 
364 
365 /* ------------------------------------------------------------------------ */
366 
367 #ifndef KMP_DEBUG
368 # define __kmp_static_delay( arg ) /* nothing to do */
369 #else
370 
371 static void
372 __kmp_static_delay( int arg )
373 {
374 /* Work around weird code-gen bug that causes assert to trip */
375 # if KMP_ARCH_X86_64 && KMP_OS_LINUX
376  KMP_ASSERT( arg != 0 );
377 # else
378  KMP_ASSERT( arg >= 0 );
379 # endif
380 }
381 #endif /* KMP_DEBUG */
382 
383 static void
384 __kmp_static_yield( int arg )
385 {
386  __kmp_yield( arg );
387 }
388 
389 /*
390  * Spin wait loop that first does pause, then yield, then sleep.
391  * Wait until spinner is equal to checker to exit.
392  *
393  * A thread that calls __kmp_wait_sleep must make certain that another thread
394  * calls __kmp_release to wake it back up up to prevent deadlocks!
395  */
396 
397 void
398 __kmp_wait_sleep( kmp_info_t *this_thr,
399  volatile kmp_uint *spinner,
400  kmp_uint checker,
401  int final_spin
402  USE_ITT_BUILD_ARG (void * itt_sync_obj)
403 )
404 {
405  /* note: we may not belong to a team at this point */
406  register volatile kmp_uint *spin = spinner;
407  register kmp_uint check = checker;
408  register kmp_uint32 spins;
409  register kmp_uint32 hibernate;
410  int th_gtid, th_tid;
411 #if OMP_30_ENABLED
412  int flag = FALSE;
413 #endif /* OMP_30_ENABLED */
414 
415  KMP_FSYNC_SPIN_INIT( spin, NULL );
416  if( TCR_4(*spin) == check ) {
417  KMP_FSYNC_SPIN_ACQUIRED( spin );
418  return;
419  }
420 
421  th_gtid = this_thr->th.th_info.ds.ds_gtid;
422 
423  KA_TRACE( 20, ("__kmp_wait_sleep: T#%d waiting for spin(%p) == %d\n",
424  th_gtid,
425  spin, check ) );
426 
427  /* setup for waiting */
428  KMP_INIT_YIELD( spins );
429 
430  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
431  //
432  // The worker threads cannot rely on the team struct existing at this
433  // point. Use the bt values cached in the thread struct instead.
434  //
435  #ifdef KMP_ADJUST_BLOCKTIME
436  if ( __kmp_zero_bt && ! this_thr->th.th_team_bt_set ) {
437  /* force immediate suspend if not set by user and more threads than available procs */
438  hibernate = 0;
439  } else {
440  hibernate = this_thr->th.th_team_bt_intervals;
441  }
442  #else
443  hibernate = this_thr->th.th_team_bt_intervals;
444  #endif /* KMP_ADJUST_BLOCKTIME */
445 
446  //
447  // If the blocktime is nonzero, we want to make sure that we spin
448  // wait for the entirety of the specified #intervals, plus up to
449  // one interval more. This increment make certain that this thread
450  // doesn't go to sleep too soon.
451  //
452  if ( hibernate != 0 ) {
453  hibernate++;
454  }
455 
456  //
457  // Add in the current time value.
458  //
459  hibernate += TCR_4( __kmp_global.g.g_time.dt.t_value );
460 
461  KF_TRACE( 20, ("__kmp_wait_sleep: T#%d now=%d, hibernate=%d, intervals=%d\n",
462  th_gtid, __kmp_global.g.g_time.dt.t_value, hibernate,
463  hibernate - __kmp_global.g.g_time.dt.t_value ));
464  }
465 
466  KMP_MB();
467 
468  /* main wait spin loop */
469  while( TCR_4(*spin) != check ) {
470  int in_pool;
471 
472 #if OMP_30_ENABLED
473  //
474  // If the task team is NULL, it means one of things:
475  // 1) A newly-created thread is first being released by
476  // __kmp_fork_barrier(), and its task team has not been set up
477  // yet.
478  // 2) All tasks have been executed to completion, this thread has
479  // decremented the task team's ref ct and possibly deallocated
480  // it, and should no longer reference it.
481  // 3) Tasking is off for this region. This could be because we
482  // are in a serialized region (perhaps the outer one), or else
483  // tasking was manually disabled (KMP_TASKING=0).
484  //
485  kmp_task_team_t * task_team = NULL;
486  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
487  task_team = this_thr->th.th_task_team;
488  if ( task_team != NULL ) {
489  if ( ! TCR_SYNC_4( task_team->tt.tt_active ) ) {
490  KMP_DEBUG_ASSERT( ! KMP_MASTER_TID( this_thr->th.th_info.ds.ds_tid ) );
491  __kmp_unref_task_team( task_team, this_thr );
492  } else if ( KMP_TASKING_ENABLED( task_team, this_thr->th.th_task_state ) ) {
493  __kmp_execute_tasks( this_thr, th_gtid, spin, check, final_spin, &flag
494  USE_ITT_BUILD_ARG( itt_sync_obj )
495  );
496  }
497  }; // if
498  }; // if
499 #endif /* OMP_30_ENABLED */
500 
501  KMP_FSYNC_SPIN_PREPARE( spin );
502  if( TCR_4(__kmp_global.g.g_done) ) {
503  if( __kmp_global.g.g_abort )
504  __kmp_abort_thread( );
505  break;
506  }
507 
508  __kmp_static_delay( 1 );
509 
510  /* if we are oversubscribed,
511  or have waited a bit (and KMP_LIBRARY=throughput), then yield */
512  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
513  // TODO: Should it be number of cores instead of thread contexts? Like:
514  // KMP_YIELD( TCR_4(__kmp_nth) > __kmp_ncores );
515  // Need performance improvement data to make the change...
516  KMP_YIELD_SPIN( spins );
517 
518  //
519  // Check if this thread was transferred from a team
520  // to the thread pool (or vice-versa) while spinning.
521  //
522  in_pool = !!TCR_4(this_thr->th.th_in_pool);
523  if ( in_pool != !!this_thr->th.th_active_in_pool ) {
524  if ( in_pool ) {
525  //
526  // recently transferred from team to pool
527  //
528  KMP_TEST_THEN_INC32(
529  (kmp_int32 *) &__kmp_thread_pool_active_nth );
530  this_thr->th.th_active_in_pool = TRUE;
531 
532  //
533  // Here, we cannot assert that
534  //
535  // KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth)
536  // <= __kmp_thread_pool_nth );
537  //
538  // __kmp_thread_pool_nth is inc/dec'd by the master thread
539  // while the fork/join lock is held, whereas
540  // __kmp_thread_pool_active_nth is inc/dec'd asynchronously
541  // by the workers. The two can get out of sync for brief
542  // periods of time.
543  //
544  }
545  else {
546  //
547  // recently transferred from pool to team
548  //
549  KMP_TEST_THEN_DEC32(
550  (kmp_int32 *) &__kmp_thread_pool_active_nth );
551  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
552  this_thr->th.th_active_in_pool = FALSE;
553  }
554  }
555 
556 #if OMP_30_ENABLED
557  // Don't suspend if there is a likelihood of new tasks being spawned.
558  if ( ( task_team != NULL ) && TCR_4(task_team->tt.tt_found_tasks) ) {
559  continue;
560  }
561 #endif /* OMP_30_ENABLED */
562 
563  /* Don't suspend if KMP_BLOCKTIME is set to "infinite" */
564  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
565  continue;
566  }
567 
568  /* if we have waited a bit more, fall asleep */
569  if ( TCR_4( __kmp_global.g.g_time.dt.t_value ) < hibernate ) {
570  continue;
571  }
572 
573  KF_TRACE( 50, ("__kmp_wait_sleep: T#%d suspend time reached\n", th_gtid ) );
574 
575  __kmp_suspend( th_gtid, spin, check );
576 
577  if( TCR_4( __kmp_global.g.g_done ) && __kmp_global.g.g_abort ) {
578  __kmp_abort_thread( );
579  }
580 
581  /* TODO */
582  /* if thread is done with work and timesout, disband/free */
583  }
584 
585  KMP_FSYNC_SPIN_ACQUIRED( spin );
586 }
587 
588 
589 /*
590  * Release the thread specified by target_thr from waiting by setting the location
591  * specified by spin and resume the thread if indicated by the sleep parameter.
592  *
593  * A thread that calls __kmp_wait_sleep must call this function to wake up the
594  * potentially sleeping thread and prevent deadlocks!
595  */
596 
597 void
598 __kmp_release( kmp_info_t *target_thr, volatile kmp_uint *spin,
599  enum kmp_mem_fence_type fetchadd_fence )
600 {
601  kmp_uint old_spin;
602  #ifdef KMP_DEBUG
603  int target_gtid = target_thr->th.th_info.ds.ds_gtid;
604  int gtid = TCR_4(__kmp_init_gtid) ? __kmp_get_gtid() : -1;
605  #endif
606 
607  KF_TRACE( 20, ( "__kmp_release: T#%d releasing T#%d spin(%p) fence_type(%d)\n",
608  gtid, target_gtid, spin, fetchadd_fence ));
609 
610  KMP_DEBUG_ASSERT( spin );
611 
612  KMP_DEBUG_ASSERT( fetchadd_fence == kmp_acquire_fence ||
613  fetchadd_fence == kmp_release_fence );
614 
615  KMP_FSYNC_RELEASING( spin );
616 
617  old_spin = ( fetchadd_fence == kmp_acquire_fence )
618  ? KMP_TEST_THEN_ADD4_ACQ32( (volatile kmp_int32 *) spin )
619  : KMP_TEST_THEN_ADD4_32( (volatile kmp_int32 *) spin );
620 
621  KF_TRACE( 100, ( "__kmp_release: T#%d old spin(%p)=%d, set new spin=%d\n",
622  gtid, spin, old_spin, *spin ) );
623 
624  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
625  /* Only need to check sleep stuff if infinite block time not set */
626  if ( old_spin & KMP_BARRIER_SLEEP_STATE ) {
627  #ifndef KMP_DEBUG
628  int target_gtid = target_thr->th.th_info.ds.ds_gtid;
629  #endif
630  /* wake up thread if needed */
631  KF_TRACE( 50, ( "__kmp_release: T#%d waking up thread T#%d since sleep spin(%p) set\n",
632  gtid, target_gtid, spin ));
633  __kmp_resume( target_gtid, spin );
634  } else {
635  KF_TRACE( 50, ( "__kmp_release: T#%d don't wake up thread T#%d since sleep spin(%p) not set\n",
636  gtid, target_gtid, spin ));
637  }
638  }
639 }
640 
641 /* ------------------------------------------------------------------------ */
642 
643 void
644 __kmp_infinite_loop( void )
645 {
646  static int done = FALSE;
647 
648  while (! done) {
649  KMP_YIELD( 1 );
650  }
651 }
652 
653 #define MAX_MESSAGE 512
654 
655 void
656 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
657  char buffer[MAX_MESSAGE];
658  int node;
659  va_list ap;
660 
661  va_start( ap, format);
662  sprintf( buffer, "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
663  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
664  __kmp_vprintf( kmp_err, buffer, ap );
665 #if KMP_PRINT_DATA_PLACEMENT
666  if(gtid >= 0) {
667  if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
668  if( __kmp_storage_map_verbose ) {
669  node = __kmp_get_host_node(p1);
670  if(node < 0) /* doesn't work, so don't try this next time */
671  __kmp_storage_map_verbose = FALSE;
672  else {
673  char *last;
674  int lastNode;
675  int localProc = __kmp_get_cpu_from_gtid(gtid);
676 
677  p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
678  p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
679  if(localProc >= 0)
680  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
681  else
682  __kmp_printf_no_lock(" GTID %d\n", gtid);
683 # if KMP_USE_PRCTL
684 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
685  do {
686  last = p1;
687  lastNode = node;
688  /* This loop collates adjacent pages with the same host node. */
689  do {
690  (char*)p1 += PAGE_SIZE;
691  } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
692  __kmp_printf_no_lock(" %p-%p memNode %d\n", last,
693  (char*)p1 - 1, lastNode);
694  } while(p1 <= p2);
695 # else
696  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
697  (char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
698  if(p1 < p2) {
699  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
700  (char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
701  }
702 # endif
703  }
704  }
705  } else
706  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
707  }
708 #endif /* KMP_PRINT_DATA_PLACEMENT */
709  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
710 }
711 
712 void
713 __kmp_warn( char const * format, ... )
714 {
715  char buffer[MAX_MESSAGE];
716  va_list ap;
717 
718  if ( __kmp_generate_warnings == kmp_warnings_off ) {
719  return;
720  }
721 
722  va_start( ap, format );
723 
724  snprintf( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
725  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
726  __kmp_vprintf( kmp_err, buffer, ap );
727  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
728 
729  va_end( ap );
730 }
731 
732 void
733 __kmp_abort_process()
734 {
735 
736  // Later threads may stall here, but that's ok because abort() will kill them.
737  __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
738 
739  if ( __kmp_debug_buf ) {
740  __kmp_dump_debug_buffer();
741  }; // if
742 
743  if ( KMP_OS_WINDOWS ) {
744  // Let other threads know of abnormal termination and prevent deadlock
745  // if abort happened during library initialization or shutdown
746  __kmp_global.g.g_abort = SIGABRT;
747 
748  /*
749  On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
750  Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
751  works well, but this function is not available in VS7 (this is not problem for DLL, but
752  it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
753  not help, at least in some versions of MS C RTL.
754 
755  It seems following sequence is the only way to simulate abort() and avoid pop-up error
756  box.
757  */
758  raise( SIGABRT );
759  _exit( 3 ); // Just in case, if signal ignored, exit anyway.
760  } else {
761  abort();
762  }; // if
763 
764  __kmp_infinite_loop();
765  __kmp_release_bootstrap_lock( & __kmp_exit_lock );
766 
767 } // __kmp_abort_process
768 
769 void
770 __kmp_abort_thread( void )
771 {
772  // TODO: Eliminate g_abort global variable and this function.
773  // In case of abort just call abort(), it will kill all the threads.
774  __kmp_infinite_loop();
775 } // __kmp_abort_thread
776 
777 /* ------------------------------------------------------------------------ */
778 
779 /*
780  * Print out the storage map for the major kmp_info_t thread data structures
781  * that are allocated together.
782  */
783 
784 static void
785 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
786 {
787  __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
788 
789  __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
790  "th_%d.th_info", gtid );
791 
792  __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
793  "th_%d.th_local", gtid );
794 
795  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
796  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
797 
798  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
799  &thr->th.th_bar[bs_plain_barrier+1],
800  sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
801 
802  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
803  &thr->th.th_bar[bs_forkjoin_barrier+1],
804  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
805 
806  #if KMP_FAST_REDUCTION_BARRIER
807  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
808  &thr->th.th_bar[bs_reduction_barrier+1],
809  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
810  #endif // KMP_FAST_REDUCTION_BARRIER
811 }
812 
813 /*
814  * Print out the storage map for the major kmp_team_t team data structures
815  * that are allocated together.
816  */
817 
818 static void
819 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
820 {
821  int num_disp_buff = team->t.t_max_nproc > 1 ? KMP_MAX_DISP_BUF : 2;
822  __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
823  header, team_id );
824 
825  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
826  sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
827 
828 
829  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
830  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
831 
832  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
833  sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
834 
835  #if KMP_FAST_REDUCTION_BARRIER
836  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
837  sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
838  #endif // KMP_FAST_REDUCTION_BARRIER
839 
840  __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
841  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
842 
843  __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
844  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
845 
846  __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
847  sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
848  header, team_id );
849 
850  /*
851  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nproc[0], &team->t.t_set_nproc[num_thr],
852  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
853 
854  __kmp_print_storage_map_gtid( -1, &team->t.t_set_dynamic[0], &team->t.t_set_dynamic[num_thr],
855  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
856 
857  __kmp_print_storage_map_gtid( -1, &team->t.t_set_nested[0], &team->t.t_set_nested[num_thr],
858  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
859 
860  __kmp_print_storage_map_gtid( -1, &team->t.t_set_blocktime[0], &team->t.t_set_blocktime[num_thr],
861  sizeof(int) * num_thr, "%s_%d.t_set_nproc", header, team_id );
862 
863  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_intervals[0], &team->t.t_set_bt_intervals[num_thr],
864  sizeof(int) * num_thr, "%s_%d.t_set_dynamic", header, team_id );
865 
866  __kmp_print_storage_map_gtid( -1, &team->t.t_set_bt_set[0], &team->t.t_set_bt_set[num_thr],
867  sizeof(int) * num_thr, "%s_%d.t_set_nested", header, team_id );
868 
869 #if OMP_30_ENABLED
870  //__kmp_print_storage_map_gtid( -1, &team->t.t_set_max_active_levels[0], &team->t.t_set_max_active_levels[num_thr],
871  // sizeof(int) * num_thr, "%s_%d.t_set_max_active_levels", header, team_id );
872 
873  __kmp_print_storage_map_gtid( -1, &team->t.t_set_sched[0], &team->t.t_set_sched[num_thr],
874  sizeof(kmp_r_sched_t) * num_thr, "%s_%d.t_set_sched", header, team_id );
875 #endif // OMP_30_ENABLED
876 #if OMP_40_ENABLED
877  __kmp_print_storage_map_gtid( -1, &team->t.t_set_proc_bind[0], &team->t.t_set_proc_bind[num_thr],
878  sizeof(kmp_proc_bind_t) * num_thr, "%s_%d.t_set_proc_bind", header, team_id );
879 #endif
880  */
881 
882  __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
883  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
884 }
885 
886 static void __kmp_init_allocator() {}
887 static void __kmp_fini_allocator() {}
888 static void __kmp_fini_allocator_thread() {}
889 
890 /* ------------------------------------------------------------------------ */
891 
892 #ifdef GUIDEDLL_EXPORTS
893 # if KMP_OS_WINDOWS
894 
895 
896 static void
897 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
898  // TODO: Change to __kmp_break_bootstrap_lock().
899  __kmp_init_bootstrap_lock( lck ); // make the lock released
900 }
901 
902 static void
903 __kmp_reset_locks_on_process_detach( int gtid_req ) {
904  int i;
905  int thread_count;
906 
907  // PROCESS_DETACH is expected to be called by a thread
908  // that executes ProcessExit() or FreeLibrary().
909  // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
910  // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
911  // However, in fact, some threads can be still alive here, although being about to be terminated.
912  // The threads in the array with ds_thread==0 are most suspicious.
913  // Actually, it can be not safe to access the __kmp_threads[].
914 
915  // TODO: does it make sense to check __kmp_roots[] ?
916 
917  // Let's check that there are no other alive threads registered with the OMP lib.
918  while( 1 ) {
919  thread_count = 0;
920  for( i = 0; i < __kmp_threads_capacity; ++i ) {
921  if( !__kmp_threads ) continue;
922  kmp_info_t* th = __kmp_threads[ i ];
923  if( th == NULL ) continue;
924  int gtid = th->th.th_info.ds.ds_gtid;
925  if( gtid == gtid_req ) continue;
926  if( gtid < 0 ) continue;
927  DWORD exit_val;
928  int alive = __kmp_is_thread_alive( th, &exit_val );
929  if( alive ) {
930  ++thread_count;
931  }
932  }
933  if( thread_count == 0 ) break; // success
934  }
935 
936  // Assume that I'm alone.
937 
938  // Now it might be probably safe to check and reset locks.
939  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
940  __kmp_reset_lock( &__kmp_forkjoin_lock );
941  #ifdef KMP_DEBUG
942  __kmp_reset_lock( &__kmp_stdio_lock );
943  #endif // KMP_DEBUG
944 
945 
946 }
947 
948 BOOL WINAPI
949 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
950  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
951 
952  switch( fdwReason ) {
953 
954  case DLL_PROCESS_ATTACH:
955  KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
956 
957  return TRUE;
958 
959  case DLL_PROCESS_DETACH:
960  KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
961  __kmp_gtid_get_specific() ));
962 
963  if( lpReserved != NULL )
964  {
965  // lpReserved is used for telling the difference:
966  // lpReserved == NULL when FreeLibrary() was called,
967  // lpReserved != NULL when the process terminates.
968  // When FreeLibrary() is called, worker threads remain alive.
969  // So they will release the forkjoin lock by themselves.
970  // When the process terminates, worker threads disappear triggering
971  // the problem of unreleased forkjoin lock as described below.
972 
973  // A worker thread can take the forkjoin lock
974  // in __kmp_suspend()->__kmp_rml_decrease_load_before_sleep().
975  // The problem comes up if that worker thread becomes dead
976  // before it releases the forkjoin lock.
977  // The forkjoin lock remains taken, while the thread
978  // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
979  // will try to take the forkjoin lock and will always fail,
980  // so that the application will never finish [normally].
981  // This scenario is possible if __kmpc_end() has not been executed.
982  // It looks like it's not a corner case, but common cases:
983  // - the main function was compiled by an alternative compiler;
984  // - the main function was compiled by icl but without /Qopenmp (application with plugins);
985  // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
986  // - alive foreign thread prevented __kmpc_end from doing cleanup.
987 
988  // This is a hack to work around the problem.
989  // TODO: !!! to figure out something better.
990  __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
991  }
992 
993  __kmp_internal_end_library( __kmp_gtid_get_specific() );
994 
995  return TRUE;
996 
997  case DLL_THREAD_ATTACH:
998  KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
999 
1000  /* if we wanted to register new siblings all the time here call
1001  * __kmp_get_gtid(); */
1002  return TRUE;
1003 
1004  case DLL_THREAD_DETACH:
1005  KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
1006  __kmp_gtid_get_specific() ));
1007 
1008  __kmp_internal_end_thread( __kmp_gtid_get_specific() );
1009  return TRUE;
1010  }
1011 
1012  return TRUE;
1013 }
1014 
1015 # endif /* KMP_OS_WINDOWS */
1016 #endif /* GUIDEDLL_EXPORTS
1017 
1018 
1019 /* ------------------------------------------------------------------------ */
1020 
1021 /* Change the library type to "status" and return the old type */
1022 /* called from within initialization routines where __kmp_initz_lock is held */
1023 int
1024 __kmp_change_library( int status )
1025 {
1026  int old_status;
1027 
1028  old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
1029 
1030  if (status) {
1031  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
1032  }
1033  else {
1034  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
1035  }
1036 
1037  return old_status; // return previous setting of whether KMP_LIBRARY=throughput
1038 }
1039 
1040 /* ------------------------------------------------------------------------ */
1041 /* ------------------------------------------------------------------------ */
1042 
1043 /* __kmp_parallel_deo --
1044  * Wait until it's our turn.
1045  */
1046 void
1047 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
1048 {
1049  int gtid = *gtid_ref;
1050 #ifdef BUILD_PARALLEL_ORDERED
1051  kmp_team_t *team = __kmp_team_from_gtid( gtid );
1052 #endif /* BUILD_PARALLEL_ORDERED */
1053 
1054  if( __kmp_env_consistency_check ) {
1055  if( __kmp_threads[gtid] -> th.th_root -> r.r_active )
1056  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
1057  }
1058 #ifdef BUILD_PARALLEL_ORDERED
1059  if( !team -> t.t_serialized ) {
1060  kmp_uint32 spins;
1061 
1062  KMP_MB();
1063  KMP_WAIT_YIELD(&team -> t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
1064  KMP_MB();
1065  }
1066 #endif /* BUILD_PARALLEL_ORDERED */
1067 }
1068 
1069 /* __kmp_parallel_dxo --
1070  * Signal the next task.
1071  */
1072 
1073 void
1074 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
1075 {
1076  int gtid = *gtid_ref;
1077 #ifdef BUILD_PARALLEL_ORDERED
1078  int tid = __kmp_tid_from_gtid( gtid );
1079  kmp_team_t *team = __kmp_team_from_gtid( gtid );
1080 #endif /* BUILD_PARALLEL_ORDERED */
1081 
1082  if( __kmp_env_consistency_check ) {
1083  if( __kmp_threads[gtid] -> th.th_root -> r.r_active )
1084  __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
1085  }
1086 #ifdef BUILD_PARALLEL_ORDERED
1087  if ( ! team -> t.t_serialized ) {
1088  KMP_MB(); /* Flush all pending memory write invalidates. */
1089 
1090  /* use the tid of the next thread in this team */
1091  /* TODO repleace with general release procedure */
1092  team -> t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
1093 
1094  KMP_MB(); /* Flush all pending memory write invalidates. */
1095  }
1096 #endif /* BUILD_PARALLEL_ORDERED */
1097 }
1098 
1099 /* ------------------------------------------------------------------------ */
1100 /* ------------------------------------------------------------------------ */
1101 
1102 /* ------------------------------------------------------------------------ */
1103 /* ------------------------------------------------------------------------ */
1104 
1105 /* The BARRIER for a SINGLE process section is always explicit */
1106 
1107 int
1108 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
1109 {
1110  int status;
1111  kmp_info_t *th;
1112  kmp_team_t *team;
1113 
1114  if( ! TCR_4(__kmp_init_parallel) )
1115  __kmp_parallel_initialize();
1116 
1117  th = __kmp_threads[ gtid ];
1118  team = th -> th.th_team;
1119  status = 0;
1120 
1121  th->th.th_ident = id_ref;
1122 
1123  if ( team -> t.t_serialized ) {
1124  status = 1;
1125  } else {
1126  kmp_int32 old_this = th->th.th_local.this_construct;
1127 
1128  ++th->th.th_local.this_construct;
1129  /* try to set team count to thread count--success means thread got the
1130  single block
1131  */
1132  /* TODO: Should this be acquire or release? */
1133  status = KMP_COMPARE_AND_STORE_ACQ32(&team -> t.t_construct, old_this,
1134  th->th.th_local.this_construct);
1135  }
1136 
1137  if( __kmp_env_consistency_check ) {
1138  if (status && push_ws) {
1139  __kmp_push_workshare( gtid, ct_psingle, id_ref );
1140  } else {
1141  __kmp_check_workshare( gtid, ct_psingle, id_ref );
1142  }
1143  }
1144 #if USE_ITT_BUILD
1145  if ( status ) {
1146  __kmp_itt_single_start( gtid );
1147  }
1148 #endif /* USE_ITT_BUILD */
1149  return status;
1150 }
1151 
1152 void
1153 __kmp_exit_single( int gtid )
1154 {
1155 #if USE_ITT_BUILD
1156  __kmp_itt_single_end( gtid );
1157 #endif /* USE_ITT_BUILD */
1158  if( __kmp_env_consistency_check )
1159  __kmp_pop_workshare( gtid, ct_psingle, NULL );
1160 }
1161 
1162 
1163 /* ------------------------------------------------------------------------ */
1164 /* ------------------------------------------------------------------------ */
1165 
1166 static void
1167 __kmp_linear_barrier_gather( enum barrier_type bt,
1168  kmp_info_t *this_thr,
1169  int gtid,
1170  int tid,
1171  void (*reduce)(void *, void *)
1172  USE_ITT_BUILD_ARG(void * itt_sync_obj)
1173  )
1174 {
1175  register kmp_team_t *team = this_thr -> th.th_team;
1176  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1177  register kmp_info_t **other_threads = team -> t.t_threads;
1178 
1179  KA_TRACE( 20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1180  gtid, team->t.t_id, tid, bt ) );
1181 
1182  KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
1183 
1184  /*
1185  * We now perform a linear reduction to signal that all
1186  * of the threads have arrived.
1187  *
1188  * Collect all the worker team member threads.
1189  */
1190  if ( ! KMP_MASTER_TID( tid )) {
1191 
1192  KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
1193  "arrived(%p): %u => %u\n",
1194  gtid, team->t.t_id, tid,
1195  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1196  &thr_bar -> b_arrived, thr_bar -> b_arrived,
1197  thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
1198  ) );
1199 
1200  /* mark arrival to master thread */
1201  //
1202  // After performing this write, a worker thread may not assume that
1203  // the team is valid any more - it could be deallocated by the master
1204  // thread at any time.
1205  //
1206  __kmp_release( other_threads[0], &thr_bar -> b_arrived, kmp_release_fence );
1207 
1208  } else {
1209  register kmp_balign_team_t *team_bar = & team -> t.t_bar[ bt ];
1210  register int nproc = this_thr -> th.th_team_nproc;
1211  register int i;
1212  register kmp_uint new_state;
1213 
1214  /* Don't have to worry about sleep bit here or atomic since team setting */
1215  new_state = team_bar -> b_arrived + KMP_BARRIER_STATE_BUMP;
1216 
1217  /* Collect all the worker team member threads. */
1218  for (i = 1; i < nproc; i++) {
1219 #if KMP_CACHE_MANAGE
1220  /* prefetch next thread's arrived count */
1221  if ( i+1 < nproc )
1222  KMP_CACHE_PREFETCH( &other_threads[ i+1 ] -> th.th_bar[ bt ].bb.b_arrived );
1223 #endif /* KMP_CACHE_MANAGE */
1224  KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
1225  "arrived(%p) == %u\n",
1226  gtid, team->t.t_id, tid,
1227  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1228  &other_threads[i] -> th.th_bar[ bt ].bb.b_arrived,
1229  new_state ) );
1230 
1231  /* wait for worker thread to arrive */
1232  __kmp_wait_sleep( this_thr,
1233  & other_threads[ i ] -> th.th_bar[ bt ].bb.b_arrived,
1234  new_state, FALSE
1235  USE_ITT_BUILD_ARG( itt_sync_obj )
1236  );
1237 
1238  if (reduce) {
1239 
1240  KA_TRACE( 100, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
1241  gtid, team->t.t_id, tid,
1242  __kmp_gtid_from_tid( i, team ), team->t.t_id, i ) );
1243 
1244  (*reduce)( this_thr -> th.th_local.reduce_data,
1245  other_threads[ i ] -> th.th_local.reduce_data );
1246 
1247  }
1248 
1249  }
1250 
1251  /* Don't have to worry about sleep bit here or atomic since team setting */
1252  team_bar -> b_arrived = new_state;
1253  KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
1254  "arrived(%p) = %u\n",
1255  gtid, team->t.t_id, tid, team->t.t_id,
1256  &team_bar -> b_arrived, new_state ) );
1257  }
1258 
1259  KA_TRACE( 20, ( "__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1260  gtid, team->t.t_id, tid, bt ) );
1261 }
1262 
1263 
1264 static void
1265 __kmp_tree_barrier_gather( enum barrier_type bt,
1266  kmp_info_t *this_thr,
1267  int gtid,
1268  int tid,
1269  void (*reduce) (void *, void *)
1270  USE_ITT_BUILD_ARG( void * itt_sync_obj )
1271  )
1272 {
1273  register kmp_team_t *team = this_thr -> th.th_team;
1274  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1275  register kmp_info_t **other_threads = team -> t.t_threads;
1276  register kmp_uint32 nproc = this_thr -> th.th_team_nproc;
1277  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[ bt ];
1278  register kmp_uint32 branch_factor = 1 << branch_bits ;
1279  register kmp_uint32 child;
1280  register kmp_uint32 child_tid;
1281  register kmp_uint new_state;
1282 
1283  KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1284  gtid, team->t.t_id, tid, bt ) );
1285 
1286  KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
1287 
1288  /*
1289  * We now perform a tree gather to wait until all
1290  * of the threads have arrived, and reduce any required data
1291  * as we go.
1292  */
1293 
1294  child_tid = (tid << branch_bits) + 1;
1295 
1296  if ( child_tid < nproc ) {
1297 
1298  /* parent threads wait for all their children to arrive */
1299  new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP;
1300  child = 1;
1301 
1302  do {
1303  register kmp_info_t *child_thr = other_threads[ child_tid ];
1304  register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
1305 #if KMP_CACHE_MANAGE
1306  /* prefetch next thread's arrived count */
1307  if ( child+1 <= branch_factor && child_tid+1 < nproc )
1308  KMP_CACHE_PREFETCH( &other_threads[ child_tid+1 ] -> th.th_bar[ bt ].bb.b_arrived );
1309 #endif /* KMP_CACHE_MANAGE */
1310  KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1311  "arrived(%p) == %u\n",
1312  gtid, team->t.t_id, tid,
1313  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id, child_tid,
1314  &child_bar -> b_arrived, new_state ) );
1315 
1316  /* wait for child to arrive */
1317  __kmp_wait_sleep( this_thr, &child_bar -> b_arrived, new_state, FALSE
1318  USE_ITT_BUILD_ARG( itt_sync_obj)
1319  );
1320 
1321  if (reduce) {
1322 
1323  KA_TRACE( 100, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1324  gtid, team->t.t_id, tid,
1325  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
1326  child_tid ) );
1327 
1328  (*reduce)( this_thr -> th.th_local.reduce_data,
1329  child_thr -> th.th_local.reduce_data );
1330 
1331  }
1332 
1333  child++;
1334  child_tid++;
1335  }
1336  while ( child <= branch_factor && child_tid < nproc );
1337  }
1338 
1339  if ( !KMP_MASTER_TID(tid) ) {
1340  /* worker threads */
1341  register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
1342 
1343  KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1344  "arrived(%p): %u => %u\n",
1345  gtid, team->t.t_id, tid,
1346  __kmp_gtid_from_tid( parent_tid, team ), team->t.t_id, parent_tid,
1347  &thr_bar -> b_arrived, thr_bar -> b_arrived,
1348  thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
1349  ) );
1350 
1351  /* mark arrival to parent thread */
1352  //
1353  // After performing this write, a worker thread may not assume that
1354  // the team is valid any more - it could be deallocated by the master
1355  // thread at any time.
1356  //
1357  __kmp_release( other_threads[parent_tid], &thr_bar -> b_arrived, kmp_release_fence );
1358 
1359  } else {
1360  /* Need to update the team arrived pointer if we are the master thread */
1361 
1362  if ( nproc > 1 )
1363  /* New value was already computed in above loop */
1364  team -> t.t_bar[ bt ].b_arrived = new_state;
1365  else
1366  team -> t.t_bar[ bt ].b_arrived += KMP_BARRIER_STATE_BUMP;
1367 
1368  KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
1369  gtid, team->t.t_id, tid, team->t.t_id,
1370  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived ) );
1371  }
1372 
1373  KA_TRACE( 20, ( "__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1374  gtid, team->t.t_id, tid, bt ) );
1375 }
1376 
1377 
1378 static void
1379 __kmp_hyper_barrier_gather( enum barrier_type bt,
1380  kmp_info_t *this_thr,
1381  int gtid,
1382  int tid,
1383  void (*reduce) (void *, void *)
1384  USE_ITT_BUILD_ARG (void * itt_sync_obj)
1385  )
1386 {
1387  register kmp_team_t *team = this_thr -> th.th_team;
1388  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1389  register kmp_info_t **other_threads = team -> t.t_threads;
1390  register kmp_uint new_state = KMP_BARRIER_UNUSED_STATE;
1391  register kmp_uint32 num_threads = this_thr -> th.th_team_nproc;
1392  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[ bt ];
1393  register kmp_uint32 branch_factor = 1 << branch_bits ;
1394  register kmp_uint32 offset;
1395  register kmp_uint32 level;
1396 
1397  KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
1398  gtid, team->t.t_id, tid, bt ) );
1399 
1400  KMP_DEBUG_ASSERT( this_thr == other_threads[this_thr->th.th_info.ds.ds_tid] );
1401 
1402  /*
1403  * We now perform a hypercube-embedded tree gather to wait until all
1404  * of the threads have arrived, and reduce any required data
1405  * as we go.
1406  */
1407 
1408  for ( level=0, offset =1;
1409  offset < num_threads;
1410  level += branch_bits, offset <<= branch_bits )
1411  {
1412  register kmp_uint32 child;
1413  register kmp_uint32 child_tid;
1414 
1415  if ( ((tid >> level) & (branch_factor - 1)) != 0 ) {
1416  register kmp_int32 parent_tid = tid & ~( (1 << (level + branch_bits)) -1 );
1417 
1418  KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
1419  "arrived(%p): %u => %u\n",
1420  gtid, team->t.t_id, tid,
1421  __kmp_gtid_from_tid( parent_tid, team ), team->t.t_id, parent_tid,
1422  &thr_bar -> b_arrived, thr_bar -> b_arrived,
1423  thr_bar -> b_arrived + KMP_BARRIER_STATE_BUMP
1424  ) );
1425 
1426  /* mark arrival to parent thread */
1427  //
1428  // After performing this write (in the last iteration of the
1429  // enclosing for loop), a worker thread may not assume that the
1430  // team is valid any more - it could be deallocated by the master
1431  // thread at any time.
1432  //
1433  __kmp_release( other_threads[parent_tid], &thr_bar -> b_arrived, kmp_release_fence );
1434  break;
1435  }
1436 
1437  /* parent threads wait for children to arrive */
1438 
1439  for ( child = 1, child_tid = tid + (1 << level);
1440  child < branch_factor && child_tid < num_threads;
1441  child++, child_tid += (1 << level) )
1442  {
1443  register kmp_info_t *child_thr = other_threads[ child_tid ];
1444  register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
1445 #if KMP_CACHE_MANAGE
1446  register kmp_uint32 next_child_tid = child_tid + (1 << level);
1447  /* prefetch next thread's arrived count */
1448  if ( child+1 < branch_factor && next_child_tid < num_threads )
1449  KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ] -> th.th_bar[ bt ].bb.b_arrived );
1450 #endif /* KMP_CACHE_MANAGE */
1451  /* Only read this arrived flag once per thread that needs it */
1452  if (new_state == KMP_BARRIER_UNUSED_STATE)
1453  new_state = team -> t.t_bar[ bt ].b_arrived + KMP_BARRIER_STATE_BUMP;
1454 
1455  KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
1456  "arrived(%p) == %u\n",
1457  gtid, team->t.t_id, tid,
1458  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id, child_tid,
1459  &child_bar -> b_arrived, new_state ) );
1460 
1461  /* wait for child to arrive */
1462  __kmp_wait_sleep( this_thr, &child_bar -> b_arrived, new_state, FALSE
1463  USE_ITT_BUILD_ARG (itt_sync_obj)
1464  );
1465 
1466  if (reduce) {
1467 
1468  KA_TRACE( 100, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
1469  gtid, team->t.t_id, tid,
1470  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
1471  child_tid ) );
1472 
1473  (*reduce)( this_thr -> th.th_local.reduce_data,
1474  child_thr -> th.th_local.reduce_data );
1475 
1476  }
1477  }
1478  }
1479 
1480 
1481  if ( KMP_MASTER_TID(tid) ) {
1482  /* Need to update the team arrived pointer if we are the master thread */
1483 
1484  if (new_state == KMP_BARRIER_UNUSED_STATE)
1485  team -> t.t_bar[ bt ].b_arrived += KMP_BARRIER_STATE_BUMP;
1486  else
1487  team -> t.t_bar[ bt ].b_arrived = new_state;
1488 
1489  KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %u\n",
1490  gtid, team->t.t_id, tid, team->t.t_id,
1491  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived ) );
1492  }
1493 
1494  KA_TRACE( 20, ( "__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
1495  gtid, team->t.t_id, tid, bt ) );
1496 
1497 }
1498 
1499 static void
1500 __kmp_linear_barrier_release( enum barrier_type bt,
1501  kmp_info_t *this_thr,
1502  int gtid,
1503  int tid,
1504  int propagate_icvs
1505  USE_ITT_BUILD_ARG(void * itt_sync_obj)
1506  )
1507 {
1508  register kmp_bstate_t *thr_bar = &this_thr -> th.th_bar[ bt ].bb;
1509  register kmp_team_t *team;
1510 
1511  if (KMP_MASTER_TID( tid )) {
1512  register unsigned int i;
1513  register kmp_uint32 nproc = this_thr -> th.th_team_nproc;
1514  register kmp_info_t **other_threads;
1515 
1516  team = __kmp_threads[ gtid ]-> th.th_team;
1517  KMP_DEBUG_ASSERT( team != NULL );
1518  other_threads = team -> t.t_threads;
1519 
1520  KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
1521  gtid, team->t.t_id, tid, bt ) );
1522 
1523  if (nproc > 1) {
1524 #if KMP_BARRIER_ICV_PUSH
1525  if ( propagate_icvs ) {
1526  load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
1527  for (i = 1; i < nproc; i++) {
1528  __kmp_init_implicit_task( team->t.t_ident,
1529  team->t.t_threads[i], team, i, FALSE );
1530  store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
1531  }
1532  sync_icvs();
1533  }
1534 #endif // KMP_BARRIER_ICV_PUSH
1535 
1536  /* Now, release all of the worker threads */
1537  for (i = 1; i < nproc; i++) {
1538 #if KMP_CACHE_MANAGE
1539  /* prefetch next thread's go flag */
1540  if( i+1 < nproc )
1541  KMP_CACHE_PREFETCH( &other_threads[ i+1 ]-> th.th_bar[ bt ].bb.b_go );
1542 #endif /* KMP_CACHE_MANAGE */
1543  KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
1544  "go(%p): %u => %u\n",
1545  gtid, team->t.t_id, tid,
1546  other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
1547  &other_threads[i]->th.th_bar[bt].bb.b_go,
1548  other_threads[i]->th.th_bar[bt].bb.b_go,
1549  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP
1550  ) );
1551 
1552  __kmp_release( other_threads[ i ],
1553  &other_threads[ i ]-> th.th_bar[ bt ].bb.b_go, kmp_acquire_fence );
1554  }
1555  }
1556  } else {
1557  /* Wait for the MASTER thread to release us */
1558 
1559  KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
1560  gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
1561 
1562  __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
1563  USE_ITT_BUILD_ARG(itt_sync_obj)
1564  );
1565 
1566 #if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
1567  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1568  // we are on a fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
1569  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
1570  // cancel wait on previous parallel region...
1571  __kmp_itt_task_starting( itt_sync_obj );
1572 
1573  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1574  return;
1575 
1576  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1577  if ( itt_sync_obj != NULL )
1578  __kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
1579 
1580  } else
1581 #endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
1582  //
1583  // early exit for reaping threads releasing forkjoin barrier
1584  //
1585  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1586  return;
1587 
1588  //
1589  // The worker thread may now assume that the team is valid.
1590  //
1591 #if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
1592  // libguide only code (cannot use *itt_task* routines)
1593  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1594  // we are on a fork barrier where we could not get the object reliably
1595  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1596  __kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
1597  }
1598 #endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
1599  #ifdef KMP_DEBUG
1600  tid = __kmp_tid_from_gtid( gtid );
1601  team = __kmp_threads[ gtid ]-> th.th_team;
1602  #endif
1603  KMP_DEBUG_ASSERT( team != NULL );
1604 
1605  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1606  KA_TRACE( 20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1607  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
1608 
1609  KMP_MB(); /* Flush all pending memory write invalidates. */
1610  }
1611 
1612  KA_TRACE( 20, ( "__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1613  gtid, team->t.t_id, tid, bt ) );
1614 }
1615 
1616 
1617 static void
1618 __kmp_tree_barrier_release( enum barrier_type bt,
1619  kmp_info_t *this_thr,
1620  int gtid,
1621  int tid,
1622  int propagate_icvs
1623  USE_ITT_BUILD_ARG(void * itt_sync_obj)
1624  )
1625 {
1626  /* handle fork barrier workers who aren't part of a team yet */
1627  register kmp_team_t *team;
1628  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1629  register kmp_uint32 nproc;
1630  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
1631  register kmp_uint32 branch_factor = 1 << branch_bits ;
1632  register kmp_uint32 child;
1633  register kmp_uint32 child_tid;
1634 
1635  /*
1636  * We now perform a tree release for all
1637  * of the threads that have been gathered
1638  */
1639 
1640  if ( ! KMP_MASTER_TID( tid )) {
1641  /* worker threads */
1642 
1643  KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
1644  gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
1645 
1646  /* wait for parent thread to release us */
1647  __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
1648  USE_ITT_BUILD_ARG(itt_sync_obj)
1649  );
1650 
1651 #if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
1652  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1653  // we are on a fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
1654  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
1655  // cancel wait on previous parallel region...
1656  __kmp_itt_task_starting( itt_sync_obj );
1657 
1658  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1659  return;
1660 
1661  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1662  if ( itt_sync_obj != NULL )
1663  __kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
1664 
1665  } else
1666 #endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
1667  //
1668  // early exit for reaping threads releasing forkjoin barrier
1669  //
1670  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1671  return;
1672 
1673  //
1674  // The worker thread may now assume that the team is valid.
1675  //
1676 #if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
1677  // libguide only code (cannot use *itt_task* routines)
1678  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1679  // we are on a fork barrier where we could not get the object reliably
1680  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1681  __kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
1682  }
1683 #endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
1684  team = __kmp_threads[ gtid ]-> th.th_team;
1685  KMP_DEBUG_ASSERT( team != NULL );
1686  tid = __kmp_tid_from_gtid( gtid );
1687 
1688  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1689  KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1690  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
1691 
1692  KMP_MB(); /* Flush all pending memory write invalidates. */
1693 
1694  } else {
1695  team = __kmp_threads[ gtid ]-> th.th_team;
1696  KMP_DEBUG_ASSERT( team != NULL );
1697 
1698  KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
1699  gtid, team->t.t_id, tid, bt ) );
1700  }
1701 
1702  nproc = this_thr -> th.th_team_nproc;
1703  child_tid = ( tid << branch_bits ) + 1;
1704 
1705  if ( child_tid < nproc ) {
1706  register kmp_info_t **other_threads = team -> t.t_threads;
1707  child = 1;
1708  /* parent threads release all their children */
1709 
1710  do {
1711  register kmp_info_t *child_thr = other_threads[ child_tid ];
1712  register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
1713 #if KMP_CACHE_MANAGE
1714  /* prefetch next thread's go count */
1715  if ( child+1 <= branch_factor && child_tid+1 < nproc )
1716  KMP_CACHE_PREFETCH( &other_threads[ child_tid+1 ] -> th.th_bar[ bt ].bb.b_go );
1717 #endif /* KMP_CACHE_MANAGE */
1718 
1719 #if KMP_BARRIER_ICV_PUSH
1720  if ( propagate_icvs ) {
1721  __kmp_init_implicit_task( team->t.t_ident,
1722  team->t.t_threads[child_tid], team, child_tid, FALSE );
1723  load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
1724  store_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
1725  sync_icvs();
1726  }
1727 #endif // KMP_BARRIER_ICV_PUSH
1728 
1729  KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1730  "go(%p): %u => %u\n",
1731  gtid, team->t.t_id, tid,
1732  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
1733  child_tid, &child_bar -> b_go, child_bar -> b_go,
1734  child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
1735 
1736  /* release child from barrier */
1737  __kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
1738 
1739  child++;
1740  child_tid++;
1741  }
1742  while ( child <= branch_factor && child_tid < nproc );
1743  }
1744 
1745  KA_TRACE( 20, ( "__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1746  gtid, team->t.t_id, tid, bt ) );
1747 }
1748 
1749 /* The reverse versions seem to beat the forward versions overall */
1750 #define KMP_REVERSE_HYPER_BAR
1751 #ifdef KMP_REVERSE_HYPER_BAR
1752 static void
1753 __kmp_hyper_barrier_release( enum barrier_type bt,
1754  kmp_info_t *this_thr,
1755  int gtid,
1756  int tid,
1757  int propagate_icvs
1758  USE_ITT_BUILD_ARG(void * itt_sync_obj)
1759  )
1760 {
1761  /* handle fork barrier workers who aren't part of a team yet */
1762  register kmp_team_t *team;
1763  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1764  register kmp_info_t **other_threads;
1765  register kmp_uint32 num_threads;
1766  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
1767  register kmp_uint32 branch_factor = 1 << branch_bits;
1768  register kmp_uint32 child;
1769  register kmp_uint32 child_tid;
1770  register kmp_uint32 offset;
1771  register kmp_uint32 level;
1772 
1773  /*
1774  * We now perform a hypercube-embedded tree release for all
1775  * of the threads that have been gathered, but in the exact
1776  * reverse order from the corresponding gather (for load balance.
1777  */
1778 
1779  if ( ! KMP_MASTER_TID( tid )) {
1780  /* worker threads */
1781 
1782  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
1783  gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
1784 
1785  /* wait for parent thread to release us */
1786  __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE
1787  USE_ITT_BUILD_ARG( itt_sync_obj )
1788  );
1789 
1790 #if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
1791  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1792  // we are on a fork barrier where we could not get the object reliably
1793  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
1794  // cancel wait on previous parallel region...
1795  __kmp_itt_task_starting( itt_sync_obj );
1796 
1797  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1798  return;
1799 
1800  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1801  if ( itt_sync_obj != NULL )
1802  __kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
1803 
1804  } else
1805 #endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
1806  //
1807  // early exit for reaping threads releasing forkjoin barrier
1808  //
1809  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1810  return;
1811 
1812  //
1813  // The worker thread may now assume that the team is valid.
1814  //
1815 #if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
1816  // libguide only code (cannot use *itt_task* routines)
1817  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1818  // we are on a fork barrier where we could not get the object reliably
1819  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1820  __kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
1821  }
1822 #endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
1823  team = __kmp_threads[ gtid ]-> th.th_team;
1824  KMP_DEBUG_ASSERT( team != NULL );
1825  tid = __kmp_tid_from_gtid( gtid );
1826 
1827  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1828  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1829  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
1830 
1831  KMP_MB(); /* Flush all pending memory write invalidates. */
1832 
1833  } else { /* KMP_MASTER_TID(tid) */
1834  team = __kmp_threads[ gtid ]-> th.th_team;
1835  KMP_DEBUG_ASSERT( team != NULL );
1836 
1837  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
1838  gtid, team->t.t_id, tid, bt ) );
1839  }
1840 
1841  num_threads = this_thr -> th.th_team_nproc;
1842  other_threads = team -> t.t_threads;
1843 
1844  /* count up to correct level for parent */
1845  for ( level = 0, offset = 1;
1846  offset < num_threads && (((tid >> level) & (branch_factor-1)) == 0);
1847  level += branch_bits, offset <<= branch_bits );
1848 
1849  /* now go down from there */
1850  for ( level -= branch_bits, offset >>= branch_bits;
1851  offset != 0;
1852  level -= branch_bits, offset >>= branch_bits )
1853  {
1854  /* Now go in reverse order through the children, highest to lowest.
1855  Initial setting of child is conservative here. */
1856  child = num_threads >> ((level==0)?level:level-1);
1857  for ( child = (child < branch_factor-1) ? child : branch_factor-1,
1858  child_tid = tid + (child << level);
1859  child >= 1;
1860  child--, child_tid -= (1 << level) )
1861  {
1862 
1863  if ( child_tid >= num_threads ) continue; /* child doesn't exist so keep going */
1864  else {
1865  register kmp_info_t *child_thr = other_threads[ child_tid ];
1866  register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
1867 #if KMP_CACHE_MANAGE
1868  register kmp_uint32 next_child_tid = child_tid - (1 << level);
1869  /* prefetch next thread's go count */
1870  if ( child-1 >= 1 && next_child_tid < num_threads )
1871  KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ]->th.th_bar[ bt ].bb.b_go );
1872 #endif /* KMP_CACHE_MANAGE */
1873 
1874 #if KMP_BARRIER_ICV_PUSH
1875  if ( propagate_icvs ) {
1876  KMP_DEBUG_ASSERT( team != NULL );
1877  __kmp_init_implicit_task( team->t.t_ident,
1878  team->t.t_threads[child_tid], team, child_tid, FALSE );
1879  load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
1880  store_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
1881  sync_icvs();
1882  }
1883 #endif // KMP_BARRIER_ICV_PUSH
1884 
1885  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
1886  "go(%p): %u => %u\n",
1887  gtid, team->t.t_id, tid,
1888  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
1889  child_tid, &child_bar -> b_go, child_bar -> b_go,
1890  child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
1891 
1892  /* release child from barrier */
1893  __kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
1894  }
1895  }
1896  }
1897 
1898  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1899  gtid, team->t.t_id, tid, bt ) );
1900 }
1901 
1902 #else /* !KMP_REVERSE_HYPER_BAR */
1903 
1904 static void
1905 __kmp_hyper_barrier_release( enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid, int propagate_icvs )
1906 {
1907  /* handle fork barrier workers who aren't part of a team yet */
1908  register kmp_team_t *team;
1909  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
1910  register kmp_info_t **other_threads;
1911  register kmp_uint32 num_threads;
1912  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
1913  register kmp_uint32 branch_factor = 1 << branch_bits;
1914  register kmp_uint32 child;
1915  register kmp_uint32 child_tid;
1916  register kmp_uint32 offset;
1917  register kmp_uint32 level;
1918 
1919  /*
1920  * We now perform a hypercube-embedded tree release for all
1921  * of the threads that have been gathered, but in the same order
1922  * as the gather.
1923  */
1924 
1925  if ( ! KMP_MASTER_TID( tid )) {
1926  /* worker threads */
1927 
1928  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
1929  gtid, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP ) );
1930 
1931  /* wait for parent thread to release us */
1932  __kmp_wait_sleep( this_thr, &thr_bar -> b_go, KMP_BARRIER_STATE_BUMP, TRUE, NULL );
1933 
1934 #if USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY
1935  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1936  // we are on a fork barrier where we could not get the object reliably
1937  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 0, -1 );
1938  // cancel wait on previous parallel region...
1939  __kmp_itt_task_starting( itt_sync_obj );
1940 
1941  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1942  return;
1943 
1944  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1945  if ( itt_sync_obj != NULL )
1946  __kmp_itt_task_finished( itt_sync_obj ); // call prepare as early as possible for "new" barrier
1947 
1948  } else
1949 #endif /* USE_ITT_BUILD && OMP_30_ENABLED && USE_ITT_NOTIFY */
1950  //
1951  // early exit for reaping threads releasing forkjoin barrier
1952  //
1953  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
1954  return;
1955 
1956  //
1957  // The worker thread may now assume that the team is valid.
1958  //
1959 #if USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY
1960  // libguide only code (cannot use *itt_task* routines)
1961  if ( ( __itt_sync_create_ptr && itt_sync_obj == NULL ) || KMP_ITT_DEBUG ) {
1962  // we are on a fork barrier where we could not get the object reliably
1963  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
1964  __kmp_itt_barrier_starting( gtid, itt_sync_obj ); // no need to call releasing, but we have paired calls...
1965  }
1966 #endif /* USE_ITT_BUILD && !OMP_30_ENABLED && USE_ITT_NOTIFY */
1967  team = __kmp_threads[ gtid ]-> th.th_team;
1968  KMP_DEBUG_ASSERT( team != NULL );
1969  tid = __kmp_tid_from_gtid( gtid );
1970 
1971  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
1972  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1973  gtid, ( team != NULL ) ? team->t.t_id : -1, tid,
1974  &thr_bar->b_go, KMP_INIT_BARRIER_STATE ) );
1975 
1976  KMP_MB(); /* Flush all pending memory write invalidates. */
1977 
1978  } else { /* KMP_MASTER_TID(tid) */
1979  team = __kmp_threads[ gtid ]-> th.th_team;
1980  KMP_DEBUG_ASSERT( team != NULL );
1981 
1982  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) enter for barrier type %d\n",
1983  gtid, team->t.t_id, tid, bt ) );
1984  }
1985 
1986  /* Now set up team parameters since workers have been released */
1987  if ( team == NULL ) {
1988  /* handle fork barrier workers who are now part of a team */
1989  tid = __kmp_tid_from_gtid( gtid );
1990  team = __kmp_threads[ gtid ]-> th.th_team;
1991  }
1992  num_threads = this_thr -> th.th_team_nproc;
1993  other_threads = team -> t.t_threads;
1994 
1995  /* Go down the tree, level by level */
1996  for ( level = 0, offset = 1;
1997  offset < num_threads;
1998  level += branch_bits, offset <<= branch_bits )
1999  {
2000  if (((tid >> level) & (branch_factor - 1)) != 0)
2001  /* No need to go any lower than this, since this is the level
2002  parent would be notified */
2003  break;
2004 
2005  /* iterate through children on this level of the tree */
2006  for ( child = 1, child_tid = tid + (1 << level);
2007  child < branch_factor && child_tid < num_threads;
2008  child++, child_tid += (1 << level) )
2009  {
2010  register kmp_info_t *child_thr = other_threads[ child_tid ];
2011  register kmp_bstate_t *child_bar = & child_thr -> th.th_bar[ bt ].bb;
2012 #if KMP_CACHE_MANAGE
2013  {
2014  register kmp_uint32 next_child_tid = child_tid + (1 << level);
2015  /* prefetch next thread's go count */
2016  if ( child+1 < branch_factor && next_child_tid < num_threads )
2017  KMP_CACHE_PREFETCH( &other_threads[ next_child_tid ]->th.th_bar[ bt ].bb.b_go );
2018  }
2019 #endif /* KMP_CACHE_MANAGE */
2020 
2021 #if KMP_BARRIER_ICV_PUSH
2022  if ( propagate_icvs ) {
2023  KMP_DEBUG_ASSERT( team != NULL );
2024  __kmp_init_implicit_task( team->t.t_ident,
2025  team->t.t_threads[child_tid], team, child_tid, FALSE );
2026  load_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs);
2027  store_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs, &team->t.t_implicit_task_taskdata[0].td_icvs);
2028  sync_icvs();
2029  }
2030 #endif // KMP_BARRIER_ICV_PUSH
2031 
2032  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) releasing "
2033  "T#%d(%d:%u) go(%p): %u => %u\n",
2034  gtid, team->t.t_id, tid,
2035  __kmp_gtid_from_tid( child_tid, team ), team->t.t_id,
2036  child_tid, &child_bar -> b_go, child_bar -> b_go,
2037  child_bar -> b_go + KMP_BARRIER_STATE_BUMP ) );
2038 
2039  /* release child from barrier */
2040  __kmp_release( child_thr, &child_bar -> b_go, kmp_acquire_fence );
2041  }
2042  }
2043 
2044  KA_TRACE( 20, ( "__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
2045  gtid, team->t.t_id, tid, bt ) );
2046 }
2047 #endif /* KMP_REVERSE_HYPER_BAR */
2048 
2049 
2050 /*
2051  * Internal function to do a barrier.
2052  * If is_split is true, do a split barrier, otherwise, do a plain barrier
2053  * If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
2054  * Returns 0 if master thread, 1 if worker thread.
2055  */
2056 int
2057 __kmp_barrier( enum barrier_type bt, int gtid, int is_split,
2058  size_t reduce_size, void *reduce_data, void (*reduce)(void *, void *) )
2059 {
2060  register int tid = __kmp_tid_from_gtid( gtid );
2061  register kmp_info_t *this_thr = __kmp_threads[ gtid ];
2062  register kmp_team_t *team = this_thr -> th.th_team;
2063  register int status = 0;
2064 
2065  KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) has arrived\n",
2066  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid) ) );
2067 
2068  if ( ! team->t.t_serialized ) {
2069 #if USE_ITT_BUILD
2070  // This value will be used in itt notify events below.
2071  void * itt_sync_obj = NULL;
2072  #if USE_ITT_NOTIFY
2073  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
2074  itt_sync_obj = __kmp_itt_barrier_object( gtid, bt, 1 );
2075  #endif
2076 #endif /* USE_ITT_BUILD */
2077  #if OMP_30_ENABLED
2078  if ( __kmp_tasking_mode == tskm_extra_barrier ) {
2079  __kmp_tasking_barrier( team, this_thr, gtid );
2080  KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
2081  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid) ) );
2082  }
2083  #endif /* OMP_30_ENABLED */
2084 
2085  //
2086  // Copy the blocktime info to the thread, where __kmp_wait_sleep()
2087  // can access it when the team struct is not guaranteed to exist.
2088  //
2089  // See the note about the corresponding code in __kmp_join_barrier()
2090  // being performance-critical.
2091  //
2092  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
2093  #if OMP_30_ENABLED
2094  this_thr -> th.th_team_bt_intervals = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
2095  this_thr -> th.th_team_bt_set = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
2096  #else
2097  this_thr -> th.th_team_bt_intervals = team -> t.t_set_bt_intervals[tid];
2098  this_thr -> th.th_team_bt_set= team -> t.t_set_bt_set[tid];
2099  #endif // OMP_30_ENABLED
2100  }
2101 
2102 #if USE_ITT_BUILD
2103  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
2104  __kmp_itt_barrier_starting( gtid, itt_sync_obj );
2105 #endif /* USE_ITT_BUILD */
2106 
2107  if ( reduce != NULL ) {
2108  //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
2109  this_thr -> th.th_local.reduce_data = reduce_data;
2110  }
2111  if ( __kmp_barrier_gather_pattern[ bt ] == bp_linear_bar || __kmp_barrier_gather_branch_bits[ bt ] == 0 ) {
2112  __kmp_linear_barrier_gather( bt, this_thr, gtid, tid, reduce
2113  USE_ITT_BUILD_ARG( itt_sync_obj )
2114  );
2115  } else if ( __kmp_barrier_gather_pattern[ bt ] == bp_tree_bar ) {
2116  __kmp_tree_barrier_gather( bt, this_thr, gtid, tid, reduce
2117  USE_ITT_BUILD_ARG( itt_sync_obj )
2118  );
2119  } else {
2120  __kmp_hyper_barrier_gather( bt, this_thr, gtid, tid, reduce
2121  USE_ITT_BUILD_ARG( itt_sync_obj )
2122  );
2123  }; // if
2124 
2125 #if USE_ITT_BUILD
2126  // TODO: In case of split reduction barrier, master thread may send aquired event early,
2127  // before the final summation into the shared variable is done (final summation can be a
2128  // long operation for array reductions).
2129  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
2130  __kmp_itt_barrier_middle( gtid, itt_sync_obj );
2131 #endif /* USE_ITT_BUILD */
2132 
2133  KMP_MB();
2134 
2135  if ( KMP_MASTER_TID( tid ) ) {
2136  status = 0;
2137 
2138  #if OMP_30_ENABLED
2139  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2140  __kmp_task_team_wait( this_thr, team
2141  USE_ITT_BUILD_ARG( itt_sync_obj )
2142  );
2143  __kmp_task_team_setup( this_thr, team );
2144  }
2145  #endif /* OMP_30_ENABLED */
2146 
2147 
2148  // Barrier - report frame end
2149 #if USE_ITT_BUILD
2150  // Collect information only if the file was opened succesfully.
2151  if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file )
2152  {
2153  ident_t * loc = this_thr->th.th_ident;
2154  if (loc) {
2155  // Use compiler-generated location to mark the frame:
2156  // "<func>$omp$frame@[file:]<line>[:<col>]"
2157  kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
2158 
2159  kmp_uint64 fr_end;
2160 #if defined( __GNUC__ )
2161 # if !defined( __INTEL_COMPILER )
2162  fr_end = __kmp_hardware_timestamp();
2163 # else
2164  fr_end = __rdtsc();
2165 # endif
2166 #else
2167  fr_end = __rdtsc();
2168 #endif
2169  K_DIAG( 3, ( "__kmp_barrier: T#%d(%d:%d) frame_begin = %llu, frame_end = %llu\n",
2170  gtid, ( team != NULL ) ? team->t.t_id : -1, tid, this_thr->th.th_frame_time, fr_end ) );
2171 
2172  __kmp_str_buf_print( &__kmp_itt_frame_buffer, "%s$omp$frame@%s:%d:%d,%llu,%llu,,\n",
2173  str_loc.func, str_loc.file, str_loc.line, str_loc.col, this_thr->th.th_frame_time, fr_end );
2174  __kmp_str_loc_free( &str_loc );
2175  this_thr->th.th_frame_time = fr_end;
2176  }
2177  }
2178 #endif /* USE_ITT_BUILD */
2179  } else {
2180  status = 1;
2181  }
2182  if ( status == 1 || ! is_split ) {
2183  if ( __kmp_barrier_release_pattern[ bt ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bt ] == 0 ) {
2184  __kmp_linear_barrier_release( bt, this_thr, gtid, tid, FALSE
2185  USE_ITT_BUILD_ARG( itt_sync_obj )
2186  );
2187  } else if ( __kmp_barrier_release_pattern[ bt ] == bp_tree_bar ) {
2188  __kmp_tree_barrier_release( bt, this_thr, gtid, tid, FALSE
2189  USE_ITT_BUILD_ARG( itt_sync_obj )
2190  );
2191  } else {
2192  __kmp_hyper_barrier_release( bt, this_thr, gtid, tid, FALSE
2193  USE_ITT_BUILD_ARG( itt_sync_obj )
2194  );
2195  }
2196  #if OMP_30_ENABLED
2197  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2198  __kmp_task_team_sync( this_thr, team );
2199  }
2200  #endif /* OMP_30_ENABLED */
2201  }
2202 
2203 #if USE_ITT_BUILD
2204  // GEH: TODO: Move this under if-condition above and also include in __kmp_end_split_barrier().
2205  // This will more accurately represent the actual release time of the threads for split barriers.
2206  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
2207  __kmp_itt_barrier_finished( gtid, itt_sync_obj );
2208 #endif /* USE_ITT_BUILD */
2209 
2210  } else { // Team is serialized.
2211 
2212  status = 0;
2213 
2214  #if OMP_30_ENABLED
2215  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2216  //
2217  // The task team should be NULL for serialized code.
2218  // (tasks will be executed immediately).
2219  //
2220  KMP_DEBUG_ASSERT( team->t.t_task_team == NULL );
2221  KMP_DEBUG_ASSERT( this_thr->th.th_task_team == NULL );
2222  }
2223  #endif /* OMP_30_ENABLED */
2224  }
2225 
2226  KA_TRACE( 15, ( "__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
2227  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid),
2228  status ) );
2229  return status;
2230 }
2231 
2232 
2233 void
2234 __kmp_end_split_barrier( enum barrier_type bt, int gtid )
2235 {
2236  int tid = __kmp_tid_from_gtid( gtid );
2237  kmp_info_t *this_thr = __kmp_threads[ gtid ];
2238  kmp_team_t *team = this_thr -> th.th_team;
2239 
2240  if( ! team -> t.t_serialized ) {
2241  if( KMP_MASTER_GTID( gtid ) ) {
2242  if ( __kmp_barrier_release_pattern[ bt ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bt ] == 0 ) {
2243  __kmp_linear_barrier_release( bt, this_thr, gtid, tid, FALSE
2244 #if USE_ITT_BUILD
2245  , NULL
2246 #endif /* USE_ITT_BUILD */
2247  );
2248  } else if ( __kmp_barrier_release_pattern[ bt ] == bp_tree_bar ) {
2249  __kmp_tree_barrier_release( bt, this_thr, gtid, tid, FALSE
2250 #if USE_ITT_BUILD
2251  , NULL
2252 #endif /* USE_ITT_BUILD */
2253  );
2254  } else {
2255  __kmp_hyper_barrier_release( bt, this_thr, gtid, tid, FALSE
2256 #if USE_ITT_BUILD
2257  , NULL
2258 #endif /* USE_ITT_BUILD */
2259  );
2260  }; // if
2261  #if OMP_30_ENABLED
2262  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2263  __kmp_task_team_sync( this_thr, team );
2264  }; // if
2265  #endif /* OMP_30_ENABLED */
2266  }
2267  }
2268 }
2269 
2270 /* ------------------------------------------------------------------------ */
2271 /* ------------------------------------------------------------------------ */
2272 
2273 /*
2274  * determine if we can go parallel or must use a serialized parallel region and
2275  * how many threads we can use
2276  * set_nproc is the number of threads requested for the team
2277  * returns 0 if we should serialize or only use one thread,
2278  * otherwise the number of threads to use
2279  * The forkjoin lock is held by the caller.
2280  */
2281 static int
2282 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
2283  int master_tid, int set_nthreads
2284 #if OMP_40_ENABLED
2285  , int enter_teams
2286 #endif /* OMP_40_ENABLED */
2287 )
2288 {
2289  int capacity;
2290  int new_nthreads;
2291  int use_rml_to_adjust_nth;
2292  KMP_DEBUG_ASSERT( __kmp_init_serial );
2293  KMP_DEBUG_ASSERT( root && parent_team );
2294 
2295  //
2296  // Initial check to see if we should use a serialized team.
2297  //
2298  if ( set_nthreads == 1 ) {
2299  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d reserving 1 thread; requested %d threads\n",
2300  __kmp_get_gtid(), set_nthreads ));
2301  return 1;
2302  }
2303  if ( ( !get__nested_2(parent_team,master_tid) && (root->r.r_in_parallel
2304 #if OMP_40_ENABLED
2305  && !enter_teams
2306 #endif /* OMP_40_ENABLED */
2307  ) ) || ( __kmp_library == library_serial ) ) {
2308  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team; requested %d threads\n",
2309  __kmp_get_gtid(), set_nthreads ));
2310  return 1;
2311  }
2312 
2313  //
2314  // If dyn-var is set, dynamically adjust the number of desired threads,
2315  // according to the method specified by dynamic_mode.
2316  //
2317  new_nthreads = set_nthreads;
2318  use_rml_to_adjust_nth = FALSE;
2319  if ( ! get__dynamic_2( parent_team, master_tid ) ) {
2320  ;
2321  }
2322 #ifdef USE_LOAD_BALANCE
2323  else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
2324  new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
2325  if ( new_nthreads == 1 ) {
2326  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
2327  master_tid ));
2328  return 1;
2329  }
2330  if ( new_nthreads < set_nthreads ) {
2331  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
2332  master_tid, new_nthreads ));
2333  }
2334  }
2335 #endif /* USE_LOAD_BALANCE */
2336  else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
2337  new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
2338  : root->r.r_hot_team->t.t_nproc);
2339  if ( new_nthreads <= 1 ) {
2340  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
2341  master_tid ));
2342  return 1;
2343  }
2344  if ( new_nthreads < set_nthreads ) {
2345  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
2346  master_tid, new_nthreads ));
2347  }
2348  else {
2349  new_nthreads = set_nthreads;
2350  }
2351  }
2352  else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
2353  if ( set_nthreads > 2 ) {
2354  new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
2355  new_nthreads = ( new_nthreads % set_nthreads ) + 1;
2356  if ( new_nthreads == 1 ) {
2357  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
2358  master_tid ));
2359  return 1;
2360  }
2361  if ( new_nthreads < set_nthreads ) {
2362  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
2363  master_tid, new_nthreads ));
2364  }
2365  }
2366  }
2367  else {
2368  KMP_ASSERT( 0 );
2369  }
2370 
2371  //
2372  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
2373  //
2374  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
2375  root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
2376  int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
2377  root->r.r_hot_team->t.t_nproc );
2378  if ( tl_nthreads <= 0 ) {
2379  tl_nthreads = 1;
2380  }
2381 
2382  //
2383  // If dyn-var is false, emit a 1-time warning.
2384  //
2385  if ( ! get__dynamic_2( parent_team, master_tid )
2386  && ( ! __kmp_reserve_warn ) ) {
2387  __kmp_reserve_warn = 1;
2388  __kmp_msg(
2389  kmp_ms_warning,
2390  KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
2391  KMP_HNT( Unset_ALL_THREADS ),
2392  __kmp_msg_null
2393  );
2394  }
2395  if ( tl_nthreads == 1 ) {
2396  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
2397  master_tid ));
2398  return 1;
2399  }
2400  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
2401  master_tid, tl_nthreads ));
2402  new_nthreads = tl_nthreads;
2403  }
2404 
2405 
2406  //
2407  // Check if the threads array is large enough, or needs expanding.
2408  //
2409  // See comment in __kmp_register_root() about the adjustment if
2410  // __kmp_threads[0] == NULL.
2411  //
2412  capacity = __kmp_threads_capacity;
2413  if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
2414  --capacity;
2415  }
2416  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
2417  root->r.r_hot_team->t.t_nproc ) > capacity ) {
2418  //
2419  // Expand the threads array.
2420  //
2421  int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
2422  root->r.r_hot_team->t.t_nproc ) - capacity;
2423  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
2424  if ( slotsAdded < slotsRequired ) {
2425  //
2426  // The threads array was not expanded enough.
2427  //
2428  new_nthreads -= ( slotsRequired - slotsAdded );
2429  KMP_ASSERT( new_nthreads >= 1 );
2430 
2431  //
2432  // If dyn-var is false, emit a 1-time warning.
2433  //
2434  if ( ! get__dynamic_2( parent_team, master_tid )
2435  && ( ! __kmp_reserve_warn ) ) {
2436  __kmp_reserve_warn = 1;
2437  if ( __kmp_tp_cached ) {
2438  __kmp_msg(
2439  kmp_ms_warning,
2440  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
2441  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
2442  KMP_HNT( PossibleSystemLimitOnThreads ),
2443  __kmp_msg_null
2444  );
2445  }
2446  else {
2447  __kmp_msg(
2448  kmp_ms_warning,
2449  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
2450  KMP_HNT( SystemLimitOnThreads ),
2451  __kmp_msg_null
2452  );
2453  }
2454  }
2455  }
2456  }
2457 
2458  if ( new_nthreads == 1 ) {
2459  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
2460  __kmp_get_gtid(), set_nthreads ) );
2461  return 1;
2462  }
2463 
2464  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
2465  __kmp_get_gtid(), new_nthreads, set_nthreads ));
2466  return new_nthreads;
2467 }
2468 
2469 /* ------------------------------------------------------------------------ */
2470 /* ------------------------------------------------------------------------ */
2471 
2472 /* allocate threads from the thread pool and assign them to the new team */
2473 /* we are assured that there are enough threads available, because we
2474  * checked on that earlier within critical section forkjoin */
2475 
2476 static void
2477 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
2478  kmp_info_t *master_th, int master_gtid )
2479 {
2480  int i;
2481 
2482  KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
2483  KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
2484  KMP_MB();
2485 
2486  /* first, let's setup the master thread */
2487  master_th -> th.th_info .ds.ds_tid = 0;
2488  master_th -> th.th_team = team;
2489  master_th -> th.th_team_nproc = team -> t.t_nproc;
2490  master_th -> th.th_team_master = master_th;
2491  master_th -> th.th_team_serialized = FALSE;
2492  master_th -> th.th_dispatch = & team -> t.t_dispatch[ 0 ];
2493 
2494  /* make sure we are not the optimized hot team */
2495  if ( team != root->r.r_hot_team ) {
2496 
2497  /* install the master thread */
2498  team -> t.t_threads[ 0 ] = master_th;
2499  __kmp_initialize_info( master_th, team, 0, master_gtid );
2500 
2501  /* now, install the worker threads */
2502  for ( i=1 ; i < team->t.t_nproc ; i++ ) {
2503 
2504  /* fork or reallocate a new thread and install it in team */
2505  team -> t.t_threads[ i ] = __kmp_allocate_thread( root, team, i );
2506  KMP_DEBUG_ASSERT( team->t.t_threads[i] );
2507  KMP_DEBUG_ASSERT( team->t.t_threads[i]->th.th_team == team );
2508  /* align team and thread arrived states */
2509  KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%u, plain=%u\n",
2510  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
2511  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
2512  team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
2513  team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
2514 
2515  { // Initialize threads' barrier data.
2516  int b;
2517  kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
2518  for ( b = 0; b < bs_last_barrier; ++ b ) {
2519  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
2520  }; // for b
2521  }
2522  }
2523 
2524 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
2525  __kmp_partition_places( team );
2526 #endif
2527 
2528  }
2529 
2530  KMP_MB();
2531 }
2532 
2533 static void
2534 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
2535 
2536 /* most of the work for a fork */
2537 /* return true if we really went parallel, false if serialized */
2538 int
2539 __kmp_fork_call(
2540  ident_t * loc,
2541  int gtid,
2542  int exec_master, // 0 - GNU native code, master doesn't invoke microtask
2543  // 1 - Intel code, master invokes microtask
2544  // 2 - MS native code, use special invoker
2545  kmp_int32 argc,
2546  microtask_t microtask,
2547  launch_t invoker,
2548 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
2549 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
2550  va_list * ap
2551 #else
2552  va_list ap
2553 #endif
2554  )
2555 {
2556  void **argv;
2557  int i;
2558  int master_tid;
2559  int master_this_cons;
2560  int master_last_cons;
2561  kmp_team_t *team;
2562  kmp_team_t *parent_team;
2563  kmp_info_t *master_th;
2564  kmp_root_t *root;
2565  int nthreads;
2566  int master_active;
2567  int master_set_numthreads;
2568  int level;
2569 #if OMP_40_ENABLED
2570  int teams_level;
2571 #endif
2572 
2573  KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
2574 
2575  /* initialize if needed */
2576  KMP_DEBUG_ASSERT( __kmp_init_serial );
2577  if( ! TCR_4(__kmp_init_parallel) )
2578  __kmp_parallel_initialize();
2579 
2580  /* setup current data */
2581  master_th = __kmp_threads[ gtid ];
2582  parent_team = master_th -> th.th_team;
2583  master_tid = master_th -> th.th_info.ds.ds_tid;
2584  master_this_cons = master_th -> th.th_local.this_construct;
2585  master_last_cons = master_th -> th.th_local.last_construct;
2586  root = master_th -> th.th_root;
2587  master_active = root -> r.r_active;
2588  master_set_numthreads = master_th -> th.th_set_nproc;
2589 #if OMP_30_ENABLED
2590  // Nested level will be an index in the nested nthreads array
2591  level = parent_team->t.t_level;
2592 #endif // OMP_30_ENABLED
2593 #if OMP_40_ENABLED
2594  teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
2595 #endif
2596 
2597 
2598 
2599  master_th->th.th_ident = loc;
2600 
2601 #if OMP_40_ENABLED
2602  if ( master_th->th.th_team_microtask &&
2603  ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
2604  // AC: This is start of parallel that is nested inside teams construct.
2605  // The team is actual (hot), all workers are ready at the fork barrier.
2606  // No lock needed to initialize the team a bit, then free workers.
2607  parent_team->t.t_ident = loc;
2608  parent_team->t.t_argc = argc;
2609  argv = (void**)parent_team->t.t_argv;
2610  for( i=argc-1; i >= 0; --i )
2611 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
2612 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
2613  *argv++ = va_arg( *ap, void * );
2614 #else
2615  *argv++ = va_arg( ap, void * );
2616 #endif
2617  /* Increment our nested depth levels, but not increase the serialization */
2618  if ( parent_team == master_th->th.th_serial_team ) {
2619  // AC: we are in serialized parallel
2620  __kmpc_serialized_parallel(loc, gtid);
2621  KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
2622  parent_team->t.t_serialized--; // AC: need this in order enquiry functions
2623  // work correctly, will restore at join time
2624  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
2625  return TRUE;
2626  }
2627  parent_team->t.t_pkfn = microtask;
2628  parent_team->t.t_invoke = invoker;
2629  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
2630  parent_team->t.t_active_level ++;
2631  parent_team->t.t_level ++;
2632 
2633  /* Change number of threads in the team if requested */
2634  if ( master_set_numthreads ) { // The parallel has num_threads clause
2635  if ( master_set_numthreads < master_th->th.th_set_nth_teams ) {
2636  // AC: only can reduce the number of threads dynamically, cannot increase
2637  kmp_info_t **other_threads = parent_team->t.t_threads;
2638  parent_team->t.t_nproc = master_set_numthreads;
2639  for ( i = 0; i < master_set_numthreads; ++i ) {
2640  other_threads[i]->th.th_team_nproc = master_set_numthreads;
2641  }
2642  // Keep extra threads hot in the team for possible next parallels
2643  }
2644  master_th->th.th_set_nproc = 0;
2645  }
2646 
2647 
2648  KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
2649  __kmp_internal_fork( loc, gtid, parent_team );
2650  KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
2651 
2652  /* Invoke microtask for MASTER thread */
2653  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2654  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
2655 
2656  if (! parent_team->t.t_invoke( gtid )) {
2657  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2658  }
2659  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2660  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
2661  KMP_MB(); /* Flush all pending memory write invalidates. */
2662 
2663  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2664 
2665  return TRUE;
2666  }
2667 #endif /* OMP_40_ENABLED */
2668 
2669 #if OMP_30_ENABLED && KMP_DEBUG
2670  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2671  KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
2672  }
2673 #endif // OMP_30_ENABLED
2674 
2675  /* determine how many new threads we can use */
2676  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2677 
2678 #if OMP_30_ENABLED
2679  if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
2680  nthreads = 1;
2681  }
2682  else
2683 #endif // OMP_30_ENABLED
2684 
2685  {
2686  nthreads = master_set_numthreads ?
2687  master_set_numthreads : get__nproc_2( parent_team, master_tid );
2688  nthreads = __kmp_reserve_threads( root, parent_team, master_tid, nthreads
2689 #if OMP_40_ENABLED
2690  // AC: If we execute teams from parallel region (on host), then teams
2691  // should be created but each can only have 1 thread if nesting is disabled.
2692  // If teams called from serial region, then teams and their threads
2693  // should be created regardless of the nesting setting.
2694  ,( ( ap == NULL && teams_level == 0 ) ||
2695  ( ap && teams_level > 0 && teams_level == level ) )
2696 #endif /* OMP_40_ENABLED */
2697  );
2698  }
2699  KMP_DEBUG_ASSERT( nthreads > 0 );
2700 
2701  /* If we temporarily changed the set number of threads then restore it now */
2702  master_th -> th.th_set_nproc = 0;
2703 
2704 
2705  /* create a serialized parallel region? */
2706  if ( nthreads == 1 ) {
2707  /* josh todo: hypothetical question: what do we do for OS X*? */
2708 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
2709  void * args[ argc ];
2710 #else
2711  void * * args = (void**) alloca( argc * sizeof( void * ) );
2712 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 ) */
2713 
2714  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2715  KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
2716 
2717  __kmpc_serialized_parallel(loc, gtid);
2718 
2719  if ( exec_master == 0 ) {
2720  // we were called from GNU native code
2721  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
2722  return FALSE;
2723  } else if ( exec_master == 1 ) {
2724  /* TODO this sucks, use the compiler itself to pass args! :) */
2725  master_th -> th.th_serial_team -> t.t_ident = loc;
2726 #if OMP_40_ENABLED
2727  if ( !ap ) {
2728  // revert change made in __kmpc_serialized_parallel()
2729  master_th -> th.th_serial_team -> t.t_level--;
2730  // Get args from parent team for teams construct
2731  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv );
2732  } else if ( microtask == (microtask_t)__kmp_teams_master ) {
2733  KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
2734  team = master_th->th.th_team;
2735  //team->t.t_pkfn = microtask;
2736  team->t.t_invoke = invoker;
2737  __kmp_alloc_argv_entries( argc, team, TRUE );
2738  team->t.t_argc = argc;
2739  argv = (void**) team->t.t_argv;
2740  if ( ap ) {
2741  for( i=argc-1; i >= 0; --i )
2742  /* TODO: revert workaround for Intel(R) 64 tracker #96 */
2743  #if KMP_ARCH_X86_64 && KMP_OS_LINUX
2744  *argv++ = va_arg( *ap, void * );
2745  #else
2746  *argv++ = va_arg( ap, void * );
2747  #endif
2748  } else {
2749  for( i=0; i < argc; ++i )
2750  // Get args from parent team for teams construct
2751  argv[i] = parent_team->t.t_argv[i];
2752  }
2753  // AC: revert change made in __kmpc_serialized_parallel()
2754  // because initial code in teams should have level=0
2755  team->t.t_level--;
2756  // AC: call special invoker for outer "parallel" of the teams construct
2757  invoker(gtid);
2758  } else {
2759 #endif /* OMP_40_ENABLED */
2760  argv = args;
2761  for( i=argc-1; i >= 0; --i )
2762  /* TODO: revert workaround for Intel(R) 64 tracker #96 */
2763  #if KMP_ARCH_X86_64 && KMP_OS_LINUX
2764  *argv++ = va_arg( *ap, void * );
2765  #else
2766  *argv++ = va_arg( ap, void * );
2767  #endif
2768  KMP_MB();
2769  __kmp_invoke_microtask( microtask, gtid, 0, argc, args );
2770 #if OMP_40_ENABLED
2771  }
2772 #endif /* OMP_40_ENABLED */
2773  }
2774  else {
2775  KMP_ASSERT2( exec_master <= 1, "__kmp_fork_call: unknown parameter exec_master" );
2776  }
2777 
2778  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
2779 
2780  KMP_MB();
2781  return FALSE;
2782  }
2783 
2784 #if OMP_30_ENABLED
2785  // GEH: only modify the executing flag in the case when not serialized
2786  // serialized case is handled in kmpc_serialized_parallel
2787  KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
2788  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
2789  master_th->th.th_current_task->td_icvs.max_active_levels ) );
2790  // TODO: GEH - cannot do this assertion because root thread not set up as executing
2791  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
2792  master_th->th.th_current_task->td_flags.executing = 0;
2793 #endif
2794 
2795 #if OMP_40_ENABLED
2796  if ( !master_th->th.th_team_microtask || level > teams_level )
2797 #endif /* OMP_40_ENABLED */
2798  {
2799  /* Increment our nested depth level */
2800  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
2801  }
2802 
2803 #if OMP_30_ENABLED
2804  //
2805  // See if we need to make a copy of the ICVs.
2806  //
2807  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
2808  if ( ( level + 1 < __kmp_nested_nth.used ) &&
2809  ( __kmp_nested_nth.nth[level + 1] != nthreads_icv ) ) {
2810  nthreads_icv = __kmp_nested_nth.nth[level + 1];
2811  }
2812  else {
2813  nthreads_icv = 0; // don't update
2814  }
2815 
2816 #if OMP_40_ENABLED
2817  //
2818  // Figure out the proc_bind_policy for the new team.
2819  //
2820  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
2821  kmp_proc_bind_t proc_bind_icv; // proc_bind_default means don't update
2822 
2823  if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
2824  proc_bind = proc_bind_false;
2825  proc_bind_icv = proc_bind_default;
2826  }
2827  else {
2828  proc_bind_icv = master_th->th.th_current_task->td_icvs.proc_bind;
2829  if ( proc_bind == proc_bind_default ) {
2830  //
2831  // No proc_bind clause was specified, so use the current value
2832  // of proc-bind-var for this parallel region.
2833  //
2834  proc_bind = proc_bind_icv;
2835  }
2836  else {
2837  //
2838  // The proc_bind policy was specified explicitly on the parallel
2839  // clause. This overrides the proc-bind-var for this parallel
2840  // region, but does not change proc-bind-var.
2841  //
2842  }
2843 
2844  //
2845  // Figure the value of proc-bind-var for the child threads.
2846  //
2847  if ( ( level + 1 < __kmp_nested_proc_bind.used )
2848  && ( __kmp_nested_proc_bind.bind_types[level + 1] != proc_bind_icv ) ) {
2849  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2850  }
2851  else {
2852  proc_bind_icv = proc_bind_default;
2853  }
2854  }
2855 
2856  //
2857  // Reset for next parallel region
2858  //
2859  master_th->th.th_set_proc_bind = proc_bind_default;
2860 #endif /* OMP_40_ENABLED */
2861 
2862  if ( ( nthreads_icv > 0 )
2863 #if OMP_40_ENABLED
2864  || ( proc_bind_icv != proc_bind_default )
2865 #endif /* OMP_40_ENABLED */
2866  )
2867  {
2868  kmp_internal_control_t new_icvs;
2869  copy_icvs( & new_icvs, & master_th->th.th_current_task->td_icvs );
2870  new_icvs.next = NULL;
2871 
2872  if ( nthreads_icv > 0 ) {
2873  new_icvs.nproc = nthreads_icv;
2874  }
2875 
2876 #if OMP_40_ENABLED
2877  if ( proc_bind_icv != proc_bind_default ) {
2878  new_icvs.proc_bind = proc_bind_icv;
2879  }
2880 #endif /* OMP_40_ENABLED */
2881 
2882  /* allocate a new parallel team */
2883  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2884  team = __kmp_allocate_team(root, nthreads, nthreads,
2885 #if OMP_40_ENABLED
2886  proc_bind,
2887 #endif
2888  &new_icvs, argc );
2889  } else
2890 #endif /* OMP_30_ENABLED */
2891  {
2892  /* allocate a new parallel team */
2893  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2894  team = __kmp_allocate_team(root, nthreads, nthreads,
2895 #if OMP_40_ENABLED
2896  proc_bind,
2897 #endif
2898 #if OMP_30_ENABLED
2899  &master_th->th.th_current_task->td_icvs,
2900 #else
2901  parent_team->t.t_set_nproc[master_tid],
2902  parent_team->t.t_set_dynamic[master_tid],
2903  parent_team->t.t_set_nested[master_tid],
2904  parent_team->t.t_set_blocktime[master_tid],
2905  parent_team->t.t_set_bt_intervals[master_tid],
2906  parent_team->t.t_set_bt_set[master_tid],
2907 #endif // OMP_30_ENABLED
2908  argc );
2909  }
2910 
2911  KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n",
2912  team ) );
2913 
2914  /* setup the new team */
2915  team->t.t_master_tid = master_tid;
2916  team->t.t_master_this_cons = master_this_cons;
2917  team->t.t_master_last_cons = master_last_cons;
2918 
2919  team->t.t_parent = parent_team;
2920  TCW_SYNC_PTR(team->t.t_pkfn, microtask);
2921  team->t.t_invoke = invoker; /* TODO move this to root, maybe */
2922  team->t.t_ident = loc;
2923 #if OMP_30_ENABLED
2924  // TODO: parent_team->t.t_level == INT_MAX ???
2925 #if OMP_40_ENABLED
2926  if ( !master_th->th.th_team_microtask || level > teams_level ) {
2927 #endif /* OMP_40_ENABLED */
2928  team->t.t_level = parent_team->t.t_level + 1;
2929  team->t.t_active_level = parent_team->t.t_active_level + 1;
2930 #if OMP_40_ENABLED
2931  } else {
2932  // AC: Do not increase parallel level at start of the teams construct
2933  team->t.t_level = parent_team->t.t_level;
2934  team->t.t_active_level = parent_team->t.t_active_level;
2935  }
2936 #endif /* OMP_40_ENABLED */
2937  team->t.t_sched = get__sched_2( parent_team, master_tid ); // set master's schedule as new run-time schedule
2938 
2939 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
2940  if ( __kmp_inherit_fp_control ) {
2941  __kmp_store_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
2942  __kmp_store_mxcsr( &team->t.t_mxcsr );
2943  team->t.t_mxcsr &= KMP_X86_MXCSR_MASK;
2944  team->t.t_fp_control_saved = TRUE;
2945  }
2946  else {
2947  team->t.t_fp_control_saved = FALSE;
2948  }
2949 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
2950 
2951  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2952  //
2953  // Set the master thread's task team to the team's task team.
2954  // Unless this is the hot team, it should be NULL.
2955  //
2956  KMP_DEBUG_ASSERT( master_th->th.th_task_team == parent_team->t.t_task_team );
2957  KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2958  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2959  parent_team, team->t.t_task_team, team ) );
2960  master_th->th.th_task_team = team->t.t_task_team;
2961  KMP_DEBUG_ASSERT( ( master_th->th.th_task_team == NULL ) || ( team == root->r.r_hot_team ) ) ;
2962  }
2963 #endif // OMP_30_ENABLED
2964 
2965  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2966  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2967  KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2968  ( team->t.t_master_tid == 0 &&
2969  ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2970  KMP_MB();
2971 
2972  /* now, setup the arguments */
2973  argv = (void**) team -> t.t_argv;
2974 #if OMP_40_ENABLED
2975  if ( ap ) {
2976 #endif /* OMP_40_ENABLED */
2977  for( i=argc-1; i >= 0; --i )
2978 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
2979 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
2980  *argv++ = va_arg( *ap, void * );
2981 #else
2982  *argv++ = va_arg( ap, void * );
2983 #endif
2984 #if OMP_40_ENABLED
2985  } else {
2986  for( i=0; i < argc; ++i )
2987  // Get args from parent team for teams construct
2988  argv[i] = team->t.t_parent->t.t_argv[i];
2989  }
2990 #endif /* OMP_40_ENABLED */
2991 
2992  /* now actually fork the threads */
2993 
2994  team->t.t_master_active = master_active;
2995  if (!root -> r.r_active) /* Only do the assignment if it makes a difference to prevent cache ping-pong */
2996  root -> r.r_active = TRUE;
2997 
2998  __kmp_fork_team_threads( root, team, master_th, gtid );
2999 
3000 
3001  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3002 
3003 
3004 #if USE_ITT_BUILD
3005  // Mark start of "parallel" region for VTune. Only use one of frame notification scheme at the moment.
3006  if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
3007 # if OMP_40_ENABLED
3008  if ( !master_th->th.th_team_microtask || microtask == (microtask_t)__kmp_teams_master )
3009  // Either not in teams or the outer fork of the teams construct
3010 # endif /* OMP_40_ENABLED */
3011  __kmp_itt_region_forking( gtid );
3012 #endif /* USE_ITT_BUILD */
3013 
3014  // Internal fork - report frame begin
3015 #if USE_ITT_BUILD
3016  // Collect information only if the file was opened succesfully.
3017  if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file )
3018  {
3019  kmp_uint64 fr_begin;
3020 #if defined( __GNUC__ )
3021 # if !defined( __INTEL_COMPILER )
3022  fr_begin = __kmp_hardware_timestamp();
3023 # else
3024  fr_begin = __rdtsc();
3025 # endif
3026 #else
3027  fr_begin = __rdtsc();
3028 #endif
3029  if( ! ( team->t.t_active_level > 1 ) ) {
3030  master_th->th.th_frame_time = fr_begin;
3031  }
3032  }
3033 #endif /* USE_ITT_BUILD */
3034 
3035  /* now go on and do the work */
3036  KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
3037  KMP_MB();
3038 
3039  KF_TRACE( 10, ( "__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", root, team, master_th, gtid ) );
3040 
3041 #if USE_ITT_BUILD
3042  if ( __itt_stack_caller_create_ptr ) {
3043  team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
3044  }
3045 #endif /* USE_ITT_BUILD */
3046 
3047 #if OMP_40_ENABLED
3048  if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
3049 #endif /* OMP_40_ENABLED */
3050  {
3051  __kmp_internal_fork( loc, gtid, team );
3052  KF_TRACE( 10, ( "__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n", root, team, master_th, gtid ) );
3053  }
3054 
3055  if (! exec_master) {
3056  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
3057  return TRUE;
3058  }
3059 
3060  /* Invoke microtask for MASTER thread */
3061  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
3062  gtid, team->t.t_id, team->t.t_pkfn ) );
3063 
3064  if (! team->t.t_invoke( gtid )) {
3065  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
3066  }
3067  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
3068  gtid, team->t.t_id, team->t.t_pkfn ) );
3069  KMP_MB(); /* Flush all pending memory write invalidates. */
3070 
3071  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
3072 
3073  return TRUE;
3074 }
3075 
3076 
3077 void
3078 __kmp_join_call(ident_t *loc, int gtid
3079 #if OMP_40_ENABLED
3080  , int exit_teams
3081 #endif /* OMP_40_ENABLED */
3082 )
3083 {
3084  kmp_team_t *team;
3085  kmp_team_t *parent_team;
3086  kmp_info_t *master_th;
3087  kmp_root_t *root;
3088  int master_active;
3089  int i;
3090 
3091  KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
3092 
3093  /* setup current data */
3094  master_th = __kmp_threads[ gtid ];
3095  root = master_th -> th.th_root;
3096  team = master_th -> th.th_team;
3097  parent_team = team->t.t_parent;
3098 
3099  master_th->th.th_ident = loc;
3100 
3101 #if OMP_30_ENABLED && KMP_DEBUG
3102  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3103  KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
3104  __kmp_gtid_from_thread( master_th ), team,
3105  team -> t.t_task_team, master_th->th.th_task_team) );
3106  KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team );
3107  }
3108 #endif // OMP_30_ENABLED
3109 
3110  if( team->t.t_serialized ) {
3111 #if OMP_40_ENABLED
3112  if ( master_th->th.th_team_microtask ) {
3113  // We are in teams construct
3114  int level = team->t.t_level;
3115  int tlevel = master_th->th.th_teams_level;
3116  if ( level == tlevel ) {
3117  // AC: we haven't incremented it earlier at start of teams construct,
3118  // so do it here - at the end of teams construct
3119  team->t.t_level++;
3120  } else if ( level == tlevel + 1 ) {
3121  // AC: we are exiting parallel inside teams, need to increment serialization
3122  // in order to restore it in the next call to __kmpc_end_serialized_parallel
3123  team->t.t_serialized++;
3124  }
3125  }
3126 #endif /* OMP_40_ENABLED */
3127  __kmpc_end_serialized_parallel( loc, gtid );
3128  return;
3129  }
3130 
3131  master_active = team->t.t_master_active;
3132 
3133 #if OMP_40_ENABLED
3134  if (!exit_teams)
3135 #endif /* OMP_40_ENABLED */
3136  {
3137  // AC: No barrier for internal teams at exit from teams construct.
3138  // But there is barrier for external team (league).
3139  __kmp_internal_join( loc, gtid, team );
3140  }
3141  KMP_MB();
3142 
3143 #if USE_ITT_BUILD
3144  if ( __itt_stack_caller_create_ptr ) {
3145  __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
3146  }
3147 
3148  // Mark end of "parallel" region for VTune. Only use one of frame notification scheme at the moment.
3149  if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
3150 # if OMP_40_ENABLED
3151  if ( !master_th->th.th_team_microtask /* not in teams */ ||
3152  ( !exit_teams && team->t.t_level == master_th->th.th_teams_level ) )
3153  // Either not in teams or exiting teams region
3154  // (teams is a frame and no other frames inside the teams)
3155 # endif /* OMP_40_ENABLED */
3156  __kmp_itt_region_joined( gtid );
3157 #endif /* USE_ITT_BUILD */
3158 
3159 #if OMP_40_ENABLED
3160  if ( master_th->th.th_team_microtask &&
3161  !exit_teams &&
3162  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
3163  team->t.t_level == master_th->th.th_teams_level + 1 ) {
3164  // AC: We need to leave the team structure intact at the end
3165  // of parallel inside the teams construct, so that at the next
3166  // parallel same (hot) team works, only adjust nesting levels
3167 
3168  /* Decrement our nested depth level */
3169  team->t.t_level --;
3170  team->t.t_active_level --;
3171  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
3172 
3173  /* Restore number of threads in the team if needed */
3174  if ( master_th->th.th_team_nproc < master_th->th.th_set_nth_teams ) {
3175  int old_num = master_th->th.th_team_nproc;
3176  int new_num = master_th->th.th_set_nth_teams;
3177  kmp_info_t **other_threads = team->t.t_threads;
3178  team->t.t_nproc = new_num;
3179  for ( i = 0; i < old_num; ++i ) {
3180  other_threads[i]->th.th_team_nproc = new_num;
3181  }
3182  // Adjust states of non-used threads of the team
3183  for ( i = old_num; i < new_num; ++i ) {
3184  // Re-initialize thread's barrier data.
3185  int b;
3186  kmp_balign_t * balign = other_threads[i]->th.th_bar;
3187  for ( b = 0; b < bp_last_bar; ++ b ) {
3188  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
3189  }
3190  // Synchronize thread's task state
3191  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
3192  }
3193  }
3194  return;
3195  }
3196 #endif /* OMP_40_ENABLED */
3197  /* do cleanup and restore the parent team */
3198  master_th -> th.th_info .ds.ds_tid = team -> t.t_master_tid;
3199  master_th -> th.th_local.this_construct = team -> t.t_master_this_cons;
3200  master_th -> th.th_local.last_construct = team -> t.t_master_last_cons;
3201 
3202  master_th -> th.th_dispatch =
3203  & parent_team -> t.t_dispatch[ team -> t.t_master_tid ];
3204 
3205  /* jc: The following lock has instructions with REL and ACQ semantics,
3206  separating the parallel user code called in this parallel region
3207  from the serial user code called after this function returns.
3208  */
3209  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3210 
3211 #if OMP_40_ENABLED
3212  if ( !master_th->th.th_team_microtask || team->t.t_level > master_th->th.th_teams_level )
3213 #endif /* OMP_40_ENABLED */
3214  {
3215  /* Decrement our nested depth level */
3216  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
3217  }
3218  KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
3219 
3220  #if OMP_30_ENABLED
3221  KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
3222  0, master_th, team ) );
3223  __kmp_pop_current_task_from_thread( master_th );
3224  #endif // OMP_30_ENABLED
3225 
3226 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
3227  //
3228  // Restore master thread's partition.
3229  //
3230  master_th -> th.th_first_place = team -> t.t_first_place;
3231  master_th -> th.th_last_place = team -> t.t_last_place;
3232 #endif /* OMP_40_ENABLED */
3233 
3234 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
3235  if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
3236  __kmp_clear_x87_fpu_status_word();
3237  __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
3238  __kmp_load_mxcsr( &team->t.t_mxcsr );
3239  }
3240 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
3241 
3242  if ( root -> r.r_active != master_active )
3243  root -> r.r_active = master_active;
3244 
3245  __kmp_free_team( root, team ); /* this will free worker threads */
3246 
3247  /* this race was fun to find. make sure the following is in the critical
3248  * region otherwise assertions may fail occasiounally since the old team
3249  * may be reallocated and the hierarchy appears inconsistent. it is
3250  * actually safe to run and won't cause any bugs, but will cause thoose
3251  * assertion failures. it's only one deref&assign so might as well put this
3252  * in the critical region */
3253  master_th -> th.th_team = parent_team;
3254  master_th -> th.th_team_nproc = parent_team -> t.t_nproc;
3255  master_th -> th.th_team_master = parent_team -> t.t_threads[0];
3256  master_th -> th.th_team_serialized = parent_team -> t.t_serialized;
3257 
3258  /* restore serialized team, if need be */
3259  if( parent_team -> t.t_serialized &&
3260  parent_team != master_th->th.th_serial_team &&
3261  parent_team != root->r.r_root_team ) {
3262  __kmp_free_team( root, master_th -> th.th_serial_team );
3263  master_th -> th.th_serial_team = parent_team;
3264  }
3265 
3266 #if OMP_30_ENABLED
3267  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3268  //
3269  // Copy the task team from the new child / old parent team
3270  // to the thread. If non-NULL, copy the state flag also.
3271  //
3272  if ( ( master_th -> th.th_task_team = parent_team -> t.t_task_team ) != NULL ) {
3273  master_th -> th.th_task_state = master_th -> th.th_task_team -> tt.tt_state;
3274  }
3275  KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
3276  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
3277  parent_team ) );
3278  }
3279 #endif /* OMP_30_ENABLED */
3280 
3281  #if OMP_30_ENABLED
3282  // TODO: GEH - cannot do this assertion because root thread not set up as executing
3283  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
3284  master_th->th.th_current_task->td_flags.executing = 1;
3285  #endif // OMP_30_ENABLED
3286 
3287  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3288 
3289  KMP_MB();
3290  KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
3291 }
3292 
3293 /* ------------------------------------------------------------------------ */
3294 /* ------------------------------------------------------------------------ */
3295 
3296 /* Check whether we should push an internal control record onto the
3297  serial team stack. If so, do it. */
3298 void
3299 __kmp_save_internal_controls ( kmp_info_t * thread )
3300 {
3301 
3302  if ( thread -> th.th_team != thread -> th.th_serial_team ) {
3303  return;
3304  }
3305  if (thread -> th.th_team -> t.t_serialized > 1) {
3306  int push = 0;
3307 
3308  if (thread -> th.th_team -> t.t_control_stack_top == NULL) {
3309  push = 1;
3310  } else {
3311  if ( thread -> th.th_team -> t.t_control_stack_top -> serial_nesting_level !=
3312  thread -> th.th_team -> t.t_serialized ) {
3313  push = 1;
3314  }
3315  }
3316  if (push) { /* push a record on the serial team's stack */
3317  kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
3318 
3319 #if OMP_30_ENABLED
3320  copy_icvs( control, & thread->th.th_current_task->td_icvs );
3321 #else
3322  control->nproc = thread->th.th_team->t.t_set_nproc[0];
3323  control->dynamic = thread->th.th_team->t.t_set_dynamic[0];
3324  control->nested = thread->th.th_team->t.t_set_nested[0];
3325  control->blocktime = thread->th.th_team->t.t_set_blocktime[0];
3326  control->bt_intervals = thread->th.th_team->t.t_set_bt_intervals[0];
3327  control->bt_set = thread->th.th_team->t.t_set_bt_set[0];
3328 #endif // OMP_30_ENABLED
3329 
3330  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
3331 
3332  control->next = thread -> th.th_team -> t.t_control_stack_top;
3333  thread -> th.th_team -> t.t_control_stack_top = control;
3334  }
3335  }
3336 }
3337 
3338 /* Changes set_nproc */
3339 void
3340 __kmp_set_num_threads( int new_nth, int gtid )
3341 {
3342  kmp_info_t *thread;
3343  kmp_root_t *root;
3344 
3345  KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
3346  KMP_DEBUG_ASSERT( __kmp_init_serial );
3347 
3348  if (new_nth < 1)
3349  new_nth = 1;
3350  else if (new_nth > __kmp_max_nth)
3351  new_nth = __kmp_max_nth;
3352 
3353  thread = __kmp_threads[gtid];
3354 
3355  __kmp_save_internal_controls( thread );
3356 
3357  set__nproc( thread, new_nth );
3358 
3359  //
3360  // If this omp_set_num_threads() call will cause the hot team size to be
3361  // reduced (in the absence of a num_threads clause), then reduce it now,
3362  // rather than waiting for the next parallel region.
3363  //
3364  root = thread->th.th_root;
3365  if ( __kmp_init_parallel && ( ! root->r.r_active )
3366  && ( root->r.r_hot_team->t.t_nproc > new_nth ) ) {
3367  kmp_team_t *hot_team = root->r.r_hot_team;
3368  int f;
3369 
3370  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3371 
3372 
3373 #if OMP_30_ENABLED
3374  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3375  kmp_task_team_t *task_team = hot_team->t.t_task_team;
3376  if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
3377  //
3378  // Signal the worker threads (esp. the extra ones) to stop
3379  // looking for tasks while spin waiting. The task teams
3380  // are reference counted and will be deallocated by the
3381  // last worker thread.
3382  //
3383  KMP_DEBUG_ASSERT( hot_team->t.t_nproc > 1 );
3384  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
3385  KMP_MB();
3386 
3387  KA_TRACE( 20, ( "__kmp_set_num_threads: setting task_team %p to NULL\n",
3388  &hot_team->t.t_task_team ) );
3389  hot_team->t.t_task_team = NULL;
3390  }
3391  else {
3392  KMP_DEBUG_ASSERT( task_team == NULL );
3393  }
3394  }
3395 #endif // OMP_30_ENABLED
3396 
3397  //
3398  // Release the extra threads we don't need any more.
3399  //
3400  for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
3401  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
3402  __kmp_free_thread( hot_team->t.t_threads[f] );
3403  hot_team->t.t_threads[f] = NULL;
3404  }
3405  hot_team->t.t_nproc = new_nth;
3406 
3407 
3408  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3409 
3410  //
3411  // Update the t_nproc field in the threads that are still active.
3412  //
3413  for( f=0 ; f < new_nth; f++ ) {
3414  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
3415  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
3416  }
3417 #if KMP_MIC
3418  // Special flag in case omp_set_num_threads() call
3419  hot_team -> t.t_size_changed = -1;
3420 #endif
3421  }
3422 
3423 }
3424 
3425 #if OMP_30_ENABLED
3426 /* Changes max_active_levels */
3427 void
3428 __kmp_set_max_active_levels( int gtid, int max_active_levels )
3429 {
3430  kmp_info_t *thread;
3431 
3432  KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
3433  KMP_DEBUG_ASSERT( __kmp_init_serial );
3434 
3435  // validate max_active_levels
3436  if( max_active_levels < 0 ) {
3437  KMP_WARNING( ActiveLevelsNegative, max_active_levels );
3438  // We ignore this call if the user has specified a negative value.
3439  // The current setting won't be changed. The last valid setting will be used.
3440  // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
3441  KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
3442  return;
3443  }
3444  if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
3445  // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
3446  // We allow a zero value. (implementation defined behavior)
3447  } else {
3448  KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
3449  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
3450  // Current upper limit is MAX_INT. (implementation defined behavior)
3451  // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
3452  // Actually, the flow should never get here until we use MAX_INT limit.
3453  }
3454  KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
3455 
3456  thread = __kmp_threads[ gtid ];
3457 
3458  __kmp_save_internal_controls( thread );
3459 
3460  set__max_active_levels( thread, max_active_levels );
3461 
3462 }
3463 
3464 /* Gets max_active_levels */
3465 int
3466 __kmp_get_max_active_levels( int gtid )
3467 {
3468  kmp_info_t *thread;
3469 
3470  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
3471  KMP_DEBUG_ASSERT( __kmp_init_serial );
3472 
3473  thread = __kmp_threads[ gtid ];
3474  KMP_DEBUG_ASSERT( thread -> th.th_current_task );
3475  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
3476  gtid, thread -> th.th_current_task, thread -> th.th_current_task -> td_icvs.max_active_levels ) );
3477  return thread -> th.th_current_task -> td_icvs.max_active_levels;
3478 }
3479 
3480 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
3481 void
3482 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
3483 {
3484  kmp_info_t *thread;
3485 // kmp_team_t *team;
3486 
3487  KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
3488  KMP_DEBUG_ASSERT( __kmp_init_serial );
3489 
3490  // Check if the kind parameter is valid, correct if needed.
3491  // Valid parameters should fit in one of two intervals - standard or extended:
3492  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
3493  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
3494  if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
3495  ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
3496  {
3497  // TODO: Hint needs attention in case we change the default schedule.
3498  __kmp_msg(
3499  kmp_ms_warning,
3500  KMP_MSG( ScheduleKindOutOfRange, kind ),
3501  KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
3502  __kmp_msg_null
3503  );
3504  kind = kmp_sched_default;
3505  chunk = 0; // ignore chunk value in case of bad kind
3506  }
3507 
3508  thread = __kmp_threads[ gtid ];
3509 
3510  __kmp_save_internal_controls( thread );
3511 
3512  if ( kind < kmp_sched_upper_std ) {
3513  if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
3514  // differ static chunked vs. unchunked:
3515  // chunk should be invalid to indicate unchunked schedule (which is the default)
3516  thread -> th.th_current_task -> td_icvs.sched.r_sched_type = kmp_sch_static;
3517  } else {
3518  thread -> th.th_current_task -> td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
3519  }
3520  } else {
3521  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
3522  thread -> th.th_current_task -> td_icvs.sched.r_sched_type =
3523  __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
3524  }
3525  if ( kind == kmp_sched_auto ) {
3526  // ignore parameter chunk for schedule auto
3527  thread -> th.th_current_task -> td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
3528  } else {
3529  thread -> th.th_current_task -> td_icvs.sched.chunk = chunk;
3530  }
3531 }
3532 
3533 /* Gets def_sched_var ICV values */
3534 void
3535 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
3536 {
3537  kmp_info_t *thread;
3538  enum sched_type th_type;
3539  int i;
3540 
3541  KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
3542  KMP_DEBUG_ASSERT( __kmp_init_serial );
3543 
3544  thread = __kmp_threads[ gtid ];
3545 
3546  //th_type = thread -> th.th_team -> t.t_set_sched[ thread->th.th_info.ds.ds_tid ].r_sched_type;
3547  th_type = thread -> th.th_current_task -> td_icvs.sched.r_sched_type;
3548 
3549  switch ( th_type ) {
3550  case kmp_sch_static:
3551  case kmp_sch_static_greedy:
3552  case kmp_sch_static_balanced:
3553  *kind = kmp_sched_static;
3554  *chunk = 0; // chunk was not set, try to show this fact via zero value
3555  return;
3556  case kmp_sch_static_chunked:
3557  *kind = kmp_sched_static;
3558  break;
3559  case kmp_sch_dynamic_chunked:
3560  *kind = kmp_sched_dynamic;
3561  break;
3563  case kmp_sch_guided_iterative_chunked:
3564  case kmp_sch_guided_analytical_chunked:
3565  *kind = kmp_sched_guided;
3566  break;
3567  case kmp_sch_auto:
3568  *kind = kmp_sched_auto;
3569  break;
3570  case kmp_sch_trapezoidal:
3571  *kind = kmp_sched_trapezoidal;
3572  break;
3573 /*
3574  case kmp_sch_static_steal:
3575  *kind = kmp_sched_static_steal;
3576  break;
3577 */
3578  default:
3579  KMP_FATAL( UnknownSchedulingType, th_type );
3580  }
3581 
3582  //*chunk = thread -> th.th_team -> t.t_set_sched[ thread->th.th_info.ds.ds_tid ].chunk;
3583  *chunk = thread -> th.th_current_task -> td_icvs.sched.chunk;
3584 }
3585 
3586 int
3587 __kmp_get_ancestor_thread_num( int gtid, int level ) {
3588 
3589  int ii, dd;
3590  kmp_team_t *team;
3591  kmp_info_t *thr;
3592 
3593  KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
3594  KMP_DEBUG_ASSERT( __kmp_init_serial );
3595 
3596  // validate level
3597  if( level == 0 ) return 0;
3598  if( level < 0 ) return -1;
3599  thr = __kmp_threads[ gtid ];
3600  team = thr->th.th_team;
3601  ii = team -> t.t_level;
3602  if( level > ii ) return -1;
3603 
3604 #if OMP_40_ENABLED
3605  if( thr->th.th_team_microtask ) {
3606  // AC: we are in teams region where multiple nested teams have same level
3607  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3608  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
3609  KMP_DEBUG_ASSERT( ii >= tlevel );
3610  // AC: As we need to pass by the teams league, we need to artificially increase ii
3611  if ( ii == tlevel ) {
3612  ii += 2; // three teams have same level
3613  } else {
3614  ii ++; // two teams have same level
3615  }
3616  }
3617  }
3618 #endif
3619 
3620  if( ii == level ) return __kmp_tid_from_gtid( gtid );
3621 
3622  dd = team -> t.t_serialized;
3623  level++;
3624  while( ii > level )
3625  {
3626  for( dd = team -> t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
3627  {
3628  }
3629  if( ( team -> t.t_serialized ) && ( !dd ) ) {
3630  team = team->t.t_parent;
3631  continue;
3632  }
3633  if( ii > level ) {
3634  team = team->t.t_parent;
3635  dd = team -> t.t_serialized;
3636  ii--;
3637  }
3638  }
3639 
3640  return ( dd > 1 ) ? ( 0 ) : ( team -> t.t_master_tid );
3641 }
3642 
3643 int
3644 __kmp_get_team_size( int gtid, int level ) {
3645 
3646  int ii, dd;
3647  kmp_team_t *team;
3648  kmp_info_t *thr;
3649 
3650  KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
3651  KMP_DEBUG_ASSERT( __kmp_init_serial );
3652 
3653  // validate level
3654  if( level == 0 ) return 1;
3655  if( level < 0 ) return -1;
3656  thr = __kmp_threads[ gtid ];
3657  team = thr->th.th_team;
3658  ii = team -> t.t_level;
3659  if( level > ii ) return -1;
3660 
3661 #if OMP_40_ENABLED
3662  if( thr->th.th_team_microtask ) {
3663  // AC: we are in teams region where multiple nested teams have same level
3664  int tlevel = thr->th.th_teams_level; // the level of the teams construct
3665  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
3666  KMP_DEBUG_ASSERT( ii >= tlevel );
3667  // AC: As we need to pass by the teams league, we need to artificially increase ii
3668  if ( ii == tlevel ) {
3669  ii += 2; // three teams have same level
3670  } else {
3671  ii ++; // two teams have same level
3672  }
3673  }
3674  }
3675 #endif
3676 
3677  while( ii > level )
3678  {
3679  for( dd = team -> t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
3680  {
3681  }
3682  if( team -> t.t_serialized && ( !dd ) ) {
3683  team = team->t.t_parent;
3684  continue;
3685  }
3686  if( ii > level ) {
3687  team = team->t.t_parent;
3688  ii--;
3689  }
3690  }
3691 
3692  return team -> t.t_nproc;
3693 }
3694 
3695 #endif // OMP_30_ENABLED
3696 
3697 kmp_r_sched_t
3698 __kmp_get_schedule_global() {
3699 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
3700 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
3701 
3702  kmp_r_sched_t r_sched;
3703 
3704  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
3705  // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
3706  // and thus have different run-time schedules in different roots (even in OMP 2.5)
3707  if ( __kmp_sched == kmp_sch_static ) {
3708  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
3709  } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
3710  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
3711  } else {
3712  r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
3713  }
3714 
3715  if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
3716  r_sched.chunk = KMP_DEFAULT_CHUNK;
3717  } else {
3718  r_sched.chunk = __kmp_chunk;
3719  }
3720 
3721  return r_sched;
3722 }
3723 
3724 /* ------------------------------------------------------------------------ */
3725 /* ------------------------------------------------------------------------ */
3726 
3727 
3728 /*
3729  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3730  * at least argc number of *t_argv entries for the requested team.
3731  */
3732 static void
3733 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
3734 {
3735 
3736  KMP_DEBUG_ASSERT( team );
3737  if( !realloc || argc > team -> t.t_max_argc ) {
3738 
3739  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
3740  team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
3741 #if (KMP_PERF_V106 == KMP_ON)
3742  /* if previously allocated heap space for args, free them */
3743  if ( realloc && team -> t.t_argv != &team -> t.t_inline_argv[0] )
3744  __kmp_free( (void *) team -> t.t_argv );
3745 
3746  if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
3747  /* use unused space in the cache line for arguments */
3748  team -> t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3749  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
3750  team->t.t_id, team->t.t_max_argc ));
3751  team -> t.t_argv = &team -> t.t_inline_argv[0];
3752  if ( __kmp_storage_map ) {
3753  __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
3754  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3755  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
3756  "team_%d.t_inline_argv",
3757  team->t.t_id );
3758  }
3759  } else {
3760  /* allocate space for arguments in the heap */
3761  team -> t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
3762  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
3763  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
3764  team->t.t_id, team->t.t_max_argc ));
3765  team -> t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
3766  if ( __kmp_storage_map ) {
3767  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
3768  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
3769  team->t.t_id );
3770  }
3771  }
3772 #else /* KMP_PERF_V106 == KMP_OFF */
3773  if ( realloc )
3774  __kmp_free( (void*) team -> t.t_argv );
3775  team -> t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
3776  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
3777  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
3778  team->t.t_id, team->t.t_max_argc ));
3779  team -> t.t_argv = __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
3780  if ( __kmp_storage_map ) {
3781  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
3782  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv", team->t.t_id );
3783  }
3784 #endif /* KMP_PERF_V106 */
3785 
3786  }
3787 }
3788 
3789 static void
3790 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
3791 {
3792  int i;
3793  int num_disp_buff = max_nth > 1 ? KMP_MAX_DISP_BUF : 2;
3794 #if KMP_USE_POOLED_ALLOC
3795  // AC: TODO: fix bug here: size of t_disp_buffer should not be multiplied by max_nth!
3796  char *ptr = __kmp_allocate(max_nth *
3797  ( sizeof(kmp_info_t*) + sizeof(dispatch_shared_info_t)*num_disp_buf
3798  + sizeof(kmp_disp_t) + sizeof(int)*6
3799 # if OMP_30_ENABLED
3800  //+ sizeof(int)
3801  + sizeof(kmp_r_sched_t)
3802  + sizeof(kmp_taskdata_t)
3803 # endif // OMP_30_ENABLED
3804  ) );
3805 
3806  team -> t.t_threads = (kmp_info_t**) ptr; ptr += sizeof(kmp_info_t*) * max_nth;
3807  team -> t.t_disp_buffer = (dispatch_shared_info_t*) ptr;
3808  ptr += sizeof(dispatch_shared_info_t) * num_disp_buff;
3809  team -> t.t_dispatch = (kmp_disp_t*) ptr; ptr += sizeof(kmp_disp_t) * max_nth;
3810  team -> t.t_set_nproc = (int*) ptr; ptr += sizeof(int) * max_nth;
3811  team -> t.t_set_dynamic = (int*) ptr; ptr += sizeof(int) * max_nth;
3812  team -> t.t_set_nested = (int*) ptr; ptr += sizeof(int) * max_nth;
3813  team -> t.t_set_blocktime = (int*) ptr; ptr += sizeof(int) * max_nth;
3814  team -> t.t_set_bt_intervals = (int*) ptr; ptr += sizeof(int) * max_nth;
3815  team -> t.t_set_bt_set = (int*) ptr;
3816 # if OMP_30_ENABLED
3817  ptr += sizeof(int) * max_nth;
3818  //team -> t.t_set_max_active_levels = (int*) ptr; ptr += sizeof(int) * max_nth;
3819  team -> t.t_set_sched = (kmp_r_sched_t*) ptr;
3820  ptr += sizeof(kmp_r_sched_t) * max_nth;
3821  team -> t.t_implicit_task_taskdata = (kmp_taskdata_t*) ptr;
3822  ptr += sizeof(kmp_taskdata_t) * max_nth;
3823 # endif // OMP_30_ENABLED
3824 #else
3825 
3826  team -> t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
3827  team -> t.t_disp_buffer = (dispatch_shared_info_t*)
3828  __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
3829  team -> t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
3830  #if OMP_30_ENABLED
3831  //team -> t.t_set_max_active_levels = (int*) __kmp_allocate( sizeof(int) * max_nth );
3832  //team -> t.t_set_sched = (kmp_r_sched_t*) __kmp_allocate( sizeof(kmp_r_sched_t) * max_nth );
3833  team -> t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
3834  #else
3835  team -> t.t_set_nproc = (int*) __kmp_allocate( sizeof(int) * max_nth );
3836  team -> t.t_set_dynamic = (int*) __kmp_allocate( sizeof(int) * max_nth );
3837  team -> t.t_set_nested = (int*) __kmp_allocate( sizeof(int) * max_nth );
3838  team -> t.t_set_blocktime = (int*) __kmp_allocate( sizeof(int) * max_nth );
3839  team -> t.t_set_bt_intervals = (int*) __kmp_allocate( sizeof(int) * max_nth );
3840  team -> t.t_set_bt_set = (int*) __kmp_allocate( sizeof(int) * max_nth );
3841 # endif // OMP_30_ENABLED
3842 #endif
3843  team->t.t_max_nproc = max_nth;
3844 
3845  /* setup dispatch buffers */
3846  for(i = 0 ; i < num_disp_buff; ++i)
3847  team -> t.t_disp_buffer[i].buffer_index = i;
3848 }
3849 
3850 static void
3851 __kmp_free_team_arrays(kmp_team_t *team) {
3852  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3853  int i;
3854  for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
3855  if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
3856  __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
3857  team->t.t_dispatch[ i ].th_disp_buffer = NULL;
3858  }; // if
3859  }; // for
3860  __kmp_free(team->t.t_threads);
3861  #if !KMP_USE_POOLED_ALLOC
3862  __kmp_free(team->t.t_disp_buffer);
3863  __kmp_free(team->t.t_dispatch);
3864  #if OMP_30_ENABLED
3865  //__kmp_free(team->t.t_set_max_active_levels);
3866  //__kmp_free(team->t.t_set_sched);
3867  __kmp_free(team->t.t_implicit_task_taskdata);
3868  #else
3869  __kmp_free(team->t.t_set_nproc);
3870  __kmp_free(team->t.t_set_dynamic);
3871  __kmp_free(team->t.t_set_nested);
3872  __kmp_free(team->t.t_set_blocktime);
3873  __kmp_free(team->t.t_set_bt_intervals);
3874  __kmp_free(team->t.t_set_bt_set);
3875  # endif // OMP_30_ENABLED
3876  #endif
3877  team->t.t_threads = NULL;
3878  team->t.t_disp_buffer = NULL;
3879  team->t.t_dispatch = NULL;
3880 #if OMP_30_ENABLED
3881  //team->t.t_set_sched = 0;
3882  //team->t.t_set_max_active_levels = 0;
3883  team->t.t_implicit_task_taskdata = 0;
3884 #else
3885  team->t.t_set_nproc = 0;
3886  team->t.t_set_dynamic = 0;
3887  team->t.t_set_nested = 0;
3888  team->t.t_set_blocktime = 0;
3889  team->t.t_set_bt_intervals = 0;
3890  team->t.t_set_bt_set = 0;
3891 #endif // OMP_30_ENABLED
3892 }
3893 
3894 static void
3895 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3896  kmp_info_t **oldThreads = team->t.t_threads;
3897 
3898  #if !KMP_USE_POOLED_ALLOC
3899  __kmp_free(team->t.t_disp_buffer);
3900  __kmp_free(team->t.t_dispatch);
3901  #if OMP_30_ENABLED
3902  //__kmp_free(team->t.t_set_max_active_levels);
3903  //__kmp_free(team->t.t_set_sched);
3904  __kmp_free(team->t.t_implicit_task_taskdata);
3905  #else
3906  __kmp_free(team->t.t_set_nproc);
3907  __kmp_free(team->t.t_set_dynamic);
3908  __kmp_free(team->t.t_set_nested);
3909  __kmp_free(team->t.t_set_blocktime);
3910  __kmp_free(team->t.t_set_bt_intervals);
3911  __kmp_free(team->t.t_set_bt_set);
3912  # endif // OMP_30_ENABLED
3913  #endif
3914  __kmp_allocate_team_arrays(team, max_nth);
3915 
3916  memcpy(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3917 
3918  __kmp_free(oldThreads);
3919 }
3920 
3921 static kmp_internal_control_t
3922 __kmp_get_global_icvs( void ) {
3923 
3924 #if OMP_30_ENABLED
3925  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3926 #endif /* OMP_30_ENABLED */
3927 
3928 #if OMP_40_ENABLED
3929  KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3930 #endif /* OMP_40_ENABLED */
3931 
3932  kmp_internal_control_t g_icvs = {
3933  0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3934  __kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
3935  __kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
3936  __kmp_dflt_team_nth,
3937  //int nproc; //internal control for # of threads for next parallel region (per thread)
3938  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3939  __kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
3940  __kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
3941  __kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
3942 #if OMP_30_ENABLED
3943  __kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
3944  r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
3945 #endif /* OMP_30_ENABLED */
3946 #if OMP_40_ENABLED
3947  __kmp_nested_proc_bind.bind_types[0],
3948 #endif /* OMP_40_ENABLED */
3949  NULL //struct kmp_internal_control *next;
3950  };
3951 
3952  return g_icvs;
3953 }
3954 
3955 static kmp_internal_control_t
3956 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3957 
3958  #if OMP_30_ENABLED
3959  kmp_internal_control_t gx_icvs;
3960  gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3961  copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3962  gx_icvs.next = NULL;
3963  #else
3964  kmp_internal_control_t gx_icvs =
3965  {
3966  0,
3967  team->t.t_set_nested[0],
3968  team->t.t_set_dynamic[0],
3969  team->t.t_set_nproc[0],
3970  team->t.t_set_blocktime[0],
3971  team->t.t_set_bt_intervals[0],
3972  team->t.t_set_bt_set[0],
3973  NULL //struct kmp_internal_control *next;
3974  };
3975  #endif // OMP_30_ENABLED
3976 
3977  return gx_icvs;
3978 }
3979 
3980 static void
3981 __kmp_initialize_root( kmp_root_t *root )
3982 {
3983  int f;
3984  kmp_team_t *root_team;
3985  kmp_team_t *hot_team;
3986  size_t disp_size, dispatch_size, bar_size;
3987  int hot_team_max_nth;
3988 #if OMP_30_ENABLED
3989  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3990  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3991 #endif // OMP_30_ENABLED
3992  KMP_DEBUG_ASSERT( root );
3993  KMP_ASSERT( ! root->r.r_begin );
3994 
3995  /* setup the root state structure */
3996  __kmp_init_lock( &root->r.r_begin_lock );
3997  root -> r.r_begin = FALSE;
3998  root -> r.r_active = FALSE;
3999  root -> r.r_in_parallel = 0;
4000  root -> r.r_blocktime = __kmp_dflt_blocktime;
4001  root -> r.r_nested = __kmp_dflt_nested;
4002 
4003  /* setup the root team for this task */
4004  /* allocate the root team structure */
4005  KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
4006  root_team =
4007  __kmp_allocate_team(
4008  root,
4009  1, // new_nproc
4010  1, // max_nproc
4011 #if OMP_40_ENABLED
4012  __kmp_nested_proc_bind.bind_types[0],
4013 #endif
4014 #if OMP_30_ENABLED
4015  &r_icvs,
4016 #else
4017  __kmp_dflt_team_nth_ub, // num_treads
4018  __kmp_global.g.g_dynamic, // dynamic
4019  __kmp_dflt_nested, // nested
4020  __kmp_dflt_blocktime, // blocktime
4021  __kmp_bt_intervals, // bt_intervals
4022  __kmp_env_blocktime, // bt_set
4023 #endif // OMP_30_ENABLED
4024  0 // argc
4025  );
4026 
4027  KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
4028 
4029  root -> r.r_root_team = root_team;
4030  root_team -> t.t_control_stack_top = NULL;
4031 
4032  /* initialize root team */
4033  root_team -> t.t_threads[0] = NULL;
4034  root_team -> t.t_nproc = 1;
4035  root_team -> t.t_serialized = 1;
4036 #if OMP_30_ENABLED
4037  // TODO???: root_team -> t.t_max_active_levels = __kmp_dflt_max_active_levels;
4038  root_team -> t.t_sched.r_sched_type = r_sched.r_sched_type;
4039  root_team -> t.t_sched.chunk = r_sched.chunk;
4040 #endif // OMP_30_ENABLED
4041  KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
4042  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4043 
4044  /* setup the hot team for this task */
4045  /* allocate the hot team structure */
4046  KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
4047  hot_team =
4048  __kmp_allocate_team(
4049  root,
4050  1, // new_nproc
4051  __kmp_dflt_team_nth_ub * 2, // max_nproc
4052 #if OMP_40_ENABLED
4053  __kmp_nested_proc_bind.bind_types[0],
4054 #endif
4055 #if OMP_30_ENABLED
4056  &r_icvs,
4057 #else
4058  __kmp_dflt_team_nth_ub, // num_treads
4059  __kmp_global.g.g_dynamic, // dynamic
4060  __kmp_dflt_nested, // nested
4061  __kmp_dflt_blocktime, // blocktime
4062  __kmp_bt_intervals, // bt_intervals
4063  __kmp_env_blocktime, // bt_set
4064 #endif // OMP_30_ENABLED
4065  0 // argc
4066  );
4067  KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
4068 
4069  root -> r.r_hot_team = hot_team;
4070  root_team -> t.t_control_stack_top = NULL;
4071 
4072  /* first-time initialization */
4073  hot_team -> t.t_parent = root_team;
4074 
4075  /* initialize hot team */
4076  hot_team_max_nth = hot_team->t.t_max_nproc;
4077  for ( f = 0; f < hot_team_max_nth; ++ f ) {
4078  hot_team -> t.t_threads[ f ] = NULL;
4079  }; // for
4080  hot_team -> t.t_nproc = 1;
4081 #if OMP_30_ENABLED
4082  // TODO???: hot_team -> t.t_max_active_levels = __kmp_dflt_max_active_levels;
4083  hot_team -> t.t_sched.r_sched_type = r_sched.r_sched_type;
4084  hot_team -> t.t_sched.chunk = r_sched.chunk;
4085 #endif // OMP_30_ENABLED
4086 #if KMP_MIC
4087  hot_team -> t.t_size_changed = 0;
4088 #endif
4089 
4090 }
4091 
4092 #ifdef KMP_DEBUG
4093 
4094 
4095 typedef struct kmp_team_list_item {
4096  kmp_team_p const * entry;
4097  struct kmp_team_list_item * next;
4098 } kmp_team_list_item_t;
4099 typedef kmp_team_list_item_t * kmp_team_list_t;
4100 
4101 
4102 static void
4103 __kmp_print_structure_team_accum( // Add team to list of teams.
4104  kmp_team_list_t list, // List of teams.
4105  kmp_team_p const * team // Team to add.
4106 ) {
4107 
4108  // List must terminate with item where both entry and next are NULL.
4109  // Team is added to the list only once.
4110  // List is sorted in ascending order by team id.
4111  // Team id is *not* a key.
4112 
4113  kmp_team_list_t l;
4114 
4115  KMP_DEBUG_ASSERT( list != NULL );
4116  if ( team == NULL ) {
4117  return;
4118  }; // if
4119 
4120  __kmp_print_structure_team_accum( list, team->t.t_parent );
4121  __kmp_print_structure_team_accum( list, team->t.t_next_pool );
4122 
4123  // Search list for the team.
4124  l = list;
4125  while ( l->next != NULL && l->entry != team ) {
4126  l = l->next;
4127  }; // while
4128  if ( l->next != NULL ) {
4129  return; // Team has been added before, exit.
4130  }; // if
4131 
4132  // Team is not found. Search list again for insertion point.
4133  l = list;
4134  while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
4135  l = l->next;
4136  }; // while
4137 
4138  // Insert team.
4139  {
4140  kmp_team_list_item_t * item =
4141  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
4142  * item = * l;
4143  l->entry = team;
4144  l->next = item;
4145  }
4146 
4147 }
4148 
4149 static void
4150 __kmp_print_structure_team(
4151  char const * title,
4152  kmp_team_p const * team
4153 
4154 ) {
4155  __kmp_printf( "%s", title );
4156  if ( team != NULL ) {
4157  __kmp_printf( "%2x %p\n", team->t.t_id, team );
4158  } else {
4159  __kmp_printf( " - (nil)\n" );
4160  }; // if
4161 }
4162 
4163 static void
4164 __kmp_print_structure_thread(
4165  char const * title,
4166  kmp_info_p const * thread
4167 
4168 ) {
4169  __kmp_printf( "%s", title );
4170  if ( thread != NULL ) {
4171  __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
4172  } else {
4173  __kmp_printf( " - (nil)\n" );
4174  }; // if
4175 }
4176 
4177 static void
4178 __kmp_print_structure(
4179  void
4180 ) {
4181 
4182  kmp_team_list_t list;
4183 
4184  // Initialize list of teams.
4185  list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
4186  list->entry = NULL;
4187  list->next = NULL;
4188 
4189  __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
4190  {
4191  int gtid;
4192  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
4193  __kmp_printf( "%2d", gtid );
4194  if ( __kmp_threads != NULL ) {
4195  __kmp_printf( " %p", __kmp_threads[ gtid ] );
4196  }; // if
4197  if ( __kmp_root != NULL ) {
4198  __kmp_printf( " %p", __kmp_root[ gtid ] );
4199  }; // if
4200  __kmp_printf( "\n" );
4201  }; // for gtid
4202  }
4203 
4204  // Print out __kmp_threads array.
4205  __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
4206  if ( __kmp_threads != NULL ) {
4207  int gtid;
4208  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
4209  kmp_info_t const * thread = __kmp_threads[ gtid ];
4210  if ( thread != NULL ) {
4211  __kmp_printf( "GTID %2d %p:\n", gtid, thread );
4212  __kmp_printf( " Our Root: %p\n", thread->th.th_root );
4213  __kmp_print_structure_team( " Our Team: ", thread->th.th_team );
4214  __kmp_print_structure_team( " Serial Team: ", thread->th.th_serial_team );
4215  __kmp_printf( " Threads: %2d\n", thread->th.th_team_nproc );
4216  __kmp_print_structure_thread( " Master: ", thread->th.th_team_master );
4217  __kmp_printf( " Serialized?: %2d\n", thread->th.th_team_serialized );
4218  __kmp_printf( " Set NProc: %2d\n", thread->th.th_set_nproc );
4219 #if OMP_40_ENABLED
4220  __kmp_printf( " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind );
4221 #endif
4222  __kmp_print_structure_thread( " Next in pool: ", thread->th.th_next_pool );
4223  __kmp_printf( "\n" );
4224  __kmp_print_structure_team_accum( list, thread->th.th_team );
4225  __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
4226  }; // if
4227  }; // for gtid
4228  } else {
4229  __kmp_printf( "Threads array is not allocated.\n" );
4230  }; // if
4231 
4232  // Print out __kmp_root array.
4233  __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
4234  if ( __kmp_root != NULL ) {
4235  int gtid;
4236  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
4237  kmp_root_t const * root = __kmp_root[ gtid ];
4238  if ( root != NULL ) {
4239  __kmp_printf( "GTID %2d %p:\n", gtid, root );
4240  __kmp_print_structure_team( " Root Team: ", root->r.r_root_team );
4241  __kmp_print_structure_team( " Hot Team: ", root->r.r_hot_team );
4242  __kmp_print_structure_thread( " Uber Thread: ", root->r.r_uber_thread );
4243  __kmp_printf( " Active?: %2d\n", root->r.r_active );
4244  __kmp_printf( " Nested?: %2d\n", root->r.r_nested );
4245  __kmp_printf( " In Parallel: %2d\n", root->r.r_in_parallel );
4246  __kmp_printf( "\n" );
4247  __kmp_print_structure_team_accum( list, root->r.r_root_team );
4248  __kmp_print_structure_team_accum( list, root->r.r_hot_team );
4249  }; // if
4250  }; // for gtid
4251  } else {
4252  __kmp_printf( "Ubers array is not allocated.\n" );
4253  }; // if
4254 
4255  __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
4256  while ( list->next != NULL ) {
4257  kmp_team_p const * team = list->entry;
4258  int i;
4259  __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
4260  __kmp_print_structure_team( " Parent Team: ", team->t.t_parent );
4261  __kmp_printf( " Master TID: %2d\n", team->t.t_master_tid );
4262  __kmp_printf( " Max threads: %2d\n", team->t.t_max_nproc );
4263  __kmp_printf( " Levels of serial: %2d\n", team->t.t_serialized );
4264  __kmp_printf( " Number threads: %2d\n", team->t.t_nproc );
4265  for ( i = 0; i < team->t.t_nproc; ++ i ) {
4266  __kmp_printf( " Thread %2d: ", i );
4267  __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
4268  }; // for i
4269  __kmp_print_structure_team( " Next in pool: ", team->t.t_next_pool );
4270  __kmp_printf( "\n" );
4271  list = list->next;
4272  }; // while
4273 
4274  // Print out __kmp_thread_pool and __kmp_team_pool.
4275  __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
4276  __kmp_print_structure_thread( "Thread pool: ", (kmp_info_t *)__kmp_thread_pool );
4277  __kmp_print_structure_team( "Team pool: ", (kmp_team_t *)__kmp_team_pool );
4278  __kmp_printf( "\n" );
4279 
4280  // Free team list.
4281  while ( list != NULL ) {
4282  kmp_team_list_item_t * item = list;
4283  list = list->next;
4284  KMP_INTERNAL_FREE( item );
4285  }; // while
4286 
4287 }
4288 
4289 #endif
4290 
4291 
4292 //---------------------------------------------------------------------------
4293 // Stuff for per-thread fast random number generator
4294 // Table of primes
4295 
4296 static const unsigned __kmp_primes[] = {
4297  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
4298  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
4299  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
4300  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
4301  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
4302  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
4303  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
4304  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
4305  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
4306  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
4307  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
4308  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
4309  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
4310  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
4311  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
4312  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
4313 };
4314 
4315 //---------------------------------------------------------------------------
4316 // __kmp_get_random: Get a random number using a linear congruential method.
4317 
4318 unsigned short
4319 __kmp_get_random( kmp_info_t * thread )
4320 {
4321  unsigned x = thread -> th.th_x;
4322  unsigned short r = x>>16;
4323 
4324  thread -> th.th_x = x*thread->th.th_a+1;
4325 
4326  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
4327  thread->th.th_info.ds.ds_tid, r) );
4328 
4329  return r;
4330 }
4331 //--------------------------------------------------------
4332 // __kmp_init_random: Initialize a random number generator
4333 
4334 void
4335 __kmp_init_random( kmp_info_t * thread )
4336 {
4337  unsigned seed = thread->th.th_info.ds.ds_tid;
4338 
4339  thread -> th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
4340  thread -> th.th_x = (seed+1)*thread->th.th_a+1;
4341  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread -> th.th_a) );
4342 }
4343 
4344 
4345 #if KMP_OS_WINDOWS
4346 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
4347 static int
4348 __kmp_reclaim_dead_roots(void) {
4349  int i, r = 0;
4350 
4351  for(i = 0; i < __kmp_threads_capacity; ++i) {
4352  if( KMP_UBER_GTID( i ) &&
4353  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
4354  !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
4355  r += __kmp_unregister_root_other_thread(i);
4356  }
4357  }
4358  return r;
4359 }
4360 #endif
4361 
4362 /*
4363  This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
4364  free entries generated.
4365 
4366  For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
4367  already dead.
4368 
4369  On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
4370  update to __kmp_threads_capacity. Array capacity is increased by doubling with clipping to
4371  __kmp_tp_capacity, if threadprivate cache array has been created.
4372  Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
4373 
4374  After any dead root reclamation, if the clipping value allows array expansion to result in the generation
4375  of a total of nWish free slots, the function does that expansion. If not, but the clipping value allows
4376  array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
4377  Otherwise, nothing is done beyond the possible initial root thread reclamation. However, if nNeed is zero,
4378  a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
4379  as many free slots as possible up to nWish.
4380 
4381  If any argument is negative, the behavior is undefined.
4382 */
4383 static int
4384 __kmp_expand_threads(int nWish, int nNeed) {
4385  int added = 0;
4386  int old_tp_cached;
4387  int __kmp_actual_max_nth;
4388 
4389  if(nNeed > nWish) /* normalize the arguments */
4390  nWish = nNeed;
4391 #if KMP_OS_WINDOWS && !defined GUIDEDLL_EXPORTS
4392 /* only for Windows static library */
4393  /* reclaim array entries for root threads that are already dead */
4394  added = __kmp_reclaim_dead_roots();
4395 
4396  if(nNeed) {
4397  nNeed -= added;
4398  if(nNeed < 0)
4399  nNeed = 0;
4400  }
4401  if(nWish) {
4402  nWish -= added;
4403  if(nWish < 0)
4404  nWish = 0;
4405  }
4406 #endif
4407  if(nWish <= 0)
4408  return added;
4409 
4410  while(1) {
4411  int nTarget;
4412  int minimumRequiredCapacity;
4413  int newCapacity;
4414  kmp_info_t **newThreads;
4415  kmp_root_t **newRoot;
4416 
4417  //
4418  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
4419  // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
4420  // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
4421  // become > __kmp_max_nth in one of two ways:
4422  //
4423  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
4424  // may not be resused by another thread, so we may need to increase
4425  // __kmp_threads_capacity to __kmp_max_threads + 1.
4426  //
4427  // 2) New foreign root(s) are encountered. We always register new
4428  // foreign roots. This may cause a smaller # of threads to be
4429  // allocated at subsequent parallel regions, but the worker threads
4430  // hang around (and eventually go to sleep) and need slots in the
4431  // __kmp_threads[] array.
4432  //
4433  // Anyway, that is the reason for moving the check to see if
4434  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
4435  // instead of having it performed here. -BB
4436  //
4437  old_tp_cached = __kmp_tp_cached;
4438  __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
4439  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
4440 
4441  /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
4442  nTarget = nWish;
4443  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
4444  /* can't fulfil nWish, so try nNeed */
4445  if(nNeed) {
4446  nTarget = nNeed;
4447  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
4448  /* possible expansion too small -- give up */
4449  break;
4450  }
4451  } else {
4452  /* best-effort */
4453  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
4454  if(!nTarget) {
4455  /* can expand at all -- give up */
4456  break;
4457  }
4458  }
4459  }
4460  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
4461 
4462  newCapacity = __kmp_threads_capacity;
4463  do{
4464  newCapacity =
4465  newCapacity <= (__kmp_actual_max_nth >> 1) ?
4466  (newCapacity << 1) :
4467  __kmp_actual_max_nth;
4468  } while(newCapacity < minimumRequiredCapacity);
4469  newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
4470  newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
4471  memcpy(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
4472  memcpy(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
4473  memset(newThreads + __kmp_threads_capacity, 0,
4474  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
4475  memset(newRoot + __kmp_threads_capacity, 0,
4476  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
4477 
4478  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
4479  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
4480  while we were allocating the expanded array, and our new capacity is larger than the threadprivate
4481  cache capacity, so we should deallocate the expanded arrays and try again. This is the first check
4482  of a double-check pair.
4483  */
4484  __kmp_free(newThreads);
4485  continue; /* start over and try again */
4486  }
4487  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
4488  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
4489  /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
4490  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
4491  __kmp_free(newThreads);
4492  continue; /* start over and try again */
4493  } else {
4494  /* success */
4495  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
4496  //
4497  *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
4498  *(kmp_root_t**volatile*)&__kmp_root = newRoot;
4499  added += newCapacity - __kmp_threads_capacity;
4500  *(volatile int*)&__kmp_threads_capacity = newCapacity;
4501  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
4502  break; /* succeded, so we can exit the loop */
4503  }
4504  }
4505  return added;
4506 }
4507 
4508 /* register the current thread as a root thread and obtain our gtid */
4509 /* we must have the __kmp_initz_lock held at this point */
4510 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
4511 int
4512 __kmp_register_root( int initial_thread )
4513 {
4514  kmp_info_t *root_thread;
4515  kmp_root_t *root;
4516  int gtid;
4517  int capacity;
4518  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
4519  KA_TRACE( 20, ("__kmp_register_root: entered\n"));
4520  KMP_MB();
4521 
4522 
4523  /*
4524  2007-03-02:
4525 
4526  If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
4527  "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
4528  return false (that means there is at least one empty slot in __kmp_threads array), but it
4529  is possible the only free slot is #0, which is reserved for initial thread and so cannot be
4530  used for this one. Following code workarounds this bug.
4531 
4532  However, right solution seems to be not reserving slot #0 for initial thread because:
4533  (1) there is no magic in slot #0,
4534  (2) we cannot detect initial thread reliably (the first thread which does serial
4535  initialization may be not a real initial thread).
4536  */
4537  capacity = __kmp_threads_capacity;
4538  if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
4539  -- capacity;
4540  }; // if
4541 
4542  /* see if there are too many threads */
4543  if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
4544  if ( __kmp_tp_cached ) {
4545  __kmp_msg(
4546  kmp_ms_fatal,
4547  KMP_MSG( CantRegisterNewThread ),
4548  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
4549  KMP_HNT( PossibleSystemLimitOnThreads ),
4550  __kmp_msg_null
4551  );
4552  }
4553  else {
4554  __kmp_msg(
4555  kmp_ms_fatal,
4556  KMP_MSG( CantRegisterNewThread ),
4557  KMP_HNT( SystemLimitOnThreads ),
4558  __kmp_msg_null
4559  );
4560  }
4561  }; // if
4562 
4563  /* find an available thread slot */
4564  /* Don't reassign the zero slot since we need that to only be used by initial
4565  thread */
4566  for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ );
4567  KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
4568  KMP_ASSERT( gtid < __kmp_threads_capacity );
4569 
4570  /* update global accounting */
4571  __kmp_all_nth ++;
4572  TCW_4(__kmp_nth, __kmp_nth + 1);
4573 
4574  //
4575  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4576  // for low numbers of procs, and method #2 (keyed API call) for higher
4577  // numbers of procs.
4578  //
4579  if ( __kmp_adjust_gtid_mode ) {
4580  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4581  if ( TCR_4(__kmp_gtid_mode) != 2) {
4582  TCW_4(__kmp_gtid_mode, 2);
4583  }
4584  }
4585  else {
4586  if (TCR_4(__kmp_gtid_mode) != 1 ) {
4587  TCW_4(__kmp_gtid_mode, 1);
4588  }
4589  }
4590  }
4591 
4592 #ifdef KMP_ADJUST_BLOCKTIME
4593  /* Adjust blocktime to zero if necessary */
4594  /* Middle initialization might not have ocurred yet */
4595  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4596  if ( __kmp_nth > __kmp_avail_proc ) {
4597  __kmp_zero_bt = TRUE;
4598  }
4599  }
4600 #endif /* KMP_ADJUST_BLOCKTIME */
4601 
4602  /* setup this new hierarchy */
4603  if( ! ( root = __kmp_root[gtid] )) {
4604  root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
4605  KMP_DEBUG_ASSERT( ! root->r.r_root_team );
4606  }
4607 
4608  __kmp_initialize_root( root );
4609 
4610  /* setup new root thread structure */
4611  if( root -> r.r_uber_thread ) {
4612  root_thread = root -> r.r_uber_thread;
4613  } else {
4614  root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4615  if ( __kmp_storage_map ) {
4616  __kmp_print_thread_storage_map( root_thread, gtid );
4617  }
4618  root_thread -> th.th_info .ds.ds_gtid = gtid;
4619  root_thread -> th.th_root = root;
4620  if( __kmp_env_consistency_check ) {
4621  root_thread -> th.th_cons = __kmp_allocate_cons_stack( gtid );
4622  }
4623  #if USE_FAST_MEMORY
4624  __kmp_initialize_fast_memory( root_thread );
4625  #endif /* USE_FAST_MEMORY */
4626 
4627  #if KMP_USE_BGET
4628  KMP_DEBUG_ASSERT( root_thread -> th.th_local.bget_data == NULL );
4629  __kmp_initialize_bget( root_thread );
4630  #endif
4631  __kmp_init_random( root_thread ); // Initialize random number generator
4632  }
4633 
4634  /* setup the serial team held in reserve by the root thread */
4635  if( ! root_thread -> th.th_serial_team ) {
4636  #if OMP_30_ENABLED
4637  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
4638  #endif // OMP_30_ENABLED
4639  KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
4640  root_thread -> th.th_serial_team = __kmp_allocate_team( root, 1, 1,
4641 #if OMP_40_ENABLED
4642  proc_bind_default,
4643 #endif
4644 #if OMP_30_ENABLED
4645  &r_icvs,
4646 #else
4647  __kmp_dflt_team_nth_ub,
4648  __kmp_global.g.g_dynamic,
4649  __kmp_dflt_nested,
4650  __kmp_dflt_blocktime,
4651  __kmp_bt_intervals,
4652  __kmp_env_blocktime,
4653 #endif // OMP_30_ENABLED
4654  0 );
4655  }
4656  KMP_ASSERT( root_thread -> th.th_serial_team );
4657  KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
4658  root_thread -> th.th_serial_team ) );
4659 
4660  /* drop root_thread into place */
4661  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
4662 
4663  root -> r.r_root_team -> t.t_threads[0] = root_thread;
4664  root -> r.r_hot_team -> t.t_threads[0] = root_thread;
4665  root_thread -> th.th_serial_team -> t.t_threads[0] = root_thread;
4666  root -> r.r_uber_thread = root_thread;
4667 
4668  /* initialize the thread, get it ready to go */
4669  __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
4670 
4671  /* prepare the master thread for get_gtid() */
4672  __kmp_gtid_set_specific( gtid );
4673  #ifdef KMP_TDATA_GTID
4674  __kmp_gtid = gtid;
4675  #endif
4676  __kmp_create_worker( gtid, root_thread, __kmp_stksize );
4677  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
4678  TCW_4(__kmp_init_gtid, TRUE);
4679 
4680  KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
4681  gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
4682  root -> r.r_hot_team -> t.t_id, 0, KMP_INIT_BARRIER_STATE,
4683  KMP_INIT_BARRIER_STATE ) );
4684  { // Initialize barrier data.
4685  int b;
4686  for ( b = 0; b < bs_last_barrier; ++ b ) {
4687  root_thread->th.th_bar[ b ].bb.b_arrived = KMP_INIT_BARRIER_STATE;
4688  }; // for
4689  }
4690  KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
4691 
4692 
4693 #if KMP_OS_WINDOWS || KMP_OS_LINUX
4694  if ( TCR_4(__kmp_init_middle) ) {
4695  __kmp_affinity_set_init_mask( gtid, TRUE );
4696  }
4697 #endif /* KMP_OS_WINDOWS || KMP_OS_LINUX */
4698 
4699  __kmp_root_counter ++;
4700 
4701  KMP_MB();
4702  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
4703 
4704  return gtid;
4705 }
4706 
4707 /* Resets a root thread and clear its root and hot teams.
4708  Returns the number of __kmp_threads entries directly and indirectly freed.
4709 */
4710 static int
4711 __kmp_reset_root(int gtid, kmp_root_t *root)
4712 {
4713  kmp_team_t * root_team = root->r.r_root_team;
4714  kmp_team_t * hot_team = root->r.r_hot_team;
4715  int n = hot_team->t.t_nproc;
4716  int i;
4717 
4718  KMP_DEBUG_ASSERT( ! root->r.r_active );
4719 
4720  root->r.r_root_team = NULL;
4721  root->r.r_hot_team = NULL;
4722  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
4723  // to __kmp_free_team().
4724  __kmp_free_team( root, root_team );
4725  __kmp_free_team( root, hot_team );
4726 
4727 #if OMP_30_ENABLED
4728  //
4729  // Before we can reap the thread, we need to make certain that all
4730  // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
4731  //
4732  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4733  __kmp_wait_to_unref_task_teams();
4734  }
4735 #endif /* OMP_30_ENABLED */
4736 
4737  #if KMP_OS_WINDOWS
4738  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
4739  KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
4740  (LPVOID)&(root->r.r_uber_thread->th),
4741  root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
4742  __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
4743  #endif /* KMP_OS_WINDOWS */
4744 
4745  TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
4746  __kmp_reap_thread( root->r.r_uber_thread, 1 );
4747 
4748  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
4749  root->r.r_uber_thread = NULL;
4750  /* mark root as no longer in use */
4751  root -> r.r_begin = FALSE;
4752 
4753  return n;
4754 }
4755 
4756 void
4757 __kmp_unregister_root_current_thread( int gtid )
4758 {
4759  kmp_root_t *root = __kmp_root[gtid];
4760 
4761  KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
4762  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
4763  KMP_ASSERT( KMP_UBER_GTID( gtid ));
4764  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
4765  KMP_ASSERT( root->r.r_active == FALSE );
4766 
4767  /* this lock should be ok, since unregister_root_current_thread is never called during
4768  * and abort, only during a normal close. furthermore, if you have the
4769  * forkjoin lock, you should never try to get the initz lock */
4770 
4771  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
4772 
4773  KMP_MB();
4774 
4775  __kmp_reset_root(gtid, root);
4776 
4777  /* free up this thread slot */
4778  __kmp_gtid_set_specific( KMP_GTID_DNE );
4779 #ifdef KMP_TDATA_GTID
4780  __kmp_gtid = KMP_GTID_DNE;
4781 #endif
4782 
4783  KMP_MB();
4784  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
4785 
4786  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
4787 }
4788 
4789 /* __kmp_forkjoin_lock must be already held
4790  Unregisters a root thread that is not the current thread. Returns the number of
4791  __kmp_threads entries freed as a result.
4792  */
4793 static int
4794 __kmp_unregister_root_other_thread( int gtid )
4795 {
4796  kmp_root_t *root = __kmp_root[gtid];
4797  int r;
4798 
4799  KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
4800  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
4801  KMP_ASSERT( KMP_UBER_GTID( gtid ));
4802  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
4803  KMP_ASSERT( root->r.r_active == FALSE );
4804 
4805  r = __kmp_reset_root(gtid, root);
4806  KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
4807  return r;
4808 }
4809 
4810 #if OMP_30_ENABLED
4811 
4812 #if KMP_DEBUG
4813 void __kmp_task_info() {
4814 
4815  kmp_int32 gtid = __kmp_entry_gtid();
4816  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
4817  kmp_info_t *this_thr = __kmp_threads[ gtid ];
4818  kmp_team_t *steam = this_thr -> th.th_serial_team;
4819  kmp_team_t *team = this_thr -> th.th_team;
4820 
4821  __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
4822  gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
4823 }
4824 #endif // KMP_DEBUG
4825 
4826 #endif // OMP_30_ENABLED
4827 
4828 /* TODO optimize with one big memclr, take out what isn't needed,
4829  * split responsility to workers as much as possible, and delay
4830  * initialization of features as much as possible */
4831 static void
4832 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
4833 {
4834  /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
4835  * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4836 
4837  KMP_DEBUG_ASSERT( this_thr != NULL );
4838  KMP_DEBUG_ASSERT( this_thr -> th.th_serial_team );
4839  KMP_DEBUG_ASSERT( team );
4840  KMP_DEBUG_ASSERT( team -> t.t_threads );
4841  KMP_DEBUG_ASSERT( team -> t.t_dispatch );
4842  KMP_DEBUG_ASSERT( team -> t.t_threads[0] );
4843  KMP_DEBUG_ASSERT( team -> t.t_threads[0] -> th.th_root );
4844 
4845  KMP_MB();
4846 
4847  TCW_SYNC_PTR(this_thr->th.th_team, team);
4848 
4849  this_thr->th.th_info.ds.ds_tid = tid;
4850  this_thr->th.th_set_nproc = 0;
4851 #if OMP_40_ENABLED
4852  this_thr->th.th_set_proc_bind = proc_bind_default;
4853 # if (KMP_OS_WINDOWS || KMP_OS_LINUX)
4854  this_thr->th.th_new_place = this_thr->th.th_current_place;
4855 # endif
4856 #endif
4857  this_thr->th.th_root = team -> t.t_threads[0] -> th.th_root;
4858 
4859  /* setup the thread's cache of the team structure */
4860  this_thr->th.th_team_nproc = team -> t.t_nproc;
4861  this_thr->th.th_team_master = team -> t.t_threads[0];
4862  this_thr->th.th_team_serialized = team -> t.t_serialized;
4863 #if OMP_40_ENABLED
4864  this_thr->th.th_team_microtask = team -> t.t_threads[0] -> th.th_team_microtask;
4865  this_thr->th.th_teams_level = team -> t.t_threads[0] -> th.th_teams_level;
4866  this_thr->th.th_set_nth_teams = team -> t.t_threads[0] -> th.th_set_nth_teams;
4867 #endif /* OMP_40_ENABLED */
4868  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4869 
4870 #if OMP_30_ENABLED
4871  KMP_DEBUG_ASSERT( team -> t.t_implicit_task_taskdata );
4872  this_thr->th.th_task_state = 0;
4873 
4874  KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4875  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4876 
4877  __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4878 
4879  KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4880  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4881  // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4882 #endif // OMP_30_ENABLED
4883 
4884  /* TODO no worksharing in speculative threads */
4885  this_thr -> th.th_dispatch = &team -> t.t_dispatch[ tid ];
4886 
4887  this_thr->th.th_local.this_construct = 0;
4888  this_thr->th.th_local.last_construct = 0;
4889 
4890 #ifdef BUILD_TV
4891  this_thr->th.th_local.tv_data = 0;
4892 #endif
4893 
4894  if ( ! this_thr->th.th_pri_common ) {
4895  this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4896  if ( __kmp_storage_map ) {
4897  __kmp_print_storage_map_gtid(
4898  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4899  sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4900  );
4901  }; // if
4902  this_thr->th.th_pri_head = NULL;
4903  }; // if
4904 
4905  /* Initialize dynamic dispatch */
4906  {
4907  volatile kmp_disp_t *dispatch = this_thr -> th.th_dispatch;
4908  /*
4909  * Use team max_nproc since this will never change for the team.
4910  */
4911  size_t disp_size = sizeof( dispatch_private_info_t ) *
4912  ( team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF );
4913  KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4914  KMP_ASSERT( dispatch );
4915  KMP_DEBUG_ASSERT( team -> t.t_dispatch );
4916  KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4917 
4918  dispatch->th_disp_index = 0;
4919 
4920  if( ! dispatch -> th_disp_buffer ) {
4921  dispatch -> th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4922 
4923  if ( __kmp_storage_map ) {
4924  __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4925  &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : KMP_MAX_DISP_BUF ],
4926  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4927  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4928  gtid, team->t.t_id, gtid );
4929  }
4930  } else {
4931  memset( & dispatch -> th_disp_buffer[0], '\0', disp_size );
4932  }
4933 
4934  dispatch -> th_dispatch_pr_current = 0;
4935  dispatch -> th_dispatch_sh_current = 0;
4936 
4937  dispatch -> th_deo_fcn = 0; /* ORDERED */
4938  dispatch -> th_dxo_fcn = 0; /* END ORDERED */
4939  }
4940 
4941  this_thr->th.th_next_pool = NULL;
4942 
4943  KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4944  KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4945 
4946  KMP_MB();
4947 }
4948 
4949 
4950 /* allocate a new thread for the requesting team. this is only called from within a
4951  * forkjoin critical section. we will first try to get an available thread from the
4952  * thread pool. if none is available, we will fork a new one assuming we are able
4953  * to create a new one. this should be assured, as the caller should check on this
4954  * first.
4955  */
4956 kmp_info_t *
4957 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4958 {
4959  kmp_team_t *serial_team;
4960  kmp_info_t *new_thr;
4961  int new_gtid;
4962 
4963  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4964  KMP_DEBUG_ASSERT( root && team );
4965  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4966  KMP_MB();
4967 
4968  /* first, try to get one from the thread pool */
4969  if ( __kmp_thread_pool ) {
4970 
4971  new_thr = (kmp_info_t*)__kmp_thread_pool;
4972  __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4973  if ( new_thr == __kmp_thread_pool_insert_pt ) {
4974  __kmp_thread_pool_insert_pt = NULL;
4975  }
4976  TCW_4(new_thr->th.th_in_pool, FALSE);
4977  //
4978  // Don't touch th_active_in_pool or th_active.
4979  // The worker thread adjusts those flags as it sleeps/awakens.
4980  //
4981 
4982  __kmp_thread_pool_nth--;
4983 
4984  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4985  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4986  KMP_ASSERT( ! new_thr -> th.th_team );
4987  KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4988  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4989 
4990  /* setup the thread structure */
4991  __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4992  KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4993 
4994  TCW_4(__kmp_nth, __kmp_nth + 1);
4995 
4996 #ifdef KMP_ADJUST_BLOCKTIME
4997  /* Adjust blocktime back to zero if necessar y */
4998  /* Middle initialization might not have ocurred yet */
4999  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5000  if ( __kmp_nth > __kmp_avail_proc ) {
5001  __kmp_zero_bt = TRUE;
5002  }
5003  }
5004 #endif /* KMP_ADJUST_BLOCKTIME */
5005 
5006  KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
5007  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
5008 
5009  KMP_MB();
5010  return new_thr;
5011  }
5012 
5013 
5014  /* no, well fork a new one */
5015  KMP_ASSERT( __kmp_nth == __kmp_all_nth );
5016  KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
5017 
5018  //
5019  // If this is the first worker thread the RTL is creating, then also
5020  // launch the monitor thread. We try to do this as early as possible.
5021  //
5022  if ( ! TCR_4( __kmp_init_monitor ) ) {
5023  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5024  if ( ! TCR_4( __kmp_init_monitor ) ) {
5025  KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
5026  TCW_4( __kmp_init_monitor, 1 );
5027  __kmp_create_monitor( & __kmp_monitor );
5028  KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
5029  }
5030  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5031  }
5032 
5033  KMP_MB();
5034  for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
5035  KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
5036  }
5037 
5038  /* allocate space for it. */
5039  new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
5040 
5041  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
5042 
5043  if ( __kmp_storage_map ) {
5044  __kmp_print_thread_storage_map( new_thr, new_gtid );
5045  }
5046 
5047  /* add the reserve serialized team, initialized from the team's master thread */
5048  {
5049  #if OMP_30_ENABLED
5050  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
5051  #endif // OMP_30_ENABLED
5052  KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
5053  new_thr -> th.th_serial_team = serial_team =
5054  (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
5055 #if OMP_40_ENABLED
5056  proc_bind_default,
5057 #endif
5058 #if OMP_30_ENABLED
5059  &r_icvs,
5060 #else
5061  team->t.t_set_nproc[0],
5062  team->t.t_set_dynamic[0],
5063  team->t.t_set_nested[0],
5064  team->t.t_set_blocktime[0],
5065  team->t.t_set_bt_intervals[0],
5066  team->t.t_set_bt_set[0],
5067 #endif // OMP_30_ENABLED
5068  0 );
5069  }
5070  KMP_ASSERT ( serial_team );
5071  serial_team -> t.t_threads[0] = new_thr;
5072  KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
5073  new_thr ) );
5074 
5075  /* setup the thread structures */
5076  __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
5077 
5078  #if USE_FAST_MEMORY
5079  __kmp_initialize_fast_memory( new_thr );
5080  #endif /* USE_FAST_MEMORY */
5081 
5082  #if KMP_USE_BGET
5083  KMP_DEBUG_ASSERT( new_thr -> th.th_local.bget_data == NULL );
5084  __kmp_initialize_bget( new_thr );
5085  #endif
5086 
5087  __kmp_init_random( new_thr ); // Initialize random number generator
5088 
5089  /* Initialize these only once when thread is grabbed for a team allocation */
5090  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
5091  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5092 
5093  new_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_go = KMP_INIT_BARRIER_STATE;
5094  new_thr->th.th_bar[ bs_plain_barrier ].bb.b_go = KMP_INIT_BARRIER_STATE;
5095  #if KMP_FAST_REDUCTION_BARRIER
5096  new_thr->th.th_bar[ bs_reduction_barrier ].bb.b_go = KMP_INIT_BARRIER_STATE;
5097  #endif // KMP_FAST_REDUCTION_BARRIER
5098 
5099  new_thr->th.th_spin_here = FALSE;
5100  new_thr->th.th_next_waiting = 0;
5101 
5102 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
5103  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
5104  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
5105  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
5106  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
5107 #endif
5108 
5109  TCW_4(new_thr->th.th_in_pool, FALSE);
5110  new_thr->th.th_active_in_pool = FALSE;
5111  TCW_4(new_thr->th.th_active, TRUE);
5112 
5113  /* adjust the global counters */
5114  __kmp_all_nth ++;
5115  __kmp_nth ++;
5116 
5117  //
5118  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
5119  // for low numbers of procs, and method #2 (keyed API call) for higher
5120  // numbers of procs.
5121  //
5122  if ( __kmp_adjust_gtid_mode ) {
5123  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
5124  if ( TCR_4(__kmp_gtid_mode) != 2) {
5125  TCW_4(__kmp_gtid_mode, 2);
5126  }
5127  }
5128  else {
5129  if (TCR_4(__kmp_gtid_mode) != 1 ) {
5130  TCW_4(__kmp_gtid_mode, 1);
5131  }
5132  }
5133  }
5134 
5135 #ifdef KMP_ADJUST_BLOCKTIME
5136  /* Adjust blocktime back to zero if necessary */
5137  /* Middle initialization might not have ocurred yet */
5138  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5139  if ( __kmp_nth > __kmp_avail_proc ) {
5140  __kmp_zero_bt = TRUE;
5141  }
5142  }
5143 #endif /* KMP_ADJUST_BLOCKTIME */
5144 
5145  /* actually fork it and create the new worker thread */
5146  KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
5147  __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
5148  KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
5149 
5150 
5151  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
5152  KMP_MB();
5153  return new_thr;
5154 }
5155 
5156 /*
5157  * reinitialize team for reuse.
5158  *
5159  * The hot team code calls this case at every fork barrier, so EPCC barrier
5160  * test are extremely sensitive to changes in it, esp. writes to the team
5161  * struct, which cause a cache invalidation in all threads.
5162  *
5163  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
5164  */
5165 static void
5166 __kmp_reinitialize_team(
5167  kmp_team_t * team,
5168  int new_nproc,
5169  #if OMP_30_ENABLED
5170  kmp_internal_control_t * new_icvs,
5171  ident_t * loc
5172  #else
5173  int new_set_nproc, int new_set_dynamic, int new_set_nested,
5174  int new_set_blocktime, int new_bt_intervals, int new_bt_set
5175  #endif // OMP_30_ENABLED
5176 ) {
5177  int f;
5178  #if OMP_30_ENABLED
5179  KMP_DEBUG_ASSERT( team && new_nproc && new_icvs );
5180  KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
5181  team->t.t_ident = loc;
5182  #else
5183  KMP_DEBUG_ASSERT( team && new_nproc && new_set_nproc );
5184  #endif // OMP_30_ENABLED
5185 
5186  team->t.t_id = KMP_GEN_TEAM_ID();
5187 
5188 #if KMP_BARRIER_ICV_PULL
5189  //
5190  // Copy the ICV's to the team structure, where all of the worker threads
5191  // can access them and make their own copies after the barrier.
5192  //
5193  load_icvs(new_icvs);
5194  store_icvs(&team->t.t_initial_icvs, new_icvs);
5195 
5196  //
5197  // Set up the master thread's copy of the ICV's. __kmp_fork_call()
5198  // assumes they are already set in the master thread.
5199  // FIXME - change that code to use the team->t.t_initial_icvs copy
5200  // and eliminate this copy.
5201  //
5202  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
5203  store_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
5204  sync_icvs();
5205  KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n",
5206  0, team->t.t_threads[0], team ) );
5207 
5208 #elif KMP_BARRIER_ICV_PUSH
5209  //
5210  // Set the ICV's in the master thread only.
5211  // They will be propagated by the fork barrier.
5212  //
5213  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
5214  load_icvs(new_icvs);
5215  store_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
5216  sync_icvs();
5217 
5218  KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n",
5219  0, team->t.t_threads[0], team ) );
5220 
5221 #else
5222  //
5223  // Copy the icvs to each of the threads. This takes O(nthreads) time.
5224  //
5225 #if OMP_30_ENABLED
5226  load_icvs(new_icvs);
5227 #endif
5228  for( f=0 ; f<new_nproc ; f++) {
5229 # if OMP_30_ENABLED
5230  // TODO: GEH - pass in better source location info since usually NULL here
5231  KF_TRACE( 10, ( "__kmp_reinitialize_team1: T#%d this_thread=%p team=%p\n",
5232  f, team->t.t_threads[f], team ) );
5233  __kmp_init_implicit_task( loc, team->t.t_threads[f], team, f, FALSE );
5234  store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
5235  KF_TRACE( 10, ( "__kmp_reinitialize_team2: T#%d this_thread=%p team=%p\n",
5236  f, team->t.t_threads[f], team ) );
5237 # else
5238  team -> t.t_set_nproc[f] = new_set_nproc;
5239  team -> t.t_set_dynamic[f] = new_set_dynamic;
5240  team -> t.t_set_nested[f] = new_set_nested;
5241  team -> t.t_set_blocktime[f] = new_set_blocktime;
5242  team -> t.t_set_bt_intervals[f] = new_bt_intervals;
5243  team -> t.t_set_bt_set[f] = new_bt_set;
5244 # endif // OMP_30_ENABLED
5245  }
5246 # if OMP_30_ENABLED
5247  sync_icvs();
5248 # endif
5249 #endif // KMP_BARRIER_ICV_PUSH || KMP_BARRIER_ICV_PULL
5250 
5251 }
5252 
5253 /* initialize the team data structure
5254  * this assumes the t_threads and t_max_nproc are already set
5255  * also, we don't touch the arguments */
5256 static void
5257 __kmp_initialize_team(
5258  kmp_team_t * team,
5259  int new_nproc,
5260  #if OMP_30_ENABLED
5261  kmp_internal_control_t * new_icvs,
5262  ident_t * loc
5263  #else
5264  int new_set_nproc, int new_set_dynamic, int new_set_nested,
5265  int new_set_blocktime, int new_bt_intervals, int new_bt_set
5266  #endif // OMP_30_ENABLED
5267 ) {
5268  /* verify */
5269  KMP_DEBUG_ASSERT( team );
5270  KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
5271  KMP_DEBUG_ASSERT( team->t.t_threads );
5272  KMP_MB();
5273 
5274  team -> t.t_master_tid = 0; /* not needed */
5275  /* team -> t.t_master_bar; not needed */
5276  team -> t.t_serialized = new_nproc > 1 ? 0 : 1;
5277  team -> t.t_nproc = new_nproc;
5278 
5279  /* team -> t.t_parent = NULL; TODO not needed & would mess up hot team */
5280  team -> t.t_next_pool = NULL;
5281  /* memset( team -> t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
5282 
5283  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
5284  team -> t.t_invoke = NULL; /* not needed */
5285 
5286 #if OMP_30_ENABLED
5287  // TODO???: team -> t.t_max_active_levels = new_max_active_levels;
5288  team -> t.t_sched = new_icvs->sched;
5289 #endif // OMP_30_ENABLED
5290 
5291 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
5292  team -> t.t_fp_control_saved = FALSE; /* not needed */
5293  team -> t.t_x87_fpu_control_word = 0; /* not needed */
5294  team -> t.t_mxcsr = 0; /* not needed */
5295 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
5296 
5297  team -> t.t_construct = 0;
5298  __kmp_init_lock( & team -> t.t_single_lock );
5299 
5300  team -> t.t_ordered .dt.t_value = 0;
5301  team -> t.t_master_active = FALSE;
5302 
5303  memset( & team -> t.t_taskq, '\0', sizeof( kmp_taskq_t ));
5304 
5305 #ifdef KMP_DEBUG
5306  team -> t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
5307 #endif
5308  team -> t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
5309 
5310  team -> t.t_control_stack_top = NULL;
5311 
5312  __kmp_reinitialize_team(
5313  team, new_nproc,
5314  #if OMP_30_ENABLED
5315  new_icvs,
5316  loc
5317  #else
5318  new_set_nproc, new_set_dynamic, new_set_nested,
5319  new_set_blocktime, new_bt_intervals, new_bt_set
5320  #endif // OMP_30_ENABLED
5321  );
5322 
5323  KMP_MB();
5324 }
5325 
5326 #if KMP_OS_LINUX
5327 /* Sets full mask for thread and returns old mask, no changes to structures. */
5328 static void
5329 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
5330 {
5331  if ( KMP_AFFINITY_CAPABLE() ) {
5332  int status;
5333  if ( old_mask != NULL ) {
5334  status = __kmp_get_system_affinity( old_mask, TRUE );
5335  int error = errno;
5336  if ( status != 0 ) {
5337  __kmp_msg(
5338  kmp_ms_fatal,
5339  KMP_MSG( ChangeThreadAffMaskError ),
5340  KMP_ERR( error ),
5341  __kmp_msg_null
5342  );
5343  }
5344  }
5345  __kmp_set_system_affinity( __kmp_affinity_get_fullMask(), TRUE );
5346  }
5347 }
5348 #endif
5349 
5350 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
5351 
5352 //
5353 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
5354 // It calculats the worker + master thread's partition based upon the parent
5355 // thread's partition, and binds each worker to a thread in thier partition.
5356 // The master thread's partition should already include its current binding.
5357 //
5358 static void
5359 __kmp_partition_places( kmp_team_t *team )
5360 {
5361  //
5362  // Copy the master thread's place partion to the team struct
5363  //
5364  kmp_info_t *master_th = team->t.t_threads[0];
5365  KMP_DEBUG_ASSERT( master_th != NULL );
5366  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
5367  int first_place = master_th->th.th_first_place;
5368  int last_place = master_th->th.th_last_place;
5369  int masters_place = master_th->th.th_current_place;
5370  team->t.t_first_place = first_place;
5371  team->t.t_last_place = last_place;
5372 
5373  KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
5374  proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
5375  masters_place, first_place, last_place ) );
5376 
5377  switch ( proc_bind ) {
5378 
5379  case proc_bind_default:
5380  //
5381  // serial teams might have the proc_bind policy set to
5382  // proc_bind_default. It doesn't matter, as we don't
5383  // rebind the master thread for any proc_bind policy.
5384  //
5385  KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
5386  break;
5387 
5388  case proc_bind_master:
5389  {
5390  int f;
5391  int n_th = team->t.t_nproc;
5392  for ( f = 1; f < n_th; f++ ) {
5393  kmp_info_t *th = team->t.t_threads[f];
5394  KMP_DEBUG_ASSERT( th != NULL );
5395  th->th.th_first_place = first_place;
5396  th->th.th_last_place = last_place;
5397  th->th.th_new_place = masters_place;
5398 
5399  KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
5400  __kmp_gtid_from_thread( team->t.t_threads[f] ),
5401  team->t.t_id, f, masters_place, first_place, last_place ) );
5402  }
5403  }
5404  break;
5405 
5406  case proc_bind_close:
5407  {
5408  int f;
5409  int n_th = team->t.t_nproc;
5410  int n_places;
5411  if ( first_place <= last_place ) {
5412  n_places = last_place - first_place + 1;
5413  }
5414  else {
5415  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
5416  }
5417  if ( n_th <= n_places ) {
5418  int place = masters_place;
5419  for ( f = 1; f < n_th; f++ ) {
5420  kmp_info_t *th = team->t.t_threads[f];
5421  KMP_DEBUG_ASSERT( th != NULL );
5422 
5423  if ( place == last_place ) {
5424  place = first_place;
5425  }
5426  else if ( place == __kmp_affinity_num_masks - 1) {
5427  place = 0;
5428  }
5429  else {
5430  place++;
5431  }
5432  th->th.th_first_place = first_place;
5433  th->th.th_last_place = last_place;
5434  th->th.th_new_place = place;
5435 
5436  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
5437  __kmp_gtid_from_thread( team->t.t_threads[f] ),
5438  team->t.t_id, f, place, first_place, last_place ) );
5439  }
5440  }
5441  else {
5442  int S, rem, gap, s_count;
5443  S = n_th / n_places;
5444  s_count = 0;
5445  rem = n_th - ( S * n_places );
5446  gap = rem > 0 ? n_places/rem : n_places;
5447  int place = masters_place;
5448  int gap_ct = gap;
5449  for ( f = 0; f < n_th; f++ ) {
5450  kmp_info_t *th = team->t.t_threads[f];
5451  KMP_DEBUG_ASSERT( th != NULL );
5452 
5453  th->th.th_first_place = first_place;
5454  th->th.th_last_place = last_place;
5455  th->th.th_new_place = place;
5456  s_count++;
5457 
5458  if ( (s_count == S) && rem && (gap_ct == gap) ) {
5459  // do nothing, add an extra thread to place on next iteration
5460  }
5461  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
5462  // we added an extra thread to this place; move to next place
5463  if ( place == last_place ) {
5464  place = first_place;
5465  }
5466  else if ( place == __kmp_affinity_num_masks - 1) {
5467  place = 0;
5468  }
5469  else {
5470  place++;
5471  }
5472  s_count = 0;
5473  gap_ct = 1;
5474  rem--;
5475  }
5476  else if (s_count == S) { // place full; don't add extra
5477  if ( place == last_place ) {
5478  place = first_place;
5479  }
5480  else if ( place == __kmp_affinity_num_masks - 1) {
5481  place = 0;
5482  }
5483  else {
5484  place++;
5485  }
5486  gap_ct++;
5487  s_count = 0;
5488  }
5489 
5490  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
5491  __kmp_gtid_from_thread( team->t.t_threads[f] ),
5492  team->t.t_id, f, th->th.th_new_place, first_place,
5493  last_place ) );
5494  }
5495  KMP_DEBUG_ASSERT( place == masters_place );
5496  }
5497  }
5498  break;
5499 
5500  case proc_bind_spread:
5501  {
5502  int f;
5503  int n_th = team->t.t_nproc;
5504  int n_places;
5505  if ( first_place <= last_place ) {
5506  n_places = last_place - first_place + 1;
5507  }
5508  else {
5509  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
5510  }
5511  if ( n_th <= n_places ) {
5512  int place = masters_place;
5513  int S = n_places/n_th;
5514  int s_count, rem, gap, gap_ct;
5515  rem = n_places - n_th*S;
5516  gap = rem ? n_th/rem : 1;
5517  gap_ct = gap;
5518  for ( f = 0; f < n_th; f++ ) {
5519  kmp_info_t *th = team->t.t_threads[f];
5520  KMP_DEBUG_ASSERT( th != NULL );
5521 
5522  th->th.th_first_place = place;
5523  th->th.th_new_place = place;
5524  s_count = 1;
5525  while (s_count < S) {
5526  if ( place == last_place ) {
5527  place = first_place;
5528  }
5529  else if ( place == __kmp_affinity_num_masks - 1) {
5530  place = 0;
5531  }
5532  else {
5533  place++;
5534  }
5535  s_count++;
5536  }
5537  if (rem && (gap_ct == gap)) {
5538  if ( place == last_place ) {
5539  place = first_place;
5540  }
5541  else if ( place == __kmp_affinity_num_masks - 1) {
5542  place = 0;
5543  }
5544  else {
5545  place++;
5546  }
5547  rem--;
5548  gap_ct = 0;
5549  }
5550  th->th.th_last_place = place;
5551  gap_ct++;
5552 
5553  if ( place == last_place ) {
5554  place = first_place;
5555  }
5556  else if ( place == __kmp_affinity_num_masks - 1) {
5557  place = 0;
5558  }
5559  else {
5560  place++;
5561  }
5562 
5563  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
5564  __kmp_gtid_from_thread( team->t.t_threads[f] ),
5565  team->t.t_id, f, th->th.th_new_place,
5566  th->th.th_first_place, th->th.th_last_place ) );
5567  }
5568  KMP_DEBUG_ASSERT( place == masters_place );
5569  }
5570  else {
5571  int S, rem, gap, s_count;
5572  S = n_th / n_places;
5573  s_count = 0;
5574  rem = n_th - ( S * n_places );
5575  gap = rem > 0 ? n_places/rem : n_places;
5576  int place = masters_place;
5577  int gap_ct = gap;
5578  for ( f = 0; f < n_th; f++ ) {
5579  kmp_info_t *th = team->t.t_threads[f];
5580  KMP_DEBUG_ASSERT( th != NULL );
5581 
5582  th->th.th_first_place = place;
5583  th->th.th_last_place = place;
5584  th->th.th_new_place = place;
5585  s_count++;
5586 
5587  if ( (s_count == S) && rem && (gap_ct == gap) ) {
5588  // do nothing, add an extra thread to place on next iteration
5589  }
5590  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
5591  // we added an extra thread to this place; move on to next place
5592  if ( place == last_place ) {
5593  place = first_place;
5594  }
5595  else if ( place == __kmp_affinity_num_masks - 1) {
5596  place = 0;
5597  }
5598  else {
5599  place++;
5600  }
5601  s_count = 0;
5602  gap_ct = 1;
5603  rem--;
5604  }
5605  else if (s_count == S) { // place is full; don't add extra thread
5606  if ( place == last_place ) {
5607  place = first_place;
5608  }
5609  else if ( place == __kmp_affinity_num_masks - 1) {
5610  place = 0;
5611  }
5612  else {
5613  place++;
5614  }
5615  gap_ct++;
5616  s_count = 0;
5617  }
5618 
5619  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
5620  __kmp_gtid_from_thread( team->t.t_threads[f] ),
5621  team->t.t_id, f, th->th.th_new_place,
5622  th->th.th_first_place, th->th.th_last_place) );
5623  }
5624  KMP_DEBUG_ASSERT( place == masters_place );
5625  }
5626  }
5627  break;
5628 
5629  default:
5630  break;
5631  }
5632 
5633  KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
5634 }
5635 
5636 #endif /* OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX) */
5637 
5638 /* allocate a new team data structure to use. take one off of the free pool if available */
5639 kmp_team_t *
5640 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
5641 #if OMP_40_ENABLED
5642  kmp_proc_bind_t new_proc_bind,
5643 #endif
5644 #if OMP_30_ENABLED
5645  kmp_internal_control_t *new_icvs,
5646 #else
5647  int new_set_nproc, int new_set_dynamic, int new_set_nested,
5648  int new_set_blocktime, int new_bt_intervals, int new_bt_set,
5649 #endif
5650  int argc )
5651 {
5652  int f;
5653  kmp_team_t *team;
5654  char *ptr;
5655  size_t size;
5656 
5657  KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
5658  KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
5659  KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
5660  KMP_MB();
5661 
5662  //
5663  // optimization to use a "hot" team for the top level,
5664  // as it is usually the same
5665  //
5666  if ( ! root->r.r_active && new_nproc > 1 ) {
5667 
5668  KMP_DEBUG_ASSERT( new_nproc == max_nproc );
5669 
5670  team = root -> r.r_hot_team;
5671 
5672 #if OMP_30_ENABLED && KMP_DEBUG
5673  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5674  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team = %p before reinit\n",
5675  team -> t.t_task_team ));
5676  }
5677 #endif
5678 
5679  /* has the number of threads changed? */
5680  if( team -> t.t_nproc > new_nproc ) {
5681  KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
5682 
5683 #if KMP_MIC
5684  team -> t.t_size_changed = 1;
5685 #endif
5686 #if OMP_30_ENABLED
5687  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5688  kmp_task_team_t *task_team = team->t.t_task_team;
5689  if ( ( task_team != NULL ) && TCR_SYNC_4(task_team->tt.tt_active) ) {
5690  //
5691  // Signal the worker threads (esp. the extra ones) to stop
5692  // looking for tasks while spin waiting. The task teams
5693  // are reference counted and will be deallocated by the
5694  // last worker thread.
5695  //
5696  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
5697  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
5698  KMP_MB();
5699 
5700  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
5701  &team->t.t_task_team ) );
5702  team->t.t_task_team = NULL;
5703  }
5704  else {
5705  KMP_DEBUG_ASSERT( task_team == NULL );
5706  }
5707  }
5708 #endif // OMP_30_ENABLED
5709 
5710  /* release the extra threads we don't need any more */
5711  for( f = new_nproc ; f < team->t.t_nproc ; f++ ) {
5712  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5713  __kmp_free_thread( team->t.t_threads[ f ] );
5714  team -> t.t_threads[ f ] = NULL;
5715  }
5716 
5717  team -> t.t_nproc = new_nproc;
5718 #if OMP_30_ENABLED
5719  // TODO???: team -> t.t_max_active_levels = new_max_active_levels;
5720  team -> t.t_sched = new_icvs->sched;
5721 #endif
5722  __kmp_reinitialize_team( team, new_nproc,
5723 #if OMP_30_ENABLED
5724  new_icvs,
5725  root->r.r_uber_thread->th.th_ident
5726 #else
5727  new_set_nproc, new_set_dynamic, new_set_nested,
5728  new_set_blocktime, new_bt_intervals, new_bt_set
5729 #endif
5730  );
5731 
5732 #if OMP_30_ENABLED
5733  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5734  kmp_task_team_t *task_team = team->t.t_task_team;
5735  if ( task_team != NULL ) {
5736  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
5737  task_team->tt.tt_nproc = new_nproc;
5738  task_team->tt.tt_unfinished_threads = new_nproc;
5739  task_team->tt.tt_ref_ct = new_nproc - 1;
5740  }
5741  }
5742 #endif
5743 
5744  /* update the remaining threads */
5745  for( f = 0 ; f < new_nproc ; f++ ) {
5746  team -> t.t_threads[ f ] -> th.th_team_nproc = team->t.t_nproc;
5747  }
5748 
5749 #if OMP_30_ENABLED
5750  // restore the current task state of the master thread: should be the implicit task
5751  KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
5752  0, team->t.t_threads[0], team ) );
5753 
5754  __kmp_push_current_task_to_thread( team -> t.t_threads[ 0 ], team, 0 );
5755 #endif
5756 
5757 #ifdef KMP_DEBUG
5758  for ( f = 0; f < team->t.t_nproc; f++ ) {
5759  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5760  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5761  }
5762 #endif
5763 
5764 #if OMP_40_ENABLED
5765  team->t.t_proc_bind = new_proc_bind;
5766 # if KMP_OS_WINDOWS || KMP_OS_LINUX
5767  __kmp_partition_places( team );
5768 # endif
5769 #endif
5770 
5771  }
5772  else if ( team -> t.t_nproc < new_nproc ) {
5773 #if KMP_OS_LINUX
5774  kmp_affin_mask_t *old_mask;
5775  if ( KMP_AFFINITY_CAPABLE() ) {
5776  KMP_CPU_ALLOC(old_mask);
5777  }
5778 #endif
5779 
5780  KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
5781 
5782 #if KMP_MIC
5783  team -> t.t_size_changed = 1;
5784 #endif
5785 
5786 
5787  if(team -> t.t_max_nproc < new_nproc) {
5788  /* reallocate larger arrays */
5789  __kmp_reallocate_team_arrays(team, new_nproc);
5790  __kmp_reinitialize_team( team, new_nproc,
5791 #if OMP_30_ENABLED
5792  new_icvs,
5793  NULL // TODO: !!!
5794 #else
5795  new_set_nproc, new_set_dynamic, new_set_nested,
5796  new_set_blocktime, new_bt_intervals, new_bt_set
5797 #endif
5798  );
5799  }
5800 
5801 #if KMP_OS_LINUX
5802  /* Temporarily set full mask for master thread before
5803  creation of workers. The reason is that workers inherit
5804  the affinity from master, so if a lot of workers are
5805  created on the single core quickly, they don't get
5806  a chance to set their own affinity for a long time.
5807  */
5808  __kmp_set_thread_affinity_mask_full_tmp( old_mask );
5809 #endif
5810 
5811  /* allocate new threads for the hot team */
5812  for( f = team->t.t_nproc ; f < new_nproc ; f++ ) {
5813  kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
5814  KMP_DEBUG_ASSERT( new_worker );
5815  team->t.t_threads[ f ] = new_worker;
5816  new_worker->th.th_team_nproc = team->t.t_nproc;
5817 
5818  KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%u, plain=%u\n",
5819  team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
5820  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5821  team->t.t_bar[bs_plain_barrier].b_arrived ) );
5822 
5823  { // Initialize barrier data for new threads.
5824  int b;
5825  kmp_balign_t * balign = new_worker->th.th_bar;
5826  for ( b = 0; b < bp_last_bar; ++ b ) {
5827  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5828  }
5829  }
5830  }
5831 
5832 #if KMP_OS_LINUX
5833  if ( KMP_AFFINITY_CAPABLE() ) {
5834  /* Restore initial master thread's affinity mask */
5835  __kmp_set_system_affinity( old_mask, TRUE );
5836  KMP_CPU_FREE(old_mask);
5837  }
5838 #endif
5839 
5840  /* make sure everyone is syncronized */
5841  __kmp_initialize_team( team, new_nproc,
5842 #if OMP_30_ENABLED
5843  new_icvs,
5844  root->r.r_uber_thread->th.th_ident
5845 #else
5846  new_set_nproc, new_set_dynamic, new_set_nested,
5847  new_set_blocktime, new_bt_intervals, new_bt_set
5848 #endif
5849  );
5850 
5851 #if OMP_30_ENABLED
5852  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5853  kmp_task_team_t *task_team = team->t.t_task_team;
5854  if ( task_team != NULL ) {
5855  KMP_DEBUG_ASSERT( ! TCR_4(task_team->tt.tt_found_tasks) );
5856  task_team->tt.tt_nproc = new_nproc;
5857  task_team->tt.tt_unfinished_threads = new_nproc;
5858  task_team->tt.tt_ref_ct = new_nproc - 1;
5859  }
5860  }
5861 #endif
5862 
5863  /* reinitialize the old threads */
5864  for( f = 0 ; f < team->t.t_nproc ; f++ )
5865  __kmp_initialize_info( team->t.t_threads[ f ], team, f,
5866  __kmp_gtid_from_tid( f, team ) );
5867 #ifdef KMP_DEBUG
5868  for ( f = 0; f < team->t.t_nproc; ++ f ) {
5869  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5870  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5871  }
5872 #endif
5873 
5874 #if OMP_40_ENABLED
5875  team->t.t_proc_bind = new_proc_bind;
5876 # if KMP_OS_WINDOWS || KMP_OS_LINUX
5877  __kmp_partition_places( team );
5878 # endif
5879 #endif
5880 
5881  }
5882  else {
5883  KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
5884 #if KMP_MIC
5885  // This case can mean that omp_set_num_threads() was called and the hot team size
5886  // was already reduced, so we check the special flag
5887  if ( team -> t.t_size_changed == -1 ) {
5888  team -> t.t_size_changed = 1;
5889  } else {
5890  team -> t.t_size_changed = 0;
5891  }
5892 #endif
5893 
5894 #if OMP_30_ENABLED
5895  // TODO???: team -> t.t_max_active_levels = new_max_active_levels;
5896  team -> t.t_sched = new_icvs->sched;
5897 #endif
5898 
5899  __kmp_reinitialize_team( team, new_nproc,
5900 #if OMP_30_ENABLED
5901  new_icvs,
5902  root->r.r_uber_thread->th.th_ident
5903 #else
5904  new_set_nproc, new_set_dynamic, new_set_nested,
5905  new_set_blocktime, new_bt_intervals, new_bt_set
5906 #endif
5907  );
5908 
5909 #if OMP_30_ENABLED
5910  KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
5911  0, team->t.t_threads[0], team ) );
5912  __kmp_push_current_task_to_thread( team -> t.t_threads[ 0 ], team, 0 );
5913 #endif
5914 
5915 #if OMP_40_ENABLED
5916 # if (KMP_OS_WINDOWS || KMP_OS_LINUX)
5917  if ( team->t.t_proc_bind == new_proc_bind ) {
5918  KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
5919  team->t.t_id, new_proc_bind, team->t.t_first_place,
5920  team->t.t_last_place ) );
5921  }
5922  else {
5923  team->t.t_proc_bind = new_proc_bind;
5924  __kmp_partition_places( team );
5925  }
5926 # else
5927  if ( team->t.t_proc_bind != new_proc_bind ) {
5928  team->t.t_proc_bind = new_proc_bind;
5929  }
5930 # endif /* (KMP_OS_WINDOWS || KMP_OS_LINUX) */
5931 #endif /* OMP_40_ENABLED */
5932  }
5933 
5934  /* reallocate space for arguments if necessary */
5935  __kmp_alloc_argv_entries( argc, team, TRUE );
5936  team -> t.t_argc = argc;
5937  //
5938  // The hot team re-uses the previous task team,
5939  // if untouched during the previous release->gather phase.
5940  //
5941 
5942  KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5943 
5944 #if OMP_30_ENABLED && KMP_DEBUG
5945  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5946  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team = %p after reinit\n",
5947  team -> t.t_task_team ));
5948  }
5949 #endif
5950 
5951  KMP_MB();
5952 
5953  return team;
5954  }
5955 
5956  /* next, let's try to take one from the team pool */
5957  KMP_MB();
5958  for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5959  {
5960  /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5961  if ( team->t.t_max_nproc >= max_nproc ) {
5962  /* take this team from the team pool */
5963  __kmp_team_pool = team->t.t_next_pool;
5964 
5965  /* setup the team for fresh use */
5966  __kmp_initialize_team( team, new_nproc,
5967 #if OMP_30_ENABLED
5968  new_icvs,
5969  NULL // TODO: !!!
5970 #else
5971  new_set_nproc, new_set_dynamic, new_set_nested,
5972  new_set_blocktime, new_bt_intervals, new_bt_set
5973 #endif
5974  );
5975 
5976 #if OMP_30_ENABLED
5977  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
5978  &team->t.t_task_team ) );
5979  team -> t.t_task_team = NULL;
5980 #endif
5981 
5982  /* reallocate space for arguments if necessary */
5983  __kmp_alloc_argv_entries( argc, team, TRUE );
5984  team -> t.t_argc = argc;
5985 
5986  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5987  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5988  { // Initialize barrier data.
5989  int b;
5990  for ( b = 0; b < bs_last_barrier; ++ b) {
5991  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5992  }
5993  }
5994 
5995 #if OMP_40_ENABLED
5996  team->t.t_proc_bind = new_proc_bind;
5997 #endif
5998 
5999  KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
6000  KMP_MB();
6001 
6002  return team;
6003  }
6004 
6005  /* reap team if it is too small, then loop back and check the next one */
6006  /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
6007  /* TODO: Use technique to find the right size hot-team, don't reap them */
6008  team = __kmp_reap_team( team );
6009  __kmp_team_pool = team;
6010  }
6011 
6012  /* nothing available in the pool, no matter, make a new team! */
6013  KMP_MB();
6014  team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
6015 
6016  /* and set it up */
6017  team -> t.t_max_nproc = max_nproc;
6018  /* NOTE well, for some reason allocating one big buffer and dividing it
6019  * up seems to really hurt performance a lot on the P4, so, let's not use
6020  * this... */
6021  __kmp_allocate_team_arrays( team, max_nproc );
6022  __kmp_initialize_team( team, new_nproc,
6023 #if OMP_30_ENABLED
6024  new_icvs,
6025  NULL // TODO: !!!
6026 #else
6027  new_set_nproc, new_set_dynamic, new_set_nested,
6028  new_set_blocktime, new_bt_intervals, new_bt_set
6029 #endif
6030  );
6031 
6032 #if OMP_30_ENABLED
6033  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team %p to NULL\n",
6034  &team->t.t_task_team ) );
6035  team -> t.t_task_team = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
6036 #endif
6037 
6038  if ( __kmp_storage_map ) {
6039  __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
6040  }
6041 
6042  /* allocate space for arguments */
6043  __kmp_alloc_argv_entries( argc, team, FALSE );
6044  team -> t.t_argc = argc;
6045 
6046  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
6047  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
6048  { // Initialize barrier data.
6049  int b;
6050  for ( b = 0; b < bs_last_barrier; ++ b ) {
6051  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
6052  }
6053  }
6054 
6055 #if OMP_40_ENABLED
6056  team->t.t_proc_bind = new_proc_bind;
6057 #endif
6058 
6059  KMP_MB();
6060 
6061  KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
6062 
6063  return team;
6064 }
6065 
6066 /* TODO implement hot-teams at all levels */
6067 /* TODO implement lazy thread release on demand (disband request) */
6068 
6069 /* free the team. return it to the team pool. release all the threads
6070  * associated with it */
6071 void
6072 __kmp_free_team( kmp_root_t *root, kmp_team_t *team )
6073 {
6074  int f;
6075  KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
6076 
6077  /* verify state */
6078  KMP_DEBUG_ASSERT( root );
6079  KMP_DEBUG_ASSERT( team );
6080  KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
6081  KMP_DEBUG_ASSERT( team->t.t_threads );
6082 
6083  /* team is done working */
6084  TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
6085  team -> t.t_copyin_counter = 0; // init counter for possible reuse
6086  // Do not reset pointer to parent team to NULL for hot teams.
6087 
6088  /* if we are a nested team, release our threads */
6089  if( team != root->r.r_hot_team ) {
6090 
6091 #if OMP_30_ENABLED
6092  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
6093  kmp_task_team_t *task_team = team->t.t_task_team;
6094  if ( task_team != NULL ) {
6095  //
6096  // Signal the worker threads to stop looking for tasks while
6097  // spin waiting. The task teams are reference counted and will
6098  // be deallocated by the last worker thread via the thread's
6099  // pointer to the task team.
6100  //
6101  KA_TRACE( 20, ( "__kmp_free_team: deactivating task_team %p\n",
6102  task_team ) );
6103  KMP_DEBUG_ASSERT( team->t.t_nproc > 1 );
6104  TCW_SYNC_4( task_team->tt.tt_active, FALSE );
6105  KMP_MB();
6106  team->t.t_task_team = NULL;
6107  }
6108  }
6109 #endif /* OMP_30_ENABLED */
6110 
6111  // Reset pointer to parent team only for non-hot teams.
6112  team -> t.t_parent = NULL;
6113 
6114 
6115  /* free the worker threads */
6116  for ( f = 1; f < team->t.t_nproc; ++ f ) {
6117  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
6118  __kmp_free_thread( team->t.t_threads[ f ] );
6119  team->t.t_threads[ f ] = NULL;
6120  }
6121 
6122 
6123  /* put the team back in the team pool */
6124  /* TODO limit size of team pool, call reap_team if pool too large */
6125  team -> t.t_next_pool = (kmp_team_t*) __kmp_team_pool;
6126  __kmp_team_pool = (volatile kmp_team_t*) team;
6127  }
6128 
6129  KMP_MB();
6130 }
6131 
6132 
6133 /* reap the team. destroy it, reclaim all its resources and free its memory */
6134 kmp_team_t *
6135 __kmp_reap_team( kmp_team_t *team )
6136 {
6137  kmp_team_t *next_pool = team -> t.t_next_pool;
6138 
6139  KMP_DEBUG_ASSERT( team );
6140  KMP_DEBUG_ASSERT( team -> t.t_dispatch );
6141  KMP_DEBUG_ASSERT( team -> t.t_disp_buffer );
6142  KMP_DEBUG_ASSERT( team -> t.t_threads );
6143  #if OMP_30_ENABLED
6144  #else
6145  KMP_DEBUG_ASSERT( team -> t.t_set_nproc );
6146  #endif
6147  KMP_DEBUG_ASSERT( team -> t.t_argv );
6148 
6149  /* TODO clean the threads that are a part of this? */
6150 
6151  /* free stuff */
6152 
6153  __kmp_free_team_arrays( team );
6154 #if (KMP_PERF_V106 == KMP_ON)
6155  if ( team -> t.t_argv != &team -> t.t_inline_argv[0] )
6156  __kmp_free( (void*) team -> t.t_argv );
6157 #else
6158  __kmp_free( (void*) team -> t.t_argv );
6159 #endif
6160  __kmp_free( team );
6161 
6162  KMP_MB();
6163  return next_pool;
6164 }
6165 
6166 //
6167 // Free the thread. Don't reap it, just place it on the pool of available
6168 // threads.
6169 //
6170 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
6171 // binding for the affinity mechanism to be useful.
6172 //
6173 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
6174 // However, we want to avoid a potential performance problem by always
6175 // scanning through the list to find the correct point at which to insert
6176 // the thread (potential N**2 behavior). To do this we keep track of the
6177 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
6178 // With single-level parallelism, threads will always be added to the tail
6179 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
6180 // parallelism, all bets are off and we may need to scan through the entire
6181 // free list.
6182 //
6183 // This change also has a potentially large performance benefit, for some
6184 // applications. Previously, as threads were freed from the hot team, they
6185 // would be placed back on the free list in inverse order. If the hot team
6186 // grew back to it's original size, then the freed thread would be placed
6187 // back on the hot team in reverse order. This could cause bad cache
6188 // locality problems on programs where the size of the hot team regularly
6189 // grew and shrunk.
6190 //
6191 // Now, for single-level parallelism, the OMP tid is alway == gtid.
6192 //
6193 void
6194 __kmp_free_thread( kmp_info_t *this_th )
6195 {
6196  int gtid;
6197  kmp_info_t **scan;
6198 
6199  KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
6200  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
6201 
6202  KMP_DEBUG_ASSERT( this_th );
6203 
6204 
6205  /* put thread back on the free pool */
6206  TCW_PTR(this_th->th.th_team, NULL);
6207  TCW_PTR(this_th->th.th_root, NULL);
6208  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
6209 
6210  //
6211  // If the __kmp_thread_pool_insert_pt is already past the new insert
6212  // point, then we need to re-scan the entire list.
6213  //
6214  gtid = this_th->th.th_info.ds.ds_gtid;
6215  if ( __kmp_thread_pool_insert_pt != NULL ) {
6216  KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
6217  if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
6218  __kmp_thread_pool_insert_pt = NULL;
6219  }
6220  }
6221 
6222  //
6223  // Scan down the list to find the place to insert the thread.
6224  // scan is the address of a link in the list, possibly the address of
6225  // __kmp_thread_pool itself.
6226  //
6227  // In the absence of nested parallism, the for loop will have 0 iterations.
6228  //
6229  if ( __kmp_thread_pool_insert_pt != NULL ) {
6230  scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
6231  }
6232  else {
6233  scan = (kmp_info_t **)&__kmp_thread_pool;
6234  }
6235  for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
6236  scan = &( (*scan)->th.th_next_pool ) );
6237 
6238  //
6239  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
6240  // to its address.
6241  //
6242  TCW_PTR(this_th->th.th_next_pool, *scan);
6243  __kmp_thread_pool_insert_pt = *scan = this_th;
6244  KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
6245  || ( this_th->th.th_info.ds.ds_gtid
6246  < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
6247  TCW_4(this_th->th.th_in_pool, TRUE);
6248  __kmp_thread_pool_nth++;
6249 
6250  TCW_4(__kmp_nth, __kmp_nth - 1);
6251 
6252 #ifdef KMP_ADJUST_BLOCKTIME
6253  /* Adjust blocktime back to user setting or default if necessary */
6254  /* Middle initialization might never have ocurred */
6255  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6256  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6257  if ( __kmp_nth <= __kmp_avail_proc ) {
6258  __kmp_zero_bt = FALSE;
6259  }
6260  }
6261 #endif /* KMP_ADJUST_BLOCKTIME */
6262 
6263  KMP_MB();
6264 }
6265 
6266 void
6267 __kmp_join_barrier( int gtid )
6268 {
6269  register kmp_info_t *this_thr = __kmp_threads[ gtid ];
6270  register kmp_team_t *team;
6271  register kmp_uint nproc;
6272  kmp_info_t *master_thread;
6273  int tid;
6274  #ifdef KMP_DEBUG
6275  int team_id;
6276  #endif /* KMP_DEBUG */
6277 #if USE_ITT_BUILD
6278  void * itt_sync_obj = NULL;
6279  #if USE_ITT_NOTIFY
6280  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) // don't call routine without need
6281  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); // get object created at fork_barrier
6282  #endif
6283 #endif /* USE_ITT_BUILD */
6284 
6285  KMP_MB();
6286 
6287  /* get current info */
6288  team = this_thr -> th.th_team;
6289  /* nproc = team -> t.t_nproc;*/
6290  nproc = this_thr -> th.th_team_nproc;
6291  KMP_DEBUG_ASSERT( nproc == team->t.t_nproc );
6292  tid = __kmp_tid_from_gtid(gtid);
6293  #ifdef KMP_DEBUG
6294  team_id = team -> t.t_id;
6295  #endif /* KMP_DEBUG */
6296  /* master_thread = team -> t.t_threads[0];*/
6297  master_thread = this_thr -> th.th_team_master;
6298  #ifdef KMP_DEBUG
6299  if ( master_thread != team->t.t_threads[0] ) {
6300  __kmp_print_structure();
6301  }
6302  #endif /* KMP_DEBUG */
6303  KMP_DEBUG_ASSERT( master_thread == team->t.t_threads[0] );
6304  KMP_MB();
6305 
6306  /* verify state */
6307  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
6308  KMP_DEBUG_ASSERT( TCR_PTR(this_thr->th.th_team) );
6309  KMP_DEBUG_ASSERT( TCR_PTR(this_thr->th.th_root) );
6310  KMP_DEBUG_ASSERT( this_thr == team -> t.t_threads[tid] );
6311 
6312  KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
6313  gtid, team_id, tid ));
6314 
6315 
6316  #if OMP_30_ENABLED
6317  if ( __kmp_tasking_mode == tskm_extra_barrier ) {
6318  __kmp_tasking_barrier( team, this_thr, gtid );
6319 
6320  KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n",
6321  gtid, team_id, tid ));
6322  }
6323  #ifdef KMP_DEBUG
6324  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
6325  KA_TRACE( 20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
6326  __kmp_gtid_from_thread( this_thr ), team_id, team -> t.t_task_team,
6327  this_thr->th.th_task_team ) );
6328  KMP_DEBUG_ASSERT( this_thr->th.th_task_team == team->t.t_task_team );
6329  }
6330  #endif /* KMP_DEBUG */
6331  #endif /* OMP_30_ENABLED */
6332 
6333  //
6334  // Copy the blocktime info to the thread, where __kmp_wait_sleep()
6335  // can access it when the team struct is not guaranteed to exist.
6336  //
6337  // Doing these loads causes a cache miss slows down EPCC parallel by 2x.
6338  // As a workaround, we do not perform the copy if blocktime=infinite,
6339  // since the values are not used by __kmp_wait_sleep() in that case.
6340  //
6341  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
6342  #if OMP_30_ENABLED
6343  this_thr -> th.th_team_bt_intervals = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
6344  this_thr -> th.th_team_bt_set = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
6345  #else
6346  this_thr -> th.th_team_bt_intervals = team -> t.t_set_bt_intervals[tid];
6347  this_thr -> th.th_team_bt_set= team -> t.t_set_bt_set[tid];
6348  #endif // OMP_30_ENABLED
6349  }
6350 
6351  #if KMP_OS_WINDOWS
6352  // AC: wait here until monitor has started. This is a fix for CQ232808.
6353  // The reason is that if the library is loaded/unloaded in a loop with small (parallel)
6354  // work in between, then there is high probability that monitor thread started after
6355  // the library shutdown. At shutdown it is too late to cope with the problem, because
6356  // when the master is in DllMain (process detach) the monitor has no chances to start
6357  // (it is blocked), and master has no means to inform the monitor that the library has gone,
6358  // because all the memory which the monitor can access is going to be released/reset.
6359  //
6360  // The moment before barrier_gather sounds appropriate, because master needs to
6361  // wait for all workers anyway, and we want this to happen as late as possible,
6362  // but before the shutdown which may happen after the barrier.
6363  if( KMP_MASTER_TID( tid ) && TCR_4(__kmp_init_monitor) < 2 ) {
6364  __kmp_wait_sleep( this_thr, (volatile kmp_uint32*)&__kmp_init_monitor, 2, 0
6365  USE_ITT_BUILD_ARG( itt_sync_obj )
6366  );
6367  }
6368  #endif
6369 
6370 #if USE_ITT_BUILD
6371  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
6372  __kmp_itt_barrier_starting( gtid, itt_sync_obj );
6373 #endif /* USE_ITT_BUILD */
6374 
6375  if ( __kmp_barrier_gather_pattern[ bs_forkjoin_barrier ] == bp_linear_bar || __kmp_barrier_gather_branch_bits[ bs_forkjoin_barrier ] == 0 ) {
6376  __kmp_linear_barrier_gather( bs_forkjoin_barrier, this_thr, gtid, tid, NULL
6377  USE_ITT_BUILD_ARG( itt_sync_obj )
6378  );
6379  } else if ( __kmp_barrier_gather_pattern[ bs_forkjoin_barrier ] == bp_tree_bar ) {
6380  __kmp_tree_barrier_gather( bs_forkjoin_barrier, this_thr, gtid, tid, NULL
6381  USE_ITT_BUILD_ARG( itt_sync_obj )
6382  );
6383  } else {
6384  __kmp_hyper_barrier_gather( bs_forkjoin_barrier, this_thr, gtid, tid, NULL
6385  USE_ITT_BUILD_ARG( itt_sync_obj )
6386  );
6387  }; // if
6388 
6389 #if USE_ITT_BUILD
6390  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG )
6391  __kmp_itt_barrier_middle( gtid, itt_sync_obj );
6392 #endif /* USE_ITT_BUILD */
6393 
6394  //
6395  // From this point on, the team data structure may be deallocated
6396  // at any time by the master thread - it is unsafe to reference it
6397  // in any of the worker threads.
6398  //
6399  // Any per-team data items that need to be referenced before the end
6400  // of the barrier should be moved to the kmp_task_team_t structs.
6401  //
6402 
6403  #if OMP_30_ENABLED
6404  if ( KMP_MASTER_TID( tid ) ) {
6405  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
6406  // Master shouldn't call decrease_load(). // TODO: enable master threads.
6407  // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
6408  __kmp_task_team_wait( this_thr, team
6409  USE_ITT_BUILD_ARG( itt_sync_obj )
6410  );
6411  }
6412  // Join barrier - report frame end
6413 #if USE_ITT_BUILD
6414  // Collect information only if the file was opened successfully.
6415  if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file )
6416  {
6417  ident_t * loc = this_thr->th.th_ident;
6418  if (loc) {
6419  // Use compiler-generated location to mark the frame:
6420  // "<func>$omp$frame@[file:]<line>[:<col>]"
6421  kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
6422 
6423  kmp_uint64 fr_end;
6424 #if defined( __GNUC__ )
6425 # if !defined( __INTEL_COMPILER )
6426  fr_end = __kmp_hardware_timestamp();
6427 # else
6428  fr_end = __rdtsc();
6429 # endif
6430 #else
6431  fr_end = __rdtsc();
6432 #endif
6433  K_DIAG( 3, ( "__kmp_join_barrier: T#%d(%d:%d) frame_begin = %llu, frame_end = %llu\n",
6434  gtid, ( team != NULL ) ? team->t.t_id : -1, tid, this_thr->th.th_frame_time, fr_end ) );
6435 
6436  __kmp_str_buf_print( &__kmp_itt_frame_buffer, "%s$omp$frame@%s:%d:%d,%llu,%llu,,\n",
6437  str_loc.func, str_loc.file, str_loc.line, str_loc.col, this_thr->th.th_frame_time, fr_end );
6438 
6439  __kmp_str_loc_free( &str_loc );
6440  }
6441  }
6442 #endif /* USE_ITT_BUILD */
6443  }
6444  #endif /* OMP_30_ENABLED */
6445 
6446  #if KMP_DEBUG
6447  if( KMP_MASTER_TID( tid )) {
6448  KA_TRACE( 15, ( "__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
6449  gtid, team_id, tid, nproc ));
6450  }
6451  #endif /* KMP_DEBUG */
6452 
6453  /* TODO now, mark worker threads as done so they may be disbanded */
6454 
6455  KMP_MB(); /* Flush all pending memory write invalidates. */
6456  KA_TRACE( 10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n",
6457  gtid, team_id, tid ));
6458 }
6459 
6460 
6461 /* TODO release worker threads' fork barriers as we are ready instead of all at once */
6462 
6463 void
6464 __kmp_fork_barrier( int gtid, int tid )
6465 {
6466  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6467  kmp_team_t *team = ( tid == 0 ) ? this_thr -> th.th_team : NULL;
6468 #if USE_ITT_BUILD
6469  void * itt_sync_obj = NULL;
6470 #endif /* USE_ITT_BUILD */
6471 
6472  KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
6473  gtid, ( team != NULL ) ? team->t.t_id : -1, tid ));
6474 
6475  /* th_team pointer only valid for master thread here */
6476  if ( KMP_MASTER_TID( tid ) ) {
6477 
6478 #if USE_ITT_BUILD && USE_ITT_NOTIFY
6479  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
6480  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier, 1 ); // create itt barrier object
6481  //__kmp_itt_barrier_starting( gtid, itt_sync_obj ); // AC: no need to call prepare right before acquired
6482  __kmp_itt_barrier_middle( gtid, itt_sync_obj ); // call acquired / releasing
6483  }
6484 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
6485 
6486 #ifdef KMP_DEBUG
6487 
6488  register kmp_info_t **other_threads = team -> t.t_threads;
6489  register int i;
6490 
6491  /* verify state */
6492  KMP_MB();
6493 
6494  for( i = 1; i < team -> t.t_nproc ; i++ ) {
6495  KA_TRACE( 500, ( "__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork "
6496  "go == %u.\n",
6497  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
6498  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
6499  other_threads[i]->th.th_bar[ bs_forkjoin_barrier ].bb.b_go ) );
6500 
6501  KMP_DEBUG_ASSERT( ( TCR_4( other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go )
6502  & ~(KMP_BARRIER_SLEEP_STATE) )
6503  == KMP_INIT_BARRIER_STATE );
6504  KMP_DEBUG_ASSERT( other_threads[i]->th.th_team == team );
6505 
6506  }
6507 #endif
6508 
6509 #if OMP_30_ENABLED
6510  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
6511  __kmp_task_team_setup( this_thr, team );
6512  }
6513 #endif /* OMP_30_ENABLED */
6514 
6515  //
6516  // The master thread may have changed its blocktime between the
6517  // join barrier and the fork barrier.
6518  //
6519  // Copy the blocktime info to the thread, where __kmp_wait_sleep()
6520  // can access it when the team struct is not guaranteed to exist.
6521  //
6522  // See the note about the corresponding code in __kmp_join_barrier()
6523  // being performance-critical.
6524  //
6525  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
6526 #if OMP_30_ENABLED
6527  this_thr -> th.th_team_bt_intervals = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
6528  this_thr -> th.th_team_bt_set = team -> t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
6529 #else
6530  this_thr -> th.th_team_bt_intervals = team -> t.t_set_bt_intervals[tid];
6531  this_thr -> th.th_team_bt_set= team -> t.t_set_bt_set[tid];
6532 #endif // OMP_30_ENABLED
6533  }
6534  } // master
6535 
6536  if ( __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] == bp_linear_bar || __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] == 0 ) {
6537  __kmp_linear_barrier_release( bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
6538  USE_ITT_BUILD_ARG( itt_sync_obj )
6539  );
6540  } else if ( __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] == bp_tree_bar ) {
6541  __kmp_tree_barrier_release( bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
6542  USE_ITT_BUILD_ARG( itt_sync_obj )
6543  );
6544  } else {
6545  __kmp_hyper_barrier_release( bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
6546  USE_ITT_BUILD_ARG( itt_sync_obj )
6547  );
6548  }; // if
6549 
6550  //
6551  // early exit for reaping threads releasing forkjoin barrier
6552  //
6553  if ( TCR_4(__kmp_global.g.g_done) ) {
6554 
6555 #if OMP_30_ENABLED
6556  if ( this_thr->th.th_task_team != NULL ) {
6557  if ( KMP_MASTER_TID( tid ) ) {
6558  TCW_PTR(this_thr->th.th_task_team, NULL);
6559  }
6560  else {
6561  __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
6562  }
6563  }
6564 #endif /* OMP_30_ENABLED */
6565 
6566 #if USE_ITT_BUILD && USE_ITT_NOTIFY
6567  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
6568  if ( !KMP_MASTER_TID( tid ) ) {
6569  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier );
6570  if ( itt_sync_obj )
6571  __kmp_itt_barrier_finished( gtid, itt_sync_obj );
6572  }
6573  }
6574 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
6575  KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d is leaving early\n", gtid ));
6576  return;
6577  }
6578 
6579  //
6580  // We can now assume that a valid team structure has been allocated
6581  // by the master and propagated to all worker threads.
6582  //
6583  // The current thread, however, may not be part of the team, so we can't
6584  // blindly assume that the team pointer is non-null.
6585  //
6586  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
6587  KMP_DEBUG_ASSERT( team != NULL );
6588  tid = __kmp_tid_from_gtid( gtid );
6589 
6590 #if OMP_30_ENABLED
6591 
6592 # if KMP_BARRIER_ICV_PULL
6593  //
6594  // FIXME - after __kmp_fork_call() is modified to not look at the
6595  // master thread's implicit task ICV's, remove the ! KMP_MASTER_TID
6596  // restriction from this if condition.
6597  //
6598  if (! KMP_MASTER_TID( tid ) ) {
6599  //
6600  // Copy the initial ICV's from the team struct to the implicit task
6601  // for this tid.
6602  //
6603  __kmp_init_implicit_task( team->t.t_ident, team->t.t_threads[tid],
6604  team, tid, FALSE );
6605  load_icvs(&team->t.t_initial_icvs);
6606  store_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &team->t.t_initial_icvs);
6607  sync_icvs();
6608  }
6609 # endif // KMP_BARRIER_ICV_PULL
6610 
6611  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
6612  __kmp_task_team_sync( this_thr, team );
6613  }
6614 
6615 #endif /* OMP_30_ENABLED */
6616 
6617 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
6618  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
6619  if ( proc_bind == proc_bind_intel ) {
6620 #endif
6621 #if KMP_MIC
6622  //
6623  // Call dynamic affinity settings
6624  //
6625  if( __kmp_affinity_type == affinity_balanced && team->t.t_size_changed ) {
6626  __kmp_balanced_affinity( tid, team->t.t_nproc );
6627  }
6628 #endif
6629 #if OMP_40_ENABLED && (KMP_OS_WINDOWS || KMP_OS_LINUX)
6630  }
6631  else if ( ( proc_bind != proc_bind_false )
6632  && ( proc_bind != proc_bind_disabled )) {
6633  if ( this_thr->th.th_new_place == this_thr->th.th_current_place ) {
6634  KA_TRACE( 100, ( "__kmp_fork_barrier: T#%d already in correct place %d\n",
6635  __kmp_gtid_from_thread( this_thr ), this_thr->th.th_current_place ) );
6636  }
6637  else {
6638  __kmp_affinity_set_place( gtid );
6639  }
6640  }
6641 #endif
6642 
6643 #if USE_ITT_BUILD && USE_ITT_NOTIFY
6644  if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) {
6645  if ( !KMP_MASTER_TID( tid ) ) {
6646  itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); // get correct barrier object
6647  __kmp_itt_barrier_finished( gtid, itt_sync_obj ); // workers call acquired
6648  } // (prepare called inside barrier_release)
6649  }
6650 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
6651  KA_TRACE( 10, ( "__kmp_fork_barrier: T#%d(%d:%d) is leaving\n",
6652  gtid, team->t.t_id, tid ));
6653 }
6654 
6655 
6656 /* ------------------------------------------------------------------------ */
6657 /* ------------------------------------------------------------------------ */
6658 
6659 void *
6660 __kmp_launch_thread( kmp_info_t *this_thr )
6661 {
6662  int gtid = this_thr->th.th_info.ds.ds_gtid;
6663 /* void *stack_data;*/
6664  kmp_team_t *(*volatile pteam);
6665 
6666  KMP_MB();
6667  KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
6668 
6669  if( __kmp_env_consistency_check ) {
6670  this_thr -> th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak?
6671  }
6672 
6673  /* This is the place where threads wait for work */
6674  while( ! TCR_4(__kmp_global.g.g_done) ) {
6675  KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
6676  KMP_MB();
6677 
6678  /* wait for work to do */
6679  KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
6680 
6681  /* No tid yet since not part of a team */
6682  __kmp_fork_barrier( gtid, KMP_GTID_DNE );
6683 
6684  pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
6685 
6686  /* have we been allocated? */
6687  if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
6688  /* we were just woken up, so run our new task */
6689  if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
6690  int rc;
6691  KA_TRACE( 20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
6692  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn ));
6693 
6694 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6695  if ( __kmp_inherit_fp_control && (*pteam)->t.t_fp_control_saved ) {
6696  __kmp_clear_x87_fpu_status_word();
6697  __kmp_load_x87_fpu_control_word( &(*pteam)->t.t_x87_fpu_control_word );
6698  __kmp_load_mxcsr( &(*pteam)->t.t_mxcsr );
6699  }
6700 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6701 
6702  rc = (*pteam) -> t.t_invoke( gtid );
6703  KMP_ASSERT( rc );
6704 
6705  KMP_MB();
6706  KA_TRACE( 20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
6707  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn ));
6708  }
6709 
6710  /* join barrier after parallel region */
6711  __kmp_join_barrier( gtid );
6712  }
6713  }
6714  TCR_SYNC_PTR(__kmp_global.g.g_done);
6715 
6716 #if OMP_30_ENABLED
6717  if ( TCR_PTR( this_thr->th.th_task_team ) != NULL ) {
6718  __kmp_unref_task_team( this_thr->th.th_task_team, this_thr );
6719  }
6720 #endif /* OMP_30_ENABLED */
6721 
6722  /* run the destructors for the threadprivate data for this thread */
6723  __kmp_common_destroy_gtid( gtid );
6724 
6725  KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
6726  KMP_MB();
6727  return this_thr;
6728 }
6729 
6730 /* ------------------------------------------------------------------------ */
6731 /* ------------------------------------------------------------------------ */
6732 
6733 
6734 
6735 void
6736 __kmp_internal_end_dest( void *specific_gtid )
6737 {
6738  #ifdef __INTEL_COMPILER
6739  #pragma warning( push )
6740  #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits
6741  #endif
6742  // Make sure no significant bits are lost
6743  int gtid = (kmp_intptr_t)specific_gtid - 1;
6744  #ifdef __INTEL_COMPILER
6745  #pragma warning( pop )
6746  #endif
6747 
6748  KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
6749  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
6750  * this is because 0 is reserved for the nothing-stored case */
6751 
6752  /* josh: One reason for setting the gtid specific data even when it is being
6753  destroyed by pthread is to allow gtid lookup through thread specific data
6754  (__kmp_gtid_get_specific). Some of the code, especially stat code,
6755  that gets executed in the call to __kmp_internal_end_thread, actually
6756  gets the gtid through the thread specific data. Setting it here seems
6757  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
6758  to run smoothly.
6759  todo: get rid of this after we remove the dependence on
6760  __kmp_gtid_get_specific
6761  */
6762  if(gtid >= 0 && KMP_UBER_GTID(gtid))
6763  __kmp_gtid_set_specific( gtid );
6764  #ifdef KMP_TDATA_GTID
6765  __kmp_gtid = gtid;
6766  #endif
6767  __kmp_internal_end_thread( gtid );
6768 }
6769 
6770 #if KMP_OS_UNIX && GUIDEDLL_EXPORTS
6771 
6772 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
6773 // perfectly, but in real libiomp5.so I have no evidence it is ever called. However, -fini linker
6774 // option in makefile.mk works fine.
6775 
6776 __attribute__(( destructor ))
6777 void
6778 __kmp_internal_end_dtor( void )
6779 {
6780  __kmp_internal_end_atexit();
6781 }
6782 
6783 void
6784 __kmp_internal_end_fini( void )
6785 {
6786  __kmp_internal_end_atexit();
6787 }
6788 
6789 #endif
6790 
6791 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
6792 void
6793 __kmp_internal_end_atexit( void )
6794 {
6795  KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
6796  /* [Windows]
6797  josh: ideally, we want to completely shutdown the library in this atexit handler, but
6798  stat code that depends on thread specific data for gtid fails because that data becomes
6799  unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
6800  instead. We should eventually remove the dependency on __kmp_get_specific_gtid in the
6801  stat code and use __kmp_internal_end_library to cleanly shutdown the library.
6802 
6803 // TODO: Can some of this comment about GVS be removed?
6804  I suspect that the offending stat code is executed when the calling thread tries to
6805  clean up a dead root thread's data structures, resulting in GVS code trying to close
6806  the GVS structures for that thread, but since the stat code uses
6807  __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
6808  cleaning up itself instead of another thread, it gets confused. This happens because
6809  allowing a thread to unregister and cleanup another thread is a recent modification for
6810  addressing an issue with Maxon Cinema4D. Based on the current design (20050722), a
6811  thread may end up trying to unregister another thread only if thread death does not
6812  trigger the calling of __kmp_internal_end_thread. For Linux* OS, there is the thread
6813  specific data destructor function to detect thread death. For Windows dynamic, there
6814  is DllMain(THREAD_DETACH). For Windows static, there is nothing. Thus, the
6815  workaround is applicable only for Windows static stat library.
6816  */
6817  __kmp_internal_end_library( -1 );
6818  #if KMP_OS_WINDOWS
6819  __kmp_close_console();
6820  #endif
6821 }
6822 
6823 static void
6824 __kmp_reap_thread(
6825  kmp_info_t * thread,
6826  int is_root
6827 ) {
6828 
6829  // It is assumed __kmp_forkjoin_lock is aquired.
6830 
6831  int gtid;
6832 
6833  KMP_DEBUG_ASSERT( thread != NULL );
6834 
6835  gtid = thread->th.th_info.ds.ds_gtid;
6836 
6837  if ( ! is_root ) {
6838 
6839  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
6840  /* Assume the threads are at the fork barrier here */
6841  KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
6842  /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
6843  __kmp_release(
6844  thread,
6845  &thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go,
6846  kmp_release_fence
6847  );
6848  }; // if
6849 
6850 
6851  // Terminate OS thread.
6852  __kmp_reap_worker( thread );
6853 
6854  //
6855  // The thread was killed asynchronously. If it was actively
6856  // spinning in the in the thread pool, decrement the global count.
6857  //
6858  // There is a small timing hole here - if the worker thread was
6859  // just waking up after sleeping in the pool, had reset it's
6860  // th_active_in_pool flag but not decremented the global counter
6861  // __kmp_thread_pool_active_nth yet, then the global counter
6862  // might not get updated.
6863  //
6864  // Currently, this can only happen as the library is unloaded,
6865  // so there are no harmful side effects.
6866  //
6867  if ( thread->th.th_active_in_pool ) {
6868  thread->th.th_active_in_pool = FALSE;
6869  KMP_TEST_THEN_DEC32(
6870  (kmp_int32 *) &__kmp_thread_pool_active_nth );
6871  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
6872  }
6873 
6874  // Decrement # of [worker] threads in the pool.
6875  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
6876  --__kmp_thread_pool_nth;
6877  }; // if
6878 
6879  // Free the fast memory for tasking
6880  #if USE_FAST_MEMORY
6881  __kmp_free_fast_memory( thread );
6882  #endif /* USE_FAST_MEMORY */
6883 
6884  __kmp_suspend_uninitialize_thread( thread );
6885 
6886  KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
6887  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
6888 
6889  -- __kmp_all_nth;
6890  // __kmp_nth was decremented when thread is added to the pool.
6891 
6892 #ifdef KMP_ADJUST_BLOCKTIME
6893  /* Adjust blocktime back to user setting or default if necessary */
6894  /* Middle initialization might never have ocurred */
6895  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6896  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6897  if ( __kmp_nth <= __kmp_avail_proc ) {
6898  __kmp_zero_bt = FALSE;
6899  }
6900  }
6901 #endif /* KMP_ADJUST_BLOCKTIME */
6902 
6903  /* free the memory being used */
6904  if( __kmp_env_consistency_check ) {
6905  if ( thread->th.th_cons ) {
6906  __kmp_free_cons_stack( thread->th.th_cons );
6907  thread->th.th_cons = NULL;
6908  }; // if
6909  }
6910 
6911  if ( thread->th.th_pri_common != NULL ) {
6912  __kmp_free( thread->th.th_pri_common );
6913  thread->th.th_pri_common = NULL;
6914  }; // if
6915 
6916  #if KMP_USE_BGET
6917  if ( thread->th.th_local.bget_data != NULL ) {
6918  __kmp_finalize_bget( thread );
6919  }; // if
6920  #endif
6921 
6922 #if (KMP_OS_WINDOWS || KMP_OS_LINUX)
6923  if ( thread->th.th_affin_mask != NULL ) {
6924  KMP_CPU_FREE( thread->th.th_affin_mask );
6925  thread->th.th_affin_mask = NULL;
6926  }; // if
6927 #endif /* (KMP_OS_WINDOWS || KMP_OS_LINUX) */
6928 
6929  __kmp_reap_team( thread->th.th_serial_team );
6930  thread->th.th_serial_team = NULL;
6931  __kmp_free( thread );
6932 
6933  KMP_MB();
6934 
6935 } // __kmp_reap_thread
6936 
6937 static void
6938 __kmp_internal_end(void)
6939 {
6940  int i;
6941 
6942  /* First, unregister the library */
6943  __kmp_unregister_library();
6944 
6945  #if KMP_OS_WINDOWS
6946  /* In Win static library, we can't tell when a root actually dies, so we
6947  reclaim the data structures for any root threads that have died but not
6948  unregistered themselves, in order to shut down cleanly.
6949  In Win dynamic library we also can't tell when a thread dies.
6950  */
6951  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
6952  #endif
6953 
6954  for( i=0 ; i<__kmp_threads_capacity ; i++ )
6955  if( __kmp_root[i] )
6956  if( __kmp_root[i] -> r.r_active )
6957  break;
6958  KMP_MB(); /* Flush all pending memory write invalidates. */
6959  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6960 
6961  if ( i < __kmp_threads_capacity ) {
6962  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6963  KMP_MB(); /* Flush all pending memory write invalidates. */
6964 
6965  //
6966  // Need to check that monitor was initialized before reaping it.
6967  // If we are called form __kmp_atfork_child (which sets
6968  // __kmp_init_parallel = 0), then __kmp_monitor will appear to
6969  // contain valid data, but it is only valid in the parent process,
6970  // not the child.
6971  //
6972  // One of the possible fixes for CQ138434 / CQ140126
6973  // (used in 20091103_dreamworks patch)
6974  //
6975  // New behavior (201008): instead of keying off of the flag
6976  // __kmp_init_parallel, the monitor thread creation is keyed off
6977  // of the new flag __kmp_init_monitor.
6978  //
6979  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
6980  if ( TCR_4( __kmp_init_monitor ) ) {
6981  __kmp_reap_monitor( & __kmp_monitor );
6982  TCW_4( __kmp_init_monitor, 0 );
6983  }
6984  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
6985  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
6986  } else {
6987  /* TODO move this to cleanup code */
6988  #ifdef KMP_DEBUG
6989  /* make sure that everything has properly ended */
6990  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6991  if( __kmp_root[i] ) {
6992  KMP_ASSERT( ! KMP_UBER_GTID( i ) );
6993  KMP_ASSERT( ! __kmp_root[i] -> r.r_active );
6994  }
6995  }
6996  #endif
6997 
6998  KMP_MB();
6999 
7000  // Reap the worker threads.
7001  // This is valid for now, but be careful if threads are reaped sooner.
7002  while ( __kmp_thread_pool != NULL ) { // Loop thru all the thread in the pool.
7003  // Get the next thread from the pool.
7004  kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
7005  __kmp_thread_pool = thread->th.th_next_pool;
7006  // Reap it.
7007  thread->th.th_next_pool = NULL;
7008  thread->th.th_in_pool = FALSE;
7009  __kmp_reap_thread( thread, 0 );
7010  }; // while
7011  __kmp_thread_pool_insert_pt = NULL;
7012 
7013  // Reap teams.
7014  while ( __kmp_team_pool != NULL ) { // Loop thru all the teams in the pool.
7015  // Get the next team from the pool.
7016  kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
7017  __kmp_team_pool = team->t.t_next_pool;
7018  // Reap it.
7019  team->t.t_next_pool = NULL;
7020  __kmp_reap_team( team );
7021  }; // while
7022 
7023  #if OMP_30_ENABLED
7024  __kmp_reap_task_teams( );
7025  #endif /* OMP_30_ENABLED */
7026 
7027  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
7028  // TBD: Add some checking...
7029  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
7030  }
7031 
7032  /* Make sure all threadprivate destructors get run by joining with all worker
7033  threads before resetting this flag */
7034  TCW_SYNC_4(__kmp_init_common, FALSE);
7035 
7036  KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
7037  KMP_MB();
7038 
7039  //
7040  // See note above: One of the possible fixes for CQ138434 / CQ140126
7041  //
7042  // FIXME: push both code fragments down and CSE them?
7043  // push them into __kmp_cleanup() ?
7044  //
7045  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
7046  if ( TCR_4( __kmp_init_monitor ) ) {
7047  __kmp_reap_monitor( & __kmp_monitor );
7048  TCW_4( __kmp_init_monitor, 0 );
7049  }
7050  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
7051  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
7052 
7053  } /* else !__kmp_global.t_active */
7054  TCW_4(__kmp_init_gtid, FALSE);
7055  KMP_MB(); /* Flush all pending memory write invalidates. */
7056 
7057 
7058  __kmp_cleanup();
7059 }
7060 
7061 void
7062 __kmp_internal_end_library( int gtid_req )
7063 {
7064  int i;
7065 
7066  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
7067  /* this shouldn't be a race condition because __kmp_internal_end() is the
7068  * only place to clear __kmp_serial_init */
7069  /* we'll check this later too, after we get the lock */
7070  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
7071  // because the next check will work in any case.
7072  if( __kmp_global.g.g_abort ) {
7073  KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
7074  /* TODO abort? */
7075  return;
7076  }
7077  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
7078  KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
7079  return;
7080  }
7081 
7082 
7083  KMP_MB(); /* Flush all pending memory write invalidates. */
7084 
7085  /* find out who we are and what we should do */
7086  {
7087  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
7088  KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req ));
7089  if( gtid == KMP_GTID_SHUTDOWN ) {
7090  KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
7091  return;
7092  } else if( gtid == KMP_GTID_MONITOR ) {
7093  KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
7094  return;
7095  } else if( gtid == KMP_GTID_DNE ) {
7096  KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
7097  /* we don't know who we are, but we may still shutdown the library */
7098  } else if( KMP_UBER_GTID( gtid )) {
7099  /* unregister ourselves as an uber thread. gtid is no longer valid */
7100  if( __kmp_root[gtid] -> r.r_active ) {
7101  __kmp_global.g.g_abort = -1;
7102  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
7103  KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
7104  return;
7105  } else {
7106  KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
7107  __kmp_unregister_root_current_thread( gtid );
7108  }
7109  } else {
7110  /* worker threads may call this function through the atexit handler, if they call exit() */
7111  /* For now, skip the usual subsequent processing and just dump the debug buffer.
7112  TODO: do a thorough shutdown instead
7113  */
7114  #ifdef DUMP_DEBUG_ON_EXIT
7115  if ( __kmp_debug_buf )
7116  __kmp_dump_debug_buffer( );
7117  #endif
7118  return;
7119  }
7120  }
7121  /* synchronize the termination process */
7122  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7123 
7124  /* have we already finished */
7125  if( __kmp_global.g.g_abort ) {
7126  KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
7127  /* TODO abort? */
7128  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7129  return;
7130  }
7131  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
7132  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7133  return;
7134  }
7135 
7136  /* We need this lock to enforce mutex between this reading of
7137  __kmp_threads_capacity and the writing by __kmp_register_root.
7138  Alternatively, we can use a counter of roots that is
7139  atomically updated by __kmp_get_global_thread_id_reg,
7140  __kmp_do_serial_initialize and __kmp_internal_end_*.
7141  */
7142  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
7143 
7144  /* now we can safely conduct the actual termination */
7145  __kmp_internal_end();
7146 
7147  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
7148  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7149 
7150  KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
7151 
7152  #ifdef DUMP_DEBUG_ON_EXIT
7153  if ( __kmp_debug_buf )
7154  __kmp_dump_debug_buffer();
7155  #endif
7156 
7157  #if KMP_OS_WINDOWS
7158  __kmp_close_console();
7159  #endif
7160 
7161  __kmp_fini_allocator();
7162 
7163 } // __kmp_internal_end_library
7164 
7165 void
7166 __kmp_internal_end_thread( int gtid_req )
7167 {
7168  int i;
7169 
7170  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
7171  /* this shouldn't be a race condition because __kmp_internal_end() is the
7172  * only place to clear __kmp_serial_init */
7173  /* we'll check this later too, after we get the lock */
7174  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
7175  // because the next check will work in any case.
7176  if( __kmp_global.g.g_abort ) {
7177  KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
7178  /* TODO abort? */
7179  return;
7180  }
7181  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
7182  KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
7183  return;
7184  }
7185 
7186  KMP_MB(); /* Flush all pending memory write invalidates. */
7187 
7188  /* find out who we are and what we should do */
7189  {
7190  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
7191  KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req ));
7192  if( gtid == KMP_GTID_SHUTDOWN ) {
7193  KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
7194  return;
7195  } else if( gtid == KMP_GTID_MONITOR ) {
7196  KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
7197  return;
7198  } else if( gtid == KMP_GTID_DNE ) {
7199  KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
7200  return;
7201  /* we don't know who we are */
7202  } else if( KMP_UBER_GTID( gtid )) {
7203  /* unregister ourselves as an uber thread. gtid is no longer valid */
7204  if( __kmp_root[gtid] -> r.r_active ) {
7205  __kmp_global.g.g_abort = -1;
7206  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
7207  KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
7208  return;
7209  } else {
7210  KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
7211  __kmp_unregister_root_current_thread( gtid );
7212  }
7213  } else {
7214  /* just a worker thread, let's leave */
7215  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
7216 
7217  #if OMP_30_ENABLED
7218  if ( gtid >= 0 ) {
7219  kmp_info_t *this_thr = __kmp_threads[ gtid ];
7220  if (TCR_PTR(this_thr->th.th_task_team) != NULL) {
7221  __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
7222  }
7223  }
7224  #endif /* OMP_30_ENABLED */
7225 
7226  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
7227  return;
7228  }
7229  }
7230  #if defined GUIDEDLL_EXPORTS
7231  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
7232  // because we will better shutdown later in the library destructor.
7233  // The reason of this change is performance problem when non-openmp thread
7234  // in a loop forks and joins many openmp threads. We can save a lot of time
7235  // keeping worker threads alive until the program shutdown.
7236  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
7237  // Windows(DPD200287443) that occurs when using critical sections from foreign threads.
7238  KA_TRACE( 10, ("__kmp_internal_end_thread: exiting\n") );
7239  return;
7240  #endif
7241  /* synchronize the termination process */
7242  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7243 
7244  /* have we already finished */
7245  if( __kmp_global.g.g_abort ) {
7246  KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
7247  /* TODO abort? */
7248  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7249  return;
7250  }
7251  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
7252  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7253  return;
7254  }
7255 
7256  /* We need this lock to enforce mutex between this reading of
7257  __kmp_threads_capacity and the writing by __kmp_register_root.
7258  Alternatively, we can use a counter of roots that is
7259  atomically updated by __kmp_get_global_thread_id_reg,
7260  __kmp_do_serial_initialize and __kmp_internal_end_*.
7261  */
7262 
7263  /* should we finish the run-time? are all siblings done? */
7264  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
7265 
7266  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
7267  if ( KMP_UBER_GTID( i ) ) {
7268  KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
7269  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
7270  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7271  return;
7272  };
7273  }
7274 
7275  /* now we can safely conduct the actual termination */
7276 
7277  __kmp_internal_end();
7278 
7279  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
7280  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7281 
7282  KA_TRACE( 10, ("__kmp_internal_end_thread: exit\n" ) );
7283 
7284  #ifdef DUMP_DEBUG_ON_EXIT
7285  if ( __kmp_debug_buf )
7286  __kmp_dump_debug_buffer();
7287  #endif
7288 } // __kmp_internal_end_thread
7289 
7290 // -------------------------------------------------------------------------------------------------
7291 // Library registration stuff.
7292 
7293 static long __kmp_registration_flag = 0;
7294  // Random value used to indicate library initialization.
7295 static char * __kmp_registration_str = NULL;
7296  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
7297 
7298 
7299 static inline
7300 char *
7301 __kmp_reg_status_name() {
7302  /*
7303  On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
7304  If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
7305  the name of registered_lib_env env var can not be found, because the name will contain different pid.
7306  */
7307  return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
7308 } // __kmp_reg_status_get
7309 
7310 
7311 void
7312 __kmp_register_library_startup(
7313  void
7314 ) {
7315 
7316  char * name = __kmp_reg_status_name(); // Name of the environment variable.
7317  int done = 0;
7318  union {
7319  double dtime;
7320  long ltime;
7321  } time;
7322  #if KMP_OS_WINDOWS
7323  __kmp_initialize_system_tick();
7324  #endif
7325  __kmp_read_system_time( & time.dtime );
7326  __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
7327  __kmp_registration_str =
7328  __kmp_str_format(
7329  "%p-%lx-%s",
7330  & __kmp_registration_flag,
7331  __kmp_registration_flag,
7332  KMP_LIBRARY_FILE
7333  );
7334 
7335  KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
7336 
7337  while ( ! done ) {
7338 
7339  char * value = NULL; // Actual value of the environment variable.
7340 
7341  // Set environment variable, but do not overwrite if it is exist.
7342  __kmp_env_set( name, __kmp_registration_str, 0 );
7343  // Check the variable is written.
7344  value = __kmp_env_get( name );
7345  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
7346 
7347  done = 1; // Ok, environment variable set successfully, exit the loop.
7348 
7349  } else {
7350 
7351  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
7352  // Check whether it alive or dead.
7353  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
7354  char * tail = value;
7355  char * flag_addr_str = NULL;
7356  char * flag_val_str = NULL;
7357  char const * file_name = NULL;
7358  __kmp_str_split( tail, '-', & flag_addr_str, & tail );
7359  __kmp_str_split( tail, '-', & flag_val_str, & tail );
7360  file_name = tail;
7361  if ( tail != NULL ) {
7362  long * flag_addr = 0;
7363  long flag_val = 0;
7364  sscanf( flag_addr_str, "%p", & flag_addr );
7365  sscanf( flag_val_str, "%lx", & flag_val );
7366  if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
7367  // First, check whether environment-encoded address is mapped into addr space.
7368  // If so, dereference it to see if it still has the right value.
7369 
7370  if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
7371  neighbor = 1;
7372  } else {
7373  // If not, then we know the other copy of the library is no longer running.
7374  neighbor = 2;
7375  }; // if
7376  }; // if
7377  }; // if
7378  switch ( neighbor ) {
7379  case 0 : // Cannot parse environment variable -- neighbor status unknown.
7380  // Assume it is the incompatible format of future version of the library.
7381  // Assume the other library is alive.
7382  // WARN( ... ); // TODO: Issue a warning.
7383  file_name = "unknown library";
7384  // Attention! Falling to the next case. That's intentional.
7385  case 1 : { // Neighbor is alive.
7386  // Check it is allowed.
7387  char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
7388  if ( ! __kmp_str_match_true( duplicate_ok ) ) {
7389  // That's not allowed. Issue fatal error.
7390  __kmp_msg(
7391  kmp_ms_fatal,
7392  KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
7393  KMP_HNT( DuplicateLibrary ),
7394  __kmp_msg_null
7395  );
7396  }; // if
7397  KMP_INTERNAL_FREE( duplicate_ok );
7398  __kmp_duplicate_library_ok = 1;
7399  done = 1; // Exit the loop.
7400  } break;
7401  case 2 : { // Neighbor is dead.
7402  // Clear the variable and try to register library again.
7403  __kmp_env_unset( name );
7404  } break;
7405  default : {
7406  KMP_DEBUG_ASSERT( 0 );
7407  } break;
7408  }; // switch
7409 
7410  }; // if
7411  KMP_INTERNAL_FREE( (void *) value );
7412 
7413  }; // while
7414  KMP_INTERNAL_FREE( (void *) name );
7415 
7416 } // func __kmp_register_library_startup
7417 
7418 
7419 void
7420 __kmp_unregister_library( void ) {
7421 
7422  char * name = __kmp_reg_status_name();
7423  char * value = __kmp_env_get( name );
7424 
7425  KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
7426  KMP_DEBUG_ASSERT( __kmp_registration_str != NULL );
7427  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
7428  // Ok, this is our variable. Delete it.
7429  __kmp_env_unset( name );
7430  }; // if
7431 
7432  KMP_INTERNAL_FREE( __kmp_registration_str );
7433  KMP_INTERNAL_FREE( value );
7434  KMP_INTERNAL_FREE( name );
7435 
7436  __kmp_registration_flag = 0;
7437  __kmp_registration_str = NULL;
7438 
7439 } // __kmp_unregister_library
7440 
7441 
7442 // End of Library registration stuff.
7443 // -------------------------------------------------------------------------------------------------
7444 
7445 static void
7446 __kmp_do_serial_initialize( void )
7447 {
7448  int i, gtid;
7449  int size;
7450 
7451  KA_TRACE( 10, ("__kmp_serial_initialize: enter\n" ) );
7452 
7453  KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
7454  KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
7455  KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
7456  KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
7457  KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
7458 
7459  __kmp_validate_locks();
7460 
7461  /* Initialize internal memory allocator */
7462  __kmp_init_allocator();
7463 
7464  /* Register the library startup via an environment variable
7465  and check to see whether another copy of the library is already
7466  registered. */
7467 
7468  __kmp_register_library_startup( );
7469 
7470  /* TODO reinitialization of library */
7471  if( TCR_4(__kmp_global.g.g_done) ) {
7472  KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
7473  }
7474 
7475  __kmp_global.g.g_abort = 0;
7476  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
7477 
7478  /* initialize the locks */
7479 #if KMP_USE_ADAPTIVE_LOCKS
7480 #if KMP_DEBUG_ADAPTIVE_LOCKS
7481  __kmp_init_speculative_stats();
7482 #endif
7483 #endif
7484  __kmp_init_lock( & __kmp_global_lock );
7485  __kmp_init_queuing_lock( & __kmp_dispatch_lock );
7486  __kmp_init_lock( & __kmp_debug_lock );
7487  __kmp_init_atomic_lock( & __kmp_atomic_lock );
7488  __kmp_init_atomic_lock( & __kmp_atomic_lock_1i );
7489  __kmp_init_atomic_lock( & __kmp_atomic_lock_2i );
7490  __kmp_init_atomic_lock( & __kmp_atomic_lock_4i );
7491  __kmp_init_atomic_lock( & __kmp_atomic_lock_4r );
7492  __kmp_init_atomic_lock( & __kmp_atomic_lock_8i );
7493  __kmp_init_atomic_lock( & __kmp_atomic_lock_8r );
7494  __kmp_init_atomic_lock( & __kmp_atomic_lock_8c );
7495  __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
7496  __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
7497  __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
7498  __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
7499  __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
7500  __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock );
7501  __kmp_init_bootstrap_lock( & __kmp_exit_lock );
7502  __kmp_init_bootstrap_lock( & __kmp_monitor_lock );
7503  __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
7504 
7505  /* conduct initialization and initial setup of configuration */
7506 
7507  __kmp_runtime_initialize();
7508 
7509  // Some global variable initialization moved here from kmp_env_initialize()
7510 #ifdef KMP_DEBUG
7511  kmp_diag = 0;
7512 #endif
7513  __kmp_abort_delay = 0;
7514 
7515  // From __kmp_init_dflt_team_nth()
7516  /* assume the entire machine will be used */
7517  __kmp_dflt_team_nth_ub = __kmp_xproc;
7518  if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
7519  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
7520  }
7521  if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
7522  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
7523  }
7524  __kmp_max_nth = __kmp_sys_max_nth;
7525  __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
7526 
7527  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
7528  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
7529  __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
7530  __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
7531  // From "KMP_LIBRARY" part of __kmp_env_initialize()
7532  __kmp_library = library_throughput;
7533  // From KMP_SCHEDULE initialization
7534  __kmp_static = kmp_sch_static_balanced;
7535  // AC: do not use analytical here, because it is non-monotonous
7536  //__kmp_guided = kmp_sch_guided_iterative_chunked;
7537  #if OMP_30_ENABLED
7538  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
7539  #endif // OMP_30_ENABLED
7540  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
7541  // control parts
7542  #if KMP_FAST_REDUCTION_BARRIER
7543  #define kmp_reduction_barrier_gather_bb ((int)1)
7544  #define kmp_reduction_barrier_release_bb ((int)1)
7545  #define kmp_reduction_barrier_gather_pat bp_hyper_bar
7546  #define kmp_reduction_barrier_release_pat bp_hyper_bar
7547  #endif // KMP_FAST_REDUCTION_BARRIER
7548  for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
7549  __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
7550  __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
7551  __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
7552  __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
7553  #if KMP_FAST_REDUCTION_BARRIER
7554  if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
7555  __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
7556  __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
7557  __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
7558  __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
7559  }
7560  #endif // KMP_FAST_REDUCTION_BARRIER
7561  }
7562  #if KMP_FAST_REDUCTION_BARRIER
7563  #undef kmp_reduction_barrier_release_pat
7564  #undef kmp_reduction_barrier_gather_pat
7565  #undef kmp_reduction_barrier_release_bb
7566  #undef kmp_reduction_barrier_gather_bb
7567  #endif // KMP_FAST_REDUCTION_BARRIER
7568  #if KMP_MIC
7569  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
7570  __kmp_barrier_gather_branch_bits [ 0 ] = 3; // plane gather
7571  __kmp_barrier_release_branch_bits[ 1 ] = 1; // forkjoin release
7572  #endif
7573 
7574  // From KMP_CHECKS initialization
7575 #ifdef KMP_DEBUG
7576  __kmp_env_checks = TRUE; /* development versions have the extra checks */
7577 #else
7578  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
7579 #endif
7580 
7581  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
7582  __kmp_foreign_tp = TRUE;
7583 
7584  __kmp_global.g.g_dynamic = FALSE;
7585  __kmp_global.g.g_dynamic_mode = dynamic_default;
7586 
7587  __kmp_env_initialize( NULL );
7588  // Print all messages in message catalog for testing purposes.
7589  #ifdef KMP_DEBUG
7590  char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
7591  if ( __kmp_str_match_true( val ) ) {
7592  kmp_str_buf_t buffer;
7593  __kmp_str_buf_init( & buffer );
7594  __kmp_i18n_dump_catalog( buffer );
7595  __kmp_printf( "%s", buffer.str );
7596  __kmp_str_buf_free( & buffer );
7597  }; // if
7598  __kmp_env_free( & val );
7599  #endif
7600 
7601  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
7602  __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
7603 
7604  // omalyshe: This initialisation beats env var setting.
7605  //__kmp_load_balance_interval = 1.0;
7606 
7607  // If the library is shut down properly, both pools must be NULL. Just in case, set them
7608  // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
7609  KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
7610  KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
7611  KMP_DEBUG_ASSERT( __kmp_team_pool == NULL );
7612  __kmp_thread_pool = NULL;
7613  __kmp_thread_pool_insert_pt = NULL;
7614  __kmp_team_pool = NULL;
7615 
7616  /* Allocate all of the variable sized records */
7617  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
7618  /* Since allocation is cache-aligned, just add extra padding at the end */
7619  size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
7620  __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
7621  __kmp_root = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
7622 
7623  /* init thread counts */
7624  KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
7625  KMP_DEBUG_ASSERT( __kmp_nth == 0 ); // something was wrong in termination.
7626  __kmp_all_nth = 0;
7627  __kmp_nth = 0;
7628 
7629  /* setup the uber master thread and hierarchy */
7630  gtid = __kmp_register_root( TRUE );
7631  KA_TRACE( 10, ("__kmp_do_serial_initialize T#%d\n", gtid ));
7632  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
7633  KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
7634 
7635  KMP_MB(); /* Flush all pending memory write invalidates. */
7636 
7637  __kmp_common_initialize();
7638 
7639  #if KMP_OS_UNIX
7640  /* invoke the child fork handler */
7641  __kmp_register_atfork();
7642  #endif
7643 
7644  #if ! defined GUIDEDLL_EXPORTS
7645  {
7646  /* Invoke the exit handler when the program finishes, only for static library.
7647  For dynamic library, we already have _fini and DllMain.
7648  */
7649  int rc = atexit( __kmp_internal_end_atexit );
7650  if ( rc != 0 ) {
7651  __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
7652  }; // if
7653  }
7654  #endif
7655 
7656  #if KMP_HANDLE_SIGNALS
7657  #if KMP_OS_UNIX
7658  /* NOTE: make sure that this is called before the user installs
7659  * their own signal handlers so that the user handlers
7660  * are called first. this way they can return false,
7661  * not call our handler, avoid terminating the library,
7662  * and continue execution where they left off. */
7663  __kmp_install_signals( FALSE );
7664  #endif /* KMP_OS_UNIX */
7665  #if KMP_OS_WINDOWS
7666  __kmp_install_signals( TRUE );
7667  #endif /* KMP_OS_WINDOWS */
7668  #endif
7669 
7670  /* we have finished the serial initialization */
7671  __kmp_init_counter ++;
7672 
7673  __kmp_init_serial = TRUE;
7674 
7675  if (__kmp_settings) {
7676  __kmp_env_print();
7677  }
7678 
7679 #if OMP_40_ENABLED
7680  if (__kmp_display_env || __kmp_display_env_verbose) {
7681  __kmp_env_print_2();
7682  }
7683 #endif // OMP_40_ENABLED
7684 
7685  KMP_MB();
7686 
7687  KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
7688 }
7689 
7690 void
7691 __kmp_serial_initialize( void )
7692 {
7693  if ( __kmp_init_serial ) {
7694  return;
7695  }
7696  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7697  if ( __kmp_init_serial ) {
7698  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7699  return;
7700  }
7701  __kmp_do_serial_initialize();
7702  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7703 }
7704 
7705 static void
7706 __kmp_do_middle_initialize( void )
7707 {
7708  int i, j;
7709  int prev_dflt_team_nth;
7710 
7711  if( !__kmp_init_serial ) {
7712  __kmp_do_serial_initialize();
7713  }
7714 
7715  KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
7716 
7717  //
7718  // Save the previous value for the __kmp_dflt_team_nth so that
7719  // we can avoid some reinitialization if it hasn't changed.
7720  //
7721  prev_dflt_team_nth = __kmp_dflt_team_nth;
7722 
7723 #if KMP_OS_WINDOWS || KMP_OS_LINUX
7724  //
7725  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
7726  // number of cores on the machine.
7727  //
7728  __kmp_affinity_initialize();
7729 
7730  //
7731  // Run through the __kmp_threads array and set the affinity mask
7732  // for each root thread that is currently registered with the RTL.
7733  //
7734  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
7735  if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
7736  __kmp_affinity_set_init_mask( i, TRUE );
7737  }
7738  }
7739 #endif /* KMP_OS_WINDOWS || KMP_OS_LINUX */
7740 
7741  KMP_ASSERT( __kmp_xproc > 0 );
7742  if ( __kmp_avail_proc == 0 ) {
7743  __kmp_avail_proc = __kmp_xproc;
7744  }
7745 
7746  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
7747  j = 0;
7748  while ( __kmp_nested_nth.used && ! __kmp_nested_nth.nth[ j ] ) {
7749  __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
7750  j++;
7751  }
7752 
7753  if ( __kmp_dflt_team_nth == 0 ) {
7754 #ifdef KMP_DFLT_NTH_CORES
7755  //
7756  // Default #threads = #cores
7757  //
7758  __kmp_dflt_team_nth = __kmp_ncores;
7759  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
7760  __kmp_dflt_team_nth ) );
7761 #else
7762  //
7763  // Default #threads = #available OS procs
7764  //
7765  __kmp_dflt_team_nth = __kmp_avail_proc;
7766  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
7767  __kmp_dflt_team_nth ) );
7768 #endif /* KMP_DFLT_NTH_CORES */
7769  }
7770 
7771  if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
7772  __kmp_dflt_team_nth = KMP_MIN_NTH;
7773  }
7774  if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
7775  __kmp_dflt_team_nth = __kmp_sys_max_nth;
7776  }
7777 
7778  //
7779  // There's no harm in continuing if the following check fails,
7780  // but it indicates an error in the previous logic.
7781  //
7782  KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
7783 
7784  if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
7785  //
7786  // Run through the __kmp_threads array and set the num threads icv
7787  // for each root thread that is currently registered with the RTL
7788  // (which has not already explicitly set its nthreads-var with a
7789  // call to omp_set_num_threads()).
7790  //
7791  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
7792  kmp_info_t *thread = __kmp_threads[ i ];
7793  if ( thread == NULL ) continue;
7794 #if OMP_30_ENABLED
7795  if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
7796 #else
7797  if ( thread->th.th_team->t.t_set_nproc[ thread->th.th_info.ds.ds_tid ] != 0 ) continue;
7798 #endif /* OMP_30_ENABLED */
7799 
7800  set__nproc_p( __kmp_threads[ i ], __kmp_dflt_team_nth );
7801  }
7802  }
7803  KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7804  __kmp_dflt_team_nth) );
7805 
7806 #ifdef KMP_ADJUST_BLOCKTIME
7807  /* Adjust blocktime to zero if necessary */
7808  /* now that __kmp_avail_proc is set */
7809  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
7810  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
7811  if ( __kmp_nth > __kmp_avail_proc ) {
7812  __kmp_zero_bt = TRUE;
7813  }
7814  }
7815 #endif /* KMP_ADJUST_BLOCKTIME */
7816 
7817  /* we have finished middle initialization */
7818  TCW_SYNC_4(__kmp_init_middle, TRUE);
7819 
7820  KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
7821 }
7822 
7823 void
7824 __kmp_middle_initialize( void )
7825 {
7826  if ( __kmp_init_middle ) {
7827  return;
7828  }
7829  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7830  if ( __kmp_init_middle ) {
7831  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7832  return;
7833  }
7834  __kmp_do_middle_initialize();
7835  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7836 }
7837 
7838 void
7839 __kmp_parallel_initialize( void )
7840 {
7841  int gtid = __kmp_entry_gtid(); // this might be a new root
7842 
7843  /* syncronize parallel initialization (for sibling) */
7844  if( TCR_4(__kmp_init_parallel) ) return;
7845  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7846  if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
7847 
7848  /* TODO reinitialization after we have already shut down */
7849  if( TCR_4(__kmp_global.g.g_done) ) {
7850  KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
7851  __kmp_infinite_loop();
7852  }
7853 
7854  /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
7855  would cause a deadlock. So we call __kmp_do_serial_initialize directly.
7856  */
7857  if( !__kmp_init_middle ) {
7858  __kmp_do_middle_initialize();
7859  }
7860 
7861  /* begin initialization */
7862  KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
7863  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
7864 
7865 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7866  //
7867  // Save the FP control regs.
7868  // Worker threads will set theirs to these values at thread startup.
7869  //
7870  __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
7871  __kmp_store_mxcsr( &__kmp_init_mxcsr );
7872  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7873 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7874 
7875 #if KMP_OS_UNIX
7876 # if KMP_HANDLE_SIGNALS
7877  /* must be after __kmp_serial_initialize */
7878  __kmp_install_signals( TRUE );
7879 # endif
7880 #endif
7881 
7882  __kmp_suspend_initialize();
7883 
7884 # if defined(USE_LOAD_BALANCE)
7885  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
7886  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7887  }
7888 #else
7889  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
7890  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7891  }
7892 #endif
7893 
7894  if ( __kmp_version ) {
7895  __kmp_print_version_2();
7896  }
7897 
7898 #if USE_ITT_BUILD
7899  // Create CSV file to report frames
7900  if( __kmp_forkjoin_frames_mode == 1 )
7901  {
7902  // Open CSV file to write itt frame information
7903  const char * csv_file;
7904 /* Internal AXE variables
7905  char * host_name = __kmp_env_get("INTEL_MRTE_HOST_NAME");
7906  char * out_dir = __kmp_env_get("INTEL_MRTE_DATA_DIR");*/
7907  char * host_name = __kmp_env_get("AMPLXE_HOSTNAME");
7908  char * out_dir = __kmp_env_get("AMPLXE_DATA_DIR");
7909 
7910  if( out_dir && host_name ) {
7911  csv_file = __kmp_str_format( "%s/omp-frames-hostname-%s.csv", out_dir, host_name );
7912  __kmp_itt_csv_file = fopen( csv_file, "w" );
7913  __kmp_str_free( &csv_file );
7914  } else {
7915 #ifdef KMP_DEBUG
7916  // Create CSV file in the current dir
7917  csv_file = __kmp_str_format( "./omp-frames-hostname-xxx.csv" );
7918  __kmp_itt_csv_file = fopen( csv_file, "w" );
7919  __kmp_str_free( &csv_file );
7920 #endif
7921  }
7922  if( __kmp_itt_csv_file ) {
7923  __kmp_str_buf_init( & __kmp_itt_frame_buffer );
7924  __kmp_str_buf_print( & __kmp_itt_frame_buffer, "name,start_tsc.TSC,end_tsc,pid,tid\n" );
7925  }
7926  }
7927 
7928 #endif /* USE_ITT_BUILD */
7929 
7930  /* we have finished parallel initialization */
7931  TCW_SYNC_4(__kmp_init_parallel, TRUE);
7932 
7933  KMP_MB();
7934  KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
7935 
7936  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7937 }
7938 
7939 
7940 /* ------------------------------------------------------------------------ */
7941 
7942 void
7943 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
7944  kmp_team_t *team )
7945 {
7946  kmp_disp_t *dispatch;
7947 
7948  KMP_MB();
7949 
7950  /* none of the threads have encountered any constructs, yet. */
7951  this_thr->th.th_local.this_construct = 0;
7952  this_thr->th.th_local.last_construct = 0;
7953 #if KMP_CACHE_MANAGE
7954  KMP_CACHE_PREFETCH( &this_thr -> th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
7955 #endif /* KMP_CACHE_MANAGE */
7956  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7957  KMP_DEBUG_ASSERT( dispatch );
7958  KMP_DEBUG_ASSERT( team -> t.t_dispatch );
7959  //KMP_DEBUG_ASSERT( this_thr -> th.th_dispatch == &team -> t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
7960 
7961  dispatch -> th_disp_index = 0; /* reset the dispatch buffer counter */
7962 
7963  if( __kmp_env_consistency_check )
7964  __kmp_push_parallel( gtid, team->t.t_ident );
7965 
7966  KMP_MB(); /* Flush all pending memory write invalidates. */
7967 }
7968 
7969 void
7970 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
7971  kmp_team_t *team )
7972 {
7973  if( __kmp_env_consistency_check )
7974  __kmp_pop_parallel( gtid, team->t.t_ident );
7975 }
7976 
7977 int
7978 __kmp_invoke_task_func( int gtid )
7979 {
7980  int rc;
7981  int tid = __kmp_tid_from_gtid( gtid );
7982  kmp_info_t *this_thr = __kmp_threads[ gtid ];
7983  kmp_team_t *team = this_thr -> th.th_team;
7984 
7985  __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
7986 #if USE_ITT_BUILD
7987  if ( __itt_stack_caller_create_ptr ) {
7988  __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
7989  }
7990 #endif /* USE_ITT_BUILD */
7991  rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
7992  gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv );
7993 
7994 #if USE_ITT_BUILD
7995  if ( __itt_stack_caller_create_ptr ) {
7996  __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
7997  }
7998 #endif /* USE_ITT_BUILD */
7999  __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
8000 
8001  return rc;
8002 }
8003 
8004 #if OMP_40_ENABLED
8005 void
8006 __kmp_teams_master( microtask_t microtask, int gtid )
8007 {
8008  // This routine is called by all master threads in teams construct
8009  kmp_info_t *this_thr = __kmp_threads[ gtid ];
8010  kmp_team_t *team = this_thr -> th.th_team;
8011  ident_t *loc = team->t.t_ident;
8012 
8013 #if KMP_DEBUG
8014  int tid = __kmp_tid_from_gtid( gtid );
8015  KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
8016  gtid, tid, microtask) );
8017 #endif
8018 
8019  // Launch league of teams now, but not let workers execute
8020  // (they hang on fork barrier until next parallel)
8021  this_thr->th.th_set_nproc = this_thr->th.th_set_nth_teams;
8022  __kmp_fork_call( loc, gtid, TRUE,
8023  team->t.t_argc,
8024  microtask,
8025  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
8026  NULL );
8027  __kmp_join_call( loc, gtid, 1 ); // AC: last parameter "1" eliminates join barrier which won't work because
8028  // worker threads are in a fork barrier waiting for more parallel regions
8029 }
8030 
8031 int
8032 __kmp_invoke_teams_master( int gtid )
8033 {
8034  #if KMP_DEBUG
8035  if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
8036  KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
8037  #endif
8038 
8039  __kmp_teams_master( (microtask_t)__kmp_threads[gtid]->th.th_team_microtask, gtid );
8040 
8041  return 1;
8042 }
8043 #endif /* OMP_40_ENABLED */
8044 
8045 /* this sets the requested number of threads for the next parallel region
8046  * encountered by this team */
8047 /* since this should be enclosed in the forkjoin critical section it
8048  * should avoid race conditions with assymmetrical nested parallelism */
8049 
8050 void
8051 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
8052 {
8053  kmp_info_t *thr = __kmp_threads[gtid];
8054 
8055  if( num_threads > 0 )
8056  thr -> th.th_set_nproc = num_threads;
8057 }
8058 
8059 #if OMP_40_ENABLED
8060 
8061 /* this sets the requested number of teams for the teams region and/or
8062  * the number of threads for the next parallel region encountered */
8063 void
8064 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
8065 {
8066  kmp_info_t *thr = __kmp_threads[gtid];
8067  // The number of teams is the number of threads in the outer "parallel"
8068  if( num_teams > 0 ) {
8069  thr -> th.th_set_nproc = num_teams;
8070  } else {
8071  thr -> th.th_set_nproc = 1; // AC: default number of teams is 1;
8072  // TODO: should it be __kmp_ncores ?
8073  }
8074  // The number of threads is for inner parallel regions
8075  if( num_threads > 0 ) {
8076  thr -> th.th_set_nth_teams = num_threads;
8077  } else {
8078  if( !TCR_4(__kmp_init_middle) )
8079  __kmp_middle_initialize();
8080  thr -> th.th_set_nth_teams = __kmp_avail_proc / thr -> th.th_set_nproc;
8081  }
8082 }
8083 
8084 
8085 //
8086 // Set the proc_bind var to use in the following parallel region.
8087 //
8088 void
8089 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
8090 {
8091  kmp_info_t *thr = __kmp_threads[gtid];
8092  thr -> th.th_set_proc_bind = proc_bind;
8093 }
8094 
8095 #endif /* OMP_40_ENABLED */
8096 
8097 /* Launch the worker threads into the microtask. */
8098 
8099 void
8100 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
8101 {
8102  kmp_info_t *this_thr = __kmp_threads[gtid];
8103 
8104 #ifdef KMP_DEBUG
8105  int f;
8106 #endif /* KMP_DEBUG */
8107 
8108  KMP_DEBUG_ASSERT( team );
8109  KMP_DEBUG_ASSERT( this_thr -> th.th_team == team );
8110  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
8111  KMP_MB(); /* Flush all pending memory write invalidates. */
8112 
8113  team -> t.t_construct = 0; /* no single directives seen yet */
8114  team -> t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
8115 
8116  /* Reset the identifiers on the dispatch buffer */
8117  KMP_DEBUG_ASSERT( team -> t.t_disp_buffer );
8118  if ( team->t.t_max_nproc > 1 ) {
8119  int i;
8120  for (i = 0; i < KMP_MAX_DISP_BUF; ++i)
8121  team -> t.t_disp_buffer[ i ].buffer_index = i;
8122  } else {
8123  team -> t.t_disp_buffer[ 0 ].buffer_index = 0;
8124  }
8125 
8126  KMP_MB(); /* Flush all pending memory write invalidates. */
8127  KMP_ASSERT( this_thr -> th.th_team == team );
8128 
8129 #ifdef KMP_DEBUG
8130  for( f=0 ; f<team->t.t_nproc ; f++ ) {
8131  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
8132  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
8133  }
8134 #endif /* KMP_DEBUG */
8135 
8136  /* release the worker threads so they may begin working */
8137  __kmp_fork_barrier( gtid, 0 );
8138 }
8139 
8140 
8141 void
8142 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
8143 {
8144  kmp_info_t *this_thr = __kmp_threads[gtid];
8145 
8146  KMP_DEBUG_ASSERT( team );
8147  KMP_DEBUG_ASSERT( this_thr -> th.th_team == team );
8148  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
8149  KMP_MB(); /* Flush all pending memory write invalidates. */
8150 
8151  /* Join barrier after fork */
8152 
8153 #ifdef KMP_DEBUG
8154  if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
8155  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
8156  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
8157  gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
8158  __kmp_print_structure();
8159  }
8160  KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
8161  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
8162 #endif /* KMP_DEBUG */
8163 
8164  __kmp_join_barrier( gtid ); /* wait for everyone */
8165 
8166  KMP_MB(); /* Flush all pending memory write invalidates. */
8167  KMP_ASSERT( this_thr -> th.th_team == team );
8168 }
8169 
8170 
8171 /* ------------------------------------------------------------------------ */
8172 /* ------------------------------------------------------------------------ */
8173 
8174 #ifdef USE_LOAD_BALANCE
8175 
8176 //
8177 // Return the worker threads actively spinning in the hot team, if we
8178 // are at the outermost level of parallelism. Otherwise, return 0.
8179 //
8180 static int
8181 __kmp_active_hot_team_nproc( kmp_root_t *root )
8182 {
8183  int i;
8184  int retval;
8185  kmp_team_t *hot_team;
8186 
8187  if ( root->r.r_active ) {
8188  return 0;
8189  }
8190  hot_team = root->r.r_hot_team;
8191  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
8192  return hot_team->t.t_nproc - 1; // Don't count master thread
8193  }
8194 
8195  //
8196  // Skip the master thread - it is accounted for elsewhere.
8197  //
8198  retval = 0;
8199  for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
8200  if ( hot_team->t.t_threads[i]->th.th_active ) {
8201  retval++;
8202  }
8203  }
8204  return retval;
8205 }
8206 
8207 //
8208 // Perform an automatic adjustment to the number of
8209 // threads used by the next parallel region.
8210 //
8211 static int
8212 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
8213 {
8214  int retval;
8215  int pool_active;
8216  int hot_team_active;
8217  int team_curr_active;
8218  int system_active;
8219 
8220  KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
8221  root, set_nproc ) );
8222  KMP_DEBUG_ASSERT( root );
8223  #if OMP_30_ENABLED
8224  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
8225  #else
8226  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_set_dynamic[0] == TRUE );
8227  #endif
8228  KMP_DEBUG_ASSERT( set_nproc > 1 );
8229 
8230  if ( set_nproc == 1) {
8231  KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
8232  return 1;
8233  }
8234 
8235  //
8236  // Threads that are active in the thread pool, active in the hot team
8237  // for this particular root (if we are at the outer par level), and
8238  // the currently executing thread (to become the master) are available
8239  // to add to the new team, but are currently contributing to the system
8240  // load, and must be accounted for.
8241  //
8242  pool_active = TCR_4(__kmp_thread_pool_active_nth);
8243  hot_team_active = __kmp_active_hot_team_nproc( root );
8244  team_curr_active = pool_active + hot_team_active + 1;
8245 
8246  //
8247  // Check the system load.
8248  //
8249  system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
8250  KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
8251  system_active, pool_active, hot_team_active ) );
8252 
8253  if ( system_active < 0 ) {
8254  //
8255  // There was an error reading the necessary info from /proc,
8256  // so use the thread limit algorithm instead. Once we set
8257  // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
8258  // we shouldn't wind up getting back here.
8259  //
8260  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
8261  KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
8262 
8263  //
8264  // Make this call behave like the thread limit algorithm.
8265  //
8266  retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
8267  : root->r.r_hot_team->t.t_nproc);
8268  if ( retval > set_nproc ) {
8269  retval = set_nproc;
8270  }
8271  if ( retval < KMP_MIN_NTH ) {
8272  retval = KMP_MIN_NTH;
8273  }
8274 
8275  KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
8276  return retval;
8277  }
8278 
8279  //
8280  // There is a slight delay in the load balance algorithm in detecting
8281  // new running procs. The real system load at this instant should be
8282  // at least as large as the #active omp thread that are available to
8283  // add to the team.
8284  //
8285  if ( system_active < team_curr_active ) {
8286  system_active = team_curr_active;
8287  }
8288  retval = __kmp_avail_proc - system_active + team_curr_active;
8289  if ( retval > set_nproc ) {
8290  retval = set_nproc;
8291  }
8292  if ( retval < KMP_MIN_NTH ) {
8293  retval = KMP_MIN_NTH;
8294  }
8295 
8296  KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
8297  return retval;
8298 } // __kmp_load_balance_nproc()
8299 
8300 #endif /* USE_LOAD_BALANCE */
8301 
8302 
8303 /* ------------------------------------------------------------------------ */
8304 /* ------------------------------------------------------------------------ */
8305 
8306 /* NOTE: this is called with the __kmp_init_lock held */
8307 void
8308 __kmp_cleanup( void )
8309 {
8310  int f;
8311 
8312  KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
8313 
8314  if (TCR_4(__kmp_init_parallel)) {
8315 #if KMP_HANDLE_SIGNALS
8316  __kmp_remove_signals();
8317 #endif
8318  TCW_4(__kmp_init_parallel, FALSE);
8319  }
8320 
8321  if (TCR_4(__kmp_init_middle)) {
8322 #if KMP_OS_WINDOWS || KMP_OS_LINUX
8323  __kmp_affinity_uninitialize();
8324 #endif /* KMP_OS_WINDOWS || KMP_OS_LINUX */
8325  TCW_4(__kmp_init_middle, FALSE);
8326  }
8327 
8328  KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
8329 
8330  if (__kmp_init_serial) {
8331 
8332  __kmp_runtime_destroy();
8333 
8334  __kmp_init_serial = FALSE;
8335  }
8336 
8337  for ( f = 0; f < __kmp_threads_capacity; f++ ) {
8338  if ( __kmp_root[ f ] != NULL ) {
8339  __kmp_free( __kmp_root[ f ] );
8340  __kmp_root[ f ] = NULL;
8341  }
8342  }
8343  __kmp_free( __kmp_threads );
8344  // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
8345  // freeing __kmp_root.
8346  __kmp_threads = NULL;
8347  __kmp_root = NULL;
8348  __kmp_threads_capacity = 0;
8349 
8350  __kmp_cleanup_user_locks();
8351 
8352  #if KMP_OS_LINUX || KMP_OS_WINDOWS
8353  KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
8354  __kmp_cpuinfo_file = NULL;
8355  #endif /* KMP_OS_LINUX || KMP_OS_WINDOWS */
8356 
8357  #if KMP_USE_ADAPTIVE_LOCKS
8358  #if KMP_DEBUG_ADAPTIVE_LOCKS
8359  __kmp_print_speculative_stats();
8360  #endif
8361  #endif
8362  KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
8363  __kmp_nested_nth.nth = NULL;
8364  __kmp_nested_nth.size = 0;
8365  __kmp_nested_nth.used = 0;
8366 
8367  __kmp_i18n_catclose();
8368 
8369 #if USE_ITT_BUILD
8370  // Close CSV file for frames
8371  if( __kmp_forkjoin_frames_mode && __kmp_itt_csv_file ) {
8372  fprintf( __kmp_itt_csv_file, __kmp_itt_frame_buffer.str );
8373 
8374  __kmp_str_buf_free( & __kmp_itt_frame_buffer );
8375  fclose( __kmp_itt_csv_file );
8376  }
8377 #endif /* USE_ITT_BUILD */
8378 
8379  KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
8380 }
8381 
8382 /* ------------------------------------------------------------------------ */
8383 /* ------------------------------------------------------------------------ */
8384 
8385 int
8386 __kmp_ignore_mppbeg( void )
8387 {
8388  char *env;
8389 
8390  if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
8391  if (__kmp_str_match_false( env ))
8392  return FALSE;
8393  }
8394  // By default __kmpc_begin() is no-op.
8395  return TRUE;
8396 }
8397 
8398 int
8399 __kmp_ignore_mppend( void )
8400 {
8401  char *env;
8402 
8403  if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
8404  if (__kmp_str_match_false( env ))
8405  return FALSE;
8406  }
8407  // By default __kmpc_end() is no-op.
8408  return TRUE;
8409 }
8410 
8411 void
8412 __kmp_internal_begin( void )
8413 {
8414  int gtid;
8415  kmp_root_t *root;
8416 
8417  /* this is a very important step as it will register new sibling threads
8418  * and assign these new uber threads a new gtid */
8419  gtid = __kmp_entry_gtid();
8420  root = __kmp_threads[ gtid ] -> th.th_root;
8421  KMP_ASSERT( KMP_UBER_GTID( gtid ));
8422 
8423  if( root->r.r_begin ) return;
8424  __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
8425  if( root->r.r_begin ) {
8426  __kmp_release_lock( & root->r.r_begin_lock, gtid );
8427  return;
8428  }
8429 
8430  root -> r.r_begin = TRUE;
8431 
8432  __kmp_release_lock( & root->r.r_begin_lock, gtid );
8433 }
8434 
8435 
8436 /* ------------------------------------------------------------------------ */
8437 /* ------------------------------------------------------------------------ */
8438 
8439 void
8440 __kmp_user_set_library (enum library_type arg)
8441 {
8442  int gtid;
8443  kmp_root_t *root;
8444  kmp_info_t *thread;
8445 
8446  /* first, make sure we are initialized so we can get our gtid */
8447 
8448  gtid = __kmp_entry_gtid();
8449  thread = __kmp_threads[ gtid ];
8450 
8451  root = thread -> th.th_root;
8452 
8453  KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
8454  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
8455  KMP_WARNING( SetLibraryIncorrectCall );
8456  return;
8457  }
8458 
8459  switch ( arg ) {
8460  case library_serial :
8461  thread -> th.th_set_nproc = 0;
8462  set__nproc_p( thread, 1 );
8463  break;
8464  case library_turnaround :
8465  thread -> th.th_set_nproc = 0;
8466  set__nproc_p( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
8467  break;
8468  case library_throughput :
8469  thread -> th.th_set_nproc = 0;
8470  set__nproc_p( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
8471  break;
8472  default:
8473  KMP_FATAL( UnknownLibraryType, arg );
8474  }
8475 
8476  __kmp_aux_set_library ( arg );
8477 }
8478 
8479 void
8480 __kmp_aux_set_stacksize( size_t arg )
8481 {
8482  if (! __kmp_init_serial)
8483  __kmp_serial_initialize();
8484 
8485 #if KMP_OS_DARWIN
8486  if (arg & (0x1000 - 1)) {
8487  arg &= ~(0x1000 - 1);
8488  if(arg + 0x1000) /* check for overflow if we round up */
8489  arg += 0x1000;
8490  }
8491 #endif
8492  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
8493 
8494  /* only change the default stacksize before the first parallel region */
8495  if (! TCR_4(__kmp_init_parallel)) {
8496  size_t value = arg; /* argument is in bytes */
8497 
8498  if (value < __kmp_sys_min_stksize )
8499  value = __kmp_sys_min_stksize ;
8500  else if (value > KMP_MAX_STKSIZE)
8501  value = KMP_MAX_STKSIZE;
8502 
8503  __kmp_stksize = value;
8504 
8505  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
8506  }
8507 
8508  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
8509 }
8510 
8511 /* set the behaviour of the runtime library */
8512 /* TODO this can cause some odd behaviour with sibling parallelism... */
8513 void
8514 __kmp_aux_set_library (enum library_type arg)
8515 {
8516  __kmp_library = arg;
8517 
8518  switch ( __kmp_library ) {
8519  case library_serial :
8520  {
8521  KMP_INFORM( LibraryIsSerial );
8522  (void) __kmp_change_library( TRUE );
8523  }
8524  break;
8525  case library_turnaround :
8526  (void) __kmp_change_library( TRUE );
8527  break;
8528  case library_throughput :
8529  (void) __kmp_change_library( FALSE );
8530  break;
8531  default:
8532  KMP_FATAL( UnknownLibraryType, arg );
8533  }
8534 }
8535 
8536 /* ------------------------------------------------------------------------ */
8537 /* ------------------------------------------------------------------------ */
8538 
8539 void
8540 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
8541 {
8542  int blocktime = arg; /* argument is in milliseconds */
8543  int bt_intervals;
8544  int bt_set;
8545 
8546  __kmp_save_internal_controls( thread );
8547 
8548  /* Normalize and set blocktime for the teams */
8549  if (blocktime < KMP_MIN_BLOCKTIME)
8550  blocktime = KMP_MIN_BLOCKTIME;
8551  else if (blocktime > KMP_MAX_BLOCKTIME)
8552  blocktime = KMP_MAX_BLOCKTIME;
8553 
8554  set__blocktime_team( thread -> th.th_team, tid, blocktime );
8555  set__blocktime_team( thread -> th.th_serial_team, 0, blocktime );
8556 
8557  /* Calculate and set blocktime intervals for the teams */
8558  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8559 
8560  set__bt_intervals_team( thread -> th.th_team, tid, bt_intervals );
8561  set__bt_intervals_team( thread -> th.th_serial_team, 0, bt_intervals );
8562 
8563  /* Set whether blocktime has been set to "TRUE" */
8564  bt_set = TRUE;
8565 
8566  set__bt_set_team( thread -> th.th_team, tid, bt_set );
8567  set__bt_set_team( thread -> th.th_serial_team, 0, bt_set );
8568  KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
8569  __kmp_gtid_from_tid(tid, thread->th.th_team),
8570  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
8571 }
8572 
8573 void
8574 __kmp_aux_set_defaults(
8575  char const * str,
8576  int len
8577 ) {
8578  if ( ! __kmp_init_serial ) {
8579  __kmp_serial_initialize();
8580  };
8581  __kmp_env_initialize( str );
8582 
8583  if (__kmp_settings
8584 #if OMP_40_ENABLED
8585  || __kmp_display_env || __kmp_display_env_verbose
8586 #endif // OMP_40_ENABLED
8587  ) {
8588  __kmp_env_print();
8589  }
8590 } // __kmp_aux_set_defaults
8591 
8592 /* ------------------------------------------------------------------------ */
8593 
8594 /*
8595  * internal fast reduction routines
8596  */
8597 
8598 // implementation rev. 0.4
8599 // AT: determine CPU, and always use 'critical method' if non-Intel
8600 // AT: test loc != NULL
8601 // AT: what to return if lck == NULL
8602 // AT: tune the cut-off point for atomic reduce method
8603 // AT: tune what to return depending on the CPU and platform configuration
8604 // AT: tune what to return depending on team size
8605 // AT: move this function out to kmp_csupport.c
8606 PACKED_REDUCTION_METHOD_T
8607 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
8608  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8609  kmp_critical_name *lck )
8610 {
8611 
8612  // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
8613  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
8614  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
8615  // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
8616 
8617  PACKED_REDUCTION_METHOD_T retval;
8618 
8619  int team_size;
8620 
8621  KMP_DEBUG_ASSERT( loc ); // it would be nice to test ( loc != 0 )
8622  KMP_DEBUG_ASSERT( lck ); // it would be nice to test ( lck != 0 )
8623 
8624  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
8625  #define FAST_REDUCTION_TREE_METHOD_GENERATED ( ( reduce_data ) && ( reduce_func ) )
8626 
8627  retval = critical_reduce_block;
8628 
8629  team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
8630 
8631  if( team_size == 1 ) {
8632 
8633  retval = empty_reduce_block;
8634 
8635  } else {
8636 
8637  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8638  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8639 
8640  #if KMP_ARCH_X86_64
8641 
8642  #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_DARWIN
8643  #if KMP_MIC
8644  #define REDUCTION_TEAMSIZE_CUTOFF 8
8645  #else // KMP_MIC
8646  #define REDUCTION_TEAMSIZE_CUTOFF 4
8647  #endif // KMP_MIC
8648  if( tree_available ) {
8649  if( team_size <= REDUCTION_TEAMSIZE_CUTOFF ) {
8650  if ( atomic_available ) {
8651  retval = atomic_reduce_block;
8652  }
8653  } else {
8654  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8655  }
8656  } else if ( atomic_available ) {
8657  retval = atomic_reduce_block;
8658  }
8659  #else
8660  #error "Unknown or unsupported OS"
8661  #endif // KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_DARWIN
8662 
8663  #elif KMP_ARCH_X86
8664 
8665  #if KMP_OS_LINUX || KMP_OS_WINDOWS
8666 
8667  // similar to win_32
8668  // 4x1x2 fxqlin04, the 'linear,linear' barrier
8669 
8670  // similar to lin_32
8671  // 4x1x2 fxqwin04, the 'linear,linear' barrier
8672 
8673  // actual measurement shows that the critical section method is better if team_size <= 8;
8674  // what happenes when team_size > 8 ? ( no machine to test )
8675 
8676  // TO DO: need to run a 32-bit code on Intel(R) 64
8677  // TO DO: test the 'hyper,hyper,1,1' barrier
8678 
8679  // basic tuning
8680 
8681  if( atomic_available ) {
8682  if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
8683  retval = atomic_reduce_block;
8684  }
8685  } // otherwise: use critical section
8686 
8687  #elif KMP_OS_DARWIN
8688 
8689 
8690  if( atomic_available && ( num_vars <= 3 ) ) {
8691  retval = atomic_reduce_block;
8692  } else if( tree_available ) {
8693  if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
8694  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8695  }
8696  } // otherwise: use critical section
8697 
8698  #else
8699  #error "Unknown or unsupported OS"
8700  #endif
8701 
8702  #else
8703  #error "Unknown or unsupported architecture"
8704  #endif
8705 
8706  }
8707 
8708  //AT: TO DO: critical block method not implemented by PAROPT
8709  //if( retval == __kmp_critical_reduce_block ) {
8710  // if( lck == NULL ) { // critical block method not implemented by PAROPT
8711  // }
8712  //}
8713 
8714  // tune what to return depending on the CPU and platform configuration
8715  // (sometimes tree method is slower than critical)
8716 
8717  // probably tune what to return depending on team size
8718 
8719 
8720  // KMP_FORCE_REDUCTION
8721 
8722  if( __kmp_force_reduction_method != reduction_method_not_defined ) {
8723 
8724  PACKED_REDUCTION_METHOD_T forced_retval;
8725 
8726  int atomic_available, tree_available;
8727 
8728  switch( ( forced_retval = __kmp_force_reduction_method ) )
8729  {
8730  case critical_reduce_block:
8731  KMP_ASSERT( lck ); // lck should be != 0
8732  if( team_size <= 1 ) {
8733  forced_retval = empty_reduce_block;
8734  }
8735  break;
8736 
8737  case atomic_reduce_block:
8738  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8739  KMP_ASSERT( atomic_available ); // atomic_available should be != 0
8740  break;
8741 
8742  case tree_reduce_block:
8743  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8744  KMP_ASSERT( tree_available ); // tree_available should be != 0
8745  #if KMP_FAST_REDUCTION_BARRIER
8746  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8747  #endif
8748  break;
8749 
8750  default:
8751  KMP_ASSERT( 0 ); // "unsupported method specified"
8752  }
8753 
8754  retval = forced_retval;
8755  }
8756 
8757  KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
8758 
8759  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8760  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8761 
8762  return ( retval );
8763 }
8764 
8765 // this function is for testing set/get/determine reduce method
8766 kmp_int32
8767 __kmp_get_reduce_method( void ) {
8768  return ( ( __kmp_entry_thread() -> th.th_local.packed_reduction_method ) >> 8 );
8769 }
8770 
8771 /* ------------------------------------------------------------------------ */