Intel® OpenMP* Runtime Library
 All Classes Functions Variables Typedefs Enumerations Enumerator Groups Pages
kmp_csupport.c
1 /*
2  * kmp_csupport.c -- kfront linkage support for OpenMP.
3  * $Revision: 42642 $
4  * $Date: 2013-09-06 01:57:24 -0500 (Fri, 06 Sep 2013) $
5  */
6 
7 /* <copyright>
8  Copyright (c) 1997-2013 Intel Corporation. All Rights Reserved.
9 
10  Redistribution and use in source and binary forms, with or without
11  modification, are permitted provided that the following conditions
12  are met:
13 
14  * Redistributions of source code must retain the above copyright
15  notice, this list of conditions and the following disclaimer.
16  * Redistributions in binary form must reproduce the above copyright
17  notice, this list of conditions and the following disclaimer in the
18  documentation and/or other materials provided with the distribution.
19  * Neither the name of Intel Corporation nor the names of its
20  contributors may be used to endorse or promote products derived
21  from this software without specific prior written permission.
22 
23  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 
35 </copyright> */
36 
37 #include "omp.h" /* extern "C" declarations of user-visible routines */
38 #include "kmp.h"
39 #include "kmp_i18n.h"
40 #include "kmp_itt.h"
41 #include "kmp_error.h"
42 
43 #define MAX_MESSAGE 512
44 
45 /* ------------------------------------------------------------------------ */
46 /* ------------------------------------------------------------------------ */
47 
48 /* flags will be used in future, e.g., to implement */
49 /* openmp_strict library restrictions */
50 
60 void
61 __kmpc_begin(ident_t *loc, kmp_int32 flags)
62 {
63  // By default __kmp_ignore_mppbeg() returns TRUE.
64  if (__kmp_ignore_mppbeg() == FALSE) {
65  __kmp_internal_begin();
66 
67  KC_TRACE( 10, ("__kmpc_begin: called\n" ) );
68  }
69 }
70 
78 void
80 {
81  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op.
82  // However, this can be overridden with KMP_IGNORE_MPPEND environment variable.
83  // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end()
84  // will unregister this root (it can cause library shut down).
85  if (__kmp_ignore_mppend() == FALSE) {
86  KC_TRACE( 10, ("__kmpc_end: called\n" ) );
87  KA_TRACE( 30, ("__kmpc_end\n" ));
88 
89  __kmp_internal_end_thread( -1 );
90  }
91 }
92 
112 kmp_int32
114 {
115  kmp_int32 gtid = __kmp_entry_gtid();
116 
117  KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) );
118 
119  return gtid;
120 }
121 
135 kmp_int32
137 {
138  KC_TRACE( 10, ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_nth ) );
139 
140  return TCR_4(__kmp_nth);
141 }
142 
149 kmp_int32
151 {
152  KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) );
153  return __kmp_tid_from_gtid( __kmp_entry_gtid() );
154 }
155 
161 kmp_int32
163 {
164  KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) );
165 
166  return __kmp_entry_thread() -> th.th_team -> t.t_nproc;
167 }
168 
175 kmp_int32
177 {
178 #ifndef KMP_DEBUG
179 
180  return TRUE;
181 
182 #else
183 
184  const char *semi2;
185  const char *semi3;
186  int line_no;
187 
188  if (__kmp_par_range == 0) {
189  return TRUE;
190  }
191  semi2 = loc->psource;
192  if (semi2 == NULL) {
193  return TRUE;
194  }
195  semi2 = strchr(semi2, ';');
196  if (semi2 == NULL) {
197  return TRUE;
198  }
199  semi2 = strchr(semi2 + 1, ';');
200  if (semi2 == NULL) {
201  return TRUE;
202  }
203  if (__kmp_par_range_filename[0]) {
204  const char *name = semi2 - 1;
205  while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
206  name--;
207  }
208  if ((*name == '/') || (*name == ';')) {
209  name++;
210  }
211  if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
212  return __kmp_par_range < 0;
213  }
214  }
215  semi3 = strchr(semi2 + 1, ';');
216  if (__kmp_par_range_routine[0]) {
217  if ((semi3 != NULL) && (semi3 > semi2)
218  && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
219  return __kmp_par_range < 0;
220  }
221  }
222  if (sscanf(semi3 + 1, "%d", &line_no) == 1) {
223  if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
224  return __kmp_par_range > 0;
225  }
226  return __kmp_par_range < 0;
227  }
228  return TRUE;
229 
230 #endif /* KMP_DEBUG */
231 
232 }
233 
239 kmp_int32
241 {
242  return __kmp_entry_thread() -> th.th_root -> r.r_active;
243 }
244 
254 void
255 __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads )
256 {
257  KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
258  global_tid, num_threads ) );
259 
260  __kmp_push_num_threads( loc, global_tid, num_threads );
261 }
262 
263 void
264 __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid )
265 {
266  KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) );
267 
268  /* the num_threads are automatically popped */
269 }
270 
271 
272 #if OMP_40_ENABLED
273 
274 void
275 __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind )
276 {
277  KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n",
278  global_tid, proc_bind ) );
279 
280  __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind );
281 }
282 
283 #endif /* OMP_40_ENABLED */
284 
285 
295 void
296 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
297 {
298  int gtid = __kmp_entry_gtid();
299  // maybe to save thr_state is enough here
300  {
301  va_list ap;
302  va_start( ap, microtask );
303 
304  __kmp_fork_call( loc, gtid, TRUE,
305  argc,
306  VOLATILE_CAST(microtask_t) microtask,
307  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
308 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
309 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
310  &ap
311 #else
312  ap
313 #endif
314  );
315  __kmp_join_call( loc, gtid );
316 
317  va_end( ap );
318  }
319 }
320 
321 #if OMP_40_ENABLED
322 
332 void
333 __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads )
334 {
335  KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
336  global_tid, num_teams, num_threads ) );
337 
338  __kmp_push_num_teams( loc, global_tid, num_teams, num_threads );
339 }
340 
350 void
351 __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
352 {
353  int gtid = __kmp_entry_gtid();
354  kmp_info_t *this_thr = __kmp_threads[ gtid ];
355  va_list ap;
356  va_start( ap, microtask );
357 
358  // remember teams entry point and nesting level
359  this_thr->th.th_team_microtask = microtask;
360  this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
361 
362  // check if __kmpc_push_num_teams called, set default number of teams otherwise
363  if ( this_thr->th.th_set_nth_teams == 0 ) {
364  __kmp_push_num_teams( loc, gtid, 0, 0 );
365  }
366  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
367  KMP_DEBUG_ASSERT(this_thr->th.th_set_nth_teams >= 1);
368 
369  __kmp_fork_call( loc, gtid, TRUE,
370  argc,
371  VOLATILE_CAST(microtask_t) __kmp_teams_master,
372  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
373 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
374  &ap
375 #else
376  ap
377 #endif
378  );
379  __kmp_join_call( loc, gtid );
380  this_thr->th.th_team_microtask = NULL;
381  this_thr->th.th_teams_level = 0;
382 
383  va_end( ap );
384 }
385 #endif /* OMP_40_ENABLED */
386 
387 
388 //
389 // I don't think this function should ever have been exported.
390 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
391 // openmp code ever called it, but it's been exported from the RTL for so
392 // long that I'm afraid to remove the definition.
393 //
394 int
395 __kmpc_invoke_task_func( int gtid )
396 {
397  return __kmp_invoke_task_func( gtid );
398 }
399 
412 void
413 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
414 {
415  kmp_info_t *this_thr;
416  kmp_team_t *serial_team;
417 
418  KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
419 
420  /* Skip all this code for autopar serialized loops since it results in
421  unacceptable overhead */
422  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
423  return;
424 
425  if( ! TCR_4( __kmp_init_parallel ) )
426  __kmp_parallel_initialize();
427 
428  this_thr = __kmp_threads[ global_tid ];
429  serial_team = this_thr -> th.th_serial_team;
430 
431  /* utilize the serialized team held by this thread */
432  KMP_DEBUG_ASSERT( serial_team );
433  KMP_MB();
434 
435 #if OMP_30_ENABLED
436  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
437  KMP_DEBUG_ASSERT( this_thr -> th.th_task_team == this_thr -> th.th_team -> t.t_task_team );
438  KMP_DEBUG_ASSERT( serial_team -> t.t_task_team == NULL );
439  KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
440  global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
441  this_thr -> th.th_task_team = NULL;
442  }
443 #endif // OMP_30_ENABLED
444 
445 #if OMP_40_ENABLED
446  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
447  if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
448  proc_bind = proc_bind_false;
449  }
450  else if ( proc_bind == proc_bind_default ) {
451  //
452  // No proc_bind clause was specified, so use the current value
453  // of proc-bind-var for this parallel region.
454  //
455  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
456  }
457  //
458  // Reset for next parallel region
459  //
460  this_thr->th.th_set_proc_bind = proc_bind_default;
461 #endif /* OMP_3_ENABLED */
462 
463  if( this_thr -> th.th_team != serial_team ) {
464 #if OMP_30_ENABLED
465  // Nested level will be an index in the nested nthreads array
466  int level = this_thr->th.th_team->t.t_level;
467 #endif
468  if( serial_team -> t.t_serialized ) {
469  /* this serial team was already used
470  * TODO increase performance by making this locks more specific */
471  kmp_team_t *new_team;
472  int tid = this_thr->th.th_info.ds.ds_tid;
473 
474  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
475 
476  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
477 #if OMP_40_ENABLED
478  proc_bind,
479 #endif
480 #if OMP_30_ENABLED
481  & this_thr->th.th_current_task->td_icvs,
482 #else
483  this_thr->th.th_team->t.t_set_nproc[tid],
484  this_thr->th.th_team->t.t_set_dynamic[tid],
485  this_thr->th.th_team->t.t_set_nested[tid],
486  this_thr->th.th_team->t.t_set_blocktime[tid],
487  this_thr->th.th_team->t.t_set_bt_intervals[tid],
488  this_thr->th.th_team->t.t_set_bt_set[tid],
489 #endif // OMP_30_ENABLED
490  0);
491  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
492  KMP_ASSERT( new_team );
493 
494  /* setup new serialized team and install it */
495  new_team -> t.t_threads[0] = this_thr;
496  new_team -> t.t_parent = this_thr -> th.th_team;
497  serial_team = new_team;
498  this_thr -> th.th_serial_team = serial_team;
499 
500  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
501  global_tid, serial_team ) );
502 
503 
504  /* TODO the above breaks the requirement that if we run out of
505  * resources, then we can still guarantee that serialized teams
506  * are ok, since we may need to allocate a new one */
507  } else {
508  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
509  global_tid, serial_team ) );
510  }
511 
512  /* we have to initialize this serial team */
513  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
514  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
515  KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
516  serial_team -> t.t_ident = loc;
517  serial_team -> t.t_serialized = 1;
518  serial_team -> t.t_nproc = 1;
519  serial_team -> t.t_parent = this_thr->th.th_team;
520 #if OMP_30_ENABLED
521  serial_team -> t.t_sched = this_thr->th.th_team->t.t_sched;
522 #endif // OMP_30_ENABLED
523  this_thr -> th.th_team = serial_team;
524  serial_team -> t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
525 
526 #if OMP_30_ENABLED
527  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
528  global_tid, this_thr->th.th_current_task ) );
529  KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
530  this_thr->th.th_current_task->td_flags.executing = 0;
531 
532  __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
533 
534  /* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
535  each serialized task represented by team->t.t_serialized? */
536  copy_icvs(
537  & this_thr->th.th_current_task->td_icvs,
538  & this_thr->th.th_current_task->td_parent->td_icvs );
539 
540  // Thread value exists in the nested nthreads array for the next nested level
541  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
542  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
543  }
544 
545 #if OMP_40_ENABLED
546  if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
547  this_thr->th.th_current_task->td_icvs.proc_bind
548  = __kmp_nested_proc_bind.bind_types[ level + 1 ];
549  }
550 #endif /* OMP_40_ENABLED */
551 
552 #else /* pre-3.0 icv's */
553  serial_team -> t.t_set_nproc[0] = serial_team->t.t_parent->
554  t.t_set_nproc[serial_team->
555  t.t_master_tid];
556  serial_team -> t.t_set_dynamic[0] = serial_team->t.t_parent->
557  t.t_set_dynamic[serial_team->
558  t.t_master_tid];
559  serial_team -> t.t_set_nested[0] = serial_team->t.t_parent->
560  t.t_set_nested[serial_team->
561  t.t_master_tid];
562  serial_team -> t.t_set_blocktime[0] = serial_team->t.t_parent->
563  t.t_set_blocktime[serial_team->
564  t.t_master_tid];
565  serial_team -> t.t_set_bt_intervals[0] = serial_team->t.t_parent->
566  t.t_set_bt_intervals[serial_team->
567  t.t_master_tid];
568  serial_team -> t.t_set_bt_set[0] = serial_team->t.t_parent->
569  t.t_set_bt_set[serial_team->
570  t.t_master_tid];
571 #endif // OMP_30_ENABLED
572  this_thr -> th.th_info.ds.ds_tid = 0;
573 
574  /* set thread cache values */
575  this_thr -> th.th_team_nproc = 1;
576  this_thr -> th.th_team_master = this_thr;
577  this_thr -> th.th_team_serialized = 1;
578 
579 #if OMP_30_ENABLED
580  serial_team -> t.t_level = serial_team -> t.t_parent -> t.t_level + 1;
581  serial_team -> t.t_active_level = serial_team -> t.t_parent -> t.t_active_level;
582 #endif // OMP_30_ENABLED
583 
584 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
585  if ( __kmp_inherit_fp_control ) {
586  __kmp_store_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
587  __kmp_store_mxcsr( &serial_team->t.t_mxcsr );
588  serial_team->t.t_mxcsr &= KMP_X86_MXCSR_MASK;
589  serial_team->t.t_fp_control_saved = TRUE;
590  } else {
591  serial_team->t.t_fp_control_saved = FALSE;
592  }
593 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
594  /* check if we need to allocate dispatch buffers stack */
595  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
596  if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
597  serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
598  __kmp_allocate( sizeof( dispatch_private_info_t ) );
599  }
600  this_thr -> th.th_dispatch = serial_team->t.t_dispatch;
601 
602  KMP_MB();
603 
604  } else {
605  /* this serialized team is already being used,
606  * that's fine, just add another nested level */
607  KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
608  KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
609  KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
610  ++ serial_team -> t.t_serialized;
611  this_thr -> th.th_team_serialized = serial_team -> t.t_serialized;
612 
613 #if OMP_30_ENABLED
614  // Nested level will be an index in the nested nthreads array
615  int level = this_thr->th.th_team->t.t_level;
616  // Thread value exists in the nested nthreads array for the next nested level
617  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
618  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
619  }
620  serial_team -> t.t_level++;
621  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
622  global_tid, serial_team, serial_team -> t.t_level ) );
623 #else
624  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing team %p for nested serialized parallel region\n",
625  global_tid, serial_team ) );
626 #endif // OMP_30_ENABLED
627 
628  /* allocate/push dispatch buffers stack */
629  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
630  {
631  dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
632  __kmp_allocate( sizeof( dispatch_private_info_t ) );
633  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
634  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
635  }
636  this_thr -> th.th_dispatch = serial_team->t.t_dispatch;
637 
638  KMP_MB();
639  }
640 
641  if ( __kmp_env_consistency_check )
642  __kmp_push_parallel( global_tid, NULL );
643 
644 #if USE_ITT_BUILD
645  // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment.
646  if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
647  {
648  __kmp_itt_region_forking( global_tid, 1 );
649  }
650  // Collect information only if the file was opened succesfully.
651  if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file )
652  {
653  if( this_thr->th.th_team->t.t_level == 1 ) {
654  kmp_uint64 fr_begin;
655 #if defined( __GNUC__ )
656 # if !defined( __INTEL_COMPILER )
657  fr_begin = __kmp_hardware_timestamp();
658 # else
659  fr_begin = __rdtsc();
660 # endif
661 #else
662  fr_begin = __rdtsc();
663 #endif
664  this_thr->th.th_frame_time_serialized = fr_begin;
665  }
666  }
667 #endif /* USE_ITT_BUILD */
668 
669 }
670 
678 void
679 __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
680 {
681  kmp_internal_control_t *top;
682  kmp_info_t *this_thr;
683  kmp_team_t *serial_team;
684 
685  KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) );
686 
687  /* skip all this code for autopar serialized loops since it results in
688  unacceptable overhead */
689  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
690  return;
691 
692  // Not autopar code
693  if( ! TCR_4( __kmp_init_parallel ) )
694  __kmp_parallel_initialize();
695 
696  this_thr = __kmp_threads[ global_tid ];
697  serial_team = this_thr->th.th_serial_team;
698 
699  KMP_MB();
700  KMP_DEBUG_ASSERT( serial_team );
701  KMP_ASSERT( serial_team -> t.t_serialized );
702  KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team );
703  KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team );
704  KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
705  KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
706 
707  /* If necessary, pop the internal control stack values and replace the team values */
708  top = serial_team -> t.t_control_stack_top;
709  if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
710 #if OMP_30_ENABLED
711  copy_icvs(
712  &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs,
713  top );
714 #else
715  serial_team -> t.t_set_nproc[0] = top -> nproc;
716  serial_team -> t.t_set_dynamic[0] = top -> dynamic;
717  serial_team -> t.t_set_nested[0] = top -> nested;
718  serial_team -> t.t_set_blocktime[0] = top -> blocktime;
719  serial_team -> t.t_set_bt_intervals[0] = top -> bt_intervals;
720  serial_team -> t.t_set_bt_set[0] = top -> bt_set;
721 #endif // OMP_30_ENABLED
722  serial_team -> t.t_control_stack_top = top -> next;
723  __kmp_free(top);
724  }
725 
726 #if OMP_30_ENABLED
727  //if( serial_team -> t.t_serialized > 1 )
728  serial_team -> t.t_level--;
729 #endif // OMP_30_ENABLED
730 
731  /* pop dispatch buffers stack */
732  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
733  {
734  dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer;
735  serial_team->t.t_dispatch->th_disp_buffer =
736  serial_team->t.t_dispatch->th_disp_buffer->next;
737  __kmp_free( disp_buffer );
738  }
739 
740  -- serial_team -> t.t_serialized;
741  if ( serial_team -> t.t_serialized == 0 ) {
742 
743  /* return to the parallel section */
744 
745 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
746  if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) {
747  __kmp_clear_x87_fpu_status_word();
748  __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
749  __kmp_load_mxcsr( &serial_team->t.t_mxcsr );
750  }
751 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
752 
753  this_thr -> th.th_team = serial_team -> t.t_parent;
754  this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid;
755 
756  /* restore values cached in the thread */
757  this_thr -> th.th_team_nproc = serial_team -> t.t_parent -> t.t_nproc; /* JPH */
758  this_thr -> th.th_team_master = serial_team -> t.t_parent -> t.t_threads[0]; /* JPH */
759  this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized;
760 
761  /* TODO the below shouldn't need to be adjusted for serialized teams */
762  this_thr -> th.th_dispatch = & this_thr -> th.th_team ->
763  t.t_dispatch[ serial_team -> t.t_master_tid ];
764 
765 #if OMP_30_ENABLED
766  __kmp_pop_current_task_from_thread( this_thr );
767 
768  KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
769  this_thr -> th.th_current_task -> td_flags.executing = 1;
770 
771  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
772  //
773  // Copy the task team from the new child / old parent team
774  // to the thread. If non-NULL, copy the state flag also.
775  //
776  if ( ( this_thr -> th.th_task_team = this_thr -> th.th_team -> t.t_task_team ) != NULL ) {
777  this_thr -> th.th_task_state = this_thr -> th.th_task_team -> tt.tt_state;
778  }
779  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
780  global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
781  }
782 #endif // OMP_30_ENABLED
783 
784  }
785  else {
786 
787 #if OMP_30_ENABLED
788  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
789  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
790  global_tid, serial_team, serial_team -> t.t_serialized ) );
791  }
792 #endif // OMP_30_ENABLED
793 
794  }
795 
796 #if USE_ITT_BUILD
797  // Mark the end of the "parallel" region for VTune. Only use one of frame notification scheme at the moment.
798  if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG )
799  {
800  __kmp_itt_region_joined( global_tid, 1 );
801  }
802  // Collect information only if the file was opened succesfully.
803  if( __kmp_forkjoin_frames_mode == 1 && __kmp_itt_csv_file )
804  {
805  if( this_thr->th.th_team->t.t_level == 0 ) {
806  ident_t * loc = this_thr->th.th_ident;
807  if (loc) {
808  // Use compiler-generated location to mark the frame:
809  // "<func>$omp$frame@[file:]<line>[:<col>]"
810  kmp_str_loc_t str_loc = __kmp_str_loc_init( loc->psource, 1 );
811 
812  kmp_uint64 fr_end;
813 #if defined( __GNUC__ )
814 # if !defined( __INTEL_COMPILER )
815  fr_end = __kmp_hardware_timestamp();
816 # else
817  fr_end = __rdtsc();
818 # endif
819 #else
820  fr_end = __rdtsc();
821 #endif
822  K_DIAG( 3, ( "__kmpc_end_serialized_parallel: T#%d frame_begin = %llu, frame_end = %llu\n",
823  global_tid, this_thr->th.th_frame_time, fr_end ) );
824 
825  __kmp_str_buf_print( &__kmp_itt_frame_buffer, "%s$omp$frame@%s:%d:%d,%llu,%llu,,\n",
826  str_loc.func, str_loc.file, str_loc.line, str_loc.col, this_thr->th.th_frame_time_serialized, fr_end );
827  __kmp_str_loc_free( &str_loc );
828  }
829  }
830  }
831 #endif /* USE_ITT_BUILD */
832 
833  if ( __kmp_env_consistency_check )
834  __kmp_pop_parallel( global_tid, NULL );
835 }
836 
849 void
851 {
852  KC_TRACE( 10, ("__kmpc_flush: called\n" ) );
853 
854  /* need explicit __mf() here since use volatile instead in library */
855  KMP_MB(); /* Flush all pending memory write invalidates. */
856 
857  // This is not an OMP 3.0 feature.
858  // This macro is used here just not to let the change go to 10.1.
859  // This change will go to the mainline first.
860  #if OMP_30_ENABLED
861  #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
862  #if KMP_MIC
863  // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
864  // We shouldn't need it, though, since the ABI rules require that
865  // * If the compiler generates NGO stores it also generates the fence
866  // * If users hand-code NGO stores they should insert the fence
867  // therefore no incomplete unordered stores should be visible.
868  #else
869  // C74404
870  // This is to address non-temporal store instructions (sfence needed).
871  // The clflush instruction is addressed either (mfence needed).
872  // Probably the non-temporal load monvtdqa instruction should also be addressed.
873  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
874  if ( ! __kmp_cpuinfo.initialized ) {
875  __kmp_query_cpuid( & __kmp_cpuinfo );
876  }; // if
877  if ( ! __kmp_cpuinfo.sse2 ) {
878  // CPU cannot execute SSE2 instructions.
879  } else {
880  #if defined( __GNUC__ ) && !defined( __INTEL_COMPILER )
881  __sync_synchronize();
882  #else
883  _mm_mfence();
884  #endif // __GNUC__
885  }; // if
886  #endif // KMP_MIC
887  #else
888  #error Unknown or unsupported architecture
889  #endif
890  #endif // OMP_30_ENABLED
891 
892 }
893 
894 /* -------------------------------------------------------------------------- */
895 
896 /* -------------------------------------------------------------------------- */
897 
905 void
906 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
907 {
908  int explicit_barrier_flag;
909  KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
910 
911  if (! TCR_4(__kmp_init_parallel))
912  __kmp_parallel_initialize();
913 
914  if ( __kmp_env_consistency_check ) {
915  if ( loc == 0 ) {
916  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
917  }; // if
918 
919  __kmp_check_barrier( global_tid, ct_barrier, loc );
920  }
921 
922  __kmp_threads[ global_tid ]->th.th_ident = loc;
923  // TODO: explicit barrier_wait_id:
924  // this function is called when 'barrier' directive is present or
925  // implicit barrier at the end of a worksharing construct.
926  // 1) better to add a per-thread barrier counter to a thread data structure
927  // 2) set to 0 when a new team is created
928  // 4) no sync is required
929 
930  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
931 }
932 
933 /* The BARRIER for a MASTER section is always explicit */
940 kmp_int32
941 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
942 {
943  int status = 0;
944 
945  KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
946 
947  if( ! TCR_4( __kmp_init_parallel ) )
948  __kmp_parallel_initialize();
949 
950  if( KMP_MASTER_GTID( global_tid ))
951  status = 1;
952 
953  if ( __kmp_env_consistency_check ) {
954  if (status)
955  __kmp_push_sync( global_tid, ct_master, loc, NULL );
956  else
957  __kmp_check_sync( global_tid, ct_master, loc, NULL );
958  }
959 
960  return status;
961 }
962 
971 void
972 __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
973 {
974  KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
975 
976  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
977 
978  if ( __kmp_env_consistency_check ) {
979  if( global_tid < 0 )
980  KMP_WARNING( ThreadIdentInvalid );
981 
982  if( KMP_MASTER_GTID( global_tid ))
983  __kmp_pop_sync( global_tid, ct_master, loc );
984  }
985 }
986 
994 void
995 __kmpc_ordered( ident_t * loc, kmp_int32 gtid )
996 {
997  int cid = 0;
998  kmp_info_t *th;
999  KMP_DEBUG_ASSERT( __kmp_init_serial );
1000 
1001  KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid ));
1002 
1003  if (! TCR_4(__kmp_init_parallel))
1004  __kmp_parallel_initialize();
1005 
1006 #if USE_ITT_BUILD
1007  __kmp_itt_ordered_prep( gtid );
1008  // TODO: ordered_wait_id
1009 #endif /* USE_ITT_BUILD */
1010 
1011  th = __kmp_threads[ gtid ];
1012 
1013  if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
1014  (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
1015  else
1016  __kmp_parallel_deo( & gtid, & cid, loc );
1017 
1018 #if USE_ITT_BUILD
1019  __kmp_itt_ordered_start( gtid );
1020 #endif /* USE_ITT_BUILD */
1021 }
1022 
1030 void
1031 __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
1032 {
1033  int cid = 0;
1034  kmp_info_t *th;
1035 
1036  KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) );
1037 
1038 #if USE_ITT_BUILD
1039  __kmp_itt_ordered_end( gtid );
1040  // TODO: ordered_wait_id
1041 #endif /* USE_ITT_BUILD */
1042 
1043  th = __kmp_threads[ gtid ];
1044 
1045  if ( th -> th.th_dispatch -> th_dxo_fcn != 0 )
1046  (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
1047  else
1048  __kmp_parallel_dxo( & gtid, & cid, loc );
1049 }
1050 
1051 inline void
1052 __kmp_static_yield( int arg ) { // AC: needed in macro __kmp_acquire_user_lock_with_checks
1053  __kmp_yield( arg );
1054 }
1055 
1056 static kmp_user_lock_p
1057 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
1058 {
1059  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1060 
1061  //
1062  // Because of the double-check, the following load
1063  // doesn't need to be volatile.
1064  //
1065  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1066 
1067  if ( lck == NULL ) {
1068  void * idx;
1069 
1070  // Allocate & initialize the lock.
1071  // Remember allocated locks in table in order to free them in __kmp_cleanup()
1072  lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section );
1073  __kmp_init_user_lock_with_checks( lck );
1074  __kmp_set_user_lock_location( lck, loc );
1075 #if USE_ITT_BUILD
1076  __kmp_itt_critical_creating( lck );
1077  // __kmp_itt_critical_creating() should be called *before* the first usage of underlying
1078  // lock. It is the only place where we can guarantee it. There are chances the lock will
1079  // destroyed with no usage, but it is not a problem, because this is not real event seen
1080  // by user but rather setting name for object (lock). See more details in kmp_itt.h.
1081 #endif /* USE_ITT_BUILD */
1082 
1083  //
1084  // Use a cmpxchg instruction to slam the start of the critical
1085  // section with the lock pointer. If another thread beat us
1086  // to it, deallocate the lock, and use the lock that the other
1087  // thread allocated.
1088  //
1089  int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck );
1090 
1091  if ( status == 0 ) {
1092  // Deallocate the lock and reload the value.
1093 #if USE_ITT_BUILD
1094  __kmp_itt_critical_destroyed( lck );
1095  // Let ITT know the lock is destroyed and the same memory location may be reused for
1096  // another purpose.
1097 #endif /* USE_ITT_BUILD */
1098  __kmp_destroy_user_lock_with_checks( lck );
1099  __kmp_user_lock_free( &idx, gtid, lck );
1100  lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1101  KMP_DEBUG_ASSERT( lck != NULL );
1102  }
1103  }
1104  return lck;
1105 }
1106 
1117 void
1118 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
1119 
1120  kmp_user_lock_p lck;
1121 
1122  KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1123 
1124  //TODO: add THR_OVHD_STATE
1125 
1126  KMP_CHECK_USER_LOCK_INIT();
1127 
1128  if ( ( __kmp_user_lock_kind == lk_tas )
1129  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1130  lck = (kmp_user_lock_p)crit;
1131  }
1132 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1133  else if ( ( __kmp_user_lock_kind == lk_futex )
1134  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1135  lck = (kmp_user_lock_p)crit;
1136  }
1137 #endif
1138  else { // ticket, queuing or drdpa
1139  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
1140  }
1141 
1142  if ( __kmp_env_consistency_check )
1143  __kmp_push_sync( global_tid, ct_critical, loc, lck );
1144 
1145  /* since the critical directive binds to all threads, not just
1146  * the current team we have to check this even if we are in a
1147  * serialized team */
1148  /* also, even if we are the uber thread, we still have to conduct the lock,
1149  * as we have to contend with sibling threads */
1150 
1151 #if USE_ITT_BUILD
1152  __kmp_itt_critical_acquiring( lck );
1153 #endif /* USE_ITT_BUILD */
1154  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1155 
1156  __kmp_acquire_user_lock_with_checks( lck, global_tid );
1157 
1158 #if USE_ITT_BUILD
1159  __kmp_itt_critical_acquired( lck );
1160 #endif /* USE_ITT_BUILD */
1161 
1162  KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1163 } // __kmpc_critical
1164 
1174 void
1175 __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
1176 {
1177  kmp_user_lock_p lck;
1178 
1179  KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid ));
1180 
1181  if ( ( __kmp_user_lock_kind == lk_tas )
1182  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1183  lck = (kmp_user_lock_p)crit;
1184  }
1185 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1186  else if ( ( __kmp_user_lock_kind == lk_futex )
1187  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1188  lck = (kmp_user_lock_p)crit;
1189  }
1190 #endif
1191  else { // ticket, queuing or drdpa
1192  lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit));
1193  }
1194 
1195  KMP_ASSERT(lck != NULL);
1196 
1197  if ( __kmp_env_consistency_check )
1198  __kmp_pop_sync( global_tid, ct_critical, loc );
1199 
1200 #if USE_ITT_BUILD
1201  __kmp_itt_critical_releasing( lck );
1202 #endif /* USE_ITT_BUILD */
1203  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1204 
1205  __kmp_release_user_lock_with_checks( lck, global_tid );
1206 
1207  KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
1208 }
1209 
1218 kmp_int32
1219 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
1220 {
1221  int status;
1222 
1223  KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) );
1224 
1225  if (! TCR_4(__kmp_init_parallel))
1226  __kmp_parallel_initialize();
1227 
1228  if ( __kmp_env_consistency_check )
1229  __kmp_check_barrier( global_tid, ct_barrier, loc );
1230 
1231  status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
1232 
1233  return (status != 0) ? 0 : 1;
1234 }
1235 
1245 void
1246 __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
1247 {
1248  KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid ));
1249 
1250  __kmp_end_split_barrier ( bs_plain_barrier, global_tid );
1251 }
1252 
1263 kmp_int32
1264 __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
1265 {
1266  kmp_int32 ret;
1267 
1268  KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid ));
1269 
1270  if (! TCR_4(__kmp_init_parallel))
1271  __kmp_parallel_initialize();
1272 
1273  if ( __kmp_env_consistency_check ) {
1274  if ( loc == 0 ) {
1275  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
1276  }
1277  __kmp_check_barrier( global_tid, ct_barrier, loc );
1278  }
1279 
1280  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
1281 
1282  ret = __kmpc_master (loc, global_tid);
1283 
1284  if ( __kmp_env_consistency_check ) {
1285  /* there's no __kmpc_end_master called; so the (stats) */
1286  /* actions of __kmpc_end_master are done here */
1287 
1288  if ( global_tid < 0 ) {
1289  KMP_WARNING( ThreadIdentInvalid );
1290  }
1291  if (ret) {
1292  /* only one thread should do the pop since only */
1293  /* one did the push (see __kmpc_master()) */
1294 
1295  __kmp_pop_sync( global_tid, ct_master, loc );
1296  }
1297  }
1298 
1299  return (ret);
1300 }
1301 
1302 /* The BARRIER for a SINGLE process section is always explicit */
1314 kmp_int32
1315 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
1316 {
1317  kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
1318  return rc;
1319 }
1320 
1330 void
1331 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
1332 {
1333  __kmp_exit_single( global_tid );
1334 }
1335 
1343 void
1344 __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
1345 {
1346  KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1347 
1348  if ( __kmp_env_consistency_check )
1349  __kmp_pop_workshare( global_tid, ct_pdo, loc );
1350 }
1351 
1352 /*
1353  * User routines which take C-style arguments (call by value)
1354  * different from the Fortran equivalent routines
1355  */
1356 
1357 void
1358 ompc_set_num_threads( int arg )
1359 {
1360 // !!!!! TODO: check the per-task binding
1361  __kmp_set_num_threads( arg, __kmp_entry_gtid() );
1362 }
1363 
1364 void
1365 ompc_set_dynamic( int flag )
1366 {
1367  kmp_info_t *thread;
1368 
1369  /* For the thread-private implementation of the internal controls */
1370  thread = __kmp_entry_thread();
1371 
1372  __kmp_save_internal_controls( thread );
1373 
1374  set__dynamic( thread, flag ? TRUE : FALSE );
1375 }
1376 
1377 void
1378 ompc_set_nested( int flag )
1379 {
1380  kmp_info_t *thread;
1381 
1382  /* For the thread-private internal controls implementation */
1383  thread = __kmp_entry_thread();
1384 
1385  __kmp_save_internal_controls( thread );
1386 
1387  set__nested( thread, flag ? TRUE : FALSE );
1388 }
1389 
1390 #if OMP_30_ENABLED
1391 
1392 void
1393 ompc_set_max_active_levels( int max_active_levels )
1394 {
1395  /* TO DO */
1396  /* we want per-task implementation of this internal control */
1397 
1398  /* For the per-thread internal controls implementation */
1399  __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels );
1400 }
1401 
1402 void
1403 ompc_set_schedule( omp_sched_t kind, int modifier )
1404 {
1405 // !!!!! TODO: check the per-task binding
1406  __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier );
1407 }
1408 
1409 int
1410 ompc_get_ancestor_thread_num( int level )
1411 {
1412  return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level );
1413 }
1414 
1415 int
1416 ompc_get_team_size( int level )
1417 {
1418  return __kmp_get_team_size( __kmp_entry_gtid(), level );
1419 }
1420 
1421 #endif // OMP_30_ENABLED
1422 
1423 void
1424 kmpc_set_stacksize( int arg )
1425 {
1426  // __kmp_aux_set_stacksize initializes the library if needed
1427  __kmp_aux_set_stacksize( arg );
1428 }
1429 
1430 void
1431 kmpc_set_stacksize_s( size_t arg )
1432 {
1433  // __kmp_aux_set_stacksize initializes the library if needed
1434  __kmp_aux_set_stacksize( arg );
1435 }
1436 
1437 void
1438 kmpc_set_blocktime( int arg )
1439 {
1440  int gtid, tid;
1441  kmp_info_t *thread;
1442 
1443  gtid = __kmp_entry_gtid();
1444  tid = __kmp_tid_from_gtid(gtid);
1445  thread = __kmp_thread_from_gtid(gtid);
1446 
1447  __kmp_aux_set_blocktime( arg, thread, tid );
1448 }
1449 
1450 void
1451 kmpc_set_library( int arg )
1452 {
1453  // __kmp_user_set_library initializes the library if needed
1454  __kmp_user_set_library( (enum library_type)arg );
1455 }
1456 
1457 void
1458 kmpc_set_defaults( char const * str )
1459 {
1460  // __kmp_aux_set_defaults initializes the library if needed
1461  __kmp_aux_set_defaults( str, strlen( str ) );
1462 }
1463 
1464 #ifdef OMP_30_ENABLED
1465 
1466 int
1467 kmpc_set_affinity_mask_proc( int proc, void **mask )
1468 {
1469 #if defined(KMP_STUB) || !(KMP_OS_WINDOWS || KMP_OS_LINUX)
1470  return -1;
1471 #else
1472  if ( ! TCR_4(__kmp_init_middle) ) {
1473  __kmp_middle_initialize();
1474  }
1475  return __kmp_aux_set_affinity_mask_proc( proc, mask );
1476 #endif
1477 }
1478 
1479 int
1480 kmpc_unset_affinity_mask_proc( int proc, void **mask )
1481 {
1482 #if defined(KMP_STUB) || !(KMP_OS_WINDOWS || KMP_OS_LINUX)
1483  return -1;
1484 #else
1485  if ( ! TCR_4(__kmp_init_middle) ) {
1486  __kmp_middle_initialize();
1487  }
1488  return __kmp_aux_unset_affinity_mask_proc( proc, mask );
1489 #endif
1490 }
1491 
1492 int
1493 kmpc_get_affinity_mask_proc( int proc, void **mask )
1494 {
1495 #if defined(KMP_STUB) || !(KMP_OS_WINDOWS || KMP_OS_LINUX)
1496  return -1;
1497 #else
1498  if ( ! TCR_4(__kmp_init_middle) ) {
1499  __kmp_middle_initialize();
1500  }
1501  return __kmp_aux_get_affinity_mask_proc( proc, mask );
1502 #endif
1503 }
1504 
1505 #endif /* OMP_30_ENABLED */
1506 
1507 /* -------------------------------------------------------------------------- */
1548 void
1549 __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit )
1550 {
1551  void **data_ptr;
1552 
1553  KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid ));
1554 
1555  KMP_MB();
1556 
1557  data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data;
1558 
1559  if ( __kmp_env_consistency_check ) {
1560  if ( loc == 0 ) {
1561  KMP_WARNING( ConstructIdentInvalid );
1562  }
1563  }
1564 
1565  /* ToDo: Optimize the following two barriers into some kind of split barrier */
1566 
1567  if (didit) *data_ptr = cpy_data;
1568 
1569  /* This barrier is not a barrier region boundary */
1570  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1571 
1572  if (! didit) (*cpy_func)( cpy_data, *data_ptr );
1573 
1574  /* Consider next barrier the user-visible barrier for barrier region boundaries */
1575  /* Nesting checks are already handled by the single construct checks */
1576 
1577  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1578 }
1579 
1580 /* -------------------------------------------------------------------------- */
1581 
1582 #define INIT_LOCK __kmp_init_user_lock_with_checks
1583 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
1584 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
1585 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
1586 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
1587 #define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed
1588 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
1589 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
1590 #define TEST_LOCK __kmp_test_user_lock_with_checks
1591 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
1592 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
1593 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
1594 
1595 
1596 /*
1597  * TODO: Make check abort messages use location info & pass it
1598  * into with_checks routines
1599  */
1600 
1601 /* initialize the lock */
1602 void
1603 __kmpc_init_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1604  static char const * const func = "omp_init_lock";
1605  kmp_user_lock_p lck;
1606  KMP_DEBUG_ASSERT( __kmp_init_serial );
1607 
1608  if ( __kmp_env_consistency_check ) {
1609  if ( user_lock == NULL ) {
1610  KMP_FATAL( LockIsUninitialized, func );
1611  }
1612  }
1613 
1614  KMP_CHECK_USER_LOCK_INIT();
1615 
1616  if ( ( __kmp_user_lock_kind == lk_tas )
1617  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1618  lck = (kmp_user_lock_p)user_lock;
1619  }
1620 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1621  else if ( ( __kmp_user_lock_kind == lk_futex )
1622  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1623  lck = (kmp_user_lock_p)user_lock;
1624  }
1625 #endif
1626  else {
1627  lck = __kmp_user_lock_allocate( user_lock, gtid );
1628  }
1629  INIT_LOCK( lck );
1630  __kmp_set_user_lock_location( lck, loc );
1631 
1632 #if USE_ITT_BUILD
1633  __kmp_itt_lock_creating( lck );
1634 #endif /* USE_ITT_BUILD */
1635 } // __kmpc_init_lock
1636 
1637 /* initialize the lock */
1638 void
1639 __kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1640  static char const * const func = "omp_init_nest_lock";
1641  kmp_user_lock_p lck;
1642  KMP_DEBUG_ASSERT( __kmp_init_serial );
1643 
1644  if ( __kmp_env_consistency_check ) {
1645  if ( user_lock == NULL ) {
1646  KMP_FATAL( LockIsUninitialized, func );
1647  }
1648  }
1649 
1650  KMP_CHECK_USER_LOCK_INIT();
1651 
1652  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1653  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1654  lck = (kmp_user_lock_p)user_lock;
1655  }
1656 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1657  else if ( ( __kmp_user_lock_kind == lk_futex )
1658  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1659  <= OMP_NEST_LOCK_T_SIZE ) ) {
1660  lck = (kmp_user_lock_p)user_lock;
1661  }
1662 #endif
1663  else {
1664  lck = __kmp_user_lock_allocate( user_lock, gtid );
1665  }
1666 
1667  INIT_NESTED_LOCK( lck );
1668  __kmp_set_user_lock_location( lck, loc );
1669 
1670 #if USE_ITT_BUILD
1671  __kmp_itt_lock_creating( lck );
1672 #endif /* USE_ITT_BUILD */
1673 } // __kmpc_init_nest_lock
1674 
1675 void
1676 __kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1677 
1678  kmp_user_lock_p lck;
1679 
1680  if ( ( __kmp_user_lock_kind == lk_tas )
1681  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1682  lck = (kmp_user_lock_p)user_lock;
1683  }
1684 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1685  else if ( ( __kmp_user_lock_kind == lk_futex )
1686  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1687  lck = (kmp_user_lock_p)user_lock;
1688  }
1689 #endif
1690  else {
1691  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" );
1692  }
1693 
1694 #if USE_ITT_BUILD
1695  __kmp_itt_lock_destroyed( lck );
1696 #endif /* USE_ITT_BUILD */
1697  DESTROY_LOCK( lck );
1698 
1699  if ( ( __kmp_user_lock_kind == lk_tas )
1700  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1701  ;
1702  }
1703 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1704  else if ( ( __kmp_user_lock_kind == lk_futex )
1705  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1706  ;
1707  }
1708 #endif
1709  else {
1710  __kmp_user_lock_free( user_lock, gtid, lck );
1711  }
1712 } // __kmpc_destroy_lock
1713 
1714 /* destroy the lock */
1715 void
1716 __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1717 
1718  kmp_user_lock_p lck;
1719 
1720  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1721  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1722  lck = (kmp_user_lock_p)user_lock;
1723  }
1724 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1725  else if ( ( __kmp_user_lock_kind == lk_futex )
1726  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1727  <= OMP_NEST_LOCK_T_SIZE ) ) {
1728  lck = (kmp_user_lock_p)user_lock;
1729  }
1730 #endif
1731  else {
1732  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" );
1733  }
1734 
1735 #if USE_ITT_BUILD
1736  __kmp_itt_lock_destroyed( lck );
1737 #endif /* USE_ITT_BUILD */
1738 
1739  DESTROY_NESTED_LOCK( lck );
1740 
1741  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1742  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1743  ;
1744  }
1745 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1746  else if ( ( __kmp_user_lock_kind == lk_futex )
1747  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1748  <= OMP_NEST_LOCK_T_SIZE ) ) {
1749  ;
1750  }
1751 #endif
1752  else {
1753  __kmp_user_lock_free( user_lock, gtid, lck );
1754  }
1755 } // __kmpc_destroy_nest_lock
1756 
1757 void
1758 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1759  kmp_user_lock_p lck;
1760 
1761  if ( ( __kmp_user_lock_kind == lk_tas )
1762  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1763  lck = (kmp_user_lock_p)user_lock;
1764  }
1765 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1766  else if ( ( __kmp_user_lock_kind == lk_futex )
1767  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1768  lck = (kmp_user_lock_p)user_lock;
1769  }
1770 #endif
1771  else {
1772  lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" );
1773  }
1774 
1775 #if USE_ITT_BUILD
1776  __kmp_itt_lock_acquiring( lck );
1777 #endif /* USE_ITT_BUILD */
1778 
1779  ACQUIRE_LOCK( lck, gtid );
1780 
1781 #if USE_ITT_BUILD
1782  __kmp_itt_lock_acquired( lck );
1783 #endif /* USE_ITT_BUILD */
1784 }
1785 
1786 
1787 void
1788 __kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1789  kmp_user_lock_p lck;
1790 
1791  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1792  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1793  lck = (kmp_user_lock_p)user_lock;
1794  }
1795 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1796  else if ( ( __kmp_user_lock_kind == lk_futex )
1797  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1798  <= OMP_NEST_LOCK_T_SIZE ) ) {
1799  lck = (kmp_user_lock_p)user_lock;
1800  }
1801 #endif
1802  else {
1803  lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" );
1804  }
1805 
1806 #if USE_ITT_BUILD
1807  __kmp_itt_lock_acquiring( lck );
1808 #endif /* USE_ITT_BUILD */
1809 
1810  ACQUIRE_NESTED_LOCK( lck, gtid );
1811 
1812 #if USE_ITT_BUILD
1813  __kmp_itt_lock_acquired( lck );
1814 #endif /* USE_ITT_BUILD */
1815 }
1816 
1817 void
1818 __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
1819 {
1820  kmp_user_lock_p lck;
1821 
1822  /* Can't use serial interval since not block structured */
1823  /* release the lock */
1824 
1825  if ( ( __kmp_user_lock_kind == lk_tas )
1826  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1827 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1828  // "fast" path implemented to fix customer performance issue
1829 #if USE_ITT_BUILD
1830  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
1831 #endif /* USE_ITT_BUILD */
1832  TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
1833  KMP_MB();
1834  return;
1835 #else
1836  lck = (kmp_user_lock_p)user_lock;
1837 #endif
1838  }
1839 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1840  else if ( ( __kmp_user_lock_kind == lk_futex )
1841  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1842  lck = (kmp_user_lock_p)user_lock;
1843  }
1844 #endif
1845  else {
1846  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" );
1847  }
1848 
1849 #if USE_ITT_BUILD
1850  __kmp_itt_lock_releasing( lck );
1851 #endif /* USE_ITT_BUILD */
1852 
1853  RELEASE_LOCK( lck, gtid );
1854 }
1855 
1856 /* release the lock */
1857 void
1858 __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
1859 {
1860  kmp_user_lock_p lck;
1861 
1862  /* Can't use serial interval since not block structured */
1863 
1864  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1865  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1866 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1867  // "fast" path implemented to fix customer performance issue
1868  kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock;
1869 #if USE_ITT_BUILD
1870  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
1871 #endif /* USE_ITT_BUILD */
1872  if ( --(tl->lk.depth_locked) == 0 ) {
1873  TCW_4(tl->lk.poll, 0);
1874  }
1875  KMP_MB();
1876  return;
1877 #else
1878  lck = (kmp_user_lock_p)user_lock;
1879 #endif
1880  }
1881 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1882  else if ( ( __kmp_user_lock_kind == lk_futex )
1883  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1884  <= OMP_NEST_LOCK_T_SIZE ) ) {
1885  lck = (kmp_user_lock_p)user_lock;
1886  }
1887 #endif
1888  else {
1889  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" );
1890  }
1891 
1892 #if USE_ITT_BUILD
1893  __kmp_itt_lock_releasing( lck );
1894 #endif /* USE_ITT_BUILD */
1895 
1896  RELEASE_NESTED_LOCK( lck, gtid );
1897 }
1898 
1899 /* try to acquire the lock */
1900 int
1901 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
1902 {
1903  kmp_user_lock_p lck;
1904  int rc;
1905 
1906  if ( ( __kmp_user_lock_kind == lk_tas )
1907  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1908  lck = (kmp_user_lock_p)user_lock;
1909  }
1910 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1911  else if ( ( __kmp_user_lock_kind == lk_futex )
1912  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1913  lck = (kmp_user_lock_p)user_lock;
1914  }
1915 #endif
1916  else {
1917  lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" );
1918  }
1919 
1920 #if USE_ITT_BUILD
1921  __kmp_itt_lock_acquiring( lck );
1922 #endif /* USE_ITT_BUILD */
1923 
1924  rc = TEST_LOCK( lck, gtid );
1925 #if USE_ITT_BUILD
1926  if ( rc ) {
1927  __kmp_itt_lock_acquired( lck );
1928  } else {
1929  __kmp_itt_lock_cancelled( lck );
1930  }
1931 #endif /* USE_ITT_BUILD */
1932  return ( rc ? FTN_TRUE : FTN_FALSE );
1933 
1934  /* Can't use serial interval since not block structured */
1935 }
1936 
1937 /* try to acquire the lock */
1938 int
1939 __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
1940 {
1941  kmp_user_lock_p lck;
1942  int rc;
1943 
1944  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1945  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1946  lck = (kmp_user_lock_p)user_lock;
1947  }
1948 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64)
1949  else if ( ( __kmp_user_lock_kind == lk_futex )
1950  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1951  <= OMP_NEST_LOCK_T_SIZE ) ) {
1952  lck = (kmp_user_lock_p)user_lock;
1953  }
1954 #endif
1955  else {
1956  lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" );
1957  }
1958 
1959 #if USE_ITT_BUILD
1960  __kmp_itt_lock_acquiring( lck );
1961 #endif /* USE_ITT_BUILD */
1962 
1963  rc = TEST_NESTED_LOCK( lck, gtid );
1964 #if USE_ITT_BUILD
1965  if ( rc ) {
1966  __kmp_itt_lock_acquired( lck );
1967  } else {
1968  __kmp_itt_lock_cancelled( lck );
1969  }
1970 #endif /* USE_ITT_BUILD */
1971  return rc;
1972 
1973  /* Can't use serial interval since not block structured */
1974 }
1975 
1976 
1977 /*--------------------------------------------------------------------------------------------------------------------*/
1978 
1979 /*
1980  * Interface to fast scalable reduce methods routines
1981  */
1982 
1983 // keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions;
1984 // another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then)
1985 // AT: which solution is better?
1986 #define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \
1987  ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) )
1988 
1989 #define __KMP_GET_REDUCTION_METHOD(gtid) \
1990  ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method )
1991 
1992 // description of the packed_reduction_method variable: look at the macros in kmp.h
1993 
1994 
1995 // used in a critical section reduce block
1996 static __forceinline void
1997 __kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
1998 
1999  // this lock was visible to a customer and to the thread profiler as a serial overhead span
2000  // (although it's used for an internal purpose only)
2001  // why was it visible in previous implementation?
2002  // should we keep it visible in new reduce block?
2003  kmp_user_lock_p lck;
2004 
2005  // We know that the fast reduction code is only emitted by Intel compilers
2006  // with 32 byte critical sections. If there isn't enough space, then we
2007  // have to use a pointer.
2008  if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) {
2009  lck = (kmp_user_lock_p)crit;
2010  }
2011  else {
2012  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
2013  }
2014  KMP_DEBUG_ASSERT( lck != NULL );
2015 
2016  if ( __kmp_env_consistency_check )
2017  __kmp_push_sync( global_tid, ct_critical, loc, lck );
2018 
2019  __kmp_acquire_user_lock_with_checks( lck, global_tid );
2020 }
2021 
2022 // used in a critical section reduce block
2023 static __forceinline void
2024 __kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2025 
2026  kmp_user_lock_p lck;
2027 
2028  // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical
2029  // sections. If there isn't enough space, then we have to use a pointer.
2030  if ( __kmp_base_user_lock_size > 32 ) {
2031  lck = *( (kmp_user_lock_p *) crit );
2032  KMP_ASSERT( lck != NULL );
2033  } else {
2034  lck = (kmp_user_lock_p) crit;
2035  }
2036 
2037  if ( __kmp_env_consistency_check )
2038  __kmp_pop_sync( global_tid, ct_critical, loc );
2039 
2040  __kmp_release_user_lock_with_checks( lck, global_tid );
2041 
2042 } // __kmp_end_critical_section_reduce_block
2043 
2044 
2045 /* 2.a.i. Reduce Block without a terminating barrier */
2059 kmp_int32
2061  ident_t *loc, kmp_int32 global_tid,
2062  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
2063  kmp_critical_name *lck ) {
2064 
2065  int retval;
2066  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2067 
2068  KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
2069 
2070  // why do we need this initialization here at all?
2071  // Reduction clause can not be used as a stand-alone directive.
2072 
2073  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2074  // possible detection of false-positive race by the threadchecker ???
2075  if( ! TCR_4( __kmp_init_parallel ) )
2076  __kmp_parallel_initialize();
2077 
2078  // check correctness of reduce block nesting
2079  if ( __kmp_env_consistency_check )
2080  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2081 
2082  // it's better to check an assertion ASSERT( thr_state == THR_WORK_STATE )
2083 
2084  // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
2085  // the variable should be either a construct-specific or thread-specific property, not a team specific property
2086  // (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct)
2087  // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?)
2088  // (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed)
2089  // a thread-specific variable is better regarding two issues above (next construct and extra syncs)
2090  // a thread-specific "th_local.reduction_method" variable is used currently
2091  // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs)
2092 
2093  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2094  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2095 
2096  if( packed_reduction_method == critical_reduce_block ) {
2097 
2098  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2099  retval = 1;
2100 
2101  } else if( packed_reduction_method == empty_reduce_block ) {
2102 
2103  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2104  retval = 1;
2105 
2106  } else if( packed_reduction_method == atomic_reduce_block ) {
2107 
2108  retval = 2;
2109 
2110  // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen)
2111  // (it's not quite good, because the checking block has been closed by this 'pop',
2112  // but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction)
2113  if ( __kmp_env_consistency_check )
2114  __kmp_pop_sync( global_tid, ct_reduce, loc );
2115 
2116  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2117 
2118  //AT: performance issue: a real barrier here
2119  //AT: (if master goes slow, other threads are blocked here waiting for the master to come and release them)
2120  //AT: (it's not what a customer might expect specifying NOWAIT clause)
2121  //AT: (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer)
2122  //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster
2123  // and be more in line with sense of NOWAIT
2124  //AT: TO DO: do epcc test and compare times
2125 
2126  // this barrier should be invisible to a customer and to the thread profiler
2127  // (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
2128  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
2129  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2130 
2131  // all other workers except master should do this pop here
2132  // ( none of other workers will get to __kmpc_end_reduce_nowait() )
2133  if ( __kmp_env_consistency_check ) {
2134  if( retval == 0 ) {
2135  __kmp_pop_sync( global_tid, ct_reduce, loc );
2136  }
2137  }
2138 
2139  } else {
2140 
2141  // should never reach this block
2142  KMP_ASSERT( 0 ); // "unexpected method"
2143 
2144  }
2145 
2146  KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2147 
2148  return retval;
2149 }
2150 
2159 void
2160 __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2161 
2162  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2163 
2164  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) );
2165 
2166  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2167 
2168  if( packed_reduction_method == critical_reduce_block ) {
2169 
2170  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2171 
2172  } else if( packed_reduction_method == empty_reduce_block ) {
2173 
2174  // usage: if team size == 1, no synchronization is required ( on Intel platforms only )
2175 
2176  } else if( packed_reduction_method == atomic_reduce_block ) {
2177 
2178  // neither master nor other workers should get here
2179  // (code gen does not generate this call in case 2: atomic reduce block)
2180  // actually it's better to remove this elseif at all;
2181  // after removal this value will checked by the 'else' and will assert
2182 
2183  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2184 
2185  // only master gets here
2186 
2187  } else {
2188 
2189  // should never reach this block
2190  KMP_ASSERT( 0 ); // "unexpected method"
2191 
2192  }
2193 
2194  if ( __kmp_env_consistency_check )
2195  __kmp_pop_sync( global_tid, ct_reduce, loc );
2196 
2197  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2198 
2199  return;
2200 }
2201 
2202 /* 2.a.ii. Reduce Block with a terminating barrier */
2203 
2217 kmp_int32
2219  ident_t *loc, kmp_int32 global_tid,
2220  kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
2221  void (*reduce_func)(void *lhs_data, void *rhs_data),
2222  kmp_critical_name *lck )
2223 {
2224  int retval;
2225  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2226 
2227  KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) );
2228 
2229  // why do we need this initialization here at all?
2230  // Reduction clause can not be a stand-alone directive.
2231 
2232  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2233  // possible detection of false-positive race by the threadchecker ???
2234  if( ! TCR_4( __kmp_init_parallel ) )
2235  __kmp_parallel_initialize();
2236 
2237  // check correctness of reduce block nesting
2238  if ( __kmp_env_consistency_check )
2239  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2240 
2241  // it's better to check an assertion ASSERT( thr_state == THR_WORK_STATE )
2242 
2243  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2244  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2245 
2246  if( packed_reduction_method == critical_reduce_block ) {
2247 
2248  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2249  retval = 1;
2250 
2251  } else if( packed_reduction_method == empty_reduce_block ) {
2252 
2253  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2254  retval = 1;
2255 
2256  } else if( packed_reduction_method == atomic_reduce_block ) {
2257 
2258  retval = 2;
2259 
2260  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2261 
2262  //case tree_reduce_block:
2263  // this barrier should be visible to a customer and to the thread profiler
2264  // (it's a terminating barrier on constructs if NOWAIT not specified)
2265  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
2266  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2267 
2268  // all other workers except master should do this pop here
2269  // ( none of other workers except master will enter __kmpc_end_reduce() )
2270  if ( __kmp_env_consistency_check ) {
2271  if( retval == 0 ) { // 0: all other workers; 1: master
2272  __kmp_pop_sync( global_tid, ct_reduce, loc );
2273  }
2274  }
2275 
2276  } else {
2277 
2278  // should never reach this block
2279  KMP_ASSERT( 0 ); // "unexpected method"
2280 
2281  }
2282 
2283  KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2284 
2285  return retval;
2286 }
2287 
2297 void
2298 __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2299 
2300  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2301 
2302  KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) );
2303 
2304  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2305 
2306  // this barrier should be visible to a customer and to the thread profiler
2307  // (it's a terminating barrier on constructs if NOWAIT not specified)
2308 
2309  if( packed_reduction_method == critical_reduce_block ) {
2310 
2311  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2312 
2313  // TODO: implicit barrier: should be exposed
2314  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2315 
2316  } else if( packed_reduction_method == empty_reduce_block ) {
2317 
2318  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2319 
2320  // TODO: implicit barrier: should be exposed
2321  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2322 
2323  } else if( packed_reduction_method == atomic_reduce_block ) {
2324 
2325  // TODO: implicit barrier: should be exposed
2326  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2327 
2328  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2329 
2330  // only master executes here (master releases all other workers)
2331  __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid );
2332 
2333  } else {
2334 
2335  // should never reach this block
2336  KMP_ASSERT( 0 ); // "unexpected method"
2337 
2338  }
2339 
2340  if ( __kmp_env_consistency_check )
2341  __kmp_pop_sync( global_tid, ct_reduce, loc );
2342 
2343  KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2344 
2345  return;
2346 }
2347 
2348 #undef __KMP_GET_REDUCTION_METHOD
2349 #undef __KMP_SET_REDUCTION_METHOD
2350 
2351 /*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/
2352 
2353 kmp_uint64
2354 __kmpc_get_taskid() {
2355 
2356  #if OMP_30_ENABLED
2357 
2358  kmp_int32 gtid;
2359  kmp_info_t * thread;
2360 
2361  gtid = __kmp_get_gtid();
2362  if ( gtid < 0 ) {
2363  return 0;
2364  }; // if
2365  thread = __kmp_thread_from_gtid( gtid );
2366  return thread->th.th_current_task->td_task_id;
2367 
2368  #else
2369 
2370  return 0;
2371 
2372  #endif
2373 
2374 } // __kmpc_get_taskid
2375 
2376 
2377 kmp_uint64
2378 __kmpc_get_parent_taskid() {
2379 
2380  #if OMP_30_ENABLED
2381 
2382  kmp_int32 gtid;
2383  kmp_info_t * thread;
2384  kmp_taskdata_t * parent_task;
2385 
2386  gtid = __kmp_get_gtid();
2387  if ( gtid < 0 ) {
2388  return 0;
2389  }; // if
2390  thread = __kmp_thread_from_gtid( gtid );
2391  parent_task = thread->th.th_current_task->td_parent;
2392  return ( parent_task == NULL ? 0 : parent_task->td_task_id );
2393 
2394  #else
2395 
2396  return 0;
2397 
2398  #endif
2399 
2400 } // __kmpc_get_parent_taskid
2401 
2402 void __kmpc_place_threads(int nC, int nT, int nO)
2403 {
2404 #if KMP_MIC
2405  if ( ! __kmp_init_serial ) {
2406  __kmp_serial_initialize();
2407  }
2408  __kmp_place_num_cores = nC;
2409  __kmp_place_num_threads_per_core = nT;
2410  __kmp_place_core_offset = nO;
2411 #endif
2412 }
2413 
2414 // end of file //
2415