LLVM OpenMP* Runtime Library
kmp_csupport.c
1 /*
2  * kmp_csupport.c -- kfront linkage support for OpenMP.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "omp.h" /* extern "C" declarations of user-visible routines */
17 #include "kmp.h"
18 #include "kmp_i18n.h"
19 #include "kmp_itt.h"
20 #include "kmp_lock.h"
21 #include "kmp_error.h"
22 #include "kmp_stats.h"
23 
24 #if OMPT_SUPPORT
25 #include "ompt-internal.h"
26 #include "ompt-specific.h"
27 #endif
28 
29 #define MAX_MESSAGE 512
30 
31 /* ------------------------------------------------------------------------ */
32 /* ------------------------------------------------------------------------ */
33 
34 /* flags will be used in future, e.g., to implement */
35 /* openmp_strict library restrictions */
36 
46 void
47 __kmpc_begin(ident_t *loc, kmp_int32 flags)
48 {
49  // By default __kmp_ignore_mppbeg() returns TRUE.
50  if (__kmp_ignore_mppbeg() == FALSE) {
51  __kmp_internal_begin();
52 
53  KC_TRACE( 10, ("__kmpc_begin: called\n" ) );
54  }
55 }
56 
64 void
66 {
67  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op.
68  // However, this can be overridden with KMP_IGNORE_MPPEND environment variable.
69  // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end()
70  // will unregister this root (it can cause library shut down).
71  if (__kmp_ignore_mppend() == FALSE) {
72  KC_TRACE( 10, ("__kmpc_end: called\n" ) );
73  KA_TRACE( 30, ("__kmpc_end\n" ));
74 
75  __kmp_internal_end_thread( -1 );
76  }
77 }
78 
98 kmp_int32
100 {
101  kmp_int32 gtid = __kmp_entry_gtid();
102 
103  KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) );
104 
105  return gtid;
106 }
107 
121 kmp_int32
123 {
124  KC_TRACE( 10, ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_nth ) );
125 
126  return TCR_4(__kmp_nth);
127 }
128 
135 kmp_int32
137 {
138  KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) );
139  return __kmp_tid_from_gtid( __kmp_entry_gtid() );
140 }
141 
147 kmp_int32
149 {
150  KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) );
151 
152  return __kmp_entry_thread() -> th.th_team -> t.t_nproc;
153 }
154 
161 kmp_int32
163 {
164 #ifndef KMP_DEBUG
165 
166  return TRUE;
167 
168 #else
169 
170  const char *semi2;
171  const char *semi3;
172  int line_no;
173 
174  if (__kmp_par_range == 0) {
175  return TRUE;
176  }
177  semi2 = loc->psource;
178  if (semi2 == NULL) {
179  return TRUE;
180  }
181  semi2 = strchr(semi2, ';');
182  if (semi2 == NULL) {
183  return TRUE;
184  }
185  semi2 = strchr(semi2 + 1, ';');
186  if (semi2 == NULL) {
187  return TRUE;
188  }
189  if (__kmp_par_range_filename[0]) {
190  const char *name = semi2 - 1;
191  while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
192  name--;
193  }
194  if ((*name == '/') || (*name == ';')) {
195  name++;
196  }
197  if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
198  return __kmp_par_range < 0;
199  }
200  }
201  semi3 = strchr(semi2 + 1, ';');
202  if (__kmp_par_range_routine[0]) {
203  if ((semi3 != NULL) && (semi3 > semi2)
204  && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
205  return __kmp_par_range < 0;
206  }
207  }
208  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
209  if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
210  return __kmp_par_range > 0;
211  }
212  return __kmp_par_range < 0;
213  }
214  return TRUE;
215 
216 #endif /* KMP_DEBUG */
217 
218 }
219 
225 kmp_int32
227 {
228  return __kmp_entry_thread() -> th.th_root -> r.r_active;
229 }
230 
240 void
241 __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads )
242 {
243  KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
244  global_tid, num_threads ) );
245 
246  __kmp_push_num_threads( loc, global_tid, num_threads );
247 }
248 
249 void
250 __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid )
251 {
252  KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) );
253 
254  /* the num_threads are automatically popped */
255 }
256 
257 
258 #if OMP_40_ENABLED
259 
260 void
261 __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind )
262 {
263  KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n",
264  global_tid, proc_bind ) );
265 
266  __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind );
267 }
268 
269 #endif /* OMP_40_ENABLED */
270 
271 
281 void
282 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
283 {
284  int gtid = __kmp_entry_gtid();
285 
286 #if (KMP_STATS_ENABLED)
287  int inParallel = __kmpc_in_parallel(loc);
288  if (inParallel)
289  {
290  KMP_COUNT_BLOCK(OMP_NESTED_PARALLEL);
291  }
292  else
293  {
294  KMP_COUNT_BLOCK(OMP_PARALLEL);
295  }
296 #endif
297 
298  // maybe to save thr_state is enough here
299  {
300  va_list ap;
301  va_start( ap, microtask );
302 
303 #if OMPT_SUPPORT
304  int tid = __kmp_tid_from_gtid( gtid );
305  kmp_info_t *master_th = __kmp_threads[ gtid ];
306  kmp_team_t *parent_team = master_th->th.th_team;
307  if (ompt_enabled) {
308  parent_team->t.t_implicit_task_taskdata[tid].
309  ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(0);
310  }
311 #endif
312 
313 #if INCLUDE_SSC_MARKS
314  SSC_MARK_FORKING();
315 #endif
316  __kmp_fork_call( loc, gtid, fork_context_intel,
317  argc,
318 #if OMPT_SUPPORT
319  VOLATILE_CAST(void *) microtask, // "unwrapped" task
320 #endif
321  VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
322  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
323 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
324 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
325  &ap
326 #else
327  ap
328 #endif
329  );
330 #if INCLUDE_SSC_MARKS
331  SSC_MARK_JOINING();
332 #endif
333  __kmp_join_call( loc, gtid
334 #if OMPT_SUPPORT
335  , fork_context_intel
336 #endif
337  );
338 
339  va_end( ap );
340 
341 #if OMPT_SUPPORT
342  if (ompt_enabled) {
343  parent_team->t.t_implicit_task_taskdata[tid].
344  ompt_task_info.frame.reenter_runtime_frame = 0;
345  }
346 #endif
347  }
348 }
349 
350 #if OMP_40_ENABLED
351 
362 void
363 __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads )
364 {
365  KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
366  global_tid, num_teams, num_threads ) );
367 
368  __kmp_push_num_teams( loc, global_tid, num_teams, num_threads );
369 }
370 
380 void
381 __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
382 {
383  int gtid = __kmp_entry_gtid();
384  kmp_info_t *this_thr = __kmp_threads[ gtid ];
385  va_list ap;
386  va_start( ap, microtask );
387 
388  KMP_COUNT_BLOCK(OMP_TEAMS);
389 
390  // remember teams entry point and nesting level
391  this_thr->th.th_teams_microtask = microtask;
392  this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
393 
394 #if OMPT_SUPPORT
395  kmp_team_t *parent_team = this_thr->th.th_team;
396  int tid = __kmp_tid_from_gtid( gtid );
397  if (ompt_enabled) {
398  parent_team->t.t_implicit_task_taskdata[tid].
399  ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(0);
400  }
401 #endif
402 
403  // check if __kmpc_push_num_teams called, set default number of teams otherwise
404  if ( this_thr->th.th_teams_size.nteams == 0 ) {
405  __kmp_push_num_teams( loc, gtid, 0, 0 );
406  }
407  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
408  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
409  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
410 
411  __kmp_fork_call( loc, gtid, fork_context_intel,
412  argc,
413 #if OMPT_SUPPORT
414  VOLATILE_CAST(void *) microtask, // "unwrapped" task
415 #endif
416  VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
417  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
418 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
419  &ap
420 #else
421  ap
422 #endif
423  );
424  __kmp_join_call( loc, gtid
425 #if OMPT_SUPPORT
426  , fork_context_intel
427 #endif
428  );
429 
430 #if OMPT_SUPPORT
431  if (ompt_enabled) {
432  parent_team->t.t_implicit_task_taskdata[tid].
433  ompt_task_info.frame.reenter_runtime_frame = NULL;
434  }
435 #endif
436 
437  this_thr->th.th_teams_microtask = NULL;
438  this_thr->th.th_teams_level = 0;
439  *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
440  va_end( ap );
441 }
442 #endif /* OMP_40_ENABLED */
443 
444 
445 //
446 // I don't think this function should ever have been exported.
447 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
448 // openmp code ever called it, but it's been exported from the RTL for so
449 // long that I'm afraid to remove the definition.
450 //
451 int
452 __kmpc_invoke_task_func( int gtid )
453 {
454  return __kmp_invoke_task_func( gtid );
455 }
456 
469 void
470 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
471 {
472  __kmp_serialized_parallel(loc, global_tid); /* The implementation is now in kmp_runtime.c so that it can share static functions with
473  * kmp_fork_call since the tasks to be done are similar in each case.
474  */
475 }
476 
484 void
485 __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
486 {
487  kmp_internal_control_t *top;
488  kmp_info_t *this_thr;
489  kmp_team_t *serial_team;
490 
491  KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) );
492 
493  /* skip all this code for autopar serialized loops since it results in
494  unacceptable overhead */
495  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
496  return;
497 
498  // Not autopar code
499  if( ! TCR_4( __kmp_init_parallel ) )
500  __kmp_parallel_initialize();
501 
502  this_thr = __kmp_threads[ global_tid ];
503  serial_team = this_thr->th.th_serial_team;
504 
505  #if OMP_45_ENABLED
506  kmp_task_team_t * task_team = this_thr->th.th_task_team;
507 
508  // we need to wait for the proxy tasks before finishing the thread
509  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
510  __kmp_task_team_wait(this_thr, serial_team USE_ITT_BUILD_ARG(NULL) ); // is an ITT object needed here?
511  #endif
512 
513  KMP_MB();
514  KMP_DEBUG_ASSERT( serial_team );
515  KMP_ASSERT( serial_team -> t.t_serialized );
516  KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team );
517  KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team );
518  KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
519  KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
520 
521  /* If necessary, pop the internal control stack values and replace the team values */
522  top = serial_team -> t.t_control_stack_top;
523  if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
524  copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
525  serial_team -> t.t_control_stack_top = top -> next;
526  __kmp_free(top);
527  }
528 
529  //if( serial_team -> t.t_serialized > 1 )
530  serial_team -> t.t_level--;
531 
532  /* pop dispatch buffers stack */
533  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
534  {
535  dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer;
536  serial_team->t.t_dispatch->th_disp_buffer =
537  serial_team->t.t_dispatch->th_disp_buffer->next;
538  __kmp_free( disp_buffer );
539  }
540 
541  -- serial_team -> t.t_serialized;
542  if ( serial_team -> t.t_serialized == 0 ) {
543 
544  /* return to the parallel section */
545 
546 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
547  if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) {
548  __kmp_clear_x87_fpu_status_word();
549  __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
550  __kmp_load_mxcsr( &serial_team->t.t_mxcsr );
551  }
552 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
553 
554  this_thr -> th.th_team = serial_team -> t.t_parent;
555  this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid;
556 
557  /* restore values cached in the thread */
558  this_thr -> th.th_team_nproc = serial_team -> t.t_parent -> t.t_nproc; /* JPH */
559  this_thr -> th.th_team_master = serial_team -> t.t_parent -> t.t_threads[0]; /* JPH */
560  this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized;
561 
562  /* TODO the below shouldn't need to be adjusted for serialized teams */
563  this_thr -> th.th_dispatch = & this_thr -> th.th_team ->
564  t.t_dispatch[ serial_team -> t.t_master_tid ];
565 
566  __kmp_pop_current_task_from_thread( this_thr );
567 
568  KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
569  this_thr -> th.th_current_task -> td_flags.executing = 1;
570 
571  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
572  // Copy the task team from the new child / old parent team to the thread.
573  this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
574  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
575  global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
576  }
577  } else {
578  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
579  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
580  global_tid, serial_team, serial_team -> t.t_serialized ) );
581  }
582  }
583 
584  if ( __kmp_env_consistency_check )
585  __kmp_pop_parallel( global_tid, NULL );
586 }
587 
596 void
598 {
599  KC_TRACE( 10, ("__kmpc_flush: called\n" ) );
600 
601  /* need explicit __mf() here since use volatile instead in library */
602  KMP_MB(); /* Flush all pending memory write invalidates. */
603 
604  #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
605  #if KMP_MIC
606  // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
607  // We shouldn't need it, though, since the ABI rules require that
608  // * If the compiler generates NGO stores it also generates the fence
609  // * If users hand-code NGO stores they should insert the fence
610  // therefore no incomplete unordered stores should be visible.
611  #else
612  // C74404
613  // This is to address non-temporal store instructions (sfence needed).
614  // The clflush instruction is addressed either (mfence needed).
615  // Probably the non-temporal load monvtdqa instruction should also be addressed.
616  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
617  if ( ! __kmp_cpuinfo.initialized ) {
618  __kmp_query_cpuid( & __kmp_cpuinfo );
619  }; // if
620  if ( ! __kmp_cpuinfo.sse2 ) {
621  // CPU cannot execute SSE2 instructions.
622  } else {
623  #if KMP_COMPILER_ICC
624  _mm_mfence();
625  #elif KMP_COMPILER_MSVC
626  MemoryBarrier();
627  #else
628  __sync_synchronize();
629  #endif // KMP_COMPILER_ICC
630  }; // if
631  #endif // KMP_MIC
632  #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
633  // Nothing to see here move along
634  #elif KMP_ARCH_PPC64
635  // Nothing needed here (we have a real MB above).
636  #if KMP_OS_CNK
637  // The flushing thread needs to yield here; this prevents a
638  // busy-waiting thread from saturating the pipeline. flush is
639  // often used in loops like this:
640  // while (!flag) {
641  // #pragma omp flush(flag)
642  // }
643  // and adding the yield here is good for at least a 10x speedup
644  // when running >2 threads per core (on the NAS LU benchmark).
645  __kmp_yield(TRUE);
646  #endif
647  #else
648  #error Unknown or unsupported architecture
649  #endif
650 
651 }
652 
653 /* -------------------------------------------------------------------------- */
654 
655 /* -------------------------------------------------------------------------- */
656 
664 void
665 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
666 {
667  KMP_COUNT_BLOCK(OMP_BARRIER);
668  KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
669 
670  if (! TCR_4(__kmp_init_parallel))
671  __kmp_parallel_initialize();
672 
673  if ( __kmp_env_consistency_check ) {
674  if ( loc == 0 ) {
675  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
676  }; // if
677 
678  __kmp_check_barrier( global_tid, ct_barrier, loc );
679  }
680 
681  __kmp_threads[ global_tid ]->th.th_ident = loc;
682  // TODO: explicit barrier_wait_id:
683  // this function is called when 'barrier' directive is present or
684  // implicit barrier at the end of a worksharing construct.
685  // 1) better to add a per-thread barrier counter to a thread data structure
686  // 2) set to 0 when a new team is created
687  // 4) no sync is required
688 
689  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
690 }
691 
692 /* The BARRIER for a MASTER section is always explicit */
699 kmp_int32
700 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
701 {
702  int status = 0;
703 
704  KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
705 
706  if( ! TCR_4( __kmp_init_parallel ) )
707  __kmp_parallel_initialize();
708 
709  if( KMP_MASTER_GTID( global_tid )) {
710  KMP_COUNT_BLOCK(OMP_MASTER);
711  KMP_PUSH_PARTITIONED_TIMER(OMP_master);
712  status = 1;
713  }
714 
715 #if OMPT_SUPPORT && OMPT_TRACE
716  if (status) {
717  if (ompt_enabled &&
718  ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
719  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
720  kmp_team_t *team = this_thr -> th.th_team;
721 
722  int tid = __kmp_tid_from_gtid( global_tid );
723  ompt_callbacks.ompt_callback(ompt_event_master_begin)(
724  team->t.ompt_team_info.parallel_id,
725  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
726  }
727  }
728 #endif
729 
730  if ( __kmp_env_consistency_check ) {
731 #if KMP_USE_DYNAMIC_LOCK
732  if (status)
733  __kmp_push_sync( global_tid, ct_master, loc, NULL, 0 );
734  else
735  __kmp_check_sync( global_tid, ct_master, loc, NULL, 0 );
736 #else
737  if (status)
738  __kmp_push_sync( global_tid, ct_master, loc, NULL );
739  else
740  __kmp_check_sync( global_tid, ct_master, loc, NULL );
741 #endif
742  }
743 
744  return status;
745 }
746 
755 void
756 __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
757 {
758  KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
759 
760  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
761  KMP_POP_PARTITIONED_TIMER();
762 
763 #if OMPT_SUPPORT && OMPT_TRACE
764  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
765  kmp_team_t *team = this_thr -> th.th_team;
766  if (ompt_enabled &&
767  ompt_callbacks.ompt_callback(ompt_event_master_end)) {
768  int tid = __kmp_tid_from_gtid( global_tid );
769  ompt_callbacks.ompt_callback(ompt_event_master_end)(
770  team->t.ompt_team_info.parallel_id,
771  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
772  }
773 #endif
774 
775  if ( __kmp_env_consistency_check ) {
776  if( global_tid < 0 )
777  KMP_WARNING( ThreadIdentInvalid );
778 
779  if( KMP_MASTER_GTID( global_tid ))
780  __kmp_pop_sync( global_tid, ct_master, loc );
781  }
782 }
783 
791 void
792 __kmpc_ordered( ident_t * loc, kmp_int32 gtid )
793 {
794  int cid = 0;
795  kmp_info_t *th;
796  KMP_DEBUG_ASSERT( __kmp_init_serial );
797 
798  KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid ));
799 
800  if (! TCR_4(__kmp_init_parallel))
801  __kmp_parallel_initialize();
802 
803 #if USE_ITT_BUILD
804  __kmp_itt_ordered_prep( gtid );
805  // TODO: ordered_wait_id
806 #endif /* USE_ITT_BUILD */
807 
808  th = __kmp_threads[ gtid ];
809 
810 #if OMPT_SUPPORT && OMPT_TRACE
811  if (ompt_enabled) {
812  /* OMPT state update */
813  th->th.ompt_thread_info.wait_id = (uint64_t) loc;
814  th->th.ompt_thread_info.state = ompt_state_wait_ordered;
815 
816  /* OMPT event callback */
817  if (ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
818  ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
819  th->th.ompt_thread_info.wait_id);
820  }
821  }
822 #endif
823 
824  if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
825  (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
826  else
827  __kmp_parallel_deo( & gtid, & cid, loc );
828 
829 #if OMPT_SUPPORT && OMPT_TRACE
830  if (ompt_enabled) {
831  /* OMPT state update */
832  th->th.ompt_thread_info.state = ompt_state_work_parallel;
833  th->th.ompt_thread_info.wait_id = 0;
834 
835  /* OMPT event callback */
836  if (ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
837  ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
838  th->th.ompt_thread_info.wait_id);
839  }
840  }
841 #endif
842 
843 #if USE_ITT_BUILD
844  __kmp_itt_ordered_start( gtid );
845 #endif /* USE_ITT_BUILD */
846 }
847 
855 void
856 __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
857 {
858  int cid = 0;
859  kmp_info_t *th;
860 
861  KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) );
862 
863 #if USE_ITT_BUILD
864  __kmp_itt_ordered_end( gtid );
865  // TODO: ordered_wait_id
866 #endif /* USE_ITT_BUILD */
867 
868  th = __kmp_threads[ gtid ];
869 
870  if ( th -> th.th_dispatch -> th_dxo_fcn != 0 )
871  (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
872  else
873  __kmp_parallel_dxo( & gtid, & cid, loc );
874 
875 #if OMPT_SUPPORT && OMPT_BLAME
876  if (ompt_enabled &&
877  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
878  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
879  th->th.ompt_thread_info.wait_id);
880  }
881 #endif
882 }
883 
884 #if KMP_USE_DYNAMIC_LOCK
885 
886 static __forceinline void
887 __kmp_init_indirect_csptr(kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid, kmp_indirect_locktag_t tag)
888 {
889  // Pointer to the allocated indirect lock is written to crit, while indexing is ignored.
890  void *idx;
891  kmp_indirect_lock_t **lck;
892  lck = (kmp_indirect_lock_t **)crit;
893  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
894  KMP_I_LOCK_FUNC(ilk, init)(ilk->lock);
895  KMP_SET_I_LOCK_LOCATION(ilk, loc);
896  KMP_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
897  KA_TRACE(20, ("__kmp_init_indirect_csptr: initialized indirect lock #%d\n", tag));
898 #if USE_ITT_BUILD
899  __kmp_itt_critical_creating(ilk->lock, loc);
900 #endif
901  int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk);
902  if (status == 0) {
903 #if USE_ITT_BUILD
904  __kmp_itt_critical_destroyed(ilk->lock);
905 #endif
906  // We don't really need to destroy the unclaimed lock here since it will be cleaned up at program exit.
907  //KMP_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
908  }
909  KMP_DEBUG_ASSERT(*lck != NULL);
910 }
911 
912 // Fast-path acquire tas lock
913 #define KMP_ACQUIRE_TAS_LOCK(lock, gtid) { \
914  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
915  if (l->lk.poll != KMP_LOCK_FREE(tas) || \
916  ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) { \
917  kmp_uint32 spins; \
918  KMP_FSYNC_PREPARE(l); \
919  KMP_INIT_YIELD(spins); \
920  if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
921  KMP_YIELD(TRUE); \
922  } else { \
923  KMP_YIELD_SPIN(spins); \
924  } \
925  kmp_backoff_t backoff = __kmp_spin_backoff_params; \
926  while (l->lk.poll != KMP_LOCK_FREE(tas) || \
927  ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas))) { \
928  __kmp_spin_backoff(&backoff); \
929  if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
930  KMP_YIELD(TRUE); \
931  } else { \
932  KMP_YIELD_SPIN(spins); \
933  } \
934  } \
935  } \
936  KMP_FSYNC_ACQUIRED(l); \
937 }
938 
939 // Fast-path test tas lock
940 #define KMP_TEST_TAS_LOCK(lock, gtid, rc) { \
941  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
942  rc = l->lk.poll == KMP_LOCK_FREE(tas) && \
943  KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), KMP_LOCK_FREE(tas), KMP_LOCK_BUSY(gtid+1, tas)); \
944 }
945 
946 // Fast-path release tas lock
947 #define KMP_RELEASE_TAS_LOCK(lock, gtid) { \
948  TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, KMP_LOCK_FREE(tas)); \
949  KMP_MB(); \
950 }
951 
952 #if KMP_USE_FUTEX
953 
954 # include <unistd.h>
955 # include <sys/syscall.h>
956 # ifndef FUTEX_WAIT
957 # define FUTEX_WAIT 0
958 # endif
959 # ifndef FUTEX_WAKE
960 # define FUTEX_WAKE 1
961 # endif
962 
963 // Fast-path acquire futex lock
964 #define KMP_ACQUIRE_FUTEX_LOCK(lock, gtid) { \
965  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
966  kmp_int32 gtid_code = (gtid+1) << 1; \
967  KMP_MB(); \
968  KMP_FSYNC_PREPARE(ftx); \
969  kmp_int32 poll_val; \
970  while ((poll_val = KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), \
971  KMP_LOCK_BUSY(gtid_code, futex))) != KMP_LOCK_FREE(futex)) { \
972  kmp_int32 cond = KMP_LOCK_STRIP(poll_val) & 1; \
973  if (!cond) { \
974  if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, poll_val | KMP_LOCK_BUSY(1, futex))) { \
975  continue; \
976  } \
977  poll_val |= KMP_LOCK_BUSY(1, futex); \
978  } \
979  kmp_int32 rc; \
980  if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) { \
981  continue; \
982  } \
983  gtid_code |= 1; \
984  } \
985  KMP_FSYNC_ACQUIRED(ftx); \
986 }
987 
988 // Fast-path test futex lock
989 #define KMP_TEST_FUTEX_LOCK(lock, gtid, rc) { \
990  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
991  if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), KMP_LOCK_FREE(futex), KMP_LOCK_BUSY(gtid+1 << 1, futex))) { \
992  KMP_FSYNC_ACQUIRED(ftx); \
993  rc = TRUE; \
994  } else { \
995  rc = FALSE; \
996  } \
997 }
998 
999 // Fast-path release futex lock
1000 #define KMP_RELEASE_FUTEX_LOCK(lock, gtid) { \
1001  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
1002  KMP_MB(); \
1003  KMP_FSYNC_RELEASING(ftx); \
1004  kmp_int32 poll_val = KMP_XCHG_FIXED32(&(ftx->lk.poll), KMP_LOCK_FREE(futex)); \
1005  if (KMP_LOCK_STRIP(poll_val) & 1) { \
1006  syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, KMP_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1007  } \
1008  KMP_MB(); \
1009  KMP_YIELD(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \
1010 }
1011 
1012 #endif // KMP_USE_FUTEX
1013 
1014 #else // KMP_USE_DYNAMIC_LOCK
1015 
1016 static kmp_user_lock_p
1017 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
1018 {
1019  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1020 
1021  //
1022  // Because of the double-check, the following load
1023  // doesn't need to be volatile.
1024  //
1025  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1026 
1027  if ( lck == NULL ) {
1028  void * idx;
1029 
1030  // Allocate & initialize the lock.
1031  // Remember allocated locks in table in order to free them in __kmp_cleanup()
1032  lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section );
1033  __kmp_init_user_lock_with_checks( lck );
1034  __kmp_set_user_lock_location( lck, loc );
1035 #if USE_ITT_BUILD
1036  __kmp_itt_critical_creating( lck );
1037  // __kmp_itt_critical_creating() should be called *before* the first usage of underlying
1038  // lock. It is the only place where we can guarantee it. There are chances the lock will
1039  // destroyed with no usage, but it is not a problem, because this is not real event seen
1040  // by user but rather setting name for object (lock). See more details in kmp_itt.h.
1041 #endif /* USE_ITT_BUILD */
1042 
1043  //
1044  // Use a cmpxchg instruction to slam the start of the critical
1045  // section with the lock pointer. If another thread beat us
1046  // to it, deallocate the lock, and use the lock that the other
1047  // thread allocated.
1048  //
1049  int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck );
1050 
1051  if ( status == 0 ) {
1052  // Deallocate the lock and reload the value.
1053 #if USE_ITT_BUILD
1054  __kmp_itt_critical_destroyed( lck );
1055  // Let ITT know the lock is destroyed and the same memory location may be reused for
1056  // another purpose.
1057 #endif /* USE_ITT_BUILD */
1058  __kmp_destroy_user_lock_with_checks( lck );
1059  __kmp_user_lock_free( &idx, gtid, lck );
1060  lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1061  KMP_DEBUG_ASSERT( lck != NULL );
1062  }
1063  }
1064  return lck;
1065 }
1066 
1067 #endif // KMP_USE_DYNAMIC_LOCK
1068 
1079 void
1080 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit )
1081 {
1082 #if KMP_USE_DYNAMIC_LOCK
1083  __kmpc_critical_with_hint(loc, global_tid, crit, omp_lock_hint_none);
1084 #else
1085  KMP_COUNT_BLOCK(OMP_CRITICAL);
1086  KMP_TIME_PARTITIONED_BLOCK(OMP_critical_wait); /* Time spent waiting to enter the critical section */
1087  kmp_user_lock_p lck;
1088 
1089  KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1090 
1091  //TODO: add THR_OVHD_STATE
1092 
1093  KMP_CHECK_USER_LOCK_INIT();
1094 
1095  if ( ( __kmp_user_lock_kind == lk_tas )
1096  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1097  lck = (kmp_user_lock_p)crit;
1098  }
1099 #if KMP_USE_FUTEX
1100  else if ( ( __kmp_user_lock_kind == lk_futex )
1101  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1102  lck = (kmp_user_lock_p)crit;
1103  }
1104 #endif
1105  else { // ticket, queuing or drdpa
1106  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
1107  }
1108 
1109  if ( __kmp_env_consistency_check )
1110  __kmp_push_sync( global_tid, ct_critical, loc, lck );
1111 
1112  /* since the critical directive binds to all threads, not just
1113  * the current team we have to check this even if we are in a
1114  * serialized team */
1115  /* also, even if we are the uber thread, we still have to conduct the lock,
1116  * as we have to contend with sibling threads */
1117 
1118 #if USE_ITT_BUILD
1119  __kmp_itt_critical_acquiring( lck );
1120 #endif /* USE_ITT_BUILD */
1121  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1122  __kmp_acquire_user_lock_with_checks( lck, global_tid );
1123 
1124 #if USE_ITT_BUILD
1125  __kmp_itt_critical_acquired( lck );
1126 #endif /* USE_ITT_BUILD */
1127 
1128  KMP_START_EXPLICIT_TIMER(OMP_critical);
1129  KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1130 #endif // KMP_USE_DYNAMIC_LOCK
1131 }
1132 
1133 #if KMP_USE_DYNAMIC_LOCK
1134 
1135 // Converts the given hint to an internal lock implementation
1136 static __forceinline kmp_dyna_lockseq_t
1137 __kmp_map_hint_to_lock(uintptr_t hint)
1138 {
1139 #if KMP_USE_TSX
1140 # define KMP_TSX_LOCK(seq) lockseq_##seq
1141 #else
1142 # define KMP_TSX_LOCK(seq) __kmp_user_lock_seq
1143 #endif
1144 
1145 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1146 # define KMP_CPUINFO_RTM (__kmp_cpuinfo.rtm)
1147 #else
1148 # define KMP_CPUINFO_RTM 0
1149 #endif
1150 
1151  // Hints that do not require further logic
1152  if (hint & kmp_lock_hint_hle)
1153  return KMP_TSX_LOCK(hle);
1154  if (hint & kmp_lock_hint_rtm)
1155  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(rtm): __kmp_user_lock_seq;
1156  if (hint & kmp_lock_hint_adaptive)
1157  return KMP_CPUINFO_RTM ? KMP_TSX_LOCK(adaptive): __kmp_user_lock_seq;
1158 
1159  // Rule out conflicting hints first by returning the default lock
1160  if ((hint & omp_lock_hint_contended) && (hint & omp_lock_hint_uncontended))
1161  return __kmp_user_lock_seq;
1162  if ((hint & omp_lock_hint_speculative) && (hint & omp_lock_hint_nonspeculative))
1163  return __kmp_user_lock_seq;
1164 
1165  // Do not even consider speculation when it appears to be contended
1166  if (hint & omp_lock_hint_contended)
1167  return lockseq_queuing;
1168 
1169  // Uncontended lock without speculation
1170  if ((hint & omp_lock_hint_uncontended) && !(hint & omp_lock_hint_speculative))
1171  return lockseq_tas;
1172 
1173  // HLE lock for speculation
1174  if (hint & omp_lock_hint_speculative)
1175  return KMP_TSX_LOCK(hle);
1176 
1177  return __kmp_user_lock_seq;
1178 }
1179 
1192 void
1193 __kmpc_critical_with_hint( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit, uintptr_t hint )
1194 {
1195  KMP_COUNT_BLOCK(OMP_CRITICAL);
1196  kmp_user_lock_p lck;
1197 
1198  KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1199 
1200  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
1201  // Check if it is initialized.
1202  if (*lk == 0) {
1203  kmp_dyna_lockseq_t lckseq = __kmp_map_hint_to_lock(hint);
1204  if (KMP_IS_D_LOCK(lckseq)) {
1205  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(lckseq));
1206  } else {
1207  __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(lckseq));
1208  }
1209  }
1210  // Branch for accessing the actual lock object and set operation. This branching is inevitable since
1211  // this lock initialization does not follow the normal dispatch path (lock table is not used).
1212  if (KMP_EXTRACT_D_TAG(lk) != 0) {
1213  lck = (kmp_user_lock_p)lk;
1214  if (__kmp_env_consistency_check) {
1215  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint));
1216  }
1217 # if USE_ITT_BUILD
1218  __kmp_itt_critical_acquiring(lck);
1219 # endif
1220 # if KMP_USE_INLINED_TAS
1221  if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1222  KMP_ACQUIRE_TAS_LOCK(lck, global_tid);
1223  } else
1224 # elif KMP_USE_INLINED_FUTEX
1225  if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1226  KMP_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1227  } else
1228 # endif
1229  {
1230  KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
1231  }
1232  } else {
1233  kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
1234  lck = ilk->lock;
1235  if (__kmp_env_consistency_check) {
1236  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_map_hint_to_lock(hint));
1237  }
1238 # if USE_ITT_BUILD
1239  __kmp_itt_critical_acquiring(lck);
1240 # endif
1241  KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1242  }
1243 
1244 #if USE_ITT_BUILD
1245  __kmp_itt_critical_acquired( lck );
1246 #endif /* USE_ITT_BUILD */
1247 
1248  KMP_PUSH_PARTITIONED_TIMER(OMP_critical);
1249  KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1250 } // __kmpc_critical_with_hint
1251 
1252 #endif // KMP_USE_DYNAMIC_LOCK
1253 
1263 void
1264 __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
1265 {
1266  kmp_user_lock_p lck;
1267 
1268  KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid ));
1269 
1270 #if KMP_USE_DYNAMIC_LOCK
1271  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
1272  lck = (kmp_user_lock_p)crit;
1273  KMP_ASSERT(lck != NULL);
1274  if (__kmp_env_consistency_check) {
1275  __kmp_pop_sync(global_tid, ct_critical, loc);
1276  }
1277 # if USE_ITT_BUILD
1278  __kmp_itt_critical_releasing( lck );
1279 # endif
1280 # if KMP_USE_INLINED_TAS
1281  if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1282  KMP_RELEASE_TAS_LOCK(lck, global_tid);
1283  } else
1284 # elif KMP_USE_INLINED_FUTEX
1285  if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1286  KMP_RELEASE_FUTEX_LOCK(lck, global_tid);
1287  } else
1288 # endif
1289  {
1290  KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1291  }
1292  } else {
1293  kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1294  KMP_ASSERT(ilk != NULL);
1295  lck = ilk->lock;
1296  if (__kmp_env_consistency_check) {
1297  __kmp_pop_sync(global_tid, ct_critical, loc);
1298  }
1299 # if USE_ITT_BUILD
1300  __kmp_itt_critical_releasing( lck );
1301 # endif
1302  KMP_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1303  }
1304 
1305 #else // KMP_USE_DYNAMIC_LOCK
1306 
1307  if ( ( __kmp_user_lock_kind == lk_tas )
1308  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1309  lck = (kmp_user_lock_p)crit;
1310  }
1311 #if KMP_USE_FUTEX
1312  else if ( ( __kmp_user_lock_kind == lk_futex )
1313  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1314  lck = (kmp_user_lock_p)crit;
1315  }
1316 #endif
1317  else { // ticket, queuing or drdpa
1318  lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit));
1319  }
1320 
1321  KMP_ASSERT(lck != NULL);
1322 
1323  if ( __kmp_env_consistency_check )
1324  __kmp_pop_sync( global_tid, ct_critical, loc );
1325 
1326 #if USE_ITT_BUILD
1327  __kmp_itt_critical_releasing( lck );
1328 #endif /* USE_ITT_BUILD */
1329  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1330  __kmp_release_user_lock_with_checks( lck, global_tid );
1331 
1332 #if OMPT_SUPPORT && OMPT_BLAME
1333  if (ompt_enabled &&
1334  ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
1335  ompt_callbacks.ompt_callback(ompt_event_release_critical)(
1336  (uint64_t) lck);
1337  }
1338 #endif
1339 
1340 #endif // KMP_USE_DYNAMIC_LOCK
1341  KMP_POP_PARTITIONED_TIMER();
1342  KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
1343 }
1344 
1353 kmp_int32
1354 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
1355 {
1356  int status;
1357 
1358  KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) );
1359 
1360  if (! TCR_4(__kmp_init_parallel))
1361  __kmp_parallel_initialize();
1362 
1363  if ( __kmp_env_consistency_check )
1364  __kmp_check_barrier( global_tid, ct_barrier, loc );
1365 
1366 #if USE_ITT_NOTIFY
1367  __kmp_threads[global_tid]->th.th_ident = loc;
1368 #endif
1369  status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
1370 
1371  return (status != 0) ? 0 : 1;
1372 }
1373 
1383 void
1384 __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
1385 {
1386  KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid ));
1387 
1388  __kmp_end_split_barrier ( bs_plain_barrier, global_tid );
1389 }
1390 
1401 kmp_int32
1402 __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
1403 {
1404  kmp_int32 ret;
1405 
1406  KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid ));
1407 
1408  if (! TCR_4(__kmp_init_parallel))
1409  __kmp_parallel_initialize();
1410 
1411  if ( __kmp_env_consistency_check ) {
1412  if ( loc == 0 ) {
1413  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
1414  }
1415  __kmp_check_barrier( global_tid, ct_barrier, loc );
1416  }
1417 
1418 #if USE_ITT_NOTIFY
1419  __kmp_threads[global_tid]->th.th_ident = loc;
1420 #endif
1421  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
1422 
1423  ret = __kmpc_master (loc, global_tid);
1424 
1425  if ( __kmp_env_consistency_check ) {
1426  /* there's no __kmpc_end_master called; so the (stats) */
1427  /* actions of __kmpc_end_master are done here */
1428 
1429  if ( global_tid < 0 ) {
1430  KMP_WARNING( ThreadIdentInvalid );
1431  }
1432  if (ret) {
1433  /* only one thread should do the pop since only */
1434  /* one did the push (see __kmpc_master()) */
1435 
1436  __kmp_pop_sync( global_tid, ct_master, loc );
1437  }
1438  }
1439 
1440  return (ret);
1441 }
1442 
1443 /* The BARRIER for a SINGLE process section is always explicit */
1455 kmp_int32
1456 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
1457 {
1458  kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
1459 
1460  if (rc) {
1461  // We are going to execute the single statement, so we should count it.
1462  KMP_COUNT_BLOCK(OMP_SINGLE);
1463  KMP_PUSH_PARTITIONED_TIMER(OMP_single);
1464  }
1465 
1466 #if OMPT_SUPPORT && OMPT_TRACE
1467  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
1468  kmp_team_t *team = this_thr -> th.th_team;
1469  int tid = __kmp_tid_from_gtid( global_tid );
1470 
1471  if (ompt_enabled) {
1472  if (rc) {
1473  if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
1474  ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
1475  team->t.ompt_team_info.parallel_id,
1476  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
1477  team->t.ompt_team_info.microtask);
1478  }
1479  } else {
1480  if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
1481  ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
1482  team->t.ompt_team_info.parallel_id,
1483  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1484  }
1485  this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
1486  }
1487  }
1488 #endif
1489 
1490  return rc;
1491 }
1492 
1502 void
1503 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
1504 {
1505  __kmp_exit_single( global_tid );
1506  KMP_POP_PARTITIONED_TIMER();
1507 
1508 #if OMPT_SUPPORT && OMPT_TRACE
1509  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
1510  kmp_team_t *team = this_thr -> th.th_team;
1511  int tid = __kmp_tid_from_gtid( global_tid );
1512 
1513  if (ompt_enabled &&
1514  ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
1515  ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
1516  team->t.ompt_team_info.parallel_id,
1517  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1518  }
1519 #endif
1520 }
1521 
1529 void
1530 __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
1531 {
1532  KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1533 
1534 #if OMPT_SUPPORT && OMPT_TRACE
1535  if (ompt_enabled &&
1536  ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
1537  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1538  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1539  ompt_callbacks.ompt_callback(ompt_event_loop_end)(
1540  team_info->parallel_id, task_info->task_id);
1541  }
1542 #endif
1543 
1544  if ( __kmp_env_consistency_check )
1545  __kmp_pop_workshare( global_tid, ct_pdo, loc );
1546 }
1547 
1548 /*
1549  * User routines which take C-style arguments (call by value)
1550  * different from the Fortran equivalent routines
1551  */
1552 
1553 void
1554 ompc_set_num_threads( int arg )
1555 {
1556 // !!!!! TODO: check the per-task binding
1557  __kmp_set_num_threads( arg, __kmp_entry_gtid() );
1558 }
1559 
1560 void
1561 ompc_set_dynamic( int flag )
1562 {
1563  kmp_info_t *thread;
1564 
1565  /* For the thread-private implementation of the internal controls */
1566  thread = __kmp_entry_thread();
1567 
1568  __kmp_save_internal_controls( thread );
1569 
1570  set__dynamic( thread, flag ? TRUE : FALSE );
1571 }
1572 
1573 void
1574 ompc_set_nested( int flag )
1575 {
1576  kmp_info_t *thread;
1577 
1578  /* For the thread-private internal controls implementation */
1579  thread = __kmp_entry_thread();
1580 
1581  __kmp_save_internal_controls( thread );
1582 
1583  set__nested( thread, flag ? TRUE : FALSE );
1584 }
1585 
1586 void
1587 ompc_set_max_active_levels( int max_active_levels )
1588 {
1589  /* TO DO */
1590  /* we want per-task implementation of this internal control */
1591 
1592  /* For the per-thread internal controls implementation */
1593  __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels );
1594 }
1595 
1596 void
1597 ompc_set_schedule( omp_sched_t kind, int modifier )
1598 {
1599 // !!!!! TODO: check the per-task binding
1600  __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier );
1601 }
1602 
1603 int
1604 ompc_get_ancestor_thread_num( int level )
1605 {
1606  return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level );
1607 }
1608 
1609 int
1610 ompc_get_team_size( int level )
1611 {
1612  return __kmp_get_team_size( __kmp_entry_gtid(), level );
1613 }
1614 
1615 void
1616 kmpc_set_stacksize( int arg )
1617 {
1618  // __kmp_aux_set_stacksize initializes the library if needed
1619  __kmp_aux_set_stacksize( arg );
1620 }
1621 
1622 void
1623 kmpc_set_stacksize_s( size_t arg )
1624 {
1625  // __kmp_aux_set_stacksize initializes the library if needed
1626  __kmp_aux_set_stacksize( arg );
1627 }
1628 
1629 void
1630 kmpc_set_blocktime( int arg )
1631 {
1632  int gtid, tid;
1633  kmp_info_t *thread;
1634 
1635  gtid = __kmp_entry_gtid();
1636  tid = __kmp_tid_from_gtid(gtid);
1637  thread = __kmp_thread_from_gtid(gtid);
1638 
1639  __kmp_aux_set_blocktime( arg, thread, tid );
1640 }
1641 
1642 void
1643 kmpc_set_library( int arg )
1644 {
1645  // __kmp_user_set_library initializes the library if needed
1646  __kmp_user_set_library( (enum library_type)arg );
1647 }
1648 
1649 void
1650 kmpc_set_defaults( char const * str )
1651 {
1652  // __kmp_aux_set_defaults initializes the library if needed
1653  __kmp_aux_set_defaults( str, KMP_STRLEN( str ) );
1654 }
1655 
1656 void
1657 kmpc_set_disp_num_buffers( int arg )
1658 {
1659  // ignore after initialization because some teams have already
1660  // allocated dispatch buffers
1661  if( __kmp_init_serial == 0 && arg > 0 )
1662  __kmp_dispatch_num_buffers = arg;
1663 }
1664 
1665 int
1666 kmpc_set_affinity_mask_proc( int proc, void **mask )
1667 {
1668 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1669  return -1;
1670 #else
1671  if ( ! TCR_4(__kmp_init_middle) ) {
1672  __kmp_middle_initialize();
1673  }
1674  return __kmp_aux_set_affinity_mask_proc( proc, mask );
1675 #endif
1676 }
1677 
1678 int
1679 kmpc_unset_affinity_mask_proc( int proc, void **mask )
1680 {
1681 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1682  return -1;
1683 #else
1684  if ( ! TCR_4(__kmp_init_middle) ) {
1685  __kmp_middle_initialize();
1686  }
1687  return __kmp_aux_unset_affinity_mask_proc( proc, mask );
1688 #endif
1689 }
1690 
1691 int
1692 kmpc_get_affinity_mask_proc( int proc, void **mask )
1693 {
1694 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1695  return -1;
1696 #else
1697  if ( ! TCR_4(__kmp_init_middle) ) {
1698  __kmp_middle_initialize();
1699  }
1700  return __kmp_aux_get_affinity_mask_proc( proc, mask );
1701 #endif
1702 }
1703 
1704 
1705 /* -------------------------------------------------------------------------- */
1746 void
1747 __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit )
1748 {
1749  void **data_ptr;
1750 
1751  KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid ));
1752 
1753  KMP_MB();
1754 
1755  data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data;
1756 
1757  if ( __kmp_env_consistency_check ) {
1758  if ( loc == 0 ) {
1759  KMP_WARNING( ConstructIdentInvalid );
1760  }
1761  }
1762 
1763  /* ToDo: Optimize the following two barriers into some kind of split barrier */
1764 
1765  if (didit) *data_ptr = cpy_data;
1766 
1767  /* This barrier is not a barrier region boundary */
1768 #if USE_ITT_NOTIFY
1769  __kmp_threads[gtid]->th.th_ident = loc;
1770 #endif
1771  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1772 
1773  if (! didit) (*cpy_func)( cpy_data, *data_ptr );
1774 
1775  /* Consider next barrier the user-visible barrier for barrier region boundaries */
1776  /* Nesting checks are already handled by the single construct checks */
1777 
1778 #if USE_ITT_NOTIFY
1779  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
1780 #endif
1781  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1782 }
1783 
1784 /* -------------------------------------------------------------------------- */
1785 
1786 #define INIT_LOCK __kmp_init_user_lock_with_checks
1787 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
1788 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
1789 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
1790 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
1791 #define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed
1792 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
1793 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
1794 #define TEST_LOCK __kmp_test_user_lock_with_checks
1795 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
1796 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
1797 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
1798 
1799 
1800 /*
1801  * TODO: Make check abort messages use location info & pass it
1802  * into with_checks routines
1803  */
1804 
1805 #if KMP_USE_DYNAMIC_LOCK
1806 
1807 // internal lock initializer
1808 static __forceinline void
1809 __kmp_init_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq)
1810 {
1811  if (KMP_IS_D_LOCK(seq)) {
1812  KMP_INIT_D_LOCK(lock, seq);
1813 #if USE_ITT_BUILD
1814  __kmp_itt_lock_creating((kmp_user_lock_p)lock, NULL);
1815 #endif
1816  } else {
1817  KMP_INIT_I_LOCK(lock, seq);
1818 #if USE_ITT_BUILD
1819  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1820  __kmp_itt_lock_creating(ilk->lock, loc);
1821 #endif
1822  }
1823 }
1824 
1825 // internal nest lock initializer
1826 static __forceinline void
1827 __kmp_init_nest_lock_with_hint(ident_t *loc, void **lock, kmp_dyna_lockseq_t seq)
1828 {
1829 #if KMP_USE_TSX
1830  // Don't have nested lock implementation for speculative locks
1831  if (seq == lockseq_hle || seq == lockseq_rtm || seq == lockseq_adaptive)
1832  seq = __kmp_user_lock_seq;
1833 #endif
1834  switch (seq) {
1835  case lockseq_tas:
1836  seq = lockseq_nested_tas;
1837  break;
1838 #if KMP_USE_FUTEX
1839  case lockseq_futex:
1840  seq = lockseq_nested_futex;
1841  break;
1842 #endif
1843  case lockseq_ticket:
1844  seq = lockseq_nested_ticket;
1845  break;
1846  case lockseq_queuing:
1847  seq = lockseq_nested_queuing;
1848  break;
1849  case lockseq_drdpa:
1850  seq = lockseq_nested_drdpa;
1851  break;
1852  default:
1853  seq = lockseq_nested_queuing;
1854  }
1855  KMP_INIT_I_LOCK(lock, seq);
1856 #if USE_ITT_BUILD
1857  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(lock);
1858  __kmp_itt_lock_creating(ilk->lock, loc);
1859 #endif
1860 }
1861 
1862 /* initialize the lock with a hint */
1863 void
1864 __kmpc_init_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint)
1865 {
1866  KMP_DEBUG_ASSERT(__kmp_init_serial);
1867  if (__kmp_env_consistency_check && user_lock == NULL) {
1868  KMP_FATAL(LockIsUninitialized, "omp_init_lock_with_hint");
1869  }
1870 
1871  __kmp_init_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1872 }
1873 
1874 /* initialize the lock with a hint */
1875 void
1876 __kmpc_init_nest_lock_with_hint(ident_t *loc, kmp_int32 gtid, void **user_lock, uintptr_t hint)
1877 {
1878  KMP_DEBUG_ASSERT(__kmp_init_serial);
1879  if (__kmp_env_consistency_check && user_lock == NULL) {
1880  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock_with_hint");
1881  }
1882 
1883  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_map_hint_to_lock(hint));
1884 }
1885 
1886 #endif // KMP_USE_DYNAMIC_LOCK
1887 
1888 /* initialize the lock */
1889 void
1890 __kmpc_init_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1891 #if KMP_USE_DYNAMIC_LOCK
1892  KMP_DEBUG_ASSERT(__kmp_init_serial);
1893  if (__kmp_env_consistency_check && user_lock == NULL) {
1894  KMP_FATAL(LockIsUninitialized, "omp_init_lock");
1895  }
1896  __kmp_init_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1897 
1898 #else // KMP_USE_DYNAMIC_LOCK
1899 
1900  static char const * const func = "omp_init_lock";
1901  kmp_user_lock_p lck;
1902  KMP_DEBUG_ASSERT( __kmp_init_serial );
1903 
1904  if ( __kmp_env_consistency_check ) {
1905  if ( user_lock == NULL ) {
1906  KMP_FATAL( LockIsUninitialized, func );
1907  }
1908  }
1909 
1910  KMP_CHECK_USER_LOCK_INIT();
1911 
1912  if ( ( __kmp_user_lock_kind == lk_tas )
1913  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1914  lck = (kmp_user_lock_p)user_lock;
1915  }
1916 #if KMP_USE_FUTEX
1917  else if ( ( __kmp_user_lock_kind == lk_futex )
1918  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1919  lck = (kmp_user_lock_p)user_lock;
1920  }
1921 #endif
1922  else {
1923  lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1924  }
1925  INIT_LOCK( lck );
1926  __kmp_set_user_lock_location( lck, loc );
1927 
1928 #if OMPT_SUPPORT && OMPT_TRACE
1929  if (ompt_enabled &&
1930  ompt_callbacks.ompt_callback(ompt_event_init_lock)) {
1931  ompt_callbacks.ompt_callback(ompt_event_init_lock)((uint64_t) lck);
1932  }
1933 #endif
1934 
1935 #if USE_ITT_BUILD
1936  __kmp_itt_lock_creating( lck );
1937 #endif /* USE_ITT_BUILD */
1938 
1939 #endif // KMP_USE_DYNAMIC_LOCK
1940 } // __kmpc_init_lock
1941 
1942 /* initialize the lock */
1943 void
1944 __kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1945 #if KMP_USE_DYNAMIC_LOCK
1946 
1947  KMP_DEBUG_ASSERT(__kmp_init_serial);
1948  if (__kmp_env_consistency_check && user_lock == NULL) {
1949  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
1950  }
1951  __kmp_init_nest_lock_with_hint(loc, user_lock, __kmp_user_lock_seq);
1952 
1953 #else // KMP_USE_DYNAMIC_LOCK
1954 
1955  static char const * const func = "omp_init_nest_lock";
1956  kmp_user_lock_p lck;
1957  KMP_DEBUG_ASSERT( __kmp_init_serial );
1958 
1959  if ( __kmp_env_consistency_check ) {
1960  if ( user_lock == NULL ) {
1961  KMP_FATAL( LockIsUninitialized, func );
1962  }
1963  }
1964 
1965  KMP_CHECK_USER_LOCK_INIT();
1966 
1967  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1968  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1969  lck = (kmp_user_lock_p)user_lock;
1970  }
1971 #if KMP_USE_FUTEX
1972  else if ( ( __kmp_user_lock_kind == lk_futex )
1973  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1974  <= OMP_NEST_LOCK_T_SIZE ) ) {
1975  lck = (kmp_user_lock_p)user_lock;
1976  }
1977 #endif
1978  else {
1979  lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1980  }
1981 
1982  INIT_NESTED_LOCK( lck );
1983  __kmp_set_user_lock_location( lck, loc );
1984 
1985 #if OMPT_SUPPORT && OMPT_TRACE
1986  if (ompt_enabled &&
1987  ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)) {
1988  ompt_callbacks.ompt_callback(ompt_event_init_nest_lock)((uint64_t) lck);
1989  }
1990 #endif
1991 
1992 #if USE_ITT_BUILD
1993  __kmp_itt_lock_creating( lck );
1994 #endif /* USE_ITT_BUILD */
1995 
1996 #endif // KMP_USE_DYNAMIC_LOCK
1997 } // __kmpc_init_nest_lock
1998 
1999 void
2000 __kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2001 #if KMP_USE_DYNAMIC_LOCK
2002 
2003 # if USE_ITT_BUILD
2004  kmp_user_lock_p lck;
2005  if (KMP_EXTRACT_D_TAG(user_lock) == 0) {
2006  lck = ((kmp_indirect_lock_t *)KMP_LOOKUP_I_LOCK(user_lock))->lock;
2007  } else {
2008  lck = (kmp_user_lock_p)user_lock;
2009  }
2010  __kmp_itt_lock_destroyed(lck);
2011 # endif
2012  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2013 #else
2014  kmp_user_lock_p lck;
2015 
2016  if ( ( __kmp_user_lock_kind == lk_tas )
2017  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2018  lck = (kmp_user_lock_p)user_lock;
2019  }
2020 #if KMP_USE_FUTEX
2021  else if ( ( __kmp_user_lock_kind == lk_futex )
2022  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2023  lck = (kmp_user_lock_p)user_lock;
2024  }
2025 #endif
2026  else {
2027  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" );
2028  }
2029 
2030 #if OMPT_SUPPORT && OMPT_TRACE
2031  if (ompt_enabled &&
2032  ompt_callbacks.ompt_callback(ompt_event_destroy_lock)) {
2033  ompt_callbacks.ompt_callback(ompt_event_destroy_lock)((uint64_t) lck);
2034  }
2035 #endif
2036 
2037 #if USE_ITT_BUILD
2038  __kmp_itt_lock_destroyed( lck );
2039 #endif /* USE_ITT_BUILD */
2040  DESTROY_LOCK( lck );
2041 
2042  if ( ( __kmp_user_lock_kind == lk_tas )
2043  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2044  ;
2045  }
2046 #if KMP_USE_FUTEX
2047  else if ( ( __kmp_user_lock_kind == lk_futex )
2048  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2049  ;
2050  }
2051 #endif
2052  else {
2053  __kmp_user_lock_free( user_lock, gtid, lck );
2054  }
2055 #endif // KMP_USE_DYNAMIC_LOCK
2056 } // __kmpc_destroy_lock
2057 
2058 /* destroy the lock */
2059 void
2060 __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2061 #if KMP_USE_DYNAMIC_LOCK
2062 
2063 # if USE_ITT_BUILD
2064  kmp_indirect_lock_t *ilk = KMP_LOOKUP_I_LOCK(user_lock);
2065  __kmp_itt_lock_destroyed(ilk->lock);
2066 # endif
2067  KMP_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
2068 
2069 #else // KMP_USE_DYNAMIC_LOCK
2070 
2071  kmp_user_lock_p lck;
2072 
2073  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2074  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2075  lck = (kmp_user_lock_p)user_lock;
2076  }
2077 #if KMP_USE_FUTEX
2078  else if ( ( __kmp_user_lock_kind == lk_futex )
2079  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2080  <= OMP_NEST_LOCK_T_SIZE ) ) {
2081  lck = (kmp_user_lock_p)user_lock;
2082  }
2083 #endif
2084  else {
2085  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" );
2086  }
2087 
2088 #if OMPT_SUPPORT && OMPT_TRACE
2089  if (ompt_enabled &&
2090  ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)) {
2091  ompt_callbacks.ompt_callback(ompt_event_destroy_nest_lock)((uint64_t) lck);
2092  }
2093 #endif
2094 
2095 #if USE_ITT_BUILD
2096  __kmp_itt_lock_destroyed( lck );
2097 #endif /* USE_ITT_BUILD */
2098 
2099  DESTROY_NESTED_LOCK( lck );
2100 
2101  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2102  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2103  ;
2104  }
2105 #if KMP_USE_FUTEX
2106  else if ( ( __kmp_user_lock_kind == lk_futex )
2107  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2108  <= OMP_NEST_LOCK_T_SIZE ) ) {
2109  ;
2110  }
2111 #endif
2112  else {
2113  __kmp_user_lock_free( user_lock, gtid, lck );
2114  }
2115 #endif // KMP_USE_DYNAMIC_LOCK
2116 } // __kmpc_destroy_nest_lock
2117 
2118 void
2119 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2120  KMP_COUNT_BLOCK(OMP_set_lock);
2121 #if KMP_USE_DYNAMIC_LOCK
2122  int tag = KMP_EXTRACT_D_TAG(user_lock);
2123 # if USE_ITT_BUILD
2124  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); // itt function will get to the right lock object.
2125 # endif
2126 # if KMP_USE_INLINED_TAS
2127  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2128  KMP_ACQUIRE_TAS_LOCK(user_lock, gtid);
2129  } else
2130 # elif KMP_USE_INLINED_FUTEX
2131  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2132  KMP_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
2133  } else
2134 # endif
2135  {
2136  __kmp_direct_set[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2137  }
2138 # if USE_ITT_BUILD
2139  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2140 # endif
2141 
2142 #else // KMP_USE_DYNAMIC_LOCK
2143 
2144  kmp_user_lock_p lck;
2145 
2146  if ( ( __kmp_user_lock_kind == lk_tas )
2147  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2148  lck = (kmp_user_lock_p)user_lock;
2149  }
2150 #if KMP_USE_FUTEX
2151  else if ( ( __kmp_user_lock_kind == lk_futex )
2152  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2153  lck = (kmp_user_lock_p)user_lock;
2154  }
2155 #endif
2156  else {
2157  lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" );
2158  }
2159 
2160 #if USE_ITT_BUILD
2161  __kmp_itt_lock_acquiring( lck );
2162 #endif /* USE_ITT_BUILD */
2163 
2164  ACQUIRE_LOCK( lck, gtid );
2165 
2166 #if USE_ITT_BUILD
2167  __kmp_itt_lock_acquired( lck );
2168 #endif /* USE_ITT_BUILD */
2169 
2170 #if OMPT_SUPPORT && OMPT_TRACE
2171  if (ompt_enabled &&
2172  ompt_callbacks.ompt_callback(ompt_event_acquired_lock)) {
2173  ompt_callbacks.ompt_callback(ompt_event_acquired_lock)((uint64_t) lck);
2174  }
2175 #endif
2176 
2177 #endif // KMP_USE_DYNAMIC_LOCK
2178 }
2179 
2180 void
2181 __kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
2182 #if KMP_USE_DYNAMIC_LOCK
2183 
2184 # if USE_ITT_BUILD
2185  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2186 # endif
2187  KMP_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
2188 # if USE_ITT_BUILD
2189  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2190 #endif
2191 
2192 #if OMPT_SUPPORT && OMPT_TRACE
2193  if (ompt_enabled) {
2194  // missing support here: need to know whether acquired first or not
2195  }
2196 #endif
2197 
2198 #else // KMP_USE_DYNAMIC_LOCK
2199  int acquire_status;
2200  kmp_user_lock_p lck;
2201 
2202  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2203  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2204  lck = (kmp_user_lock_p)user_lock;
2205  }
2206 #if KMP_USE_FUTEX
2207  else if ( ( __kmp_user_lock_kind == lk_futex )
2208  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2209  <= OMP_NEST_LOCK_T_SIZE ) ) {
2210  lck = (kmp_user_lock_p)user_lock;
2211  }
2212 #endif
2213  else {
2214  lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" );
2215  }
2216 
2217 #if USE_ITT_BUILD
2218  __kmp_itt_lock_acquiring( lck );
2219 #endif /* USE_ITT_BUILD */
2220 
2221  ACQUIRE_NESTED_LOCK( lck, gtid, &acquire_status );
2222 
2223 #if USE_ITT_BUILD
2224  __kmp_itt_lock_acquired( lck );
2225 #endif /* USE_ITT_BUILD */
2226 
2227 #if OMPT_SUPPORT && OMPT_TRACE
2228  if (ompt_enabled) {
2229  if (acquire_status == KMP_LOCK_ACQUIRED_FIRST) {
2230  if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first))
2231  ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_first)((uint64_t) lck);
2232  } else {
2233  if(ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next))
2234  ompt_callbacks.ompt_callback(ompt_event_acquired_nest_lock_next)((uint64_t) lck);
2235  }
2236  }
2237 #endif
2238 
2239 #endif // KMP_USE_DYNAMIC_LOCK
2240 }
2241 
2242 void
2243 __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2244 {
2245 #if KMP_USE_DYNAMIC_LOCK
2246 
2247  int tag = KMP_EXTRACT_D_TAG(user_lock);
2248 # if USE_ITT_BUILD
2249  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2250 # endif
2251 # if KMP_USE_INLINED_TAS
2252  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2253  KMP_RELEASE_TAS_LOCK(user_lock, gtid);
2254  } else
2255 # elif KMP_USE_INLINED_FUTEX
2256  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2257  KMP_RELEASE_FUTEX_LOCK(user_lock, gtid);
2258  } else
2259 # endif
2260  {
2261  __kmp_direct_unset[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2262  }
2263 
2264 #else // KMP_USE_DYNAMIC_LOCK
2265 
2266  kmp_user_lock_p lck;
2267 
2268  /* Can't use serial interval since not block structured */
2269  /* release the lock */
2270 
2271  if ( ( __kmp_user_lock_kind == lk_tas )
2272  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2273 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2274  // "fast" path implemented to fix customer performance issue
2275 #if USE_ITT_BUILD
2276  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2277 #endif /* USE_ITT_BUILD */
2278  TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2279  KMP_MB();
2280  return;
2281 #else
2282  lck = (kmp_user_lock_p)user_lock;
2283 #endif
2284  }
2285 #if KMP_USE_FUTEX
2286  else if ( ( __kmp_user_lock_kind == lk_futex )
2287  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2288  lck = (kmp_user_lock_p)user_lock;
2289  }
2290 #endif
2291  else {
2292  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" );
2293  }
2294 
2295 #if USE_ITT_BUILD
2296  __kmp_itt_lock_releasing( lck );
2297 #endif /* USE_ITT_BUILD */
2298 
2299  RELEASE_LOCK( lck, gtid );
2300 
2301 #if OMPT_SUPPORT && OMPT_BLAME
2302  if (ompt_enabled &&
2303  ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
2304  ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck);
2305  }
2306 #endif
2307 
2308 #endif // KMP_USE_DYNAMIC_LOCK
2309 }
2310 
2311 /* release the lock */
2312 void
2313 __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2314 {
2315 #if KMP_USE_DYNAMIC_LOCK
2316 
2317 # if USE_ITT_BUILD
2318  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2319 # endif
2320  KMP_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2321 
2322 #else // KMP_USE_DYNAMIC_LOCK
2323 
2324  kmp_user_lock_p lck;
2325 
2326  /* Can't use serial interval since not block structured */
2327 
2328  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2329  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2330 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2331  // "fast" path implemented to fix customer performance issue
2332  kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock;
2333 #if USE_ITT_BUILD
2334  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2335 #endif /* USE_ITT_BUILD */
2336  if ( --(tl->lk.depth_locked) == 0 ) {
2337  TCW_4(tl->lk.poll, 0);
2338  }
2339  KMP_MB();
2340  return;
2341 #else
2342  lck = (kmp_user_lock_p)user_lock;
2343 #endif
2344  }
2345 #if KMP_USE_FUTEX
2346  else if ( ( __kmp_user_lock_kind == lk_futex )
2347  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2348  <= OMP_NEST_LOCK_T_SIZE ) ) {
2349  lck = (kmp_user_lock_p)user_lock;
2350  }
2351 #endif
2352  else {
2353  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" );
2354  }
2355 
2356 #if USE_ITT_BUILD
2357  __kmp_itt_lock_releasing( lck );
2358 #endif /* USE_ITT_BUILD */
2359 
2360  int release_status;
2361  release_status = RELEASE_NESTED_LOCK( lck, gtid );
2362 #if OMPT_SUPPORT && OMPT_BLAME
2363  if (ompt_enabled) {
2364  if (release_status == KMP_LOCK_RELEASED) {
2365  if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
2366  ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
2367  (uint64_t) lck);
2368  }
2369  } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) {
2370  ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
2371  (uint64_t) lck);
2372  }
2373  }
2374 #endif
2375 
2376 #endif // KMP_USE_DYNAMIC_LOCK
2377 }
2378 
2379 /* try to acquire the lock */
2380 int
2381 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2382 {
2383  KMP_COUNT_BLOCK(OMP_test_lock);
2384 
2385 #if KMP_USE_DYNAMIC_LOCK
2386  int rc;
2387  int tag = KMP_EXTRACT_D_TAG(user_lock);
2388 # if USE_ITT_BUILD
2389  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2390 # endif
2391 # if KMP_USE_INLINED_TAS
2392  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2393  KMP_TEST_TAS_LOCK(user_lock, gtid, rc);
2394  } else
2395 # elif KMP_USE_INLINED_FUTEX
2396  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2397  KMP_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2398  } else
2399 # endif
2400  {
2401  rc = __kmp_direct_test[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2402  }
2403  if (rc) {
2404 # if USE_ITT_BUILD
2405  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2406 # endif
2407  return FTN_TRUE;
2408  } else {
2409 # if USE_ITT_BUILD
2410  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2411 # endif
2412  return FTN_FALSE;
2413  }
2414 
2415 #else // KMP_USE_DYNAMIC_LOCK
2416 
2417  kmp_user_lock_p lck;
2418  int rc;
2419 
2420  if ( ( __kmp_user_lock_kind == lk_tas )
2421  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2422  lck = (kmp_user_lock_p)user_lock;
2423  }
2424 #if KMP_USE_FUTEX
2425  else if ( ( __kmp_user_lock_kind == lk_futex )
2426  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2427  lck = (kmp_user_lock_p)user_lock;
2428  }
2429 #endif
2430  else {
2431  lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" );
2432  }
2433 
2434 #if USE_ITT_BUILD
2435  __kmp_itt_lock_acquiring( lck );
2436 #endif /* USE_ITT_BUILD */
2437 
2438  rc = TEST_LOCK( lck, gtid );
2439 #if USE_ITT_BUILD
2440  if ( rc ) {
2441  __kmp_itt_lock_acquired( lck );
2442  } else {
2443  __kmp_itt_lock_cancelled( lck );
2444  }
2445 #endif /* USE_ITT_BUILD */
2446  return ( rc ? FTN_TRUE : FTN_FALSE );
2447 
2448  /* Can't use serial interval since not block structured */
2449 
2450 #endif // KMP_USE_DYNAMIC_LOCK
2451 }
2452 
2453 /* try to acquire the lock */
2454 int
2455 __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2456 {
2457 #if KMP_USE_DYNAMIC_LOCK
2458  int rc;
2459 # if USE_ITT_BUILD
2460  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2461 # endif
2462  rc = KMP_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
2463 # if USE_ITT_BUILD
2464  if (rc) {
2465  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2466  } else {
2467  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2468  }
2469 # endif
2470  return rc;
2471 
2472 #else // KMP_USE_DYNAMIC_LOCK
2473 
2474  kmp_user_lock_p lck;
2475  int rc;
2476 
2477  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2478  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2479  lck = (kmp_user_lock_p)user_lock;
2480  }
2481 #if KMP_USE_FUTEX
2482  else if ( ( __kmp_user_lock_kind == lk_futex )
2483  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2484  <= OMP_NEST_LOCK_T_SIZE ) ) {
2485  lck = (kmp_user_lock_p)user_lock;
2486  }
2487 #endif
2488  else {
2489  lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" );
2490  }
2491 
2492 #if USE_ITT_BUILD
2493  __kmp_itt_lock_acquiring( lck );
2494 #endif /* USE_ITT_BUILD */
2495 
2496  rc = TEST_NESTED_LOCK( lck, gtid );
2497 #if USE_ITT_BUILD
2498  if ( rc ) {
2499  __kmp_itt_lock_acquired( lck );
2500  } else {
2501  __kmp_itt_lock_cancelled( lck );
2502  }
2503 #endif /* USE_ITT_BUILD */
2504  return rc;
2505 
2506  /* Can't use serial interval since not block structured */
2507 
2508 #endif // KMP_USE_DYNAMIC_LOCK
2509 }
2510 
2511 
2512 /*--------------------------------------------------------------------------------------------------------------------*/
2513 
2514 /*
2515  * Interface to fast scalable reduce methods routines
2516  */
2517 
2518 // keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions;
2519 // another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then)
2520 // AT: which solution is better?
2521 #define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \
2522  ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) )
2523 
2524 #define __KMP_GET_REDUCTION_METHOD(gtid) \
2525  ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method )
2526 
2527 // description of the packed_reduction_method variable: look at the macros in kmp.h
2528 
2529 
2530 // used in a critical section reduce block
2531 static __forceinline void
2532 __kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2533 
2534  // this lock was visible to a customer and to the threading profile tool as a serial overhead span
2535  // (although it's used for an internal purpose only)
2536  // why was it visible in previous implementation?
2537  // should we keep it visible in new reduce block?
2538  kmp_user_lock_p lck;
2539 
2540 #if KMP_USE_DYNAMIC_LOCK
2541 
2542  kmp_dyna_lock_t *lk = (kmp_dyna_lock_t *)crit;
2543  // Check if it is initialized.
2544  if (*lk == 0) {
2545  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2546  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)crit, 0, KMP_GET_D_TAG(__kmp_user_lock_seq));
2547  } else {
2548  __kmp_init_indirect_csptr(crit, loc, global_tid, KMP_GET_I_TAG(__kmp_user_lock_seq));
2549  }
2550  }
2551  // Branch for accessing the actual lock object and set operation. This branching is inevitable since
2552  // this lock initialization does not follow the normal dispatch path (lock table is not used).
2553  if (KMP_EXTRACT_D_TAG(lk) != 0) {
2554  lck = (kmp_user_lock_p)lk;
2555  KMP_DEBUG_ASSERT(lck != NULL);
2556  if (__kmp_env_consistency_check) {
2557  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2558  }
2559  KMP_D_LOCK_FUNC(lk, set)(lk, global_tid);
2560  } else {
2561  kmp_indirect_lock_t *ilk = *((kmp_indirect_lock_t **)lk);
2562  lck = ilk->lock;
2563  KMP_DEBUG_ASSERT(lck != NULL);
2564  if (__kmp_env_consistency_check) {
2565  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2566  }
2567  KMP_I_LOCK_FUNC(ilk, set)(lck, global_tid);
2568  }
2569 
2570 #else // KMP_USE_DYNAMIC_LOCK
2571 
2572  // We know that the fast reduction code is only emitted by Intel compilers
2573  // with 32 byte critical sections. If there isn't enough space, then we
2574  // have to use a pointer.
2575  if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) {
2576  lck = (kmp_user_lock_p)crit;
2577  }
2578  else {
2579  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
2580  }
2581  KMP_DEBUG_ASSERT( lck != NULL );
2582 
2583  if ( __kmp_env_consistency_check )
2584  __kmp_push_sync( global_tid, ct_critical, loc, lck );
2585 
2586  __kmp_acquire_user_lock_with_checks( lck, global_tid );
2587 
2588 #endif // KMP_USE_DYNAMIC_LOCK
2589 }
2590 
2591 // used in a critical section reduce block
2592 static __forceinline void
2593 __kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2594 
2595  kmp_user_lock_p lck;
2596 
2597 #if KMP_USE_DYNAMIC_LOCK
2598 
2599  if (KMP_IS_D_LOCK(__kmp_user_lock_seq)) {
2600  lck = (kmp_user_lock_p)crit;
2601  if (__kmp_env_consistency_check)
2602  __kmp_pop_sync(global_tid, ct_critical, loc);
2603  KMP_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
2604  } else {
2605  kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
2606  if (__kmp_env_consistency_check)
2607  __kmp_pop_sync(global_tid, ct_critical, loc);
2608  KMP_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
2609  }
2610 
2611 #else // KMP_USE_DYNAMIC_LOCK
2612 
2613  // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical
2614  // sections. If there isn't enough space, then we have to use a pointer.
2615  if ( __kmp_base_user_lock_size > 32 ) {
2616  lck = *( (kmp_user_lock_p *) crit );
2617  KMP_ASSERT( lck != NULL );
2618  } else {
2619  lck = (kmp_user_lock_p) crit;
2620  }
2621 
2622  if ( __kmp_env_consistency_check )
2623  __kmp_pop_sync( global_tid, ct_critical, loc );
2624 
2625  __kmp_release_user_lock_with_checks( lck, global_tid );
2626 
2627 #endif // KMP_USE_DYNAMIC_LOCK
2628 } // __kmp_end_critical_section_reduce_block
2629 
2630 
2631 /* 2.a.i. Reduce Block without a terminating barrier */
2645 kmp_int32
2647  ident_t *loc, kmp_int32 global_tid,
2648  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
2649  kmp_critical_name *lck ) {
2650 
2651  KMP_COUNT_BLOCK(REDUCE_nowait);
2652  int retval = 0;
2653  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2654 #if OMP_40_ENABLED
2655  kmp_team_t *team;
2656  kmp_info_t *th;
2657  int teams_swapped = 0, task_state;
2658 #endif
2659  KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
2660 
2661  // why do we need this initialization here at all?
2662  // Reduction clause can not be used as a stand-alone directive.
2663 
2664  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2665  // possible detection of false-positive race by the threadchecker ???
2666  if( ! TCR_4( __kmp_init_parallel ) )
2667  __kmp_parallel_initialize();
2668 
2669  // check correctness of reduce block nesting
2670 #if KMP_USE_DYNAMIC_LOCK
2671  if ( __kmp_env_consistency_check )
2672  __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2673 #else
2674  if ( __kmp_env_consistency_check )
2675  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2676 #endif
2677 
2678 #if OMP_40_ENABLED
2679  th = __kmp_thread_from_gtid(global_tid);
2680  if( th->th.th_teams_microtask ) { // AC: check if we are inside the teams construct?
2681  team = th->th.th_team;
2682  if( team->t.t_level == th->th.th_teams_level ) {
2683  // this is reduction at teams construct
2684  KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
2685  // Let's swap teams temporarily for the reduction barrier
2686  teams_swapped = 1;
2687  th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2688  th->th.th_team = team->t.t_parent;
2689  th->th.th_team_nproc = th->th.th_team->t.t_nproc;
2690  th->th.th_task_team = th->th.th_team->t.t_task_team[0];
2691  task_state = th->th.th_task_state;
2692  th->th.th_task_state = 0;
2693  }
2694  }
2695 #endif // OMP_40_ENABLED
2696 
2697  // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
2698  // the variable should be either a construct-specific or thread-specific property, not a team specific property
2699  // (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct)
2700  // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?)
2701  // (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed)
2702  // a thread-specific variable is better regarding two issues above (next construct and extra syncs)
2703  // a thread-specific "th_local.reduction_method" variable is used currently
2704  // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs)
2705 
2706  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2707  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2708 
2709  if( packed_reduction_method == critical_reduce_block ) {
2710 
2711  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2712  retval = 1;
2713 
2714  } else if( packed_reduction_method == empty_reduce_block ) {
2715 
2716  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2717  retval = 1;
2718 
2719  } else if( packed_reduction_method == atomic_reduce_block ) {
2720 
2721  retval = 2;
2722 
2723  // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen)
2724  // (it's not quite good, because the checking block has been closed by this 'pop',
2725  // but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction)
2726  if ( __kmp_env_consistency_check )
2727  __kmp_pop_sync( global_tid, ct_reduce, loc );
2728 
2729  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2730 
2731  //AT: performance issue: a real barrier here
2732  //AT: (if master goes slow, other threads are blocked here waiting for the master to come and release them)
2733  //AT: (it's not what a customer might expect specifying NOWAIT clause)
2734  //AT: (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer)
2735  //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster
2736  // and be more in line with sense of NOWAIT
2737  //AT: TO DO: do epcc test and compare times
2738 
2739  // this barrier should be invisible to a customer and to the threading profile tool
2740  // (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
2741 #if USE_ITT_NOTIFY
2742  __kmp_threads[global_tid]->th.th_ident = loc;
2743 #endif
2744  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
2745  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2746 
2747  // all other workers except master should do this pop here
2748  // ( none of other workers will get to __kmpc_end_reduce_nowait() )
2749  if ( __kmp_env_consistency_check ) {
2750  if( retval == 0 ) {
2751  __kmp_pop_sync( global_tid, ct_reduce, loc );
2752  }
2753  }
2754 
2755  } else {
2756 
2757  // should never reach this block
2758  KMP_ASSERT( 0 ); // "unexpected method"
2759 
2760  }
2761 #if OMP_40_ENABLED
2762  if( teams_swapped ) {
2763  // Restore thread structure
2764  th->th.th_info.ds.ds_tid = 0;
2765  th->th.th_team = team;
2766  th->th.th_team_nproc = team->t.t_nproc;
2767  th->th.th_task_team = team->t.t_task_team[task_state];
2768  th->th.th_task_state = task_state;
2769  }
2770 #endif
2771  KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2772 
2773  return retval;
2774 }
2775 
2784 void
2785 __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2786 
2787  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2788 
2789  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) );
2790 
2791  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2792 
2793  if( packed_reduction_method == critical_reduce_block ) {
2794 
2795  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2796 
2797  } else if( packed_reduction_method == empty_reduce_block ) {
2798 
2799  // usage: if team size == 1, no synchronization is required ( on Intel platforms only )
2800 
2801  } else if( packed_reduction_method == atomic_reduce_block ) {
2802 
2803  // neither master nor other workers should get here
2804  // (code gen does not generate this call in case 2: atomic reduce block)
2805  // actually it's better to remove this elseif at all;
2806  // after removal this value will checked by the 'else' and will assert
2807 
2808  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2809 
2810  // only master gets here
2811 
2812  } else {
2813 
2814  // should never reach this block
2815  KMP_ASSERT( 0 ); // "unexpected method"
2816 
2817  }
2818 
2819  if ( __kmp_env_consistency_check )
2820  __kmp_pop_sync( global_tid, ct_reduce, loc );
2821 
2822  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2823 
2824  return;
2825 }
2826 
2827 /* 2.a.ii. Reduce Block with a terminating barrier */
2828 
2842 kmp_int32
2844  ident_t *loc, kmp_int32 global_tid,
2845  kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
2846  void (*reduce_func)(void *lhs_data, void *rhs_data),
2847  kmp_critical_name *lck )
2848 {
2849  KMP_COUNT_BLOCK(REDUCE_wait);
2850  int retval = 0;
2851  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2852 
2853  KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) );
2854 
2855  // why do we need this initialization here at all?
2856  // Reduction clause can not be a stand-alone directive.
2857 
2858  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2859  // possible detection of false-positive race by the threadchecker ???
2860  if( ! TCR_4( __kmp_init_parallel ) )
2861  __kmp_parallel_initialize();
2862 
2863  // check correctness of reduce block nesting
2864 #if KMP_USE_DYNAMIC_LOCK
2865  if ( __kmp_env_consistency_check )
2866  __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2867 #else
2868  if ( __kmp_env_consistency_check )
2869  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2870 #endif
2871 
2872  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2873  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2874 
2875  if( packed_reduction_method == critical_reduce_block ) {
2876 
2877  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2878  retval = 1;
2879 
2880  } else if( packed_reduction_method == empty_reduce_block ) {
2881 
2882  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2883  retval = 1;
2884 
2885  } else if( packed_reduction_method == atomic_reduce_block ) {
2886 
2887  retval = 2;
2888 
2889  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2890 
2891  //case tree_reduce_block:
2892  // this barrier should be visible to a customer and to the threading profile tool
2893  // (it's a terminating barrier on constructs if NOWAIT not specified)
2894 #if USE_ITT_NOTIFY
2895  __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
2896 #endif
2897  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
2898  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2899 
2900  // all other workers except master should do this pop here
2901  // ( none of other workers except master will enter __kmpc_end_reduce() )
2902  if ( __kmp_env_consistency_check ) {
2903  if( retval == 0 ) { // 0: all other workers; 1: master
2904  __kmp_pop_sync( global_tid, ct_reduce, loc );
2905  }
2906  }
2907 
2908  } else {
2909 
2910  // should never reach this block
2911  KMP_ASSERT( 0 ); // "unexpected method"
2912 
2913  }
2914 
2915  KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2916 
2917  return retval;
2918 }
2919 
2929 void
2930 __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2931 
2932  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2933 
2934  KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) );
2935 
2936  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2937 
2938  // this barrier should be visible to a customer and to the threading profile tool
2939  // (it's a terminating barrier on constructs if NOWAIT not specified)
2940 
2941  if( packed_reduction_method == critical_reduce_block ) {
2942 
2943  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2944 
2945  // TODO: implicit barrier: should be exposed
2946 #if USE_ITT_NOTIFY
2947  __kmp_threads[global_tid]->th.th_ident = loc;
2948 #endif
2949  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2950 
2951  } else if( packed_reduction_method == empty_reduce_block ) {
2952 
2953  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2954 
2955  // TODO: implicit barrier: should be exposed
2956 #if USE_ITT_NOTIFY
2957  __kmp_threads[global_tid]->th.th_ident = loc;
2958 #endif
2959  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2960 
2961  } else if( packed_reduction_method == atomic_reduce_block ) {
2962 
2963  // TODO: implicit barrier: should be exposed
2964 #if USE_ITT_NOTIFY
2965  __kmp_threads[global_tid]->th.th_ident = loc;
2966 #endif
2967  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2968 
2969  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2970 
2971  // only master executes here (master releases all other workers)
2972  __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid );
2973 
2974  } else {
2975 
2976  // should never reach this block
2977  KMP_ASSERT( 0 ); // "unexpected method"
2978 
2979  }
2980 
2981  if ( __kmp_env_consistency_check )
2982  __kmp_pop_sync( global_tid, ct_reduce, loc );
2983 
2984  KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2985 
2986  return;
2987 }
2988 
2989 #undef __KMP_GET_REDUCTION_METHOD
2990 #undef __KMP_SET_REDUCTION_METHOD
2991 
2992 /*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/
2993 
2994 kmp_uint64
2995 __kmpc_get_taskid() {
2996 
2997  kmp_int32 gtid;
2998  kmp_info_t * thread;
2999 
3000  gtid = __kmp_get_gtid();
3001  if ( gtid < 0 ) {
3002  return 0;
3003  }; // if
3004  thread = __kmp_thread_from_gtid( gtid );
3005  return thread->th.th_current_task->td_task_id;
3006 
3007 } // __kmpc_get_taskid
3008 
3009 
3010 kmp_uint64
3011 __kmpc_get_parent_taskid() {
3012 
3013  kmp_int32 gtid;
3014  kmp_info_t * thread;
3015  kmp_taskdata_t * parent_task;
3016 
3017  gtid = __kmp_get_gtid();
3018  if ( gtid < 0 ) {
3019  return 0;
3020  }; // if
3021  thread = __kmp_thread_from_gtid( gtid );
3022  parent_task = thread->th.th_current_task->td_parent;
3023  return ( parent_task == NULL ? 0 : parent_task->td_task_id );
3024 
3025 } // __kmpc_get_parent_taskid
3026 
3027 void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT)
3028 {
3029  if ( ! __kmp_init_serial ) {
3030  __kmp_serial_initialize();
3031  }
3032  __kmp_place_num_sockets = nS;
3033  __kmp_place_socket_offset = sO;
3034  __kmp_place_num_cores = nC;
3035  __kmp_place_core_offset = cO;
3036  __kmp_place_num_threads_per_core = nT;
3037 }
3038 
3039 #if OMP_45_ENABLED
3040 
3051 void
3052 __kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims)
3053 {
3054  int j, idx;
3055  kmp_int64 last, trace_count;
3056  kmp_info_t *th = __kmp_threads[gtid];
3057  kmp_team_t *team = th->th.th_team;
3058  kmp_uint32 *flags;
3059  kmp_disp_t *pr_buf = th->th.th_dispatch;
3060  dispatch_shared_info_t *sh_buf;
3061 
3062  KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n",
3063  gtid, num_dims, !team->t.t_serialized));
3064  KMP_DEBUG_ASSERT(dims != NULL);
3065  KMP_DEBUG_ASSERT(num_dims > 0);
3066 
3067  if( team->t.t_serialized ) {
3068  KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n"));
3069  return; // no dependencies if team is serialized
3070  }
3071  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
3072  idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop
3073  sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3074 
3075  // Save bounds info into allocated private buffer
3076  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL);
3077  pr_buf->th_doacross_info =
3078  (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1));
3079  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3080  pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions
3081  // Save also address of num_done in order to access it later without knowing the buffer index
3082  pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done;
3083  pr_buf->th_doacross_info[2] = dims[0].lo;
3084  pr_buf->th_doacross_info[3] = dims[0].up;
3085  pr_buf->th_doacross_info[4] = dims[0].st;
3086  last = 5;
3087  for( j = 1; j < num_dims; ++j ) {
3088  kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0]
3089  if( dims[j].st == 1 ) { // most common case
3090  // AC: should we care of ranges bigger than LLONG_MAX? (not for now)
3091  range_length = dims[j].up - dims[j].lo + 1;
3092  } else {
3093  if( dims[j].st > 0 ) {
3094  KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo);
3095  range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1;
3096  } else { // negative increment
3097  KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up);
3098  range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1;
3099  }
3100  }
3101  pr_buf->th_doacross_info[last++] = range_length;
3102  pr_buf->th_doacross_info[last++] = dims[j].lo;
3103  pr_buf->th_doacross_info[last++] = dims[j].up;
3104  pr_buf->th_doacross_info[last++] = dims[j].st;
3105  }
3106 
3107  // Compute total trip count.
3108  // Start with range of dims[0] which we don't need to keep in the buffer.
3109  if( dims[0].st == 1 ) { // most common case
3110  trace_count = dims[0].up - dims[0].lo + 1;
3111  } else if( dims[0].st > 0 ) {
3112  KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo);
3113  trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1;
3114  } else { // negative increment
3115  KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up);
3116  trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1;
3117  }
3118  for( j = 1; j < num_dims; ++j ) {
3119  trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges
3120  }
3121  KMP_DEBUG_ASSERT(trace_count > 0);
3122 
3123  // Check if shared buffer is not occupied by other loop (idx - __kmp_dispatch_num_buffers)
3124  if( idx != sh_buf->doacross_buf_idx ) {
3125  // Shared buffer is occupied, wait for it to be free
3126  __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL );
3127  }
3128  // Check if we are the first thread. After the CAS the first thread gets 0,
3129  // others get 1 if initialization is in progress, allocated pointer otherwise.
3130  flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64(
3131  (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1);
3132  if( flags == NULL ) {
3133  // we are the first thread, allocate the array of flags
3134  kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration
3135  sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1);
3136  } else if( (kmp_int64)flags == 1 ) {
3137  // initialization is still in progress, need to wait
3138  while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) {
3139  KMP_YIELD(TRUE);
3140  }
3141  }
3142  KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer
3143  pr_buf->th_doacross_flags = sh_buf->doacross_flags; // save private copy in order to not
3144  // touch shared buffer on each iteration
3145  KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid));
3146 }
3147 
3148 void
3149 __kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec)
3150 {
3151  kmp_int32 shft, num_dims, i;
3152  kmp_uint32 flag;
3153  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3154  kmp_info_t *th = __kmp_threads[gtid];
3155  kmp_team_t *team = th->th.th_team;
3156  kmp_disp_t *pr_buf;
3157  kmp_int64 lo, up, st;
3158 
3159  KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid));
3160  if( team->t.t_serialized ) {
3161  KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n"));
3162  return; // no dependencies if team is serialized
3163  }
3164 
3165  // calculate sequential iteration number and check out-of-bounds condition
3166  pr_buf = th->th.th_dispatch;
3167  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3168  num_dims = pr_buf->th_doacross_info[0];
3169  lo = pr_buf->th_doacross_info[2];
3170  up = pr_buf->th_doacross_info[3];
3171  st = pr_buf->th_doacross_info[4];
3172  if( st == 1 ) { // most common case
3173  if( vec[0] < lo || vec[0] > up ) {
3174  KA_TRACE(20,(
3175  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3176  gtid, vec[0], lo, up));
3177  return;
3178  }
3179  iter_number = vec[0] - lo;
3180  } else if( st > 0 ) {
3181  if( vec[0] < lo || vec[0] > up ) {
3182  KA_TRACE(20,(
3183  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3184  gtid, vec[0], lo, up));
3185  return;
3186  }
3187  iter_number = (kmp_uint64)(vec[0] - lo) / st;
3188  } else { // negative increment
3189  if( vec[0] > lo || vec[0] < up ) {
3190  KA_TRACE(20,(
3191  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3192  gtid, vec[0], lo, up));
3193  return;
3194  }
3195  iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3196  }
3197  for( i = 1; i < num_dims; ++i ) {
3198  kmp_int64 iter, ln;
3199  kmp_int32 j = i * 4;
3200  ln = pr_buf->th_doacross_info[j + 1];
3201  lo = pr_buf->th_doacross_info[j + 2];
3202  up = pr_buf->th_doacross_info[j + 3];
3203  st = pr_buf->th_doacross_info[j + 4];
3204  if( st == 1 ) {
3205  if( vec[i] < lo || vec[i] > up ) {
3206  KA_TRACE(20,(
3207  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3208  gtid, vec[i], lo, up));
3209  return;
3210  }
3211  iter = vec[i] - lo;
3212  } else if( st > 0 ) {
3213  if( vec[i] < lo || vec[i] > up ) {
3214  KA_TRACE(20,(
3215  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3216  gtid, vec[i], lo, up));
3217  return;
3218  }
3219  iter = (kmp_uint64)(vec[i] - lo) / st;
3220  } else { // st < 0
3221  if( vec[i] > lo || vec[i] < up ) {
3222  KA_TRACE(20,(
3223  "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n",
3224  gtid, vec[i], lo, up));
3225  return;
3226  }
3227  iter = (kmp_uint64)(lo - vec[i]) / (-st);
3228  }
3229  iter_number = iter + ln * iter_number;
3230  }
3231  shft = iter_number % 32; // use 32-bit granularity
3232  iter_number >>= 5; // divided by 32
3233  flag = 1 << shft;
3234  while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) {
3235  KMP_YIELD(TRUE);
3236  }
3237  KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n",
3238  gtid, (iter_number<<5)+shft));
3239 }
3240 
3241 void
3242 __kmpc_doacross_post(ident_t *loc, int gtid, long long *vec)
3243 {
3244  kmp_int32 shft, num_dims, i;
3245  kmp_uint32 flag;
3246  kmp_int64 iter_number; // iteration number of "collapsed" loop nest
3247  kmp_info_t *th = __kmp_threads[gtid];
3248  kmp_team_t *team = th->th.th_team;
3249  kmp_disp_t *pr_buf;
3250  kmp_int64 lo, st;
3251 
3252  KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid));
3253  if( team->t.t_serialized ) {
3254  KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n"));
3255  return; // no dependencies if team is serialized
3256  }
3257 
3258  // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks)
3259  pr_buf = th->th.th_dispatch;
3260  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL);
3261  num_dims = pr_buf->th_doacross_info[0];
3262  lo = pr_buf->th_doacross_info[2];
3263  st = pr_buf->th_doacross_info[4];
3264  if( st == 1 ) { // most common case
3265  iter_number = vec[0] - lo;
3266  } else if( st > 0 ) {
3267  iter_number = (kmp_uint64)(vec[0] - lo) / st;
3268  } else { // negative increment
3269  iter_number = (kmp_uint64)(lo - vec[0]) / (-st);
3270  }
3271  for( i = 1; i < num_dims; ++i ) {
3272  kmp_int64 iter, ln;
3273  kmp_int32 j = i * 4;
3274  ln = pr_buf->th_doacross_info[j + 1];
3275  lo = pr_buf->th_doacross_info[j + 2];
3276  st = pr_buf->th_doacross_info[j + 4];
3277  if( st == 1 ) {
3278  iter = vec[i] - lo;
3279  } else if( st > 0 ) {
3280  iter = (kmp_uint64)(vec[i] - lo) / st;
3281  } else { // st < 0
3282  iter = (kmp_uint64)(lo - vec[i]) / (-st);
3283  }
3284  iter_number = iter + ln * iter_number;
3285  }
3286  shft = iter_number % 32; // use 32-bit granularity
3287  iter_number >>= 5; // divided by 32
3288  flag = 1 << shft;
3289  if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 )
3290  KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag );
3291  KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n",
3292  gtid, (iter_number<<5)+shft));
3293 }
3294 
3295 void
3296 __kmpc_doacross_fini(ident_t *loc, int gtid)
3297 {
3298  kmp_int64 num_done;
3299  kmp_info_t *th = __kmp_threads[gtid];
3300  kmp_team_t *team = th->th.th_team;
3301  kmp_disp_t *pr_buf = th->th.th_dispatch;
3302 
3303  KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid));
3304  if( team->t.t_serialized ) {
3305  KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team));
3306  return; // nothing to do
3307  }
3308  num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1;
3309  if( num_done == th->th.th_team_nproc ) {
3310  // we are the last thread, need to free shared resources
3311  int idx = pr_buf->th_doacross_buf_idx - 1;
3312  dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % __kmp_dispatch_num_buffers];
3313  KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done);
3314  KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done);
3315  KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx);
3316  __kmp_thread_free(th, (void*)sh_buf->doacross_flags);
3317  sh_buf->doacross_flags = NULL;
3318  sh_buf->doacross_num_done = 0;
3319  sh_buf->doacross_buf_idx += __kmp_dispatch_num_buffers; // free buffer for future re-use
3320  }
3321  // free private resources (need to keep buffer index forever)
3322  __kmp_thread_free(th, (void*)pr_buf->th_doacross_info);
3323  pr_buf->th_doacross_info = NULL;
3324  KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid));
3325 }
3326 #endif
3327 
3328 // end of file //
3329 
kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:700
kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
kmp_int32 __kmpc_global_thread_num(ident_t *loc)
Definition: kmp_csupport.c:99
void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid)
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:791
void __kmpc_flush(ident_t *loc)
Definition: kmp_csupport.c:597
kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end(ident_t *loc)
Definition: kmp_csupport.c:65
void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid)
Definition: kmp_csupport.c:856
void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:485
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:177
void __kmpc_begin(ident_t *loc, kmp_int32 flags)
Definition: kmp_csupport.c:47
kmp_int32 __kmpc_bound_thread_num(ident_t *loc)
Definition: kmp_csupport.c:136
kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *), kmp_int32 didit)
void __kmpc_ordered(ident_t *loc, kmp_int32 gtid)
Definition: kmp_csupport.c:792
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:777
void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
Definition: kmp.h:194
void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:756
void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)
Definition: kmp_csupport.c:241
void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
Definition: kmp_csupport.c:381
kmp_int32 __kmpc_in_parallel(ident_t *loc)
Definition: kmp_csupport.c:226
kmp_int32 __kmpc_ok_to_fork(ident_t *loc)
Definition: kmp_csupport.c:162
kmp_int32 __kmpc_global_num_threads(ident_t *loc)
Definition: kmp_csupport.c:122
kmp_int32 __kmpc_bound_num_threads(ident_t *loc)
Definition: kmp_csupport.c:148
void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:665
void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads)
Definition: kmp_csupport.c:363
void(* kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
Definition: kmp.h:1439
kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid)
void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:470
void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
Definition: kmp_csupport.c:282
char const * psource
Definition: kmp.h:203
kmp_int32 flags
Definition: kmp.h:196