LLVM OpenMP* Runtime Library
kmp_runtime.c
1 /*
2  * kmp_runtime.c -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_atomic.h"
18 #include "kmp_wrapper_getpid.h"
19 #include "kmp_environment.h"
20 #include "kmp_itt.h"
21 #include "kmp_str.h"
22 #include "kmp_settings.h"
23 #include "kmp_i18n.h"
24 #include "kmp_io.h"
25 #include "kmp_error.h"
26 #include "kmp_stats.h"
27 #include "kmp_wait_release.h"
28 
29 #if OMPT_SUPPORT
30 #include "ompt-specific.h"
31 #endif
32 
33 /* these are temporary issues to be dealt with */
34 #define KMP_USE_PRCTL 0
35 
36 #if KMP_OS_WINDOWS
37 #include <process.h>
38 #endif
39 
40 
41 #if defined(KMP_GOMP_COMPAT)
42 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
43 #endif /* defined(KMP_GOMP_COMPAT) */
44 
45 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
46 #if OMP_40_ENABLED
47  "4.0 (201307)";
48 #else
49  "3.1 (201107)";
50 #endif
51 
52 #ifdef KMP_DEBUG
53 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
54 #endif /* KMP_DEBUG */
55 
56 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
57 
58 /* ------------------------------------------------------------------------ */
59 /* ------------------------------------------------------------------------ */
60 
61 kmp_info_t __kmp_monitor;
62 
63 /* ------------------------------------------------------------------------ */
64 /* ------------------------------------------------------------------------ */
65 
66 /* Forward declarations */
67 
68 void __kmp_cleanup( void );
69 
70 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
71 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
72 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
73 static void __kmp_partition_places( kmp_team_t *team, int update_master_only=0 );
74 #endif
75 static void __kmp_do_serial_initialize( void );
76 void __kmp_fork_barrier( int gtid, int tid );
77 void __kmp_join_barrier( int gtid );
78 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
79 
80 #ifdef USE_LOAD_BALANCE
81 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
82 #endif
83 
84 static int __kmp_expand_threads(int nWish, int nNeed);
85 #if KMP_OS_WINDOWS
86 static int __kmp_unregister_root_other_thread( int gtid );
87 #endif
88 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
89 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
90 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
91 
92 /* ------------------------------------------------------------------------ */
93 /* ------------------------------------------------------------------------ */
94 
95 /* Calculate the identifier of the current thread */
96 /* fast (and somewhat portable) way to get unique */
97 /* identifier of executing thread. */
98 /* returns KMP_GTID_DNE if we haven't been assigned a gtid */
99 
100 int
101 __kmp_get_global_thread_id( )
102 {
103  int i;
104  kmp_info_t **other_threads;
105  size_t stack_data;
106  char *stack_addr;
107  size_t stack_size;
108  char *stack_base;
109 
110  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
111  __kmp_nth, __kmp_all_nth ));
112 
113  /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
114  parallel region, made it return KMP_GTID_DNE to force serial_initialize by
115  caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
116  __kmp_init_gtid for this to work. */
117 
118  if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
119 
120 #ifdef KMP_TDATA_GTID
121  if ( TCR_4(__kmp_gtid_mode) >= 3) {
122  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
123  return __kmp_gtid;
124  }
125 #endif
126  if ( TCR_4(__kmp_gtid_mode) >= 2) {
127  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
128  return __kmp_gtid_get_specific();
129  }
130  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
131 
132  stack_addr = (char*) & stack_data;
133  other_threads = __kmp_threads;
134 
135  /*
136  ATT: The code below is a source of potential bugs due to unsynchronized access to
137  __kmp_threads array. For example:
138  1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
139  2. Current thread is suspended by OS.
140  3. Another thread unregisters and finishes (debug versions of free() may fill memory
141  with something like 0xEF).
142  4. Current thread is resumed.
143  5. Current thread reads junk from *thr.
144  TODO: Fix it.
145  --ln
146  */
147 
148  for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
149 
150  kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
151  if( !thr ) continue;
152 
153  stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
154  stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
155 
156  /* stack grows down -- search through all of the active threads */
157 
158  if( stack_addr <= stack_base ) {
159  size_t stack_diff = stack_base - stack_addr;
160 
161  if( stack_diff <= stack_size ) {
162  /* The only way we can be closer than the allocated */
163  /* stack size is if we are running on this thread. */
164  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
165  return i;
166  }
167  }
168  }
169 
170  /* get specific to try and determine our gtid */
171  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
172  "thread, using TLS\n" ));
173  i = __kmp_gtid_get_specific();
174 
175  /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
176 
177  /* if we havn't been assigned a gtid, then return code */
178  if( i<0 ) return i;
179 
180  /* dynamically updated stack window for uber threads to avoid get_specific call */
181  if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
182  KMP_FATAL( StackOverflow, i );
183  }
184 
185  stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
186  if( stack_addr > stack_base ) {
187  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
188  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
189  other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
190  } else {
191  TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
192  }
193 
194  /* Reprint stack bounds for ubermaster since they have been refined */
195  if ( __kmp_storage_map ) {
196  char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
197  char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
198  __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
199  other_threads[i]->th.th_info.ds.ds_stacksize,
200  "th_%d stack (refinement)", i );
201  }
202  return i;
203 }
204 
205 int
206 __kmp_get_global_thread_id_reg( )
207 {
208  int gtid;
209 
210  if ( !__kmp_init_serial ) {
211  gtid = KMP_GTID_DNE;
212  } else
213 #ifdef KMP_TDATA_GTID
214  if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
215  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
216  gtid = __kmp_gtid;
217  } else
218 #endif
219  if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
220  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
221  gtid = __kmp_gtid_get_specific();
222  } else {
223  KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
224  gtid = __kmp_get_global_thread_id();
225  }
226 
227  /* we must be a new uber master sibling thread */
228  if( gtid == KMP_GTID_DNE ) {
229  KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
230  "Registering a new gtid.\n" ));
231  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
232  if( !__kmp_init_serial ) {
233  __kmp_do_serial_initialize();
234  gtid = __kmp_gtid_get_specific();
235  } else {
236  gtid = __kmp_register_root(FALSE);
237  }
238  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
239  /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
240  }
241 
242  KMP_DEBUG_ASSERT( gtid >=0 );
243 
244  return gtid;
245 }
246 
247 /* caller must hold forkjoin_lock */
248 void
249 __kmp_check_stack_overlap( kmp_info_t *th )
250 {
251  int f;
252  char *stack_beg = NULL;
253  char *stack_end = NULL;
254  int gtid;
255 
256  KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
257  if ( __kmp_storage_map ) {
258  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
259  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
260 
261  gtid = __kmp_gtid_from_thread( th );
262 
263  if (gtid == KMP_GTID_MONITOR) {
264  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
265  "th_%s stack (%s)", "mon",
266  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
267  } else {
268  __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
269  "th_%d stack (%s)", gtid,
270  ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
271  }
272  }
273 
274  /* No point in checking ubermaster threads since they use refinement and cannot overlap */
275  gtid = __kmp_gtid_from_thread( th );
276  if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
277  {
278  KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
279  if ( stack_beg == NULL ) {
280  stack_end = (char *) th->th.th_info.ds.ds_stackbase;
281  stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
282  }
283 
284  for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
285  kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
286 
287  if( f_th && f_th != th ) {
288  char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
289  char *other_stack_beg = other_stack_end -
290  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
291  if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
292  (stack_end > other_stack_beg && stack_end < other_stack_end)) {
293 
294  /* Print the other stack values before the abort */
295  if ( __kmp_storage_map )
296  __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
297  (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
298  "th_%d stack (overlapped)",
299  __kmp_gtid_from_thread( f_th ) );
300 
301  __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
302  }
303  }
304  }
305  }
306  KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
307 }
308 
309 
310 /* ------------------------------------------------------------------------ */
311 
312 /* ------------------------------------------------------------------------ */
313 
314 void
315 __kmp_infinite_loop( void )
316 {
317  static int done = FALSE;
318 
319  while (! done) {
320  KMP_YIELD( 1 );
321  }
322 }
323 
324 #define MAX_MESSAGE 512
325 
326 void
327 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
328  char buffer[MAX_MESSAGE];
329  va_list ap;
330 
331  va_start( ap, format);
332  KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
333  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
334  __kmp_vprintf( kmp_err, buffer, ap );
335 #if KMP_PRINT_DATA_PLACEMENT
336  int node;
337  if(gtid >= 0) {
338  if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
339  if( __kmp_storage_map_verbose ) {
340  node = __kmp_get_host_node(p1);
341  if(node < 0) /* doesn't work, so don't try this next time */
342  __kmp_storage_map_verbose = FALSE;
343  else {
344  char *last;
345  int lastNode;
346  int localProc = __kmp_get_cpu_from_gtid(gtid);
347 
348  p1 = (void *)( (size_t)p1 & ~((size_t)PAGE_SIZE - 1) );
349  p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)PAGE_SIZE - 1) );
350  if(localProc >= 0)
351  __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, localProc>>1);
352  else
353  __kmp_printf_no_lock(" GTID %d\n", gtid);
354 # if KMP_USE_PRCTL
355 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
356  do {
357  last = p1;
358  lastNode = node;
359  /* This loop collates adjacent pages with the same host node. */
360  do {
361  (char*)p1 += PAGE_SIZE;
362  } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
363  __kmp_printf_no_lock(" %p-%p memNode %d\n", last,
364  (char*)p1 - 1, lastNode);
365  } while(p1 <= p2);
366 # else
367  __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
368  (char*)p1 + (PAGE_SIZE - 1), __kmp_get_host_node(p1));
369  if(p1 < p2) {
370  __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
371  (char*)p2 + (PAGE_SIZE - 1), __kmp_get_host_node(p2));
372  }
373 # endif
374  }
375  }
376  } else
377  __kmp_printf_no_lock(" %s\n", KMP_I18N_STR( StorageMapWarning ) );
378  }
379 #endif /* KMP_PRINT_DATA_PLACEMENT */
380  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
381 }
382 
383 void
384 __kmp_warn( char const * format, ... )
385 {
386  char buffer[MAX_MESSAGE];
387  va_list ap;
388 
389  if ( __kmp_generate_warnings == kmp_warnings_off ) {
390  return;
391  }
392 
393  va_start( ap, format );
394 
395  KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
396  __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
397  __kmp_vprintf( kmp_err, buffer, ap );
398  __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
399 
400  va_end( ap );
401 }
402 
403 void
404 __kmp_abort_process()
405 {
406 
407  // Later threads may stall here, but that's ok because abort() will kill them.
408  __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
409 
410  if ( __kmp_debug_buf ) {
411  __kmp_dump_debug_buffer();
412  }; // if
413 
414  if ( KMP_OS_WINDOWS ) {
415  // Let other threads know of abnormal termination and prevent deadlock
416  // if abort happened during library initialization or shutdown
417  __kmp_global.g.g_abort = SIGABRT;
418 
419  /*
420  On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
421  Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
422  works well, but this function is not available in VS7 (this is not problem for DLL, but
423  it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
424  not help, at least in some versions of MS C RTL.
425 
426  It seems following sequence is the only way to simulate abort() and avoid pop-up error
427  box.
428  */
429  raise( SIGABRT );
430  _exit( 3 ); // Just in case, if signal ignored, exit anyway.
431  } else {
432  abort();
433  }; // if
434 
435  __kmp_infinite_loop();
436  __kmp_release_bootstrap_lock( & __kmp_exit_lock );
437 
438 } // __kmp_abort_process
439 
440 void
441 __kmp_abort_thread( void )
442 {
443  // TODO: Eliminate g_abort global variable and this function.
444  // In case of abort just call abort(), it will kill all the threads.
445  __kmp_infinite_loop();
446 } // __kmp_abort_thread
447 
448 /* ------------------------------------------------------------------------ */
449 
450 /*
451  * Print out the storage map for the major kmp_info_t thread data structures
452  * that are allocated together.
453  */
454 
455 static void
456 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
457 {
458  __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
459 
460  __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
461  "th_%d.th_info", gtid );
462 
463  __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
464  "th_%d.th_local", gtid );
465 
466  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
467  sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
468 
469  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
470  &thr->th.th_bar[bs_plain_barrier+1],
471  sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
472 
473  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
474  &thr->th.th_bar[bs_forkjoin_barrier+1],
475  sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
476 
477  #if KMP_FAST_REDUCTION_BARRIER
478  __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
479  &thr->th.th_bar[bs_reduction_barrier+1],
480  sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
481  #endif // KMP_FAST_REDUCTION_BARRIER
482 }
483 
484 /*
485  * Print out the storage map for the major kmp_team_t team data structures
486  * that are allocated together.
487  */
488 
489 static void
490 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
491 {
492  int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
493  __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
494  header, team_id );
495 
496  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
497  sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
498 
499 
500  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
501  sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
502 
503  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
504  sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
505 
506  #if KMP_FAST_REDUCTION_BARRIER
507  __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
508  sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
509  #endif // KMP_FAST_REDUCTION_BARRIER
510 
511  __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
512  sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
513 
514  __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
515  sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
516 
517  __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
518  sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
519  header, team_id );
520 
521 
522  __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
523  sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
524 }
525 
526 static void __kmp_init_allocator() {}
527 static void __kmp_fini_allocator() {}
528 
529 /* ------------------------------------------------------------------------ */
530 
531 #ifdef KMP_DYNAMIC_LIB
532 # if KMP_OS_WINDOWS
533 
534 static void
535 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
536  // TODO: Change to __kmp_break_bootstrap_lock().
537  __kmp_init_bootstrap_lock( lck ); // make the lock released
538 }
539 
540 static void
541 __kmp_reset_locks_on_process_detach( int gtid_req ) {
542  int i;
543  int thread_count;
544 
545  // PROCESS_DETACH is expected to be called by a thread
546  // that executes ProcessExit() or FreeLibrary().
547  // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
548  // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
549  // However, in fact, some threads can be still alive here, although being about to be terminated.
550  // The threads in the array with ds_thread==0 are most suspicious.
551  // Actually, it can be not safe to access the __kmp_threads[].
552 
553  // TODO: does it make sense to check __kmp_roots[] ?
554 
555  // Let's check that there are no other alive threads registered with the OMP lib.
556  while( 1 ) {
557  thread_count = 0;
558  for( i = 0; i < __kmp_threads_capacity; ++i ) {
559  if( !__kmp_threads ) continue;
560  kmp_info_t* th = __kmp_threads[ i ];
561  if( th == NULL ) continue;
562  int gtid = th->th.th_info.ds.ds_gtid;
563  if( gtid == gtid_req ) continue;
564  if( gtid < 0 ) continue;
565  DWORD exit_val;
566  int alive = __kmp_is_thread_alive( th, &exit_val );
567  if( alive ) {
568  ++thread_count;
569  }
570  }
571  if( thread_count == 0 ) break; // success
572  }
573 
574  // Assume that I'm alone.
575 
576  // Now it might be probably safe to check and reset locks.
577  // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
578  __kmp_reset_lock( &__kmp_forkjoin_lock );
579  #ifdef KMP_DEBUG
580  __kmp_reset_lock( &__kmp_stdio_lock );
581  #endif // KMP_DEBUG
582 }
583 
584 BOOL WINAPI
585 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
586  //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
587 
588  switch( fdwReason ) {
589 
590  case DLL_PROCESS_ATTACH:
591  KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
592 
593  return TRUE;
594 
595  case DLL_PROCESS_DETACH:
596  KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
597  __kmp_gtid_get_specific() ));
598 
599  if( lpReserved != NULL )
600  {
601  // lpReserved is used for telling the difference:
602  // lpReserved == NULL when FreeLibrary() was called,
603  // lpReserved != NULL when the process terminates.
604  // When FreeLibrary() is called, worker threads remain alive.
605  // So they will release the forkjoin lock by themselves.
606  // When the process terminates, worker threads disappear triggering
607  // the problem of unreleased forkjoin lock as described below.
608 
609  // A worker thread can take the forkjoin lock.
610  // The problem comes up if that worker thread becomes dead
611  // before it releases the forkjoin lock.
612  // The forkjoin lock remains taken, while the thread
613  // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
614  // will try to take the forkjoin lock and will always fail,
615  // so that the application will never finish [normally].
616  // This scenario is possible if __kmpc_end() has not been executed.
617  // It looks like it's not a corner case, but common cases:
618  // - the main function was compiled by an alternative compiler;
619  // - the main function was compiled by icl but without /Qopenmp (application with plugins);
620  // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
621  // - alive foreign thread prevented __kmpc_end from doing cleanup.
622 
623  // This is a hack to work around the problem.
624  // TODO: !!! to figure out something better.
625  __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
626  }
627 
628  __kmp_internal_end_library( __kmp_gtid_get_specific() );
629 
630  return TRUE;
631 
632  case DLL_THREAD_ATTACH:
633  KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
634 
635  /* if we wanted to register new siblings all the time here call
636  * __kmp_get_gtid(); */
637  return TRUE;
638 
639  case DLL_THREAD_DETACH:
640  KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
641  __kmp_gtid_get_specific() ));
642 
643  __kmp_internal_end_thread( __kmp_gtid_get_specific() );
644  return TRUE;
645  }
646 
647  return TRUE;
648 }
649 
650 # endif /* KMP_OS_WINDOWS */
651 #endif /* KMP_DYNAMIC_LIB */
652 
653 
654 /* ------------------------------------------------------------------------ */
655 
656 /* Change the library type to "status" and return the old type */
657 /* called from within initialization routines where __kmp_initz_lock is held */
658 int
659 __kmp_change_library( int status )
660 {
661  int old_status;
662 
663  old_status = __kmp_yield_init & 1; // check whether KMP_LIBRARY=throughput (even init count)
664 
665  if (status) {
666  __kmp_yield_init |= 1; // throughput => turnaround (odd init count)
667  }
668  else {
669  __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
670  }
671 
672  return old_status; // return previous setting of whether KMP_LIBRARY=throughput
673 }
674 
675 /* ------------------------------------------------------------------------ */
676 /* ------------------------------------------------------------------------ */
677 
678 /* __kmp_parallel_deo --
679  * Wait until it's our turn.
680  */
681 void
682 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
683 {
684  int gtid = *gtid_ref;
685 #ifdef BUILD_PARALLEL_ORDERED
686  kmp_team_t *team = __kmp_team_from_gtid( gtid );
687 #endif /* BUILD_PARALLEL_ORDERED */
688 
689  if( __kmp_env_consistency_check ) {
690  if( __kmp_threads[gtid]->th.th_root->r.r_active )
691 #if KMP_USE_DYNAMIC_LOCK
692  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
693 #else
694  __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
695 #endif
696  }
697 #ifdef BUILD_PARALLEL_ORDERED
698  if( !team->t.t_serialized ) {
699  KMP_MB();
700  KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
701  KMP_MB();
702  }
703 #endif /* BUILD_PARALLEL_ORDERED */
704 }
705 
706 /* __kmp_parallel_dxo --
707  * Signal the next task.
708  */
709 
710 void
711 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
712 {
713  int gtid = *gtid_ref;
714 #ifdef BUILD_PARALLEL_ORDERED
715  int tid = __kmp_tid_from_gtid( gtid );
716  kmp_team_t *team = __kmp_team_from_gtid( gtid );
717 #endif /* BUILD_PARALLEL_ORDERED */
718 
719  if( __kmp_env_consistency_check ) {
720  if( __kmp_threads[gtid]->th.th_root->r.r_active )
721  __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
722  }
723 #ifdef BUILD_PARALLEL_ORDERED
724  if ( ! team->t.t_serialized ) {
725  KMP_MB(); /* Flush all pending memory write invalidates. */
726 
727  /* use the tid of the next thread in this team */
728  /* TODO repleace with general release procedure */
729  team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
730 
731 #if OMPT_SUPPORT && OMPT_BLAME
732  if (ompt_enabled &&
733  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
734  /* accept blame for "ordered" waiting */
735  kmp_info_t *this_thread = __kmp_threads[gtid];
736  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
737  this_thread->th.ompt_thread_info.wait_id);
738  }
739 #endif
740 
741  KMP_MB(); /* Flush all pending memory write invalidates. */
742  }
743 #endif /* BUILD_PARALLEL_ORDERED */
744 }
745 
746 /* ------------------------------------------------------------------------ */
747 /* ------------------------------------------------------------------------ */
748 
749 /* ------------------------------------------------------------------------ */
750 /* ------------------------------------------------------------------------ */
751 
752 /* The BARRIER for a SINGLE process section is always explicit */
753 
754 int
755 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
756 {
757  int status;
758  kmp_info_t *th;
759  kmp_team_t *team;
760 
761  if( ! TCR_4(__kmp_init_parallel) )
762  __kmp_parallel_initialize();
763 
764  th = __kmp_threads[ gtid ];
765  team = th->th.th_team;
766  status = 0;
767 
768  th->th.th_ident = id_ref;
769 
770  if ( team->t.t_serialized ) {
771  status = 1;
772  } else {
773  kmp_int32 old_this = th->th.th_local.this_construct;
774 
775  ++th->th.th_local.this_construct;
776  /* try to set team count to thread count--success means thread got the
777  single block
778  */
779  /* TODO: Should this be acquire or release? */
780  if (team->t.t_construct == old_this) {
781  status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
782  th->th.th_local.this_construct);
783  }
784 #if USE_ITT_BUILD
785  if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
786 #if OMP_40_ENABLED
787  th->th.th_teams_microtask == NULL &&
788 #endif
789  team->t.t_active_level == 1 )
790  { // Only report metadata by master of active team at level 1
791  __kmp_itt_metadata_single( id_ref );
792  }
793 #endif /* USE_ITT_BUILD */
794  }
795 
796  if( __kmp_env_consistency_check ) {
797  if (status && push_ws) {
798  __kmp_push_workshare( gtid, ct_psingle, id_ref );
799  } else {
800  __kmp_check_workshare( gtid, ct_psingle, id_ref );
801  }
802  }
803 #if USE_ITT_BUILD
804  if ( status ) {
805  __kmp_itt_single_start( gtid );
806  }
807 #endif /* USE_ITT_BUILD */
808  return status;
809 }
810 
811 void
812 __kmp_exit_single( int gtid )
813 {
814 #if USE_ITT_BUILD
815  __kmp_itt_single_end( gtid );
816 #endif /* USE_ITT_BUILD */
817  if( __kmp_env_consistency_check )
818  __kmp_pop_workshare( gtid, ct_psingle, NULL );
819 }
820 
821 
822 /*
823  * determine if we can go parallel or must use a serialized parallel region and
824  * how many threads we can use
825  * set_nproc is the number of threads requested for the team
826  * returns 0 if we should serialize or only use one thread,
827  * otherwise the number of threads to use
828  * The forkjoin lock is held by the caller.
829  */
830 static int
831 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
832  int master_tid, int set_nthreads
833 #if OMP_40_ENABLED
834  , int enter_teams
835 #endif /* OMP_40_ENABLED */
836 )
837 {
838  int capacity;
839  int new_nthreads;
840  KMP_DEBUG_ASSERT( __kmp_init_serial );
841  KMP_DEBUG_ASSERT( root && parent_team );
842 
843  //
844  // If dyn-var is set, dynamically adjust the number of desired threads,
845  // according to the method specified by dynamic_mode.
846  //
847  new_nthreads = set_nthreads;
848  if ( ! get__dynamic_2( parent_team, master_tid ) ) {
849  ;
850  }
851 #ifdef USE_LOAD_BALANCE
852  else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
853  new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
854  if ( new_nthreads == 1 ) {
855  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
856  master_tid ));
857  return 1;
858  }
859  if ( new_nthreads < set_nthreads ) {
860  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
861  master_tid, new_nthreads ));
862  }
863  }
864 #endif /* USE_LOAD_BALANCE */
865  else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
866  new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
867  : root->r.r_hot_team->t.t_nproc);
868  if ( new_nthreads <= 1 ) {
869  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
870  master_tid ));
871  return 1;
872  }
873  if ( new_nthreads < set_nthreads ) {
874  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
875  master_tid, new_nthreads ));
876  }
877  else {
878  new_nthreads = set_nthreads;
879  }
880  }
881  else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
882  if ( set_nthreads > 2 ) {
883  new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
884  new_nthreads = ( new_nthreads % set_nthreads ) + 1;
885  if ( new_nthreads == 1 ) {
886  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
887  master_tid ));
888  return 1;
889  }
890  if ( new_nthreads < set_nthreads ) {
891  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
892  master_tid, new_nthreads ));
893  }
894  }
895  }
896  else {
897  KMP_ASSERT( 0 );
898  }
899 
900  //
901  // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
902  //
903  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
904  root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
905  int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
906  root->r.r_hot_team->t.t_nproc );
907  if ( tl_nthreads <= 0 ) {
908  tl_nthreads = 1;
909  }
910 
911  //
912  // If dyn-var is false, emit a 1-time warning.
913  //
914  if ( ! get__dynamic_2( parent_team, master_tid )
915  && ( ! __kmp_reserve_warn ) ) {
916  __kmp_reserve_warn = 1;
917  __kmp_msg(
918  kmp_ms_warning,
919  KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
920  KMP_HNT( Unset_ALL_THREADS ),
921  __kmp_msg_null
922  );
923  }
924  if ( tl_nthreads == 1 ) {
925  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
926  master_tid ));
927  return 1;
928  }
929  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
930  master_tid, tl_nthreads ));
931  new_nthreads = tl_nthreads;
932  }
933 
934  //
935  // Check if the threads array is large enough, or needs expanding.
936  //
937  // See comment in __kmp_register_root() about the adjustment if
938  // __kmp_threads[0] == NULL.
939  //
940  capacity = __kmp_threads_capacity;
941  if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
942  --capacity;
943  }
944  if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
945  root->r.r_hot_team->t.t_nproc ) > capacity ) {
946  //
947  // Expand the threads array.
948  //
949  int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
950  root->r.r_hot_team->t.t_nproc ) - capacity;
951  int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
952  if ( slotsAdded < slotsRequired ) {
953  //
954  // The threads array was not expanded enough.
955  //
956  new_nthreads -= ( slotsRequired - slotsAdded );
957  KMP_ASSERT( new_nthreads >= 1 );
958 
959  //
960  // If dyn-var is false, emit a 1-time warning.
961  //
962  if ( ! get__dynamic_2( parent_team, master_tid )
963  && ( ! __kmp_reserve_warn ) ) {
964  __kmp_reserve_warn = 1;
965  if ( __kmp_tp_cached ) {
966  __kmp_msg(
967  kmp_ms_warning,
968  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
969  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
970  KMP_HNT( PossibleSystemLimitOnThreads ),
971  __kmp_msg_null
972  );
973  }
974  else {
975  __kmp_msg(
976  kmp_ms_warning,
977  KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
978  KMP_HNT( SystemLimitOnThreads ),
979  __kmp_msg_null
980  );
981  }
982  }
983  }
984  }
985 
986  if ( new_nthreads == 1 ) {
987  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
988  __kmp_get_gtid(), set_nthreads ) );
989  return 1;
990  }
991 
992  KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
993  __kmp_get_gtid(), new_nthreads, set_nthreads ));
994  return new_nthreads;
995 }
996 
997 /* ------------------------------------------------------------------------ */
998 /* ------------------------------------------------------------------------ */
999 
1000 /* allocate threads from the thread pool and assign them to the new team */
1001 /* we are assured that there are enough threads available, because we
1002  * checked on that earlier within critical section forkjoin */
1003 
1004 static void
1005 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1006  kmp_info_t *master_th, int master_gtid )
1007 {
1008  int i;
1009  int use_hot_team;
1010 
1011  KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1012  KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1013  KMP_MB();
1014 
1015  /* first, let's setup the master thread */
1016  master_th->th.th_info.ds.ds_tid = 0;
1017  master_th->th.th_team = team;
1018  master_th->th.th_team_nproc = team->t.t_nproc;
1019  master_th->th.th_team_master = master_th;
1020  master_th->th.th_team_serialized = FALSE;
1021  master_th->th.th_dispatch = & team->t.t_dispatch[ 0 ];
1022 
1023  /* make sure we are not the optimized hot team */
1024 #if KMP_NESTED_HOT_TEAMS
1025  use_hot_team = 0;
1026  kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1027  if( hot_teams ) { // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1028  int level = team->t.t_active_level - 1; // index in array of hot teams
1029  if( master_th->th.th_teams_microtask ) { // are we inside the teams?
1030  if( master_th->th.th_teams_size.nteams > 1 ) {
1031  ++level; // level was not increased in teams construct for team_of_masters
1032  }
1033  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1034  master_th->th.th_teams_level == team->t.t_level ) {
1035  ++level; // level was not increased in teams construct for team_of_workers before the parallel
1036  } // team->t.t_level will be increased inside parallel
1037  }
1038  if( level < __kmp_hot_teams_max_level ) {
1039  if( hot_teams[level].hot_team ) {
1040  // hot team has already been allocated for given level
1041  KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1042  use_hot_team = 1; // the team is ready to use
1043  } else {
1044  use_hot_team = 0; // AC: threads are not allocated yet
1045  hot_teams[level].hot_team = team; // remember new hot team
1046  hot_teams[level].hot_team_nth = team->t.t_nproc;
1047  }
1048  } else {
1049  use_hot_team = 0;
1050  }
1051  }
1052 #else
1053  use_hot_team = team == root->r.r_hot_team;
1054 #endif
1055  if ( !use_hot_team ) {
1056 
1057  /* install the master thread */
1058  team->t.t_threads[ 0 ] = master_th;
1059  __kmp_initialize_info( master_th, team, 0, master_gtid );
1060 
1061  /* now, install the worker threads */
1062  for ( i=1 ; i < team->t.t_nproc ; i++ ) {
1063 
1064  /* fork or reallocate a new thread and install it in team */
1065  kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1066  team->t.t_threads[ i ] = thr;
1067  KMP_DEBUG_ASSERT( thr );
1068  KMP_DEBUG_ASSERT( thr->th.th_team == team );
1069  /* align team and thread arrived states */
1070  KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%llu, plain=%llu\n",
1071  __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1072  __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1073  team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1074  team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1075 #if OMP_40_ENABLED
1076  thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1077  thr->th.th_teams_level = master_th->th.th_teams_level;
1078  thr->th.th_teams_size = master_th->th.th_teams_size;
1079 #endif
1080  { // Initialize threads' barrier data.
1081  int b;
1082  kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1083  for ( b = 0; b < bs_last_barrier; ++ b ) {
1084  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
1085  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1086 #if USE_DEBUGGER
1087  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1088 #endif
1089  }; // for b
1090  }
1091  }
1092 
1093 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1094  __kmp_partition_places( team );
1095 #endif
1096 
1097  }
1098 
1099  KMP_MB();
1100 }
1101 
1102 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1103 //
1104 // Propagate any changes to the floating point control registers out to the team
1105 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1106 // so we don't make changes unless they are needed.
1107 //
1108 inline static void
1109 propagateFPControl(kmp_team_t * team)
1110 {
1111  if ( __kmp_inherit_fp_control ) {
1112  kmp_int16 x87_fpu_control_word;
1113  kmp_uint32 mxcsr;
1114 
1115  // Get master values of FPU control flags (both X87 and vector)
1116  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1117  __kmp_store_mxcsr( &mxcsr );
1118  mxcsr &= KMP_X86_MXCSR_MASK;
1119 
1120  // There is no point looking at t_fp_control_saved here.
1121  // If it is TRUE, we still have to update the values if they are different from those we now have.
1122  // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1123  // that the values in the team are the same as those we have.
1124  // So, this code achieves what we need whether or not t_fp_control_saved is true.
1125  // By checking whether the value needs updating we avoid unnecessary writes that would put the
1126  // cache-line into a written state, causing all threads in the team to have to read it again.
1127  KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1128  KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1129  // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1130  // So we must ensure it is correct.
1131  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1132  }
1133  else {
1134  // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1135  KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1136  }
1137 }
1138 
1139 // Do the opposite, setting the hardware registers to the updated values from the team.
1140 inline static void
1141 updateHWFPControl(kmp_team_t * team)
1142 {
1143  if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1144  //
1145  // Only reset the fp control regs if they have been changed in the team.
1146  // the parallel region that we are exiting.
1147  //
1148  kmp_int16 x87_fpu_control_word;
1149  kmp_uint32 mxcsr;
1150  __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1151  __kmp_store_mxcsr( &mxcsr );
1152  mxcsr &= KMP_X86_MXCSR_MASK;
1153 
1154  if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1155  __kmp_clear_x87_fpu_status_word();
1156  __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1157  }
1158 
1159  if ( team->t.t_mxcsr != mxcsr ) {
1160  __kmp_load_mxcsr( &team->t.t_mxcsr );
1161  }
1162  }
1163 }
1164 #else
1165 # define propagateFPControl(x) ((void)0)
1166 # define updateHWFPControl(x) ((void)0)
1167 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1168 
1169 static void
1170 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1171 
1172 /*
1173  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1174  */
1175 void
1176 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1177 {
1178  kmp_info_t *this_thr;
1179  kmp_team_t *serial_team;
1180 
1181  KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1182 
1183  /* Skip all this code for autopar serialized loops since it results in
1184  unacceptable overhead */
1185  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1186  return;
1187 
1188  if( ! TCR_4( __kmp_init_parallel ) )
1189  __kmp_parallel_initialize();
1190 
1191  this_thr = __kmp_threads[ global_tid ];
1192  serial_team = this_thr->th.th_serial_team;
1193 
1194  /* utilize the serialized team held by this thread */
1195  KMP_DEBUG_ASSERT( serial_team );
1196  KMP_MB();
1197 
1198  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1199  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1200  KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1201  KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1202  global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1203  this_thr->th.th_task_team = NULL;
1204  }
1205 
1206 #if OMP_40_ENABLED
1207  kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1208  if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1209  proc_bind = proc_bind_false;
1210  }
1211  else if ( proc_bind == proc_bind_default ) {
1212  //
1213  // No proc_bind clause was specified, so use the current value
1214  // of proc-bind-var for this parallel region.
1215  //
1216  proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1217  }
1218  //
1219  // Reset for next parallel region
1220  //
1221  this_thr->th.th_set_proc_bind = proc_bind_default;
1222 #endif /* OMP_40_ENABLED */
1223 
1224  if( this_thr->th.th_team != serial_team ) {
1225  // Nested level will be an index in the nested nthreads array
1226  int level = this_thr->th.th_team->t.t_level;
1227 
1228  if( serial_team->t.t_serialized ) {
1229  /* this serial team was already used
1230  * TODO increase performance by making this locks more specific */
1231  kmp_team_t *new_team;
1232 
1233  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1234 
1235 #if OMPT_SUPPORT
1236  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1237 #endif
1238 
1239  new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1240 #if OMPT_SUPPORT
1241  ompt_parallel_id,
1242 #endif
1243 #if OMP_40_ENABLED
1244  proc_bind,
1245 #endif
1246  & this_thr->th.th_current_task->td_icvs,
1247  0 USE_NESTED_HOT_ARG(NULL) );
1248  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1249  KMP_ASSERT( new_team );
1250 
1251  /* setup new serialized team and install it */
1252  new_team->t.t_threads[0] = this_thr;
1253  new_team->t.t_parent = this_thr->th.th_team;
1254  serial_team = new_team;
1255  this_thr->th.th_serial_team = serial_team;
1256 
1257  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1258  global_tid, serial_team ) );
1259 
1260 
1261  /* TODO the above breaks the requirement that if we run out of
1262  * resources, then we can still guarantee that serialized teams
1263  * are ok, since we may need to allocate a new one */
1264  } else {
1265  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1266  global_tid, serial_team ) );
1267  }
1268 
1269  /* we have to initialize this serial team */
1270  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1271  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1272  KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1273  serial_team->t.t_ident = loc;
1274  serial_team->t.t_serialized = 1;
1275  serial_team->t.t_nproc = 1;
1276  serial_team->t.t_parent = this_thr->th.th_team;
1277  serial_team->t.t_sched = this_thr->th.th_team->t.t_sched;
1278  this_thr->th.th_team = serial_team;
1279  serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1280 
1281  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1282  global_tid, this_thr->th.th_current_task ) );
1283  KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1284  this_thr->th.th_current_task->td_flags.executing = 0;
1285 
1286  __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1287 
1288  /* TODO: GEH: do the ICVs work for nested serialized teams? Don't we need an implicit task for
1289  each serialized task represented by team->t.t_serialized? */
1290  copy_icvs(
1291  & this_thr->th.th_current_task->td_icvs,
1292  & this_thr->th.th_current_task->td_parent->td_icvs );
1293 
1294  // Thread value exists in the nested nthreads array for the next nested level
1295  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1296  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1297  }
1298 
1299 #if OMP_40_ENABLED
1300  if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1301  this_thr->th.th_current_task->td_icvs.proc_bind
1302  = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1303  }
1304 #endif /* OMP_40_ENABLED */
1305 
1306 #if USE_DEBUGGER
1307  serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1308 #endif
1309  this_thr->th.th_info.ds.ds_tid = 0;
1310 
1311  /* set thread cache values */
1312  this_thr->th.th_team_nproc = 1;
1313  this_thr->th.th_team_master = this_thr;
1314  this_thr->th.th_team_serialized = 1;
1315 
1316  serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1317  serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1318 
1319  propagateFPControl (serial_team);
1320 
1321  /* check if we need to allocate dispatch buffers stack */
1322  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1323  if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1324  serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1325  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1326  }
1327  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1328 
1329 #if OMPT_SUPPORT
1330  ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1331  __ompt_team_assign_id(serial_team, ompt_parallel_id);
1332 #endif
1333 
1334  KMP_MB();
1335 
1336  } else {
1337  /* this serialized team is already being used,
1338  * that's fine, just add another nested level */
1339  KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1340  KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1341  KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1342  ++ serial_team->t.t_serialized;
1343  this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1344 
1345  // Nested level will be an index in the nested nthreads array
1346  int level = this_thr->th.th_team->t.t_level;
1347  // Thread value exists in the nested nthreads array for the next nested level
1348  if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1349  this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1350  }
1351  serial_team->t.t_level++;
1352  KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1353  global_tid, serial_team, serial_team->t.t_level ) );
1354 
1355  /* allocate/push dispatch buffers stack */
1356  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1357  {
1358  dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1359  __kmp_allocate( sizeof( dispatch_private_info_t ) );
1360  disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1361  serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1362  }
1363  this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1364 
1365  KMP_MB();
1366  }
1367 
1368  if ( __kmp_env_consistency_check )
1369  __kmp_push_parallel( global_tid, NULL );
1370 
1371 }
1372 
1373 /* most of the work for a fork */
1374 /* return true if we really went parallel, false if serialized */
1375 int
1376 __kmp_fork_call(
1377  ident_t * loc,
1378  int gtid,
1379  enum fork_context_e call_context, // Intel, GNU, ...
1380  kmp_int32 argc,
1381 #if OMPT_SUPPORT
1382  void *unwrapped_task,
1383 #endif
1384  microtask_t microtask,
1385  launch_t invoker,
1386 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1387 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1388  va_list * ap
1389 #else
1390  va_list ap
1391 #endif
1392  )
1393 {
1394  void **argv;
1395  int i;
1396  int master_tid;
1397  int master_this_cons;
1398  kmp_team_t *team;
1399  kmp_team_t *parent_team;
1400  kmp_info_t *master_th;
1401  kmp_root_t *root;
1402  int nthreads;
1403  int master_active;
1404  int master_set_numthreads;
1405  int level;
1406 #if OMP_40_ENABLED
1407  int active_level;
1408  int teams_level;
1409 #endif
1410 #if KMP_NESTED_HOT_TEAMS
1411  kmp_hot_team_ptr_t **p_hot_teams;
1412 #endif
1413  { // KMP_TIME_BLOCK
1414  KMP_TIME_DEVELOPER_BLOCK(KMP_fork_call);
1415  KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1416 
1417  KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1418  if ( __kmp_stkpadding > 0 && __kmp_root[gtid] != NULL ) {
1419  /* Some systems prefer the stack for the root thread(s) to start with */
1420  /* some gap from the parent stack to prevent false sharing. */
1421  void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1422  /* These 2 lines below are so this does not get optimized out */
1423  if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1424  __kmp_stkpadding += (short)((kmp_int64)dummy);
1425  }
1426 
1427  /* initialize if needed */
1428  KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1429  if( ! TCR_4(__kmp_init_parallel) )
1430  __kmp_parallel_initialize();
1431 
1432  /* setup current data */
1433  master_th = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1434  parent_team = master_th->th.th_team;
1435  master_tid = master_th->th.th_info.ds.ds_tid;
1436  master_this_cons = master_th->th.th_local.this_construct;
1437  root = master_th->th.th_root;
1438  master_active = root->r.r_active;
1439  master_set_numthreads = master_th->th.th_set_nproc;
1440 
1441 #if OMPT_SUPPORT
1442  ompt_parallel_id_t ompt_parallel_id;
1443  ompt_task_id_t ompt_task_id;
1444  ompt_frame_t *ompt_frame;
1445  ompt_task_id_t my_task_id;
1446  ompt_parallel_id_t my_parallel_id;
1447 
1448  if (ompt_enabled) {
1449  ompt_parallel_id = __ompt_parallel_id_new(gtid);
1450  ompt_task_id = __ompt_get_task_id_internal(0);
1451  ompt_frame = __ompt_get_task_frame_internal(0);
1452  }
1453 #endif
1454 
1455  // Nested level will be an index in the nested nthreads array
1456  level = parent_team->t.t_level;
1457  active_level = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1458 #if OMP_40_ENABLED
1459  teams_level = master_th->th.th_teams_level; // needed to check nesting inside the teams
1460 #endif
1461 #if KMP_NESTED_HOT_TEAMS
1462  p_hot_teams = &master_th->th.th_hot_teams;
1463  if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1464  *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1465  sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1466  (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1467  (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1468  }
1469 #endif
1470 
1471 #if OMPT_SUPPORT
1472  if (ompt_enabled &&
1473  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1474  int team_size = master_set_numthreads;
1475 
1476  ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1477  ompt_task_id, ompt_frame, ompt_parallel_id,
1478  team_size, unwrapped_task, OMPT_INVOKER(call_context));
1479  }
1480 #endif
1481 
1482  master_th->th.th_ident = loc;
1483 
1484 #if OMP_40_ENABLED
1485  if ( master_th->th.th_teams_microtask &&
1486  ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1487  // AC: This is start of parallel that is nested inside teams construct.
1488  // The team is actual (hot), all workers are ready at the fork barrier.
1489  // No lock needed to initialize the team a bit, then free workers.
1490  parent_team->t.t_ident = loc;
1491  __kmp_alloc_argv_entries( argc, parent_team, TRUE );
1492  parent_team->t.t_argc = argc;
1493  argv = (void**)parent_team->t.t_argv;
1494  for( i=argc-1; i >= 0; --i )
1495 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1496 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1497  *argv++ = va_arg( *ap, void * );
1498 #else
1499  *argv++ = va_arg( ap, void * );
1500 #endif
1501  /* Increment our nested depth levels, but not increase the serialization */
1502  if ( parent_team == master_th->th.th_serial_team ) {
1503  // AC: we are in serialized parallel
1504  __kmpc_serialized_parallel(loc, gtid);
1505  KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1506  parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1507  // work correctly, will restore at join time
1508 
1509 #if OMPT_SUPPORT
1510  void *dummy;
1511  void **exit_runtime_p;
1512 
1513  ompt_lw_taskteam_t lw_taskteam;
1514 
1515  if (ompt_enabled) {
1516  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1517  unwrapped_task, ompt_parallel_id);
1518  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1519  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1520 
1521  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1522 
1523 #if OMPT_TRACE
1524  /* OMPT implicit task begin */
1525  my_task_id = lw_taskteam.ompt_task_info.task_id;
1526  my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1527  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1528  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1529  my_parallel_id, my_task_id);
1530  }
1531 #endif
1532 
1533  /* OMPT state */
1534  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1535  } else {
1536  exit_runtime_p = &dummy;
1537  }
1538 #endif
1539 
1540  {
1541  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1542  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1543  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1544 #if OMPT_SUPPORT
1545  , exit_runtime_p
1546 #endif
1547  );
1548  }
1549 
1550 #if OMPT_SUPPORT
1551  if (ompt_enabled) {
1552 #if OMPT_TRACE
1553  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1554 
1555  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1556  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1557  ompt_parallel_id, ompt_task_id);
1558  }
1559 
1560  __ompt_lw_taskteam_unlink(master_th);
1561  // reset clear the task id only after unlinking the task
1562  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1563 #endif
1564 
1565  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1566  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1567  ompt_parallel_id, ompt_task_id,
1568  OMPT_INVOKER(call_context));
1569  }
1570  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1571  }
1572 #endif
1573  return TRUE;
1574  }
1575 
1576  parent_team->t.t_pkfn = microtask;
1577 #if OMPT_SUPPORT
1578  parent_team->t.ompt_team_info.microtask = unwrapped_task;
1579 #endif
1580  parent_team->t.t_invoke = invoker;
1581  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1582  parent_team->t.t_active_level ++;
1583  parent_team->t.t_level ++;
1584 
1585  /* Change number of threads in the team if requested */
1586  if ( master_set_numthreads ) { // The parallel has num_threads clause
1587  if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1588  // AC: only can reduce the number of threads dynamically, cannot increase
1589  kmp_info_t **other_threads = parent_team->t.t_threads;
1590  parent_team->t.t_nproc = master_set_numthreads;
1591  for ( i = 0; i < master_set_numthreads; ++i ) {
1592  other_threads[i]->th.th_team_nproc = master_set_numthreads;
1593  }
1594  // Keep extra threads hot in the team for possible next parallels
1595  }
1596  master_th->th.th_set_nproc = 0;
1597  }
1598 
1599 #if USE_DEBUGGER
1600  if ( __kmp_debugging ) { // Let debugger override number of threads.
1601  int nth = __kmp_omp_num_threads( loc );
1602  if ( nth > 0 ) { // 0 means debugger does not want to change number of threads.
1603  master_set_numthreads = nth;
1604  }; // if
1605  }; // if
1606 #endif
1607 
1608  KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1609  __kmp_internal_fork( loc, gtid, parent_team );
1610  KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1611 
1612  /* Invoke microtask for MASTER thread */
1613  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1614  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1615 
1616  {
1617  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1618  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1619  if (! parent_team->t.t_invoke( gtid )) {
1620  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1621  }
1622  }
1623  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1624  gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1625  KMP_MB(); /* Flush all pending memory write invalidates. */
1626 
1627  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1628 
1629  return TRUE;
1630  } // Parallel closely nested in teams construct
1631 #endif /* OMP_40_ENABLED */
1632 
1633 #if KMP_DEBUG
1634  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1635  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1636  }
1637 #endif
1638 
1639  if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1640  nthreads = 1;
1641  } else {
1642 #if OMP_40_ENABLED
1643  int enter_teams = ((ap==NULL && active_level==0)||(ap && teams_level>0 && teams_level==level));
1644 #endif
1645  nthreads = master_set_numthreads ?
1646  master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1647 
1648  // Check if we need to take forkjoin lock? (no need for serialized parallel out of teams construct).
1649  // This code moved here from __kmp_reserve_threads() to speedup nested serialized parallels.
1650  if (nthreads > 1) {
1651  if ( ( !get__nested(master_th) && (root->r.r_in_parallel
1652 #if OMP_40_ENABLED
1653  && !enter_teams
1654 #endif /* OMP_40_ENABLED */
1655  ) ) || ( __kmp_library == library_serial ) ) {
1656  KC_TRACE( 10, ( "__kmp_fork_call: T#%d serializing team; requested %d threads\n",
1657  gtid, nthreads ));
1658  nthreads = 1;
1659  }
1660  }
1661  if ( nthreads > 1 ) {
1662  /* determine how many new threads we can use */
1663  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1664 
1665  nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1666 #if OMP_40_ENABLED
1667 /* AC: If we execute teams from parallel region (on host), then teams should be created
1668  but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1669  then teams and their threads should be created regardless of the nesting setting. */
1670  , enter_teams
1671 #endif /* OMP_40_ENABLED */
1672  );
1673  if ( nthreads == 1 ) {
1674  // Free lock for single thread execution here;
1675  // for multi-thread execution it will be freed later
1676  // after team of threads created and initialized
1677  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1678  }
1679  }
1680  }
1681  KMP_DEBUG_ASSERT( nthreads > 0 );
1682 
1683  /* If we temporarily changed the set number of threads then restore it now */
1684  master_th->th.th_set_nproc = 0;
1685 
1686  /* create a serialized parallel region? */
1687  if ( nthreads == 1 ) {
1688  /* josh todo: hypothetical question: what do we do for OS X*? */
1689 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1690  void * args[ argc ];
1691 #else
1692  void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1693 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1694 
1695  KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1696 
1697  __kmpc_serialized_parallel(loc, gtid);
1698 
1699  if ( call_context == fork_context_intel ) {
1700  /* TODO this sucks, use the compiler itself to pass args! :) */
1701  master_th->th.th_serial_team->t.t_ident = loc;
1702 #if OMP_40_ENABLED
1703  if ( !ap ) {
1704  // revert change made in __kmpc_serialized_parallel()
1705  master_th->th.th_serial_team->t.t_level--;
1706  // Get args from parent team for teams construct
1707 
1708 #if OMPT_SUPPORT
1709  void *dummy;
1710  void **exit_runtime_p;
1711 
1712  ompt_lw_taskteam_t lw_taskteam;
1713 
1714  if (ompt_enabled) {
1715  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1716  unwrapped_task, ompt_parallel_id);
1717  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1718  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1719 
1720  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1721 
1722 #if OMPT_TRACE
1723  my_task_id = lw_taskteam.ompt_task_info.task_id;
1724  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1725  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1726  ompt_parallel_id, my_task_id);
1727  }
1728 #endif
1729 
1730  /* OMPT state */
1731  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1732  } else {
1733  exit_runtime_p = &dummy;
1734  }
1735 #endif
1736 
1737  {
1738  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1739  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1740  __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1741 #if OMPT_SUPPORT
1742  , exit_runtime_p
1743 #endif
1744  );
1745  }
1746 
1747 #if OMPT_SUPPORT
1748  if (ompt_enabled) {
1749  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1750 
1751 #if OMPT_TRACE
1752  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1753  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1754  ompt_parallel_id, ompt_task_id);
1755  }
1756 #endif
1757 
1758  __ompt_lw_taskteam_unlink(master_th);
1759  // reset clear the task id only after unlinking the task
1760  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1761 
1762  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1763  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1764  ompt_parallel_id, ompt_task_id,
1765  OMPT_INVOKER(call_context));
1766  }
1767  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1768  }
1769 #endif
1770  } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1771  KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1772  team = master_th->th.th_team;
1773  //team->t.t_pkfn = microtask;
1774  team->t.t_invoke = invoker;
1775  __kmp_alloc_argv_entries( argc, team, TRUE );
1776  team->t.t_argc = argc;
1777  argv = (void**) team->t.t_argv;
1778  if ( ap ) {
1779  for( i=argc-1; i >= 0; --i )
1780 // TODO: revert workaround for Intel(R) 64 tracker #96
1781 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1782  *argv++ = va_arg( *ap, void * );
1783 # else
1784  *argv++ = va_arg( ap, void * );
1785 # endif
1786  } else {
1787  for( i=0; i < argc; ++i )
1788  // Get args from parent team for teams construct
1789  argv[i] = parent_team->t.t_argv[i];
1790  }
1791  // AC: revert change made in __kmpc_serialized_parallel()
1792  // because initial code in teams should have level=0
1793  team->t.t_level--;
1794  // AC: call special invoker for outer "parallel" of the teams construct
1795  {
1796  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1797  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1798  invoker(gtid);
1799  }
1800  } else {
1801 #endif /* OMP_40_ENABLED */
1802  argv = args;
1803  for( i=argc-1; i >= 0; --i )
1804 // TODO: revert workaround for Intel(R) 64 tracker #96
1805 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1806  *argv++ = va_arg( *ap, void * );
1807 #else
1808  *argv++ = va_arg( ap, void * );
1809 #endif
1810  KMP_MB();
1811 
1812 #if OMPT_SUPPORT
1813  void *dummy;
1814  void **exit_runtime_p;
1815 
1816  ompt_lw_taskteam_t lw_taskteam;
1817 
1818  if (ompt_enabled) {
1819  __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1820  unwrapped_task, ompt_parallel_id);
1821  lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1822  exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1823 
1824  __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1825 
1826 #if OMPT_TRACE
1827  /* OMPT implicit task begin */
1828  my_task_id = lw_taskteam.ompt_task_info.task_id;
1829  my_parallel_id = ompt_parallel_id;
1830  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1831  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1832  my_parallel_id, my_task_id);
1833  }
1834 #endif
1835 
1836  /* OMPT state */
1837  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1838  } else {
1839  exit_runtime_p = &dummy;
1840  }
1841 #endif
1842 
1843  {
1844  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1845  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1846  __kmp_invoke_microtask( microtask, gtid, 0, argc, args
1847 #if OMPT_SUPPORT
1848  , exit_runtime_p
1849 #endif
1850  );
1851  }
1852 
1853 #if OMPT_SUPPORT
1854  if (ompt_enabled) {
1855 #if OMPT_TRACE
1856  lw_taskteam.ompt_task_info.frame.exit_runtime_frame = 0;
1857 
1858  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1859  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1860  my_parallel_id, my_task_id);
1861  }
1862 #endif
1863 
1864  __ompt_lw_taskteam_unlink(master_th);
1865  // reset clear the task id only after unlinking the task
1866  lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1867 
1868  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1869  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1870  ompt_parallel_id, ompt_task_id,
1871  OMPT_INVOKER(call_context));
1872  }
1873  master_th->th.ompt_thread_info.state = ompt_state_overhead;
1874  }
1875 #endif
1876 #if OMP_40_ENABLED
1877  }
1878 #endif /* OMP_40_ENABLED */
1879  }
1880  else if ( call_context == fork_context_gnu ) {
1881 #if OMPT_SUPPORT
1882  ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
1883  __kmp_allocate(sizeof(ompt_lw_taskteam_t));
1884  __ompt_lw_taskteam_init(lwt, master_th, gtid,
1885  unwrapped_task, ompt_parallel_id);
1886 
1887  lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1888  lwt->ompt_task_info.frame.exit_runtime_frame = 0;
1889  __ompt_lw_taskteam_link(lwt, master_th);
1890 #endif
1891 
1892  // we were called from GNU native code
1893  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1894  return FALSE;
1895  }
1896  else {
1897  KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1898  }
1899 
1900 
1901  KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1902  KMP_MB();
1903  return FALSE;
1904  }
1905 
1906  // GEH: only modify the executing flag in the case when not serialized
1907  // serialized case is handled in kmpc_serialized_parallel
1908  KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1909  parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1910  master_th->th.th_current_task->td_icvs.max_active_levels ) );
1911  // TODO: GEH - cannot do this assertion because root thread not set up as executing
1912  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1913  master_th->th.th_current_task->td_flags.executing = 0;
1914 
1915 #if OMP_40_ENABLED
1916  if ( !master_th->th.th_teams_microtask || level > teams_level )
1917 #endif /* OMP_40_ENABLED */
1918  {
1919  /* Increment our nested depth level */
1920  KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1921  }
1922 
1923  // See if we need to make a copy of the ICVs.
1924  int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1925  if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1926  nthreads_icv = __kmp_nested_nth.nth[level+1];
1927  }
1928  else {
1929  nthreads_icv = 0; // don't update
1930  }
1931 
1932 #if OMP_40_ENABLED
1933  // Figure out the proc_bind_policy for the new team.
1934  kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1935  kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1936  if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1937  proc_bind = proc_bind_false;
1938  }
1939  else {
1940  if (proc_bind == proc_bind_default) {
1941  // No proc_bind clause specified; use current proc-bind-var for this parallel region
1942  proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1943  }
1944  /* else: The proc_bind policy was specified explicitly on parallel clause. This
1945  overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
1946  // Figure the value of proc-bind-var for the child threads.
1947  if ((level+1 < __kmp_nested_proc_bind.used)
1948  && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
1949  proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
1950  }
1951  }
1952 
1953  // Reset for next parallel region
1954  master_th->th.th_set_proc_bind = proc_bind_default;
1955 #endif /* OMP_40_ENABLED */
1956 
1957  if ((nthreads_icv > 0)
1958 #if OMP_40_ENABLED
1959  || (proc_bind_icv != proc_bind_default)
1960 #endif /* OMP_40_ENABLED */
1961  ) {
1962  kmp_internal_control_t new_icvs;
1963  copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1964  new_icvs.next = NULL;
1965  if (nthreads_icv > 0) {
1966  new_icvs.nproc = nthreads_icv;
1967  }
1968 
1969 #if OMP_40_ENABLED
1970  if (proc_bind_icv != proc_bind_default) {
1971  new_icvs.proc_bind = proc_bind_icv;
1972  }
1973 #endif /* OMP_40_ENABLED */
1974 
1975  /* allocate a new parallel team */
1976  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1977  team = __kmp_allocate_team(root, nthreads, nthreads,
1978 #if OMPT_SUPPORT
1979  ompt_parallel_id,
1980 #endif
1981 #if OMP_40_ENABLED
1982  proc_bind,
1983 #endif
1984  &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
1985  } else {
1986  /* allocate a new parallel team */
1987  KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1988  team = __kmp_allocate_team(root, nthreads, nthreads,
1989 #if OMPT_SUPPORT
1990  ompt_parallel_id,
1991 #endif
1992 #if OMP_40_ENABLED
1993  proc_bind,
1994 #endif
1995  &master_th->th.th_current_task->td_icvs, argc
1996  USE_NESTED_HOT_ARG(master_th) );
1997  }
1998  KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
1999 
2000  /* setup the new team */
2001  KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2002  KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2003  KMP_CHECK_UPDATE(team->t.t_ident, loc);
2004  KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2005  KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2006 #if OMPT_SUPPORT
2007  KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2008 #endif
2009  KMP_CHECK_UPDATE(team->t.t_invoke, invoker); /* TODO move this to root, maybe */
2010  // TODO: parent_team->t.t_level == INT_MAX ???
2011 #if OMP_40_ENABLED
2012  if ( !master_th->th.th_teams_microtask || level > teams_level ) {
2013 #endif /* OMP_40_ENABLED */
2014  int new_level = parent_team->t.t_level + 1;
2015  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2016  new_level = parent_team->t.t_active_level + 1;
2017  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2018 #if OMP_40_ENABLED
2019  } else {
2020  // AC: Do not increase parallel level at start of the teams construct
2021  int new_level = parent_team->t.t_level;
2022  KMP_CHECK_UPDATE(team->t.t_level, new_level);
2023  new_level = parent_team->t.t_active_level;
2024  KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2025  }
2026 #endif /* OMP_40_ENABLED */
2027  kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2028  if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || team->t.t_sched.chunk != new_sched.chunk)
2029  team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
2030 
2031 #if OMP_40_ENABLED
2032  KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2033 #endif
2034 
2035  // Update the floating point rounding in the team if required.
2036  propagateFPControl(team);
2037 
2038  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2039  // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
2040 #if 0
2041  // Patch out an assertion that trips while the runtime seems to operate correctly.
2042  // Avoiding the preconditions that cause the assertion to trip has been promised as a forthcoming patch.
2043  KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
2044 #endif
2045  KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2046  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2047  parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
2048 
2049  if ( active_level || master_th->th.th_task_team ) {
2050  // Take a memo of master's task_state
2051  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2052  if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
2053  kmp_uint32 new_size = 2*master_th->th.th_task_state_stack_sz;
2054  kmp_uint8 *old_stack, *new_stack;
2055  kmp_uint32 i;
2056  new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2057  for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
2058  new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2059  }
2060  for (i=master_th->th.th_task_state_stack_sz; i<new_size; ++i) { // zero-init rest of stack
2061  new_stack[i] = 0;
2062  }
2063  old_stack = master_th->th.th_task_state_memo_stack;
2064  master_th->th.th_task_state_memo_stack = new_stack;
2065  master_th->th.th_task_state_stack_sz = new_size;
2066  __kmp_free(old_stack);
2067  }
2068  // Store master's task_state on stack
2069  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2070  master_th->th.th_task_state_top++;
2071 #if KMP_NESTED_HOT_TEAMS
2072  if (team == master_th->th.th_hot_teams[active_level].hot_team) { // Restore master's nested state if nested hot team
2073  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2074  }
2075  else {
2076 #endif
2077  master_th->th.th_task_state = 0;
2078 #if KMP_NESTED_HOT_TEAMS
2079  }
2080 #endif
2081  }
2082 #if !KMP_NESTED_HOT_TEAMS
2083  KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
2084 #endif
2085  }
2086 
2087  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2088  gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2089  KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2090  ( team->t.t_master_tid == 0 &&
2091  ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2092  KMP_MB();
2093 
2094  /* now, setup the arguments */
2095  argv = (void**)team->t.t_argv;
2096 #if OMP_40_ENABLED
2097  if ( ap ) {
2098 #endif /* OMP_40_ENABLED */
2099  for ( i=argc-1; i >= 0; --i ) {
2100 // TODO: revert workaround for Intel(R) 64 tracker #96
2101 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2102  void *new_argv = va_arg(*ap, void *);
2103 #else
2104  void *new_argv = va_arg(ap, void *);
2105 #endif
2106  KMP_CHECK_UPDATE(*argv, new_argv);
2107  argv++;
2108  }
2109 #if OMP_40_ENABLED
2110  } else {
2111  for ( i=0; i < argc; ++i ) {
2112  // Get args from parent team for teams construct
2113  KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2114  }
2115  }
2116 #endif /* OMP_40_ENABLED */
2117 
2118  /* now actually fork the threads */
2119  KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2120  if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2121  root->r.r_active = TRUE;
2122 
2123  __kmp_fork_team_threads( root, team, master_th, gtid );
2124  __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
2125 
2126 #if OMPT_SUPPORT
2127  master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2128 #endif
2129 
2130  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2131 
2132 #if USE_ITT_BUILD
2133  if ( team->t.t_active_level == 1 // only report frames at level 1
2134 # if OMP_40_ENABLED
2135  && !master_th->th.th_teams_microtask // not in teams construct
2136 # endif /* OMP_40_ENABLED */
2137  ) {
2138 #if USE_ITT_NOTIFY
2139  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
2140  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
2141  {
2142  kmp_uint64 tmp_time = 0;
2143  if ( __itt_get_timestamp_ptr )
2144  tmp_time = __itt_get_timestamp();
2145  // Internal fork - report frame begin
2146  master_th->th.th_frame_time = tmp_time;
2147  if ( __kmp_forkjoin_frames_mode == 3 )
2148  team->t.t_region_time = tmp_time;
2149  } else // only one notification scheme (either "submit" or "forking/joined", not both)
2150 #endif /* USE_ITT_NOTIFY */
2151  if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
2152  __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
2153  { // Mark start of "parallel" region for VTune.
2154  __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2155  }
2156  }
2157 #endif /* USE_ITT_BUILD */
2158 
2159  /* now go on and do the work */
2160  KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2161  KMP_MB();
2162  KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2163  root, team, master_th, gtid));
2164 
2165 #if USE_ITT_BUILD
2166  if ( __itt_stack_caller_create_ptr ) {
2167  team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2168  }
2169 #endif /* USE_ITT_BUILD */
2170 
2171 #if OMP_40_ENABLED
2172  if ( ap ) // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2173 #endif /* OMP_40_ENABLED */
2174  {
2175  __kmp_internal_fork( loc, gtid, team );
2176  KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
2177  root, team, master_th, gtid));
2178  }
2179 
2180  if (call_context == fork_context_gnu) {
2181  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2182  return TRUE;
2183  }
2184 
2185  /* Invoke microtask for MASTER thread */
2186  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2187  gtid, team->t.t_id, team->t.t_pkfn ) );
2188  } // END of timer KMP_fork_call block
2189 
2190  {
2191  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2192  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2193  // KMP_TIME_DEVELOPER_BLOCK(USER_master_invoke);
2194  if (! team->t.t_invoke( gtid )) {
2195  KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2196  }
2197  }
2198  KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2199  gtid, team->t.t_id, team->t.t_pkfn ) );
2200  KMP_MB(); /* Flush all pending memory write invalidates. */
2201 
2202  KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2203 
2204 #if OMPT_SUPPORT
2205  if (ompt_enabled) {
2206  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2207  }
2208 #endif
2209 
2210  return TRUE;
2211 }
2212 
2213 #if OMPT_SUPPORT
2214 static inline void
2215 __kmp_join_restore_state(
2216  kmp_info_t *thread,
2217  kmp_team_t *team)
2218 {
2219  // restore state outside the region
2220  thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
2221  ompt_state_work_serial : ompt_state_work_parallel);
2222 }
2223 
2224 static inline void
2225 __kmp_join_ompt(
2226  kmp_info_t *thread,
2227  kmp_team_t *team,
2228  ompt_parallel_id_t parallel_id,
2229  fork_context_e fork_context)
2230 {
2231  if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2232  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2233  ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2234  parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2235  }
2236 
2237  __kmp_join_restore_state(thread,team);
2238 }
2239 #endif
2240 
2241 void
2242 __kmp_join_call(ident_t *loc, int gtid
2243 #if OMPT_SUPPORT
2244  , enum fork_context_e fork_context
2245 #endif
2246 #if OMP_40_ENABLED
2247  , int exit_teams
2248 #endif /* OMP_40_ENABLED */
2249 )
2250 {
2251  KMP_TIME_DEVELOPER_BLOCK(KMP_join_call);
2252  kmp_team_t *team;
2253  kmp_team_t *parent_team;
2254  kmp_info_t *master_th;
2255  kmp_root_t *root;
2256  int master_active;
2257  int i;
2258 
2259  KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2260 
2261  /* setup current data */
2262  master_th = __kmp_threads[ gtid ];
2263  root = master_th->th.th_root;
2264  team = master_th->th.th_team;
2265  parent_team = team->t.t_parent;
2266 
2267  master_th->th.th_ident = loc;
2268 
2269 #if OMPT_SUPPORT
2270  if (ompt_enabled) {
2271  master_th->th.ompt_thread_info.state = ompt_state_overhead;
2272  }
2273 #endif
2274 
2275 #if KMP_DEBUG
2276  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2277  KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2278  __kmp_gtid_from_thread( master_th ), team,
2279  team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2280  KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2281  }
2282 #endif
2283 
2284  if( team->t.t_serialized ) {
2285 #if OMP_40_ENABLED
2286  if ( master_th->th.th_teams_microtask ) {
2287  // We are in teams construct
2288  int level = team->t.t_level;
2289  int tlevel = master_th->th.th_teams_level;
2290  if ( level == tlevel ) {
2291  // AC: we haven't incremented it earlier at start of teams construct,
2292  // so do it here - at the end of teams construct
2293  team->t.t_level++;
2294  } else if ( level == tlevel + 1 ) {
2295  // AC: we are exiting parallel inside teams, need to increment serialization
2296  // in order to restore it in the next call to __kmpc_end_serialized_parallel
2297  team->t.t_serialized++;
2298  }
2299  }
2300 #endif /* OMP_40_ENABLED */
2301  __kmpc_end_serialized_parallel( loc, gtid );
2302 
2303 #if OMPT_SUPPORT
2304  if (ompt_enabled) {
2305  __kmp_join_restore_state(master_th, parent_team);
2306  }
2307 #endif
2308 
2309  return;
2310  }
2311 
2312  master_active = team->t.t_master_active;
2313 
2314 #if OMP_40_ENABLED
2315  if (!exit_teams)
2316 #endif /* OMP_40_ENABLED */
2317  {
2318  // AC: No barrier for internal teams at exit from teams construct.
2319  // But there is barrier for external team (league).
2320  __kmp_internal_join( loc, gtid, team );
2321  }
2322 #if OMP_40_ENABLED
2323  else {
2324  master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2325  }
2326 #endif /* OMP_40_ENABLED */
2327 
2328  KMP_MB();
2329 
2330 #if OMPT_SUPPORT
2331  ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2332 #endif
2333 
2334 #if USE_ITT_BUILD
2335  if ( __itt_stack_caller_create_ptr ) {
2336  __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2337  }
2338 
2339  // Mark end of "parallel" region for VTune.
2340  if ( team->t.t_active_level == 1
2341 # if OMP_40_ENABLED
2342  && !master_th->th.th_teams_microtask /* not in teams construct */
2343 # endif /* OMP_40_ENABLED */
2344  ) {
2345  master_th->th.th_ident = loc;
2346  // only one notification scheme (either "submit" or "forking/joined", not both)
2347  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2348  __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2349  0, loc, master_th->th.th_team_nproc, 1 );
2350  else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2351  ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2352  __kmp_itt_region_joined( gtid );
2353  } // active_level == 1
2354 #endif /* USE_ITT_BUILD */
2355 
2356 #if OMP_40_ENABLED
2357  if ( master_th->th.th_teams_microtask &&
2358  !exit_teams &&
2359  team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2360  team->t.t_level == master_th->th.th_teams_level + 1 ) {
2361  // AC: We need to leave the team structure intact at the end
2362  // of parallel inside the teams construct, so that at the next
2363  // parallel same (hot) team works, only adjust nesting levels
2364 
2365  /* Decrement our nested depth level */
2366  team->t.t_level --;
2367  team->t.t_active_level --;
2368  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2369 
2370  /* Restore number of threads in the team if needed */
2371  if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2372  int old_num = master_th->th.th_team_nproc;
2373  int new_num = master_th->th.th_teams_size.nth;
2374  kmp_info_t **other_threads = team->t.t_threads;
2375  team->t.t_nproc = new_num;
2376  for ( i = 0; i < old_num; ++i ) {
2377  other_threads[i]->th.th_team_nproc = new_num;
2378  }
2379  // Adjust states of non-used threads of the team
2380  for ( i = old_num; i < new_num; ++i ) {
2381  // Re-initialize thread's barrier data.
2382  int b;
2383  kmp_balign_t * balign = other_threads[i]->th.th_bar;
2384  for ( b = 0; b < bs_last_barrier; ++ b ) {
2385  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
2386  KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2387 #if USE_DEBUGGER
2388  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2389 #endif
2390  }
2391  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2392  // Synchronize thread's task state
2393  other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2394  }
2395  }
2396  }
2397 
2398 #if OMPT_SUPPORT
2399  if (ompt_enabled) {
2400  __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2401  }
2402 #endif
2403 
2404  return;
2405  }
2406 #endif /* OMP_40_ENABLED */
2407 
2408  /* do cleanup and restore the parent team */
2409  master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2410  master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2411 
2412  master_th->th.th_dispatch =
2413  & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2414 
2415  /* jc: The following lock has instructions with REL and ACQ semantics,
2416  separating the parallel user code called in this parallel region
2417  from the serial user code called after this function returns.
2418  */
2419  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2420 
2421 #if OMP_40_ENABLED
2422  if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2423 #endif /* OMP_40_ENABLED */
2424  {
2425  /* Decrement our nested depth level */
2426  KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2427  }
2428  KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2429 
2430 #if OMPT_SUPPORT && OMPT_TRACE
2431  if(ompt_enabled){
2432  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2433  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2434  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2435  parallel_id, task_info->task_id);
2436  }
2437  task_info->frame.exit_runtime_frame = 0;
2438  task_info->task_id = 0;
2439  }
2440 #endif
2441 
2442  KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2443  0, master_th, team ) );
2444  __kmp_pop_current_task_from_thread( master_th );
2445 
2446 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2447  //
2448  // Restore master thread's partition.
2449  //
2450  master_th->th.th_first_place = team->t.t_first_place;
2451  master_th->th.th_last_place = team->t.t_last_place;
2452 #endif /* OMP_40_ENABLED */
2453 
2454  updateHWFPControl (team);
2455 
2456  if ( root->r.r_active != master_active )
2457  root->r.r_active = master_active;
2458 
2459  __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2460 
2461  /* this race was fun to find. make sure the following is in the critical
2462  * region otherwise assertions may fail occasionally since the old team
2463  * may be reallocated and the hierarchy appears inconsistent. it is
2464  * actually safe to run and won't cause any bugs, but will cause those
2465  * assertion failures. it's only one deref&assign so might as well put this
2466  * in the critical region */
2467  master_th->th.th_team = parent_team;
2468  master_th->th.th_team_nproc = parent_team->t.t_nproc;
2469  master_th->th.th_team_master = parent_team->t.t_threads[0];
2470  master_th->th.th_team_serialized = parent_team->t.t_serialized;
2471 
2472  /* restore serialized team, if need be */
2473  if( parent_team->t.t_serialized &&
2474  parent_team != master_th->th.th_serial_team &&
2475  parent_team != root->r.r_root_team ) {
2476  __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2477  master_th->th.th_serial_team = parent_team;
2478  }
2479 
2480  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2481  if (master_th->th.th_task_state_top > 0) { // Restore task state from memo stack
2482  KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2483  // Remember master's state if we re-use this nested hot team
2484  master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2485  --master_th->th.th_task_state_top; // pop
2486  // Now restore state at this level
2487  master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2488  }
2489  // Copy the task team from the parent team to the master thread
2490  master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2491  KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2492  __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team, parent_team ) );
2493  }
2494 
2495  // TODO: GEH - cannot do this assertion because root thread not set up as executing
2496  // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2497  master_th->th.th_current_task->td_flags.executing = 1;
2498 
2499  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2500 
2501 #if OMPT_SUPPORT
2502  if (ompt_enabled) {
2503  __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2504  }
2505 #endif
2506 
2507  KMP_MB();
2508  KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2509 }
2510 
2511 /* ------------------------------------------------------------------------ */
2512 /* ------------------------------------------------------------------------ */
2513 
2514 /* Check whether we should push an internal control record onto the
2515  serial team stack. If so, do it. */
2516 void
2517 __kmp_save_internal_controls ( kmp_info_t * thread )
2518 {
2519 
2520  if ( thread->th.th_team != thread->th.th_serial_team ) {
2521  return;
2522  }
2523  if (thread->th.th_team->t.t_serialized > 1) {
2524  int push = 0;
2525 
2526  if (thread->th.th_team->t.t_control_stack_top == NULL) {
2527  push = 1;
2528  } else {
2529  if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2530  thread->th.th_team->t.t_serialized ) {
2531  push = 1;
2532  }
2533  }
2534  if (push) { /* push a record on the serial team's stack */
2535  kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2536 
2537  copy_icvs( control, & thread->th.th_current_task->td_icvs );
2538 
2539  control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2540 
2541  control->next = thread->th.th_team->t.t_control_stack_top;
2542  thread->th.th_team->t.t_control_stack_top = control;
2543  }
2544  }
2545 }
2546 
2547 /* Changes set_nproc */
2548 void
2549 __kmp_set_num_threads( int new_nth, int gtid )
2550 {
2551  kmp_info_t *thread;
2552  kmp_root_t *root;
2553 
2554  KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2555  KMP_DEBUG_ASSERT( __kmp_init_serial );
2556 
2557  if (new_nth < 1)
2558  new_nth = 1;
2559  else if (new_nth > __kmp_max_nth)
2560  new_nth = __kmp_max_nth;
2561 
2562  KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2563  thread = __kmp_threads[gtid];
2564 
2565  __kmp_save_internal_controls( thread );
2566 
2567  set__nproc( thread, new_nth );
2568 
2569  //
2570  // If this omp_set_num_threads() call will cause the hot team size to be
2571  // reduced (in the absence of a num_threads clause), then reduce it now,
2572  // rather than waiting for the next parallel region.
2573  //
2574  root = thread->th.th_root;
2575  if ( __kmp_init_parallel && ( ! root->r.r_active )
2576  && ( root->r.r_hot_team->t.t_nproc > new_nth )
2577 #if KMP_NESTED_HOT_TEAMS
2578  && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2579 #endif
2580  ) {
2581  kmp_team_t *hot_team = root->r.r_hot_team;
2582  int f;
2583 
2584  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2585 
2586  // Release the extra threads we don't need any more.
2587  for ( f = new_nth; f < hot_team->t.t_nproc; f++ ) {
2588  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2589  if ( __kmp_tasking_mode != tskm_immediate_exec) {
2590  // When decreasing team size, threads no longer in the team should unref task team.
2591  hot_team->t.t_threads[f]->th.th_task_team = NULL;
2592  }
2593  __kmp_free_thread( hot_team->t.t_threads[f] );
2594  hot_team->t.t_threads[f] = NULL;
2595  }
2596  hot_team->t.t_nproc = new_nth;
2597 #if KMP_NESTED_HOT_TEAMS
2598  if( thread->th.th_hot_teams ) {
2599  KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2600  thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2601  }
2602 #endif
2603 
2604  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2605 
2606  //
2607  // Update the t_nproc field in the threads that are still active.
2608  //
2609  for( f=0 ; f < new_nth; f++ ) {
2610  KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2611  hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2612  }
2613  // Special flag in case omp_set_num_threads() call
2614  hot_team->t.t_size_changed = -1;
2615  }
2616 }
2617 
2618 /* Changes max_active_levels */
2619 void
2620 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2621 {
2622  kmp_info_t *thread;
2623 
2624  KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2625  KMP_DEBUG_ASSERT( __kmp_init_serial );
2626 
2627  // validate max_active_levels
2628  if( max_active_levels < 0 ) {
2629  KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2630  // We ignore this call if the user has specified a negative value.
2631  // The current setting won't be changed. The last valid setting will be used.
2632  // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2633  KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2634  return;
2635  }
2636  if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2637  // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2638  // We allow a zero value. (implementation defined behavior)
2639  } else {
2640  KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT );
2641  max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2642  // Current upper limit is MAX_INT. (implementation defined behavior)
2643  // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2644  // Actually, the flow should never get here until we use MAX_INT limit.
2645  }
2646  KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2647 
2648  thread = __kmp_threads[ gtid ];
2649 
2650  __kmp_save_internal_controls( thread );
2651 
2652  set__max_active_levels( thread, max_active_levels );
2653 
2654 }
2655 
2656 /* Gets max_active_levels */
2657 int
2658 __kmp_get_max_active_levels( int gtid )
2659 {
2660  kmp_info_t *thread;
2661 
2662  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2663  KMP_DEBUG_ASSERT( __kmp_init_serial );
2664 
2665  thread = __kmp_threads[ gtid ];
2666  KMP_DEBUG_ASSERT( thread->th.th_current_task );
2667  KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2668  gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2669  return thread->th.th_current_task->td_icvs.max_active_levels;
2670 }
2671 
2672 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2673 void
2674 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2675 {
2676  kmp_info_t *thread;
2677 // kmp_team_t *team;
2678 
2679  KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2680  KMP_DEBUG_ASSERT( __kmp_init_serial );
2681 
2682  // Check if the kind parameter is valid, correct if needed.
2683  // Valid parameters should fit in one of two intervals - standard or extended:
2684  // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2685  // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2686  if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2687  ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2688  {
2689  // TODO: Hint needs attention in case we change the default schedule.
2690  __kmp_msg(
2691  kmp_ms_warning,
2692  KMP_MSG( ScheduleKindOutOfRange, kind ),
2693  KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2694  __kmp_msg_null
2695  );
2696  kind = kmp_sched_default;
2697  chunk = 0; // ignore chunk value in case of bad kind
2698  }
2699 
2700  thread = __kmp_threads[ gtid ];
2701 
2702  __kmp_save_internal_controls( thread );
2703 
2704  if ( kind < kmp_sched_upper_std ) {
2705  if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2706  // differ static chunked vs. unchunked:
2707  // chunk should be invalid to indicate unchunked schedule (which is the default)
2708  thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2709  } else {
2710  thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2711  }
2712  } else {
2713  // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2714  thread->th.th_current_task->td_icvs.sched.r_sched_type =
2715  __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2716  }
2717  if ( kind == kmp_sched_auto ) {
2718  // ignore parameter chunk for schedule auto
2719  thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2720  } else {
2721  thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2722  }
2723 }
2724 
2725 /* Gets def_sched_var ICV values */
2726 void
2727 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2728 {
2729  kmp_info_t *thread;
2730  enum sched_type th_type;
2731 
2732  KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2733  KMP_DEBUG_ASSERT( __kmp_init_serial );
2734 
2735  thread = __kmp_threads[ gtid ];
2736 
2737  th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2738 
2739  switch ( th_type ) {
2740  case kmp_sch_static:
2741  case kmp_sch_static_greedy:
2742  case kmp_sch_static_balanced:
2743  *kind = kmp_sched_static;
2744  *chunk = 0; // chunk was not set, try to show this fact via zero value
2745  return;
2746  case kmp_sch_static_chunked:
2747  *kind = kmp_sched_static;
2748  break;
2749  case kmp_sch_dynamic_chunked:
2750  *kind = kmp_sched_dynamic;
2751  break;
2753  case kmp_sch_guided_iterative_chunked:
2754  case kmp_sch_guided_analytical_chunked:
2755  *kind = kmp_sched_guided;
2756  break;
2757  case kmp_sch_auto:
2758  *kind = kmp_sched_auto;
2759  break;
2760  case kmp_sch_trapezoidal:
2761  *kind = kmp_sched_trapezoidal;
2762  break;
2763 /*
2764  case kmp_sch_static_steal:
2765  *kind = kmp_sched_static_steal;
2766  break;
2767 */
2768  default:
2769  KMP_FATAL( UnknownSchedulingType, th_type );
2770  }
2771 
2772  *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2773 }
2774 
2775 int
2776 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2777 
2778  int ii, dd;
2779  kmp_team_t *team;
2780  kmp_info_t *thr;
2781 
2782  KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2783  KMP_DEBUG_ASSERT( __kmp_init_serial );
2784 
2785  // validate level
2786  if( level == 0 ) return 0;
2787  if( level < 0 ) return -1;
2788  thr = __kmp_threads[ gtid ];
2789  team = thr->th.th_team;
2790  ii = team->t.t_level;
2791  if( level > ii ) return -1;
2792 
2793 #if OMP_40_ENABLED
2794  if( thr->th.th_teams_microtask ) {
2795  // AC: we are in teams region where multiple nested teams have same level
2796  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2797  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2798  KMP_DEBUG_ASSERT( ii >= tlevel );
2799  // AC: As we need to pass by the teams league, we need to artificially increase ii
2800  if ( ii == tlevel ) {
2801  ii += 2; // three teams have same level
2802  } else {
2803  ii ++; // two teams have same level
2804  }
2805  }
2806  }
2807 #endif
2808 
2809  if( ii == level ) return __kmp_tid_from_gtid( gtid );
2810 
2811  dd = team->t.t_serialized;
2812  level++;
2813  while( ii > level )
2814  {
2815  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2816  {
2817  }
2818  if( ( team->t.t_serialized ) && ( !dd ) ) {
2819  team = team->t.t_parent;
2820  continue;
2821  }
2822  if( ii > level ) {
2823  team = team->t.t_parent;
2824  dd = team->t.t_serialized;
2825  ii--;
2826  }
2827  }
2828 
2829  return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2830 }
2831 
2832 int
2833 __kmp_get_team_size( int gtid, int level ) {
2834 
2835  int ii, dd;
2836  kmp_team_t *team;
2837  kmp_info_t *thr;
2838 
2839  KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2840  KMP_DEBUG_ASSERT( __kmp_init_serial );
2841 
2842  // validate level
2843  if( level == 0 ) return 1;
2844  if( level < 0 ) return -1;
2845  thr = __kmp_threads[ gtid ];
2846  team = thr->th.th_team;
2847  ii = team->t.t_level;
2848  if( level > ii ) return -1;
2849 
2850 #if OMP_40_ENABLED
2851  if( thr->th.th_teams_microtask ) {
2852  // AC: we are in teams region where multiple nested teams have same level
2853  int tlevel = thr->th.th_teams_level; // the level of the teams construct
2854  if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2855  KMP_DEBUG_ASSERT( ii >= tlevel );
2856  // AC: As we need to pass by the teams league, we need to artificially increase ii
2857  if ( ii == tlevel ) {
2858  ii += 2; // three teams have same level
2859  } else {
2860  ii ++; // two teams have same level
2861  }
2862  }
2863  }
2864 #endif
2865 
2866  while( ii > level )
2867  {
2868  for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2869  {
2870  }
2871  if( team->t.t_serialized && ( !dd ) ) {
2872  team = team->t.t_parent;
2873  continue;
2874  }
2875  if( ii > level ) {
2876  team = team->t.t_parent;
2877  ii--;
2878  }
2879  }
2880 
2881  return team->t.t_nproc;
2882 }
2883 
2884 kmp_r_sched_t
2885 __kmp_get_schedule_global() {
2886 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2887 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2888 
2889  kmp_r_sched_t r_sched;
2890 
2891  // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2892  // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2893  // and thus have different run-time schedules in different roots (even in OMP 2.5)
2894  if ( __kmp_sched == kmp_sch_static ) {
2895  r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2896  } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2897  r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2898  } else {
2899  r_sched.r_sched_type = __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2900  }
2901 
2902  if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2903  r_sched.chunk = KMP_DEFAULT_CHUNK;
2904  } else {
2905  r_sched.chunk = __kmp_chunk;
2906  }
2907 
2908  return r_sched;
2909 }
2910 
2911 /* ------------------------------------------------------------------------ */
2912 /* ------------------------------------------------------------------------ */
2913 
2914 
2915 /*
2916  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2917  * at least argc number of *t_argv entries for the requested team.
2918  */
2919 static void
2920 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2921 {
2922 
2923  KMP_DEBUG_ASSERT( team );
2924  if( !realloc || argc > team->t.t_max_argc ) {
2925 
2926  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2927  team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2928  /* if previously allocated heap space for args, free them */
2929  if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2930  __kmp_free( (void *) team->t.t_argv );
2931 
2932  if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2933  /* use unused space in the cache line for arguments */
2934  team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2935  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2936  team->t.t_id, team->t.t_max_argc ));
2937  team->t.t_argv = &team->t.t_inline_argv[0];
2938  if ( __kmp_storage_map ) {
2939  __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2940  &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2941  (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2942  "team_%d.t_inline_argv",
2943  team->t.t_id );
2944  }
2945  } else {
2946  /* allocate space for arguments in the heap */
2947  team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2948  KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2949  KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2950  team->t.t_id, team->t.t_max_argc ));
2951  team->t.t_argv = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2952  if ( __kmp_storage_map ) {
2953  __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2954  sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2955  team->t.t_id );
2956  }
2957  }
2958  }
2959 }
2960 
2961 static void
2962 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
2963 {
2964  int i;
2965  int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2966  team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
2967  team->t.t_disp_buffer = (dispatch_shared_info_t*)
2968  __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
2969  team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
2970  team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
2971  team->t.t_max_nproc = max_nth;
2972 
2973  /* setup dispatch buffers */
2974  for(i = 0 ; i < num_disp_buff; ++i) {
2975  team->t.t_disp_buffer[i].buffer_index = i;
2976 #if OMP_45_ENABLED
2977  team->t.t_disp_buffer[i].doacross_buf_idx = i;
2978 #endif
2979  }
2980 }
2981 
2982 static void
2983 __kmp_free_team_arrays(kmp_team_t *team) {
2984  /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
2985  int i;
2986  for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
2987  if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
2988  __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
2989  team->t.t_dispatch[ i ].th_disp_buffer = NULL;
2990  }; // if
2991  }; // for
2992  __kmp_free(team->t.t_threads);
2993  __kmp_free(team->t.t_disp_buffer);
2994  __kmp_free(team->t.t_dispatch);
2995  __kmp_free(team->t.t_implicit_task_taskdata);
2996  team->t.t_threads = NULL;
2997  team->t.t_disp_buffer = NULL;
2998  team->t.t_dispatch = NULL;
2999  team->t.t_implicit_task_taskdata = 0;
3000 }
3001 
3002 static void
3003 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3004  kmp_info_t **oldThreads = team->t.t_threads;
3005 
3006  __kmp_free(team->t.t_disp_buffer);
3007  __kmp_free(team->t.t_dispatch);
3008  __kmp_free(team->t.t_implicit_task_taskdata);
3009  __kmp_allocate_team_arrays(team, max_nth);
3010 
3011  KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3012 
3013  __kmp_free(oldThreads);
3014 }
3015 
3016 static kmp_internal_control_t
3017 __kmp_get_global_icvs( void ) {
3018 
3019  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3020 
3021 #if OMP_40_ENABLED
3022  KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3023 #endif /* OMP_40_ENABLED */
3024 
3025  kmp_internal_control_t g_icvs = {
3026  0, //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3027  (kmp_int8)__kmp_dflt_nested, //int nested; //internal control for nested parallelism (per thread)
3028  (kmp_int8)__kmp_global.g.g_dynamic, //internal control for dynamic adjustment of threads (per thread)
3029  (kmp_int8)__kmp_env_blocktime, //int bt_set; //internal control for whether blocktime is explicitly set
3030  __kmp_dflt_blocktime, //int blocktime; //internal control for blocktime
3031  __kmp_bt_intervals, //int bt_intervals; //internal control for blocktime intervals
3032  __kmp_dflt_team_nth, //int nproc; //internal control for # of threads for next parallel region (per thread)
3033  // (use a max ub on value if __kmp_parallel_initialize not called yet)
3034  __kmp_dflt_max_active_levels, //int max_active_levels; //internal control for max_active_levels
3035  r_sched, //kmp_r_sched_t sched; //internal control for runtime schedule {sched,chunk} pair
3036 #if OMP_40_ENABLED
3037  __kmp_nested_proc_bind.bind_types[0],
3038 #endif /* OMP_40_ENABLED */
3039  NULL //struct kmp_internal_control *next;
3040  };
3041 
3042  return g_icvs;
3043 }
3044 
3045 static kmp_internal_control_t
3046 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3047 
3048  kmp_internal_control_t gx_icvs;
3049  gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3050  copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3051  gx_icvs.next = NULL;
3052 
3053  return gx_icvs;
3054 }
3055 
3056 static void
3057 __kmp_initialize_root( kmp_root_t *root )
3058 {
3059  int f;
3060  kmp_team_t *root_team;
3061  kmp_team_t *hot_team;
3062  int hot_team_max_nth;
3063  kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3064  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3065  KMP_DEBUG_ASSERT( root );
3066  KMP_ASSERT( ! root->r.r_begin );
3067 
3068  /* setup the root state structure */
3069  __kmp_init_lock( &root->r.r_begin_lock );
3070  root->r.r_begin = FALSE;
3071  root->r.r_active = FALSE;
3072  root->r.r_in_parallel = 0;
3073  root->r.r_blocktime = __kmp_dflt_blocktime;
3074  root->r.r_nested = __kmp_dflt_nested;
3075 
3076  /* setup the root team for this task */
3077  /* allocate the root team structure */
3078  KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3079 
3080  root_team =
3081  __kmp_allocate_team(
3082  root,
3083  1, // new_nproc
3084  1, // max_nproc
3085 #if OMPT_SUPPORT
3086  0, // root parallel id
3087 #endif
3088 #if OMP_40_ENABLED
3089  __kmp_nested_proc_bind.bind_types[0],
3090 #endif
3091  &r_icvs,
3092  0 // argc
3093  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3094  );
3095 #if USE_DEBUGGER
3096  // Non-NULL value should be assigned to make the debugger display the root team.
3097  TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
3098 #endif
3099 
3100  KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3101 
3102  root->r.r_root_team = root_team;
3103  root_team->t.t_control_stack_top = NULL;
3104 
3105  /* initialize root team */
3106  root_team->t.t_threads[0] = NULL;
3107  root_team->t.t_nproc = 1;
3108  root_team->t.t_serialized = 1;
3109  // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3110  root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3111  root_team->t.t_sched.chunk = r_sched.chunk;
3112  KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3113  root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3114 
3115  /* setup the hot team for this task */
3116  /* allocate the hot team structure */
3117  KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3118 
3119  hot_team =
3120  __kmp_allocate_team(
3121  root,
3122  1, // new_nproc
3123  __kmp_dflt_team_nth_ub * 2, // max_nproc
3124 #if OMPT_SUPPORT
3125  0, // root parallel id
3126 #endif
3127 #if OMP_40_ENABLED
3128  __kmp_nested_proc_bind.bind_types[0],
3129 #endif
3130  &r_icvs,
3131  0 // argc
3132  USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3133  );
3134  KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3135 
3136  root->r.r_hot_team = hot_team;
3137  root_team->t.t_control_stack_top = NULL;
3138 
3139  /* first-time initialization */
3140  hot_team->t.t_parent = root_team;
3141 
3142  /* initialize hot team */
3143  hot_team_max_nth = hot_team->t.t_max_nproc;
3144  for ( f = 0; f < hot_team_max_nth; ++ f ) {
3145  hot_team->t.t_threads[ f ] = NULL;
3146  }; // for
3147  hot_team->t.t_nproc = 1;
3148  // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3149  hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3150  hot_team->t.t_sched.chunk = r_sched.chunk;
3151  hot_team->t.t_size_changed = 0;
3152 }
3153 
3154 #ifdef KMP_DEBUG
3155 
3156 
3157 typedef struct kmp_team_list_item {
3158  kmp_team_p const * entry;
3159  struct kmp_team_list_item * next;
3160 } kmp_team_list_item_t;
3161 typedef kmp_team_list_item_t * kmp_team_list_t;
3162 
3163 
3164 static void
3165 __kmp_print_structure_team_accum( // Add team to list of teams.
3166  kmp_team_list_t list, // List of teams.
3167  kmp_team_p const * team // Team to add.
3168 ) {
3169 
3170  // List must terminate with item where both entry and next are NULL.
3171  // Team is added to the list only once.
3172  // List is sorted in ascending order by team id.
3173  // Team id is *not* a key.
3174 
3175  kmp_team_list_t l;
3176 
3177  KMP_DEBUG_ASSERT( list != NULL );
3178  if ( team == NULL ) {
3179  return;
3180  }; // if
3181 
3182  __kmp_print_structure_team_accum( list, team->t.t_parent );
3183  __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3184 
3185  // Search list for the team.
3186  l = list;
3187  while ( l->next != NULL && l->entry != team ) {
3188  l = l->next;
3189  }; // while
3190  if ( l->next != NULL ) {
3191  return; // Team has been added before, exit.
3192  }; // if
3193 
3194  // Team is not found. Search list again for insertion point.
3195  l = list;
3196  while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
3197  l = l->next;
3198  }; // while
3199 
3200  // Insert team.
3201  {
3202  kmp_team_list_item_t * item =
3203  (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3204  * item = * l;
3205  l->entry = team;
3206  l->next = item;
3207  }
3208 
3209 }
3210 
3211 static void
3212 __kmp_print_structure_team(
3213  char const * title,
3214  kmp_team_p const * team
3215 
3216 ) {
3217  __kmp_printf( "%s", title );
3218  if ( team != NULL ) {
3219  __kmp_printf( "%2x %p\n", team->t.t_id, team );
3220  } else {
3221  __kmp_printf( " - (nil)\n" );
3222  }; // if
3223 }
3224 
3225 static void
3226 __kmp_print_structure_thread(
3227  char const * title,
3228  kmp_info_p const * thread
3229 
3230 ) {
3231  __kmp_printf( "%s", title );
3232  if ( thread != NULL ) {
3233  __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
3234  } else {
3235  __kmp_printf( " - (nil)\n" );
3236  }; // if
3237 }
3238 
3239 void
3240 __kmp_print_structure(
3241  void
3242 ) {
3243 
3244  kmp_team_list_t list;
3245 
3246  // Initialize list of teams.
3247  list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3248  list->entry = NULL;
3249  list->next = NULL;
3250 
3251  __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3252  {
3253  int gtid;
3254  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3255  __kmp_printf( "%2d", gtid );
3256  if ( __kmp_threads != NULL ) {
3257  __kmp_printf( " %p", __kmp_threads[ gtid ] );
3258  }; // if
3259  if ( __kmp_root != NULL ) {
3260  __kmp_printf( " %p", __kmp_root[ gtid ] );
3261  }; // if
3262  __kmp_printf( "\n" );
3263  }; // for gtid
3264  }
3265 
3266  // Print out __kmp_threads array.
3267  __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3268  if ( __kmp_threads != NULL ) {
3269  int gtid;
3270  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3271  kmp_info_t const * thread = __kmp_threads[ gtid ];
3272  if ( thread != NULL ) {
3273  __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3274  __kmp_printf( " Our Root: %p\n", thread->th.th_root );
3275  __kmp_print_structure_team( " Our Team: ", thread->th.th_team );
3276  __kmp_print_structure_team( " Serial Team: ", thread->th.th_serial_team );
3277  __kmp_printf( " Threads: %2d\n", thread->th.th_team_nproc );
3278  __kmp_print_structure_thread( " Master: ", thread->th.th_team_master );
3279  __kmp_printf( " Serialized?: %2d\n", thread->th.th_team_serialized );
3280  __kmp_printf( " Set NProc: %2d\n", thread->th.th_set_nproc );
3281 #if OMP_40_ENABLED
3282  __kmp_printf( " Set Proc Bind: %2d\n", thread->th.th_set_proc_bind );
3283 #endif
3284  __kmp_print_structure_thread( " Next in pool: ", thread->th.th_next_pool );
3285  __kmp_printf( "\n" );
3286  __kmp_print_structure_team_accum( list, thread->th.th_team );
3287  __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3288  }; // if
3289  }; // for gtid
3290  } else {
3291  __kmp_printf( "Threads array is not allocated.\n" );
3292  }; // if
3293 
3294  // Print out __kmp_root array.
3295  __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3296  if ( __kmp_root != NULL ) {
3297  int gtid;
3298  for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3299  kmp_root_t const * root = __kmp_root[ gtid ];
3300  if ( root != NULL ) {
3301  __kmp_printf( "GTID %2d %p:\n", gtid, root );
3302  __kmp_print_structure_team( " Root Team: ", root->r.r_root_team );
3303  __kmp_print_structure_team( " Hot Team: ", root->r.r_hot_team );
3304  __kmp_print_structure_thread( " Uber Thread: ", root->r.r_uber_thread );
3305  __kmp_printf( " Active?: %2d\n", root->r.r_active );
3306  __kmp_printf( " Nested?: %2d\n", root->r.r_nested );
3307  __kmp_printf( " In Parallel: %2d\n", root->r.r_in_parallel );
3308  __kmp_printf( "\n" );
3309  __kmp_print_structure_team_accum( list, root->r.r_root_team );
3310  __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3311  }; // if
3312  }; // for gtid
3313  } else {
3314  __kmp_printf( "Ubers array is not allocated.\n" );
3315  }; // if
3316 
3317  __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3318  while ( list->next != NULL ) {
3319  kmp_team_p const * team = list->entry;
3320  int i;
3321  __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3322  __kmp_print_structure_team( " Parent Team: ", team->t.t_parent );
3323  __kmp_printf( " Master TID: %2d\n", team->t.t_master_tid );
3324  __kmp_printf( " Max threads: %2d\n", team->t.t_max_nproc );
3325  __kmp_printf( " Levels of serial: %2d\n", team->t.t_serialized );
3326  __kmp_printf( " Number threads: %2d\n", team->t.t_nproc );
3327  for ( i = 0; i < team->t.t_nproc; ++ i ) {
3328  __kmp_printf( " Thread %2d: ", i );
3329  __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3330  }; // for i
3331  __kmp_print_structure_team( " Next in pool: ", team->t.t_next_pool );
3332  __kmp_printf( "\n" );
3333  list = list->next;
3334  }; // while
3335 
3336  // Print out __kmp_thread_pool and __kmp_team_pool.
3337  __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3338  __kmp_print_structure_thread( "Thread pool: ", (kmp_info_t *)__kmp_thread_pool );
3339  __kmp_print_structure_team( "Team pool: ", (kmp_team_t *)__kmp_team_pool );
3340  __kmp_printf( "\n" );
3341 
3342  // Free team list.
3343  while ( list != NULL ) {
3344  kmp_team_list_item_t * item = list;
3345  list = list->next;
3346  KMP_INTERNAL_FREE( item );
3347  }; // while
3348 
3349 }
3350 
3351 #endif
3352 
3353 
3354 //---------------------------------------------------------------------------
3355 // Stuff for per-thread fast random number generator
3356 // Table of primes
3357 
3358 static const unsigned __kmp_primes[] = {
3359  0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3360  0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3361  0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3362  0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3363  0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3364  0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3365  0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3366  0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3367  0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3368  0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3369  0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3370  0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3371  0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3372  0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3373  0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3374  0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3375 };
3376 
3377 //---------------------------------------------------------------------------
3378 // __kmp_get_random: Get a random number using a linear congruential method.
3379 
3380 unsigned short
3381 __kmp_get_random( kmp_info_t * thread )
3382 {
3383  unsigned x = thread->th.th_x;
3384  unsigned short r = x>>16;
3385 
3386  thread->th.th_x = x*thread->th.th_a+1;
3387 
3388  KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3389  thread->th.th_info.ds.ds_tid, r) );
3390 
3391  return r;
3392 }
3393 //--------------------------------------------------------
3394 // __kmp_init_random: Initialize a random number generator
3395 
3396 void
3397 __kmp_init_random( kmp_info_t * thread )
3398 {
3399  unsigned seed = thread->th.th_info.ds.ds_tid;
3400 
3401  thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3402  thread->th.th_x = (seed+1)*thread->th.th_a+1;
3403  KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3404 }
3405 
3406 
3407 #if KMP_OS_WINDOWS
3408 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3409 static int
3410 __kmp_reclaim_dead_roots(void) {
3411  int i, r = 0;
3412 
3413  for(i = 0; i < __kmp_threads_capacity; ++i) {
3414  if( KMP_UBER_GTID( i ) &&
3415  !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3416  !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3417  r += __kmp_unregister_root_other_thread(i);
3418  }
3419  }
3420  return r;
3421 }
3422 #endif
3423 
3424 /*
3425  This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3426  free entries generated.
3427 
3428  For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3429  already dead.
3430 
3431  On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3432  update to __kmp_threads_capacity. Array capacity is increased by doubling with clipping to
3433  __kmp_tp_capacity, if threadprivate cache array has been created.
3434  Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3435 
3436  After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3437  of a total of nWish free slots, the function does that expansion. If not, but the clipping value allows
3438  array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3439  Otherwise, nothing is done beyond the possible initial root thread reclamation. However, if nNeed is zero,
3440  a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3441  as many free slots as possible up to nWish.
3442 
3443  If any argument is negative, the behavior is undefined.
3444 */
3445 static int
3446 __kmp_expand_threads(int nWish, int nNeed) {
3447  int added = 0;
3448  int old_tp_cached;
3449  int __kmp_actual_max_nth;
3450 
3451  if(nNeed > nWish) /* normalize the arguments */
3452  nWish = nNeed;
3453 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3454 /* only for Windows static library */
3455  /* reclaim array entries for root threads that are already dead */
3456  added = __kmp_reclaim_dead_roots();
3457 
3458  if(nNeed) {
3459  nNeed -= added;
3460  if(nNeed < 0)
3461  nNeed = 0;
3462  }
3463  if(nWish) {
3464  nWish -= added;
3465  if(nWish < 0)
3466  nWish = 0;
3467  }
3468 #endif
3469  if(nWish <= 0)
3470  return added;
3471 
3472  while(1) {
3473  int nTarget;
3474  int minimumRequiredCapacity;
3475  int newCapacity;
3476  kmp_info_t **newThreads;
3477  kmp_root_t **newRoot;
3478 
3479  //
3480  // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3481  // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3482  // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3483  // become > __kmp_max_nth in one of two ways:
3484  //
3485  // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3486  // may not be resused by another thread, so we may need to increase
3487  // __kmp_threads_capacity to __kmp_max_threads + 1.
3488  //
3489  // 2) New foreign root(s) are encountered. We always register new
3490  // foreign roots. This may cause a smaller # of threads to be
3491  // allocated at subsequent parallel regions, but the worker threads
3492  // hang around (and eventually go to sleep) and need slots in the
3493  // __kmp_threads[] array.
3494  //
3495  // Anyway, that is the reason for moving the check to see if
3496  // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3497  // instead of having it performed here. -BB
3498  //
3499  old_tp_cached = __kmp_tp_cached;
3500  __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3501  KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3502 
3503  /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3504  nTarget = nWish;
3505  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3506  /* can't fulfil nWish, so try nNeed */
3507  if(nNeed) {
3508  nTarget = nNeed;
3509  if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3510  /* possible expansion too small -- give up */
3511  break;
3512  }
3513  } else {
3514  /* best-effort */
3515  nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3516  if(!nTarget) {
3517  /* can expand at all -- give up */
3518  break;
3519  }
3520  }
3521  }
3522  minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3523 
3524  newCapacity = __kmp_threads_capacity;
3525  do{
3526  newCapacity =
3527  newCapacity <= (__kmp_actual_max_nth >> 1) ?
3528  (newCapacity << 1) :
3529  __kmp_actual_max_nth;
3530  } while(newCapacity < minimumRequiredCapacity);
3531  newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3532  newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3533  KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3534  KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3535  memset(newThreads + __kmp_threads_capacity, 0,
3536  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3537  memset(newRoot + __kmp_threads_capacity, 0,
3538  (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3539 
3540  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3541  /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3542  while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3543  cache capacity, so we should deallocate the expanded arrays and try again. This is the first check
3544  of a double-check pair.
3545  */
3546  __kmp_free(newThreads);
3547  continue; /* start over and try again */
3548  }
3549  __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3550  if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3551  /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3552  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3553  __kmp_free(newThreads);
3554  continue; /* start over and try again */
3555  } else {
3556  /* success */
3557  // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3558  //
3559  *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3560  *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3561  added += newCapacity - __kmp_threads_capacity;
3562  *(volatile int*)&__kmp_threads_capacity = newCapacity;
3563  __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3564  break; /* succeeded, so we can exit the loop */
3565  }
3566  }
3567  return added;
3568 }
3569 
3570 /* register the current thread as a root thread and obtain our gtid */
3571 /* we must have the __kmp_initz_lock held at this point */
3572 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3573 int
3574 __kmp_register_root( int initial_thread )
3575 {
3576  kmp_info_t *root_thread;
3577  kmp_root_t *root;
3578  int gtid;
3579  int capacity;
3580  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3581  KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3582  KMP_MB();
3583 
3584 
3585  /*
3586  2007-03-02:
3587 
3588  If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3589  "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3590  return false (that means there is at least one empty slot in __kmp_threads array), but it
3591  is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3592  used for this one. Following code workarounds this bug.
3593 
3594  However, right solution seems to be not reserving slot #0 for initial thread because:
3595  (1) there is no magic in slot #0,
3596  (2) we cannot detect initial thread reliably (the first thread which does serial
3597  initialization may be not a real initial thread).
3598  */
3599  capacity = __kmp_threads_capacity;
3600  if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3601  -- capacity;
3602  }; // if
3603 
3604  /* see if there are too many threads */
3605  if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3606  if ( __kmp_tp_cached ) {
3607  __kmp_msg(
3608  kmp_ms_fatal,
3609  KMP_MSG( CantRegisterNewThread ),
3610  KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3611  KMP_HNT( PossibleSystemLimitOnThreads ),
3612  __kmp_msg_null
3613  );
3614  }
3615  else {
3616  __kmp_msg(
3617  kmp_ms_fatal,
3618  KMP_MSG( CantRegisterNewThread ),
3619  KMP_HNT( SystemLimitOnThreads ),
3620  __kmp_msg_null
3621  );
3622  }
3623  }; // if
3624 
3625  /* find an available thread slot */
3626  /* Don't reassign the zero slot since we need that to only be used by initial
3627  thread */
3628  for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3629  ;
3630  KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3631  KMP_ASSERT( gtid < __kmp_threads_capacity );
3632 
3633  /* update global accounting */
3634  __kmp_all_nth ++;
3635  TCW_4(__kmp_nth, __kmp_nth + 1);
3636 
3637  //
3638  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3639  // for low numbers of procs, and method #2 (keyed API call) for higher
3640  // numbers of procs.
3641  //
3642  if ( __kmp_adjust_gtid_mode ) {
3643  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3644  if ( TCR_4(__kmp_gtid_mode) != 2) {
3645  TCW_4(__kmp_gtid_mode, 2);
3646  }
3647  }
3648  else {
3649  if (TCR_4(__kmp_gtid_mode) != 1 ) {
3650  TCW_4(__kmp_gtid_mode, 1);
3651  }
3652  }
3653  }
3654 
3655 #ifdef KMP_ADJUST_BLOCKTIME
3656  /* Adjust blocktime to zero if necessary */
3657  /* Middle initialization might not have occurred yet */
3658  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3659  if ( __kmp_nth > __kmp_avail_proc ) {
3660  __kmp_zero_bt = TRUE;
3661  }
3662  }
3663 #endif /* KMP_ADJUST_BLOCKTIME */
3664 
3665  /* setup this new hierarchy */
3666  if( ! ( root = __kmp_root[gtid] )) {
3667  root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3668  KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3669  }
3670 
3671  __kmp_initialize_root( root );
3672 
3673  /* setup new root thread structure */
3674  if( root->r.r_uber_thread ) {
3675  root_thread = root->r.r_uber_thread;
3676  } else {
3677  root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3678  if ( __kmp_storage_map ) {
3679  __kmp_print_thread_storage_map( root_thread, gtid );
3680  }
3681  root_thread->th.th_info .ds.ds_gtid = gtid;
3682  root_thread->th.th_root = root;
3683  if( __kmp_env_consistency_check ) {
3684  root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3685  }
3686  #if USE_FAST_MEMORY
3687  __kmp_initialize_fast_memory( root_thread );
3688  #endif /* USE_FAST_MEMORY */
3689 
3690  #if KMP_USE_BGET
3691  KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3692  __kmp_initialize_bget( root_thread );
3693  #endif
3694  __kmp_init_random( root_thread ); // Initialize random number generator
3695  }
3696 
3697  /* setup the serial team held in reserve by the root thread */
3698  if( ! root_thread->th.th_serial_team ) {
3699  kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3700  KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3701 
3702  root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3703 #if OMPT_SUPPORT
3704  0, // root parallel id
3705 #endif
3706 #if OMP_40_ENABLED
3707  proc_bind_default,
3708 #endif
3709  &r_icvs,
3710  0 USE_NESTED_HOT_ARG(NULL) );
3711  }
3712  KMP_ASSERT( root_thread->th.th_serial_team );
3713  KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3714  root_thread->th.th_serial_team ) );
3715 
3716  /* drop root_thread into place */
3717  TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3718 
3719  root->r.r_root_team->t.t_threads[0] = root_thread;
3720  root->r.r_hot_team ->t.t_threads[0] = root_thread;
3721  root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3722  root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3723  root->r.r_uber_thread = root_thread;
3724 
3725  /* initialize the thread, get it ready to go */
3726  __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3727  TCW_4(__kmp_init_gtid, TRUE);
3728 
3729  /* prepare the master thread for get_gtid() */
3730  __kmp_gtid_set_specific( gtid );
3731 
3732 #if USE_ITT_BUILD
3733  __kmp_itt_thread_name( gtid );
3734 #endif /* USE_ITT_BUILD */
3735 
3736  #ifdef KMP_TDATA_GTID
3737  __kmp_gtid = gtid;
3738  #endif
3739  __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3740  KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3741 
3742  KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3743  gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3744  root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3745  KMP_INIT_BARRIER_STATE ) );
3746  { // Initialize barrier data.
3747  int b;
3748  for ( b = 0; b < bs_last_barrier; ++ b ) {
3749  root_thread->th.th_bar[ b ].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3750 #if USE_DEBUGGER
3751  root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3752 #endif
3753  }; // for
3754  }
3755  KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3756 
3757 #if KMP_AFFINITY_SUPPORTED
3758 # if OMP_40_ENABLED
3759  root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3760  root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3761  root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3762  root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3763 # endif
3764 
3765  if ( TCR_4(__kmp_init_middle) ) {
3766  __kmp_affinity_set_init_mask( gtid, TRUE );
3767  }
3768 #endif /* KMP_AFFINITY_SUPPORTED */
3769 
3770  __kmp_root_counter ++;
3771 
3772  KMP_MB();
3773  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3774 
3775  return gtid;
3776 }
3777 
3778 #if KMP_NESTED_HOT_TEAMS
3779 static int
3780 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3781 {
3782  int i, n, nth;
3783  kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3784  if( !hot_teams || !hot_teams[level].hot_team ) {
3785  return 0;
3786  }
3787  KMP_DEBUG_ASSERT( level < max_level );
3788  kmp_team_t *team = hot_teams[level].hot_team;
3789  nth = hot_teams[level].hot_team_nth;
3790  n = nth - 1; // master is not freed
3791  if( level < max_level - 1 ) {
3792  for( i = 0; i < nth; ++i ) {
3793  kmp_info_t *th = team->t.t_threads[i];
3794  n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3795  if( i > 0 && th->th.th_hot_teams ) {
3796  __kmp_free( th->th.th_hot_teams );
3797  th->th.th_hot_teams = NULL;
3798  }
3799  }
3800  }
3801  __kmp_free_team( root, team, NULL );
3802  return n;
3803 }
3804 #endif
3805 
3806 /* Resets a root thread and clear its root and hot teams.
3807  Returns the number of __kmp_threads entries directly and indirectly freed.
3808 */
3809 static int
3810 __kmp_reset_root(int gtid, kmp_root_t *root)
3811 {
3812  kmp_team_t * root_team = root->r.r_root_team;
3813  kmp_team_t * hot_team = root->r.r_hot_team;
3814  int n = hot_team->t.t_nproc;
3815  int i;
3816 
3817  KMP_DEBUG_ASSERT( ! root->r.r_active );
3818 
3819  root->r.r_root_team = NULL;
3820  root->r.r_hot_team = NULL;
3821  // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3822  // to __kmp_free_team().
3823  __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3824 #if KMP_NESTED_HOT_TEAMS
3825  if( __kmp_hot_teams_max_level > 0 ) { // need to free nested hot teams and their threads if any
3826  for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3827  kmp_info_t *th = hot_team->t.t_threads[i];
3828  if( __kmp_hot_teams_max_level > 1 ) {
3829  n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3830  }
3831  if( th->th.th_hot_teams ) {
3832  __kmp_free( th->th.th_hot_teams );
3833  th->th.th_hot_teams = NULL;
3834  }
3835  }
3836  }
3837 #endif
3838  __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3839 
3840  //
3841  // Before we can reap the thread, we need to make certain that all
3842  // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3843  //
3844  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3845  __kmp_wait_to_unref_task_teams();
3846  }
3847 
3848  #if KMP_OS_WINDOWS
3849  /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3850  KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3851  (LPVOID)&(root->r.r_uber_thread->th),
3852  root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3853  __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3854  #endif /* KMP_OS_WINDOWS */
3855 
3856 #if OMPT_SUPPORT
3857  if (ompt_enabled &&
3858  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3859  int gtid = __kmp_get_gtid();
3860  __ompt_thread_end(ompt_thread_initial, gtid);
3861  }
3862 #endif
3863 
3864  TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3865  __kmp_reap_thread( root->r.r_uber_thread, 1 );
3866 
3867  // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3868  root->r.r_uber_thread = NULL;
3869  /* mark root as no longer in use */
3870  root->r.r_begin = FALSE;
3871 
3872  return n;
3873 }
3874 
3875 void
3876 __kmp_unregister_root_current_thread( int gtid )
3877 {
3878  KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3879  /* this lock should be ok, since unregister_root_current_thread is never called during
3880  * and abort, only during a normal close. furthermore, if you have the
3881  * forkjoin lock, you should never try to get the initz lock */
3882 
3883  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3884  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3885  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3886  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3887  return;
3888  }
3889  kmp_root_t *root = __kmp_root[gtid];
3890 
3891  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3892  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3893  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3894  KMP_ASSERT( root->r.r_active == FALSE );
3895 
3896 
3897  KMP_MB();
3898 
3899 #if OMP_45_ENABLED
3900  kmp_info_t * thread = __kmp_threads[gtid];
3901  kmp_team_t * team = thread->th.th_team;
3902  kmp_task_team_t * task_team = thread->th.th_task_team;
3903 
3904  // we need to wait for the proxy tasks before finishing the thread
3905  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks ) {
3906 #if OMPT_SUPPORT
3907  // the runtime is shutting down so we won't report any events
3908  thread->th.ompt_thread_info.state = ompt_state_undefined;
3909 #endif
3910  __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3911  }
3912 #endif
3913 
3914  __kmp_reset_root(gtid, root);
3915 
3916  /* free up this thread slot */
3917  __kmp_gtid_set_specific( KMP_GTID_DNE );
3918 #ifdef KMP_TDATA_GTID
3919  __kmp_gtid = KMP_GTID_DNE;
3920 #endif
3921 
3922  KMP_MB();
3923  KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3924 
3925  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3926 }
3927 
3928 #if KMP_OS_WINDOWS
3929 /* __kmp_forkjoin_lock must be already held
3930  Unregisters a root thread that is not the current thread. Returns the number of
3931  __kmp_threads entries freed as a result.
3932  */
3933 static int
3934 __kmp_unregister_root_other_thread( int gtid )
3935 {
3936  kmp_root_t *root = __kmp_root[gtid];
3937  int r;
3938 
3939  KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
3940  KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3941  KMP_ASSERT( KMP_UBER_GTID( gtid ));
3942  KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3943  KMP_ASSERT( root->r.r_active == FALSE );
3944 
3945  r = __kmp_reset_root(gtid, root);
3946  KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
3947  return r;
3948 }
3949 #endif
3950 
3951 #if KMP_DEBUG
3952 void __kmp_task_info() {
3953 
3954  kmp_int32 gtid = __kmp_entry_gtid();
3955  kmp_int32 tid = __kmp_tid_from_gtid( gtid );
3956  kmp_info_t *this_thr = __kmp_threads[ gtid ];
3957  kmp_team_t *steam = this_thr->th.th_serial_team;
3958  kmp_team_t *team = this_thr->th.th_team;
3959 
3960  __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
3961  gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
3962 }
3963 #endif // KMP_DEBUG
3964 
3965 /* TODO optimize with one big memclr, take out what isn't needed,
3966  * split responsibility to workers as much as possible, and delay
3967  * initialization of features as much as possible */
3968 static void
3969 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
3970 {
3971  /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
3972  * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3973  kmp_info_t *master = team->t.t_threads[0];
3974  KMP_DEBUG_ASSERT( this_thr != NULL );
3975  KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
3976  KMP_DEBUG_ASSERT( team );
3977  KMP_DEBUG_ASSERT( team->t.t_threads );
3978  KMP_DEBUG_ASSERT( team->t.t_dispatch );
3979  KMP_DEBUG_ASSERT( master );
3980  KMP_DEBUG_ASSERT( master->th.th_root );
3981 
3982  KMP_MB();
3983 
3984  TCW_SYNC_PTR(this_thr->th.th_team, team);
3985 
3986  this_thr->th.th_info.ds.ds_tid = tid;
3987  this_thr->th.th_set_nproc = 0;
3988 #if OMP_40_ENABLED
3989  this_thr->th.th_set_proc_bind = proc_bind_default;
3990 # if KMP_AFFINITY_SUPPORTED
3991  this_thr->th.th_new_place = this_thr->th.th_current_place;
3992 # endif
3993 #endif
3994  this_thr->th.th_root = master->th.th_root;
3995 
3996  /* setup the thread's cache of the team structure */
3997  this_thr->th.th_team_nproc = team->t.t_nproc;
3998  this_thr->th.th_team_master = master;
3999  this_thr->th.th_team_serialized = team->t.t_serialized;
4000  TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4001 
4002  KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
4003 
4004  KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4005  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4006 
4007  __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4008 
4009  KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4010  tid, gtid, this_thr, this_thr->th.th_current_task ) );
4011  // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4012 
4013  /* TODO no worksharing in speculative threads */
4014  this_thr->th.th_dispatch = &team->t.t_dispatch[ tid ];
4015 
4016  this_thr->th.th_local.this_construct = 0;
4017 
4018 #ifdef BUILD_TV
4019  this_thr->th.th_local.tv_data = 0;
4020 #endif
4021 
4022  if ( ! this_thr->th.th_pri_common ) {
4023  this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4024  if ( __kmp_storage_map ) {
4025  __kmp_print_storage_map_gtid(
4026  gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4027  sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4028  );
4029  }; // if
4030  this_thr->th.th_pri_head = NULL;
4031  }; // if
4032 
4033  /* Initialize dynamic dispatch */
4034  {
4035  volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4036  /*
4037  * Use team max_nproc since this will never change for the team.
4038  */
4039  size_t disp_size = sizeof( dispatch_private_info_t ) *
4040  ( team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers );
4041  KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4042  KMP_ASSERT( dispatch );
4043  KMP_DEBUG_ASSERT( team->t.t_dispatch );
4044  KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4045 
4046  dispatch->th_disp_index = 0;
4047 #if OMP_45_ENABLED
4048  dispatch->th_doacross_buf_idx = 0;
4049 #endif
4050  if( ! dispatch->th_disp_buffer ) {
4051  dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4052 
4053  if ( __kmp_storage_map ) {
4054  __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4055  &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers ],
4056  disp_size, "th_%d.th_dispatch.th_disp_buffer "
4057  "(team_%d.t_dispatch[%d].th_disp_buffer)",
4058  gtid, team->t.t_id, gtid );
4059  }
4060  } else {
4061  memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
4062  }
4063 
4064  dispatch->th_dispatch_pr_current = 0;
4065  dispatch->th_dispatch_sh_current = 0;
4066 
4067  dispatch->th_deo_fcn = 0; /* ORDERED */
4068  dispatch->th_dxo_fcn = 0; /* END ORDERED */
4069  }
4070 
4071  this_thr->th.th_next_pool = NULL;
4072 
4073  if (!this_thr->th.th_task_state_memo_stack) {
4074  size_t i;
4075  this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
4076  this_thr->th.th_task_state_top = 0;
4077  this_thr->th.th_task_state_stack_sz = 4;
4078  for (i=0; i<this_thr->th.th_task_state_stack_sz; ++i) // zero init the stack
4079  this_thr->th.th_task_state_memo_stack[i] = 0;
4080  }
4081 
4082  KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4083  KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4084 
4085  KMP_MB();
4086 }
4087 
4088 
4089 /* allocate a new thread for the requesting team. this is only called from within a
4090  * forkjoin critical section. we will first try to get an available thread from the
4091  * thread pool. if none is available, we will fork a new one assuming we are able
4092  * to create a new one. this should be assured, as the caller should check on this
4093  * first.
4094  */
4095 kmp_info_t *
4096 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4097 {
4098  kmp_team_t *serial_team;
4099  kmp_info_t *new_thr;
4100  int new_gtid;
4101 
4102  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4103  KMP_DEBUG_ASSERT( root && team );
4104 #if !KMP_NESTED_HOT_TEAMS
4105  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4106 #endif
4107  KMP_MB();
4108 
4109  /* first, try to get one from the thread pool */
4110  if ( __kmp_thread_pool ) {
4111 
4112  new_thr = (kmp_info_t*)__kmp_thread_pool;
4113  __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4114  if ( new_thr == __kmp_thread_pool_insert_pt ) {
4115  __kmp_thread_pool_insert_pt = NULL;
4116  }
4117  TCW_4(new_thr->th.th_in_pool, FALSE);
4118  //
4119  // Don't touch th_active_in_pool or th_active.
4120  // The worker thread adjusts those flags as it sleeps/awakens.
4121  //
4122  __kmp_thread_pool_nth--;
4123 
4124  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4125  __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4126  KMP_ASSERT( ! new_thr->th.th_team );
4127  KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4128  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4129 
4130  /* setup the thread structure */
4131  __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4132  KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4133 
4134  TCW_4(__kmp_nth, __kmp_nth + 1);
4135 
4136  new_thr->th.th_task_state = 0;
4137  new_thr->th.th_task_state_top = 0;
4138  new_thr->th.th_task_state_stack_sz = 4;
4139 
4140 #ifdef KMP_ADJUST_BLOCKTIME
4141  /* Adjust blocktime back to zero if necessar y */
4142  /* Middle initialization might not have occurred yet */
4143  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4144  if ( __kmp_nth > __kmp_avail_proc ) {
4145  __kmp_zero_bt = TRUE;
4146  }
4147  }
4148 #endif /* KMP_ADJUST_BLOCKTIME */
4149 
4150 #if KMP_DEBUG
4151  // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
4152  int b;
4153  kmp_balign_t * balign = new_thr->th.th_bar;
4154  for( b = 0; b < bs_last_barrier; ++ b )
4155  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4156 #endif
4157 
4158  KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4159  __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4160 
4161  KMP_MB();
4162  return new_thr;
4163  }
4164 
4165 
4166  /* no, well fork a new one */
4167  KMP_ASSERT( __kmp_nth == __kmp_all_nth );
4168  KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4169 
4170  //
4171  // If this is the first worker thread the RTL is creating, then also
4172  // launch the monitor thread. We try to do this as early as possible.
4173  //
4174  if ( ! TCR_4( __kmp_init_monitor ) ) {
4175  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4176  if ( ! TCR_4( __kmp_init_monitor ) ) {
4177  KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4178  TCW_4( __kmp_init_monitor, 1 );
4179  __kmp_create_monitor( & __kmp_monitor );
4180  KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4181  #if KMP_OS_WINDOWS
4182  // AC: wait until monitor has started. This is a fix for CQ232808.
4183  // The reason is that if the library is loaded/unloaded in a loop with small (parallel)
4184  // work in between, then there is high probability that monitor thread started after
4185  // the library shutdown. At shutdown it is too late to cope with the problem, because
4186  // when the master is in DllMain (process detach) the monitor has no chances to start
4187  // (it is blocked), and master has no means to inform the monitor that the library has gone,
4188  // because all the memory which the monitor can access is going to be released/reset.
4189  while ( TCR_4(__kmp_init_monitor) < 2 ) {
4190  KMP_YIELD( TRUE );
4191  }
4192  KF_TRACE( 10, ( "after monitor thread has started\n" ) );
4193  #endif
4194  }
4195  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4196  }
4197 
4198  KMP_MB();
4199  for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4200  KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4201  }
4202 
4203  /* allocate space for it. */
4204  new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4205 
4206  TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4207 
4208  if ( __kmp_storage_map ) {
4209  __kmp_print_thread_storage_map( new_thr, new_gtid );
4210  }
4211 
4212  /* add the reserve serialized team, initialized from the team's master thread */
4213  {
4214  kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4215  KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4216 
4217  new_thr->th.th_serial_team = serial_team =
4218  (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4219 #if OMPT_SUPPORT
4220  0, // root parallel id
4221 #endif
4222 #if OMP_40_ENABLED
4223  proc_bind_default,
4224 #endif
4225  &r_icvs,
4226  0 USE_NESTED_HOT_ARG(NULL) );
4227  }
4228  KMP_ASSERT ( serial_team );
4229  serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
4230  serial_team->t.t_threads[0] = new_thr;
4231  KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4232  new_thr ) );
4233 
4234  /* setup the thread structures */
4235  __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4236 
4237  #if USE_FAST_MEMORY
4238  __kmp_initialize_fast_memory( new_thr );
4239  #endif /* USE_FAST_MEMORY */
4240 
4241  #if KMP_USE_BGET
4242  KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
4243  __kmp_initialize_bget( new_thr );
4244  #endif
4245 
4246  __kmp_init_random( new_thr ); // Initialize random number generator
4247 
4248  /* Initialize these only once when thread is grabbed for a team allocation */
4249  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4250  __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4251 
4252  int b;
4253  kmp_balign_t * balign = new_thr->th.th_bar;
4254  for(b=0; b<bs_last_barrier; ++b) {
4255  balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4256  balign[b].bb.team = NULL;
4257  balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4258  balign[b].bb.use_oncore_barrier = 0;
4259  }
4260 
4261  new_thr->th.th_spin_here = FALSE;
4262  new_thr->th.th_next_waiting = 0;
4263 
4264 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4265  new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4266  new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4267  new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4268  new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4269 #endif
4270 
4271  TCW_4(new_thr->th.th_in_pool, FALSE);
4272  new_thr->th.th_active_in_pool = FALSE;
4273  TCW_4(new_thr->th.th_active, TRUE);
4274 
4275  /* adjust the global counters */
4276  __kmp_all_nth ++;
4277  __kmp_nth ++;
4278 
4279  //
4280  // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4281  // for low numbers of procs, and method #2 (keyed API call) for higher
4282  // numbers of procs.
4283  //
4284  if ( __kmp_adjust_gtid_mode ) {
4285  if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4286  if ( TCR_4(__kmp_gtid_mode) != 2) {
4287  TCW_4(__kmp_gtid_mode, 2);
4288  }
4289  }
4290  else {
4291  if (TCR_4(__kmp_gtid_mode) != 1 ) {
4292  TCW_4(__kmp_gtid_mode, 1);
4293  }
4294  }
4295  }
4296 
4297 #ifdef KMP_ADJUST_BLOCKTIME
4298  /* Adjust blocktime back to zero if necessary */
4299  /* Middle initialization might not have occurred yet */
4300  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4301  if ( __kmp_nth > __kmp_avail_proc ) {
4302  __kmp_zero_bt = TRUE;
4303  }
4304  }
4305 #endif /* KMP_ADJUST_BLOCKTIME */
4306 
4307  /* actually fork it and create the new worker thread */
4308  KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4309  __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4310  KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4311 
4312  KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4313  KMP_MB();
4314  return new_thr;
4315 }
4316 
4317 /*
4318  * reinitialize team for reuse.
4319  *
4320  * The hot team code calls this case at every fork barrier, so EPCC barrier
4321  * test are extremely sensitive to changes in it, esp. writes to the team
4322  * struct, which cause a cache invalidation in all threads.
4323  *
4324  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4325  */
4326 static void
4327 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4328  KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4329  team->t.t_threads[0], team ) );
4330  KMP_DEBUG_ASSERT( team && new_icvs);
4331  KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4332  KMP_CHECK_UPDATE(team->t.t_ident, loc);
4333 
4334  KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4335 
4336  // Copy ICVs to the master thread's implicit taskdata
4337  __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4338  copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4339 
4340  KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4341  team->t.t_threads[0], team ) );
4342 }
4343 
4344 
4345 /* initialize the team data structure
4346  * this assumes the t_threads and t_max_nproc are already set
4347  * also, we don't touch the arguments */
4348 static void
4349 __kmp_initialize_team(
4350  kmp_team_t * team,
4351  int new_nproc,
4352  kmp_internal_control_t * new_icvs,
4353  ident_t * loc
4354 ) {
4355  KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4356 
4357  /* verify */
4358  KMP_DEBUG_ASSERT( team );
4359  KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4360  KMP_DEBUG_ASSERT( team->t.t_threads );
4361  KMP_MB();
4362 
4363  team->t.t_master_tid = 0; /* not needed */
4364  /* team->t.t_master_bar; not needed */
4365  team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4366  team->t.t_nproc = new_nproc;
4367 
4368  /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4369  team->t.t_next_pool = NULL;
4370  /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4371 
4372  TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4373  team->t.t_invoke = NULL; /* not needed */
4374 
4375  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4376  team->t.t_sched = new_icvs->sched;
4377 
4378 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4379  team->t.t_fp_control_saved = FALSE; /* not needed */
4380  team->t.t_x87_fpu_control_word = 0; /* not needed */
4381  team->t.t_mxcsr = 0; /* not needed */
4382 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4383 
4384  team->t.t_construct = 0;
4385  __kmp_init_lock( & team->t.t_single_lock );
4386 
4387  team->t.t_ordered .dt.t_value = 0;
4388  team->t.t_master_active = FALSE;
4389 
4390  memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4391 
4392 #ifdef KMP_DEBUG
4393  team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4394 #endif
4395  team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4396 
4397  team->t.t_control_stack_top = NULL;
4398 
4399  __kmp_reinitialize_team( team, new_icvs, loc );
4400 
4401  KMP_MB();
4402  KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4403 }
4404 
4405 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4406 /* Sets full mask for thread and returns old mask, no changes to structures. */
4407 static void
4408 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4409 {
4410  if ( KMP_AFFINITY_CAPABLE() ) {
4411  int status;
4412  if ( old_mask != NULL ) {
4413  status = __kmp_get_system_affinity( old_mask, TRUE );
4414  int error = errno;
4415  if ( status != 0 ) {
4416  __kmp_msg(
4417  kmp_ms_fatal,
4418  KMP_MSG( ChangeThreadAffMaskError ),
4419  KMP_ERR( error ),
4420  __kmp_msg_null
4421  );
4422  }
4423  }
4424  __kmp_set_system_affinity( __kmp_affin_fullMask, TRUE );
4425  }
4426 }
4427 #endif
4428 
4429 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4430 
4431 //
4432 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4433 // It calculats the worker + master thread's partition based upon the parent
4434 // thread's partition, and binds each worker to a thread in their partition.
4435 // The master thread's partition should already include its current binding.
4436 //
4437 static void
4438 __kmp_partition_places( kmp_team_t *team, int update_master_only )
4439 {
4440  //
4441  // Copy the master thread's place partion to the team struct
4442  //
4443  kmp_info_t *master_th = team->t.t_threads[0];
4444  KMP_DEBUG_ASSERT( master_th != NULL );
4445  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4446  int first_place = master_th->th.th_first_place;
4447  int last_place = master_th->th.th_last_place;
4448  int masters_place = master_th->th.th_current_place;
4449  team->t.t_first_place = first_place;
4450  team->t.t_last_place = last_place;
4451 
4452  KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4453  proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4454  masters_place, first_place, last_place ) );
4455 
4456  switch ( proc_bind ) {
4457 
4458  case proc_bind_default:
4459  //
4460  // serial teams might have the proc_bind policy set to
4461  // proc_bind_default. It doesn't matter, as we don't
4462  // rebind the master thread for any proc_bind policy.
4463  //
4464  KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4465  break;
4466 
4467  case proc_bind_master:
4468  {
4469  int f;
4470  int n_th = team->t.t_nproc;
4471  for ( f = 1; f < n_th; f++ ) {
4472  kmp_info_t *th = team->t.t_threads[f];
4473  KMP_DEBUG_ASSERT( th != NULL );
4474  th->th.th_first_place = first_place;
4475  th->th.th_last_place = last_place;
4476  th->th.th_new_place = masters_place;
4477 
4478  KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4479  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4480  team->t.t_id, f, masters_place, first_place, last_place ) );
4481  }
4482  }
4483  break;
4484 
4485  case proc_bind_close:
4486  {
4487  int f;
4488  int n_th = team->t.t_nproc;
4489  int n_places;
4490  if ( first_place <= last_place ) {
4491  n_places = last_place - first_place + 1;
4492  }
4493  else {
4494  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4495  }
4496  if ( n_th <= n_places ) {
4497  int place = masters_place;
4498  for ( f = 1; f < n_th; f++ ) {
4499  kmp_info_t *th = team->t.t_threads[f];
4500  KMP_DEBUG_ASSERT( th != NULL );
4501 
4502  if ( place == last_place ) {
4503  place = first_place;
4504  }
4505  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4506  place = 0;
4507  }
4508  else {
4509  place++;
4510  }
4511  th->th.th_first_place = first_place;
4512  th->th.th_last_place = last_place;
4513  th->th.th_new_place = place;
4514 
4515  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4516  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4517  team->t.t_id, f, place, first_place, last_place ) );
4518  }
4519  }
4520  else {
4521  int S, rem, gap, s_count;
4522  S = n_th / n_places;
4523  s_count = 0;
4524  rem = n_th - ( S * n_places );
4525  gap = rem > 0 ? n_places/rem : n_places;
4526  int place = masters_place;
4527  int gap_ct = gap;
4528  for ( f = 0; f < n_th; f++ ) {
4529  kmp_info_t *th = team->t.t_threads[f];
4530  KMP_DEBUG_ASSERT( th != NULL );
4531 
4532  th->th.th_first_place = first_place;
4533  th->th.th_last_place = last_place;
4534  th->th.th_new_place = place;
4535  s_count++;
4536 
4537  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4538  // do nothing, add an extra thread to place on next iteration
4539  }
4540  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4541  // we added an extra thread to this place; move to next place
4542  if ( place == last_place ) {
4543  place = first_place;
4544  }
4545  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4546  place = 0;
4547  }
4548  else {
4549  place++;
4550  }
4551  s_count = 0;
4552  gap_ct = 1;
4553  rem--;
4554  }
4555  else if (s_count == S) { // place full; don't add extra
4556  if ( place == last_place ) {
4557  place = first_place;
4558  }
4559  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4560  place = 0;
4561  }
4562  else {
4563  place++;
4564  }
4565  gap_ct++;
4566  s_count = 0;
4567  }
4568 
4569  KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4570  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4571  team->t.t_id, f, th->th.th_new_place, first_place,
4572  last_place ) );
4573  }
4574  KMP_DEBUG_ASSERT( place == masters_place );
4575  }
4576  }
4577  break;
4578 
4579  case proc_bind_spread:
4580  {
4581  int f;
4582  int n_th = team->t.t_nproc;
4583  int n_places;
4584  int thidx;
4585  if ( first_place <= last_place ) {
4586  n_places = last_place - first_place + 1;
4587  }
4588  else {
4589  n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4590  }
4591  if ( n_th <= n_places ) {
4592  int place = masters_place;
4593  int S = n_places/n_th;
4594  int s_count, rem, gap, gap_ct;
4595  rem = n_places - n_th*S;
4596  gap = rem ? n_th/rem : 1;
4597  gap_ct = gap;
4598  thidx = n_th;
4599  if (update_master_only == 1)
4600  thidx = 1;
4601  for ( f = 0; f < thidx; f++ ) {
4602  kmp_info_t *th = team->t.t_threads[f];
4603  KMP_DEBUG_ASSERT( th != NULL );
4604 
4605  th->th.th_first_place = place;
4606  th->th.th_new_place = place;
4607  s_count = 1;
4608  while (s_count < S) {
4609  if ( place == last_place ) {
4610  place = first_place;
4611  }
4612  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4613  place = 0;
4614  }
4615  else {
4616  place++;
4617  }
4618  s_count++;
4619  }
4620  if (rem && (gap_ct == gap)) {
4621  if ( place == last_place ) {
4622  place = first_place;
4623  }
4624  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4625  place = 0;
4626  }
4627  else {
4628  place++;
4629  }
4630  rem--;
4631  gap_ct = 0;
4632  }
4633  th->th.th_last_place = place;
4634  gap_ct++;
4635 
4636  if ( place == last_place ) {
4637  place = first_place;
4638  }
4639  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4640  place = 0;
4641  }
4642  else {
4643  place++;
4644  }
4645 
4646  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4647  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4648  team->t.t_id, f, th->th.th_new_place,
4649  th->th.th_first_place, th->th.th_last_place ) );
4650  }
4651  KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4652  }
4653  else {
4654  int S, rem, gap, s_count;
4655  S = n_th / n_places;
4656  s_count = 0;
4657  rem = n_th - ( S * n_places );
4658  gap = rem > 0 ? n_places/rem : n_places;
4659  int place = masters_place;
4660  int gap_ct = gap;
4661  thidx = n_th;
4662  if (update_master_only == 1)
4663  thidx = 1;
4664  for ( f = 0; f < thidx; f++ ) {
4665  kmp_info_t *th = team->t.t_threads[f];
4666  KMP_DEBUG_ASSERT( th != NULL );
4667 
4668  th->th.th_first_place = place;
4669  th->th.th_last_place = place;
4670  th->th.th_new_place = place;
4671  s_count++;
4672 
4673  if ( (s_count == S) && rem && (gap_ct == gap) ) {
4674  // do nothing, add an extra thread to place on next iteration
4675  }
4676  else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4677  // we added an extra thread to this place; move on to next place
4678  if ( place == last_place ) {
4679  place = first_place;
4680  }
4681  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4682  place = 0;
4683  }
4684  else {
4685  place++;
4686  }
4687  s_count = 0;
4688  gap_ct = 1;
4689  rem--;
4690  }
4691  else if (s_count == S) { // place is full; don't add extra thread
4692  if ( place == last_place ) {
4693  place = first_place;
4694  }
4695  else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4696  place = 0;
4697  }
4698  else {
4699  place++;
4700  }
4701  gap_ct++;
4702  s_count = 0;
4703  }
4704 
4705  KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4706  __kmp_gtid_from_thread( team->t.t_threads[f] ),
4707  team->t.t_id, f, th->th.th_new_place,
4708  th->th.th_first_place, th->th.th_last_place) );
4709  }
4710  KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4711  }
4712  }
4713  break;
4714 
4715  default:
4716  break;
4717  }
4718 
4719  KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4720 }
4721 
4722 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4723 
4724 /* allocate a new team data structure to use. take one off of the free pool if available */
4725 kmp_team_t *
4726 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4727 #if OMPT_SUPPORT
4728  ompt_parallel_id_t ompt_parallel_id,
4729 #endif
4730 #if OMP_40_ENABLED
4731  kmp_proc_bind_t new_proc_bind,
4732 #endif
4733  kmp_internal_control_t *new_icvs,
4734  int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4735 {
4736  KMP_TIME_DEVELOPER_BLOCK(KMP_allocate_team);
4737  int f;
4738  kmp_team_t *team;
4739  int use_hot_team = ! root->r.r_active;
4740  int level = 0;
4741 
4742  KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4743  KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4744  KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4745  KMP_MB();
4746 
4747 #if KMP_NESTED_HOT_TEAMS
4748  kmp_hot_team_ptr_t *hot_teams;
4749  if( master ) {
4750  team = master->th.th_team;
4751  level = team->t.t_active_level;
4752  if( master->th.th_teams_microtask ) { // in teams construct?
4753  if( master->th.th_teams_size.nteams > 1 && ( // #teams > 1
4754  team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4755  master->th.th_teams_level < team->t.t_level ) ) { // or nested parallel inside the teams
4756  ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4757  }
4758  }
4759  hot_teams = master->th.th_hot_teams;
4760  if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4761  { // hot team has already been allocated for given level
4762  use_hot_team = 1;
4763  } else {
4764  use_hot_team = 0;
4765  }
4766  }
4767 #endif
4768  // Optimization to use a "hot" team
4769  if( use_hot_team && new_nproc > 1 ) {
4770  KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4771 #if KMP_NESTED_HOT_TEAMS
4772  team = hot_teams[level].hot_team;
4773 #else
4774  team = root->r.r_hot_team;
4775 #endif
4776 #if KMP_DEBUG
4777  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4778  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4779  team->t.t_task_team[0], team->t.t_task_team[1] ));
4780  }
4781 #endif
4782 
4783  // Has the number of threads changed?
4784  /* Let's assume the most common case is that the number of threads is unchanged, and
4785  put that case first. */
4786  if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4787  KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4788  // This case can mean that omp_set_num_threads() was called and the hot team size
4789  // was already reduced, so we check the special flag
4790  if ( team->t.t_size_changed == -1 ) {
4791  team->t.t_size_changed = 1;
4792  } else {
4793  KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4794  }
4795 
4796  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4797  kmp_r_sched_t new_sched = new_icvs->sched;
4798  if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4799  team->t.t_sched.chunk != new_sched.chunk)
4800  team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
4801 
4802  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4803 
4804  KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4805  0, team->t.t_threads[0], team ) );
4806  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4807 
4808 #if OMP_40_ENABLED
4809 # if KMP_AFFINITY_SUPPORTED
4810  if ( ( team->t.t_size_changed == 0 )
4811  && ( team->t.t_proc_bind == new_proc_bind ) ) {
4812  if (new_proc_bind == proc_bind_spread) {
4813  __kmp_partition_places(team, 1); // add flag to update only master for spread
4814  }
4815  KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4816  team->t.t_id, new_proc_bind, team->t.t_first_place,
4817  team->t.t_last_place ) );
4818  }
4819  else {
4820  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4821  __kmp_partition_places( team );
4822  }
4823 # else
4824  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4825 # endif /* KMP_AFFINITY_SUPPORTED */
4826 #endif /* OMP_40_ENABLED */
4827  }
4828  else if( team->t.t_nproc > new_nproc ) {
4829  KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4830 
4831  team->t.t_size_changed = 1;
4832 #if KMP_NESTED_HOT_TEAMS
4833  if( __kmp_hot_teams_mode == 0 ) {
4834  // AC: saved number of threads should correspond to team's value in this mode,
4835  // can be bigger in mode 1, when hot team has some threads in reserve
4836  KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4837  hot_teams[level].hot_team_nth = new_nproc;
4838 #endif // KMP_NESTED_HOT_TEAMS
4839  /* release the extra threads we don't need any more */
4840  for( f = new_nproc ; f < team->t.t_nproc ; f++ ) {
4841  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4842  if ( __kmp_tasking_mode != tskm_immediate_exec) {
4843  // When decreasing team size, threads no longer in the team should unref task team.
4844  team->t.t_threads[f]->th.th_task_team = NULL;
4845  }
4846  __kmp_free_thread( team->t.t_threads[ f ] );
4847  team->t.t_threads[ f ] = NULL;
4848  }
4849 #if KMP_NESTED_HOT_TEAMS
4850  } // (__kmp_hot_teams_mode == 0)
4851  else {
4852  // When keeping extra threads in team, switch threads to wait on own b_go flag
4853  for (f=new_nproc; f<team->t.t_nproc; ++f) {
4854  KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4855  kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4856  for (int b=0; b<bs_last_barrier; ++b) {
4857  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4858  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4859  }
4860  KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4861  }
4862  }
4863  }
4864 #endif // KMP_NESTED_HOT_TEAMS
4865  team->t.t_nproc = new_nproc;
4866  // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4867  if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4868  team->t.t_sched.chunk != new_icvs->sched.chunk)
4869  team->t.t_sched = new_icvs->sched;
4870  __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4871 
4872  /* update the remaining threads */
4873  for(f = 0; f < new_nproc; ++f) {
4874  team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4875  }
4876  // restore the current task state of the master thread: should be the implicit task
4877  KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4878  0, team->t.t_threads[0], team ) );
4879 
4880  __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4881 
4882 #ifdef KMP_DEBUG
4883  for ( f = 0; f < team->t.t_nproc; f++ ) {
4884  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4885  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4886  }
4887 #endif
4888 
4889 #if OMP_40_ENABLED
4890  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4891 # if KMP_AFFINITY_SUPPORTED
4892  __kmp_partition_places( team );
4893 # endif
4894 #endif
4895  }
4896  else { // team->t.t_nproc < new_nproc
4897 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4898  kmp_affin_mask_t *old_mask;
4899  if ( KMP_AFFINITY_CAPABLE() ) {
4900  KMP_CPU_ALLOC(old_mask);
4901  }
4902 #endif
4903 
4904  KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4905 
4906  team->t.t_size_changed = 1;
4907 
4908 #if KMP_NESTED_HOT_TEAMS
4909  int avail_threads = hot_teams[level].hot_team_nth;
4910  if( new_nproc < avail_threads )
4911  avail_threads = new_nproc;
4912  kmp_info_t **other_threads = team->t.t_threads;
4913  for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4914  // Adjust barrier data of reserved threads (if any) of the team
4915  // Other data will be set in __kmp_initialize_info() below.
4916  int b;
4917  kmp_balign_t * balign = other_threads[f]->th.th_bar;
4918  for ( b = 0; b < bs_last_barrier; ++ b ) {
4919  balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4920  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4921 #if USE_DEBUGGER
4922  balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4923 #endif
4924  }
4925  }
4926  if( hot_teams[level].hot_team_nth >= new_nproc ) {
4927  // we have all needed threads in reserve, no need to allocate any
4928  // this only possible in mode 1, cannot have reserved threads in mode 0
4929  KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4930  team->t.t_nproc = new_nproc; // just get reserved threads involved
4931  } else {
4932  // we may have some threads in reserve, but not enough
4933  team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
4934  hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
4935 #endif // KMP_NESTED_HOT_TEAMS
4936  if(team->t.t_max_nproc < new_nproc) {
4937  /* reallocate larger arrays */
4938  __kmp_reallocate_team_arrays(team, new_nproc);
4939  __kmp_reinitialize_team( team, new_icvs, NULL );
4940  }
4941 
4942 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4943  /* Temporarily set full mask for master thread before
4944  creation of workers. The reason is that workers inherit
4945  the affinity from master, so if a lot of workers are
4946  created on the single core quickly, they don't get
4947  a chance to set their own affinity for a long time.
4948  */
4949  __kmp_set_thread_affinity_mask_full_tmp( old_mask );
4950 #endif
4951 
4952  /* allocate new threads for the hot team */
4953  for( f = team->t.t_nproc ; f < new_nproc ; f++ ) {
4954  kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
4955  KMP_DEBUG_ASSERT( new_worker );
4956  team->t.t_threads[ f ] = new_worker;
4957 
4958  KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%llu, plain=%llu\n",
4959  team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
4960  team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4961  team->t.t_bar[bs_plain_barrier].b_arrived ) );
4962 
4963  { // Initialize barrier data for new threads.
4964  int b;
4965  kmp_balign_t * balign = new_worker->th.th_bar;
4966  for( b = 0; b < bs_last_barrier; ++ b ) {
4967  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
4968  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4969 #if USE_DEBUGGER
4970  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
4971 #endif
4972  }
4973  }
4974  }
4975 
4976 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4977  if ( KMP_AFFINITY_CAPABLE() ) {
4978  /* Restore initial master thread's affinity mask */
4979  __kmp_set_system_affinity( old_mask, TRUE );
4980  KMP_CPU_FREE(old_mask);
4981  }
4982 #endif
4983 #if KMP_NESTED_HOT_TEAMS
4984  } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
4985 #endif // KMP_NESTED_HOT_TEAMS
4986  /* make sure everyone is syncronized */
4987  int old_nproc = team->t.t_nproc; // save old value and use to update only new threads below
4988  __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
4989 
4990  /* reinitialize the threads */
4991  KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
4992  for (f=0; f < team->t.t_nproc; ++f)
4993  __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
4994  if (level) { // set th_task_state for new threads in nested hot team
4995  // __kmp_initialize_info() no longer zeroes th_task_state, so we should only need to set the
4996  // th_task_state for the new threads. th_task_state for master thread will not be accurate until
4997  // after this in __kmp_fork_call(), so we look to the master's memo_stack to get the correct value.
4998  for (f=old_nproc; f < team->t.t_nproc; ++f)
4999  team->t.t_threads[f]->th.th_task_state = team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5000  }
5001  else { // set th_task_state for new threads in non-nested hot team
5002  int old_state = team->t.t_threads[0]->th.th_task_state; // copy master's state
5003  for (f=old_nproc; f < team->t.t_nproc; ++f)
5004  team->t.t_threads[f]->th.th_task_state = old_state;
5005  }
5006 
5007 #ifdef KMP_DEBUG
5008  for ( f = 0; f < team->t.t_nproc; ++ f ) {
5009  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5010  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5011  }
5012 #endif
5013 
5014 #if OMP_40_ENABLED
5015  KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5016 # if KMP_AFFINITY_SUPPORTED
5017  __kmp_partition_places( team );
5018 # endif
5019 #endif
5020  } // Check changes in number of threads
5021 
5022 #if OMP_40_ENABLED
5023  kmp_info_t *master = team->t.t_threads[0];
5024  if( master->th.th_teams_microtask ) {
5025  for( f = 1; f < new_nproc; ++f ) {
5026  // propagate teams construct specific info to workers
5027  kmp_info_t *thr = team->t.t_threads[f];
5028  thr->th.th_teams_microtask = master->th.th_teams_microtask;
5029  thr->th.th_teams_level = master->th.th_teams_level;
5030  thr->th.th_teams_size = master->th.th_teams_size;
5031  }
5032  }
5033 #endif /* OMP_40_ENABLED */
5034 #if KMP_NESTED_HOT_TEAMS
5035  if( level ) {
5036  // Sync barrier state for nested hot teams, not needed for outermost hot team.
5037  for( f = 1; f < new_nproc; ++f ) {
5038  kmp_info_t *thr = team->t.t_threads[f];
5039  int b;
5040  kmp_balign_t * balign = thr->th.th_bar;
5041  for( b = 0; b < bs_last_barrier; ++ b ) {
5042  balign[ b ].bb.b_arrived = team->t.t_bar[ b ].b_arrived;
5043  KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5044 #if USE_DEBUGGER
5045  balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5046 #endif
5047  }
5048  }
5049  }
5050 #endif // KMP_NESTED_HOT_TEAMS
5051 
5052  /* reallocate space for arguments if necessary */
5053  __kmp_alloc_argv_entries( argc, team, TRUE );
5054  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5055  //
5056  // The hot team re-uses the previous task team,
5057  // if untouched during the previous release->gather phase.
5058  //
5059 
5060  KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5061 
5062 #if KMP_DEBUG
5063  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5064  KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
5065  team->t.t_task_team[0], team->t.t_task_team[1] ));
5066  }
5067 #endif
5068 
5069 #if OMPT_SUPPORT
5070  __ompt_team_assign_id(team, ompt_parallel_id);
5071 #endif
5072 
5073  KMP_MB();
5074 
5075  return team;
5076  }
5077 
5078  /* next, let's try to take one from the team pool */
5079  KMP_MB();
5080  for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5081  {
5082  /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5083  if ( team->t.t_max_nproc >= max_nproc ) {
5084  /* take this team from the team pool */
5085  __kmp_team_pool = team->t.t_next_pool;
5086 
5087  /* setup the team for fresh use */
5088  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5089 
5090  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5091  &team->t.t_task_team[0], &team->t.t_task_team[1]) );
5092  team->t.t_task_team[0] = NULL;
5093  team->t.t_task_team[1] = NULL;
5094 
5095  /* reallocate space for arguments if necessary */
5096  __kmp_alloc_argv_entries( argc, team, TRUE );
5097  KMP_CHECK_UPDATE(team->t.t_argc, argc);
5098 
5099  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5100  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5101  { // Initialize barrier data.
5102  int b;
5103  for ( b = 0; b < bs_last_barrier; ++ b) {
5104  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5105 #if USE_DEBUGGER
5106  team->t.t_bar[ b ].b_master_arrived = 0;
5107  team->t.t_bar[ b ].b_team_arrived = 0;
5108 #endif
5109  }
5110  }
5111 
5112 #if OMP_40_ENABLED
5113  team->t.t_proc_bind = new_proc_bind;
5114 #endif
5115 
5116  KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5117 
5118 #if OMPT_SUPPORT
5119  __ompt_team_assign_id(team, ompt_parallel_id);
5120 #endif
5121 
5122  KMP_MB();
5123 
5124  return team;
5125  }
5126 
5127  /* reap team if it is too small, then loop back and check the next one */
5128  /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5129  /* TODO: Use technique to find the right size hot-team, don't reap them */
5130  team = __kmp_reap_team( team );
5131  __kmp_team_pool = team;
5132  }
5133 
5134  /* nothing available in the pool, no matter, make a new team! */
5135  KMP_MB();
5136  team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5137 
5138  /* and set it up */
5139  team->t.t_max_nproc = max_nproc;
5140  /* NOTE well, for some reason allocating one big buffer and dividing it
5141  * up seems to really hurt performance a lot on the P4, so, let's not use
5142  * this... */
5143  __kmp_allocate_team_arrays( team, max_nproc );
5144 
5145  KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
5146  __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5147 
5148  KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5149  &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
5150  team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5151  team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5152 
5153  if ( __kmp_storage_map ) {
5154  __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5155  }
5156 
5157  /* allocate space for arguments */
5158  __kmp_alloc_argv_entries( argc, team, FALSE );
5159  team->t.t_argc = argc;
5160 
5161  KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5162  team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5163  { // Initialize barrier data.
5164  int b;
5165  for ( b = 0; b < bs_last_barrier; ++ b ) {
5166  team->t.t_bar[ b ].b_arrived = KMP_INIT_BARRIER_STATE;
5167 #if USE_DEBUGGER
5168  team->t.t_bar[ b ].b_master_arrived = 0;
5169  team->t.t_bar[ b ].b_team_arrived = 0;
5170 #endif
5171  }
5172  }
5173 
5174 #if OMP_40_ENABLED
5175  team->t.t_proc_bind = new_proc_bind;
5176 #endif
5177 
5178 #if OMPT_SUPPORT
5179  __ompt_team_assign_id(team, ompt_parallel_id);
5180  team->t.ompt_serialized_team_info = NULL;
5181 #endif
5182 
5183  KMP_MB();
5184 
5185  KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5186 
5187  return team;
5188 }
5189 
5190 /* TODO implement hot-teams at all levels */
5191 /* TODO implement lazy thread release on demand (disband request) */
5192 
5193 /* free the team. return it to the team pool. release all the threads
5194  * associated with it */
5195 void
5196 __kmp_free_team( kmp_root_t *root, kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master) )
5197 {
5198  int f;
5199  KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5200 
5201  /* verify state */
5202  KMP_DEBUG_ASSERT( root );
5203  KMP_DEBUG_ASSERT( team );
5204  KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5205  KMP_DEBUG_ASSERT( team->t.t_threads );
5206 
5207  int use_hot_team = team == root->r.r_hot_team;
5208 #if KMP_NESTED_HOT_TEAMS
5209  int level;
5210  kmp_hot_team_ptr_t *hot_teams;
5211  if( master ) {
5212  level = team->t.t_active_level - 1;
5213  if( master->th.th_teams_microtask ) { // in teams construct?
5214  if( master->th.th_teams_size.nteams > 1 ) {
5215  ++level; // level was not increased in teams construct for team_of_masters
5216  }
5217  if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5218  master->th.th_teams_level == team->t.t_level ) {
5219  ++level; // level was not increased in teams construct for team_of_workers before the parallel
5220  } // team->t.t_level will be increased inside parallel
5221  }
5222  hot_teams = master->th.th_hot_teams;
5223  if( level < __kmp_hot_teams_max_level ) {
5224  KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
5225  use_hot_team = 1;
5226  }
5227  }
5228 #endif // KMP_NESTED_HOT_TEAMS
5229 
5230  /* team is done working */
5231  TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5232  team->t.t_copyin_counter = 0; // init counter for possible reuse
5233  // Do not reset pointer to parent team to NULL for hot teams.
5234 
5235  /* if we are non-hot team, release our threads */
5236  if( ! use_hot_team ) {
5237  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5238  // Delete task teams
5239  int tt_idx;
5240  for (tt_idx=0; tt_idx<2; ++tt_idx) {
5241  kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5242  if ( task_team != NULL ) {
5243  for (f=0; f<team->t.t_nproc; ++f) { // Have all threads unref task teams
5244  team->t.t_threads[f]->th.th_task_team = NULL;
5245  }
5246  KA_TRACE( 20, ( "__kmp_free_team: T#%d deactivating task_team %p on team %d\n", __kmp_get_gtid(), task_team, team->t.t_id ) );
5247 #if KMP_NESTED_HOT_TEAMS
5248  __kmp_free_task_team( master, task_team );
5249 #endif
5250  team->t.t_task_team[tt_idx] = NULL;
5251  }
5252  }
5253  }
5254 
5255  // Reset pointer to parent team only for non-hot teams.
5256  team->t.t_parent = NULL;
5257  team->t.t_level = 0;
5258  team->t.t_active_level = 0;
5259 
5260  /* free the worker threads */
5261  for ( f = 1; f < team->t.t_nproc; ++ f ) {
5262  KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5263  __kmp_free_thread( team->t.t_threads[ f ] );
5264  team->t.t_threads[ f ] = NULL;
5265  }
5266 
5267  /* put the team back in the team pool */
5268  /* TODO limit size of team pool, call reap_team if pool too large */
5269  team->t.t_next_pool = (kmp_team_t*) __kmp_team_pool;
5270  __kmp_team_pool = (volatile kmp_team_t*) team;
5271  }
5272 
5273  KMP_MB();
5274 }
5275 
5276 
5277 /* reap the team. destroy it, reclaim all its resources and free its memory */
5278 kmp_team_t *
5279 __kmp_reap_team( kmp_team_t *team )
5280 {
5281  kmp_team_t *next_pool = team->t.t_next_pool;
5282 
5283  KMP_DEBUG_ASSERT( team );
5284  KMP_DEBUG_ASSERT( team->t.t_dispatch );
5285  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5286  KMP_DEBUG_ASSERT( team->t.t_threads );
5287  KMP_DEBUG_ASSERT( team->t.t_argv );
5288 
5289  /* TODO clean the threads that are a part of this? */
5290 
5291  /* free stuff */
5292 
5293  __kmp_free_team_arrays( team );
5294  if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5295  __kmp_free( (void*) team->t.t_argv );
5296  __kmp_free( team );
5297 
5298  KMP_MB();
5299  return next_pool;
5300 }
5301 
5302 //
5303 // Free the thread. Don't reap it, just place it on the pool of available
5304 // threads.
5305 //
5306 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5307 // binding for the affinity mechanism to be useful.
5308 //
5309 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5310 // However, we want to avoid a potential performance problem by always
5311 // scanning through the list to find the correct point at which to insert
5312 // the thread (potential N**2 behavior). To do this we keep track of the
5313 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5314 // With single-level parallelism, threads will always be added to the tail
5315 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5316 // parallelism, all bets are off and we may need to scan through the entire
5317 // free list.
5318 //
5319 // This change also has a potentially large performance benefit, for some
5320 // applications. Previously, as threads were freed from the hot team, they
5321 // would be placed back on the free list in inverse order. If the hot team
5322 // grew back to it's original size, then the freed thread would be placed
5323 // back on the hot team in reverse order. This could cause bad cache
5324 // locality problems on programs where the size of the hot team regularly
5325 // grew and shrunk.
5326 //
5327 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5328 //
5329 void
5330 __kmp_free_thread( kmp_info_t *this_th )
5331 {
5332  int gtid;
5333  kmp_info_t **scan;
5334 
5335  KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5336  __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5337 
5338  KMP_DEBUG_ASSERT( this_th );
5339 
5340  // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5341  int b;
5342  kmp_balign_t *balign = this_th->th.th_bar;
5343  for (b=0; b<bs_last_barrier; ++b) {
5344  if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5345  balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5346  balign[b].bb.team = NULL;
5347  balign[b].bb.leaf_kids = 0;
5348  }
5349  this_th->th.th_task_state = 0;
5350 
5351  /* put thread back on the free pool */
5352  TCW_PTR(this_th->th.th_team, NULL);
5353  TCW_PTR(this_th->th.th_root, NULL);
5354  TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5355 
5356  //
5357  // If the __kmp_thread_pool_insert_pt is already past the new insert
5358  // point, then we need to re-scan the entire list.
5359  //
5360  gtid = this_th->th.th_info.ds.ds_gtid;
5361  if ( __kmp_thread_pool_insert_pt != NULL ) {
5362  KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5363  if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5364  __kmp_thread_pool_insert_pt = NULL;
5365  }
5366  }
5367 
5368  //
5369  // Scan down the list to find the place to insert the thread.
5370  // scan is the address of a link in the list, possibly the address of
5371  // __kmp_thread_pool itself.
5372  //
5373  // In the absence of nested parallism, the for loop will have 0 iterations.
5374  //
5375  if ( __kmp_thread_pool_insert_pt != NULL ) {
5376  scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5377  }
5378  else {
5379  scan = (kmp_info_t **)&__kmp_thread_pool;
5380  }
5381  for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5382  scan = &( (*scan)->th.th_next_pool ) );
5383 
5384  //
5385  // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5386  // to its address.
5387  //
5388  TCW_PTR(this_th->th.th_next_pool, *scan);
5389  __kmp_thread_pool_insert_pt = *scan = this_th;
5390  KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5391  || ( this_th->th.th_info.ds.ds_gtid
5392  < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5393  TCW_4(this_th->th.th_in_pool, TRUE);
5394  __kmp_thread_pool_nth++;
5395 
5396  TCW_4(__kmp_nth, __kmp_nth - 1);
5397 
5398 #ifdef KMP_ADJUST_BLOCKTIME
5399  /* Adjust blocktime back to user setting or default if necessary */
5400  /* Middle initialization might never have occurred */
5401  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5402  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5403  if ( __kmp_nth <= __kmp_avail_proc ) {
5404  __kmp_zero_bt = FALSE;
5405  }
5406  }
5407 #endif /* KMP_ADJUST_BLOCKTIME */
5408 
5409  KMP_MB();
5410 }
5411 
5412 
5413 /* ------------------------------------------------------------------------ */
5414 
5415 void *
5416 __kmp_launch_thread( kmp_info_t *this_thr )
5417 {
5418  int gtid = this_thr->th.th_info.ds.ds_gtid;
5419 /* void *stack_data;*/
5420  kmp_team_t *(*volatile pteam);
5421 
5422  KMP_MB();
5423  KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5424 
5425  if( __kmp_env_consistency_check ) {
5426  this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid ); // ATT: Memory leak?
5427  }
5428 
5429 #if OMPT_SUPPORT
5430  if (ompt_enabled) {
5431  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5432  this_thr->th.ompt_thread_info.wait_id = 0;
5433  this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5434  if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5435  __ompt_thread_begin(ompt_thread_worker, gtid);
5436  }
5437  }
5438 #endif
5439 
5440  /* This is the place where threads wait for work */
5441  while( ! TCR_4(__kmp_global.g.g_done) ) {
5442  KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5443  KMP_MB();
5444 
5445  /* wait for work to do */
5446  KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5447 
5448 #if OMPT_SUPPORT
5449  if (ompt_enabled) {
5450  this_thr->th.ompt_thread_info.state = ompt_state_idle;
5451  }
5452 #endif
5453 
5454  /* No tid yet since not part of a team */
5455  __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5456 
5457 #if OMPT_SUPPORT
5458  if (ompt_enabled) {
5459  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5460  }
5461 #endif
5462 
5463  pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5464 
5465  /* have we been allocated? */
5466  if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5467 #if OMPT_SUPPORT
5468  ompt_task_info_t *task_info;
5469  ompt_parallel_id_t my_parallel_id;
5470  if (ompt_enabled) {
5471  task_info = __ompt_get_taskinfo(0);
5472  my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5473  }
5474 #endif
5475  /* we were just woken up, so run our new task */
5476  if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5477  int rc;
5478  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5479  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5480 
5481  updateHWFPControl (*pteam);
5482 
5483 #if OMPT_SUPPORT
5484  if (ompt_enabled) {
5485  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5486  // Initialize OMPT task id for implicit task.
5487  int tid = __kmp_tid_from_gtid(gtid);
5488  task_info->task_id = __ompt_task_id_new(tid);
5489  }
5490 #endif
5491 
5492  KMP_STOP_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
5493  {
5494  KMP_TIME_DEVELOPER_BLOCK(USER_worker_invoke);
5495  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5496  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5497  rc = (*pteam)->t.t_invoke( gtid );
5498  }
5499  KMP_START_DEVELOPER_EXPLICIT_TIMER(USER_launch_thread_loop);
5500  KMP_ASSERT( rc );
5501 
5502 #if OMPT_SUPPORT
5503  if (ompt_enabled) {
5504  /* no frame set while outside task */
5505  task_info->frame.exit_runtime_frame = 0;
5506 
5507  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5508  }
5509 #endif
5510  KMP_MB();
5511  KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5512  gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5513  }
5514  /* join barrier after parallel region */
5515  __kmp_join_barrier( gtid );
5516 #if OMPT_SUPPORT && OMPT_TRACE
5517  if (ompt_enabled) {
5518  if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5519  // don't access *pteam here: it may have already been freed
5520  // by the master thread behind the barrier (possible race)
5521  ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5522  my_parallel_id, task_info->task_id);
5523  }
5524  task_info->frame.exit_runtime_frame = 0;
5525  task_info->task_id = 0;
5526  }
5527 #endif
5528  }
5529  }
5530  TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5531 
5532 #if OMPT_SUPPORT
5533  if (ompt_enabled &&
5534  ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5535  __ompt_thread_end(ompt_thread_worker, gtid);
5536  }
5537 #endif
5538 
5539  this_thr->th.th_task_team = NULL;
5540  /* run the destructors for the threadprivate data for this thread */
5541  __kmp_common_destroy_gtid( gtid );
5542 
5543  KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5544  KMP_MB();
5545  return this_thr;
5546 }
5547 
5548 /* ------------------------------------------------------------------------ */
5549 /* ------------------------------------------------------------------------ */
5550 
5551 void
5552 __kmp_internal_end_dest( void *specific_gtid )
5553 {
5554  #if KMP_COMPILER_ICC
5555  #pragma warning( push )
5556  #pragma warning( disable: 810 ) // conversion from "void *" to "int" may lose significant bits
5557  #endif
5558  // Make sure no significant bits are lost
5559  int gtid = (kmp_intptr_t)specific_gtid - 1;
5560  #if KMP_COMPILER_ICC
5561  #pragma warning( pop )
5562  #endif
5563 
5564  KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5565  /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5566  * this is because 0 is reserved for the nothing-stored case */
5567 
5568  /* josh: One reason for setting the gtid specific data even when it is being
5569  destroyed by pthread is to allow gtid lookup through thread specific data
5570  (__kmp_gtid_get_specific). Some of the code, especially stat code,
5571  that gets executed in the call to __kmp_internal_end_thread, actually
5572  gets the gtid through the thread specific data. Setting it here seems
5573  rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5574  to run smoothly.
5575  todo: get rid of this after we remove the dependence on
5576  __kmp_gtid_get_specific
5577  */
5578  if(gtid >= 0 && KMP_UBER_GTID(gtid))
5579  __kmp_gtid_set_specific( gtid );
5580  #ifdef KMP_TDATA_GTID
5581  __kmp_gtid = gtid;
5582  #endif
5583  __kmp_internal_end_thread( gtid );
5584 }
5585 
5586 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5587 
5588 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5589 // perfectly, but in real libomp.so I have no evidence it is ever called. However, -fini linker
5590 // option in makefile.mk works fine.
5591 
5592 __attribute__(( destructor ))
5593 void
5594 __kmp_internal_end_dtor( void )
5595 {
5596  __kmp_internal_end_atexit();
5597 }
5598 
5599 void
5600 __kmp_internal_end_fini( void )
5601 {
5602  __kmp_internal_end_atexit();
5603 }
5604 
5605 #endif
5606 
5607 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5608 void
5609 __kmp_internal_end_atexit( void )
5610 {
5611  KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5612  /* [Windows]
5613  josh: ideally, we want to completely shutdown the library in this atexit handler, but
5614  stat code that depends on thread specific data for gtid fails because that data becomes
5615  unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5616  instead. We should eventually remove the dependency on __kmp_get_specific_gtid in the
5617  stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5618 
5619 // TODO: Can some of this comment about GVS be removed?
5620  I suspect that the offending stat code is executed when the calling thread tries to
5621  clean up a dead root thread's data structures, resulting in GVS code trying to close
5622  the GVS structures for that thread, but since the stat code uses
5623  __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5624  cleaning up itself instead of another thread, it gets confused. This happens because
5625  allowing a thread to unregister and cleanup another thread is a recent modification for
5626  addressing an issue with Maxon Cinema4D. Based on the current design (20050722), a
5627  thread may end up trying to unregister another thread only if thread death does not
5628  trigger the calling of __kmp_internal_end_thread. For Linux* OS, there is the thread
5629  specific data destructor function to detect thread death. For Windows dynamic, there
5630  is DllMain(THREAD_DETACH). For Windows static, there is nothing. Thus, the
5631  workaround is applicable only for Windows static stat library.
5632  */
5633  __kmp_internal_end_library( -1 );
5634  #if KMP_OS_WINDOWS
5635  __kmp_close_console();
5636  #endif
5637 }
5638 
5639 static void
5640 __kmp_reap_thread(
5641  kmp_info_t * thread,
5642  int is_root
5643 ) {
5644 
5645  // It is assumed __kmp_forkjoin_lock is acquired.
5646 
5647  int gtid;
5648 
5649  KMP_DEBUG_ASSERT( thread != NULL );
5650 
5651  gtid = thread->th.th_info.ds.ds_gtid;
5652 
5653  if ( ! is_root ) {
5654 
5655  if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5656  /* Assume the threads are at the fork barrier here */
5657  KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5658  /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5659  kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5660  __kmp_release_64(&flag);
5661  }; // if
5662 
5663  // Terminate OS thread.
5664  __kmp_reap_worker( thread );
5665 
5666  //
5667  // The thread was killed asynchronously. If it was actively
5668  // spinning in the thread pool, decrement the global count.
5669  //
5670  // There is a small timing hole here - if the worker thread was
5671  // just waking up after sleeping in the pool, had reset it's
5672  // th_active_in_pool flag but not decremented the global counter
5673  // __kmp_thread_pool_active_nth yet, then the global counter
5674  // might not get updated.
5675  //
5676  // Currently, this can only happen as the library is unloaded,
5677  // so there are no harmful side effects.
5678  //
5679  if ( thread->th.th_active_in_pool ) {
5680  thread->th.th_active_in_pool = FALSE;
5681  KMP_TEST_THEN_DEC32(
5682  (kmp_int32 *) &__kmp_thread_pool_active_nth );
5683  KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5684  }
5685 
5686  // Decrement # of [worker] threads in the pool.
5687  KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5688  --__kmp_thread_pool_nth;
5689  }; // if
5690 
5691  __kmp_free_implicit_task(thread);
5692 
5693  // Free the fast memory for tasking
5694  #if USE_FAST_MEMORY
5695  __kmp_free_fast_memory( thread );
5696  #endif /* USE_FAST_MEMORY */
5697 
5698  __kmp_suspend_uninitialize_thread( thread );
5699 
5700  KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5701  TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5702 
5703  -- __kmp_all_nth;
5704  // __kmp_nth was decremented when thread is added to the pool.
5705 
5706 #ifdef KMP_ADJUST_BLOCKTIME
5707  /* Adjust blocktime back to user setting or default if necessary */
5708  /* Middle initialization might never have occurred */
5709  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5710  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5711  if ( __kmp_nth <= __kmp_avail_proc ) {
5712  __kmp_zero_bt = FALSE;
5713  }
5714  }
5715 #endif /* KMP_ADJUST_BLOCKTIME */
5716 
5717  /* free the memory being used */
5718  if( __kmp_env_consistency_check ) {
5719  if ( thread->th.th_cons ) {
5720  __kmp_free_cons_stack( thread->th.th_cons );
5721  thread->th.th_cons = NULL;
5722  }; // if
5723  }
5724 
5725  if ( thread->th.th_pri_common != NULL ) {
5726  __kmp_free( thread->th.th_pri_common );
5727  thread->th.th_pri_common = NULL;
5728  }; // if
5729 
5730  if (thread->th.th_task_state_memo_stack != NULL) {
5731  __kmp_free(thread->th.th_task_state_memo_stack);
5732  thread->th.th_task_state_memo_stack = NULL;
5733  }
5734 
5735  #if KMP_USE_BGET
5736  if ( thread->th.th_local.bget_data != NULL ) {
5737  __kmp_finalize_bget( thread );
5738  }; // if
5739  #endif
5740 
5741 #if KMP_AFFINITY_SUPPORTED
5742  if ( thread->th.th_affin_mask != NULL ) {
5743  KMP_CPU_FREE( thread->th.th_affin_mask );
5744  thread->th.th_affin_mask = NULL;
5745  }; // if
5746 #endif /* KMP_AFFINITY_SUPPORTED */
5747 
5748  __kmp_reap_team( thread->th.th_serial_team );
5749  thread->th.th_serial_team = NULL;
5750  __kmp_free( thread );
5751 
5752  KMP_MB();
5753 
5754 } // __kmp_reap_thread
5755 
5756 static void
5757 __kmp_internal_end(void)
5758 {
5759  int i;
5760 
5761  /* First, unregister the library */
5762  __kmp_unregister_library();
5763 
5764  #if KMP_OS_WINDOWS
5765  /* In Win static library, we can't tell when a root actually dies, so we
5766  reclaim the data structures for any root threads that have died but not
5767  unregistered themselves, in order to shut down cleanly.
5768  In Win dynamic library we also can't tell when a thread dies.
5769  */
5770  __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5771  #endif
5772 
5773  for( i=0 ; i<__kmp_threads_capacity ; i++ )
5774  if( __kmp_root[i] )
5775  if( __kmp_root[i]->r.r_active )
5776  break;
5777  KMP_MB(); /* Flush all pending memory write invalidates. */
5778  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5779 
5780  if ( i < __kmp_threads_capacity ) {
5781  // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5782  KMP_MB(); /* Flush all pending memory write invalidates. */
5783 
5784  //
5785  // Need to check that monitor was initialized before reaping it.
5786  // If we are called form __kmp_atfork_child (which sets
5787  // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5788  // contain valid data, but it is only valid in the parent process,
5789  // not the child.
5790  //
5791  // New behavior (201008): instead of keying off of the flag
5792  // __kmp_init_parallel, the monitor thread creation is keyed off
5793  // of the new flag __kmp_init_monitor.
5794  //
5795  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5796  if ( TCR_4( __kmp_init_monitor ) ) {
5797  __kmp_reap_monitor( & __kmp_monitor );
5798  TCW_4( __kmp_init_monitor, 0 );
5799  }
5800  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5801  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5802  } else {
5803  /* TODO move this to cleanup code */
5804  #ifdef KMP_DEBUG
5805  /* make sure that everything has properly ended */
5806  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5807  if( __kmp_root[i] ) {
5808 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: there can be uber threads alive here
5809  KMP_ASSERT( ! __kmp_root[i]->r.r_active ); // TODO: can they be active?
5810  }
5811  }
5812  #endif
5813 
5814  KMP_MB();
5815 
5816  // Reap the worker threads.
5817  // This is valid for now, but be careful if threads are reaped sooner.
5818  while ( __kmp_thread_pool != NULL ) { // Loop thru all the thread in the pool.
5819  // Get the next thread from the pool.
5820  kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5821  __kmp_thread_pool = thread->th.th_next_pool;
5822  // Reap it.
5823  thread->th.th_next_pool = NULL;
5824  thread->th.th_in_pool = FALSE;
5825  __kmp_reap_thread( thread, 0 );
5826  }; // while
5827  __kmp_thread_pool_insert_pt = NULL;
5828 
5829  // Reap teams.
5830  while ( __kmp_team_pool != NULL ) { // Loop thru all the teams in the pool.
5831  // Get the next team from the pool.
5832  kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5833  __kmp_team_pool = team->t.t_next_pool;
5834  // Reap it.
5835  team->t.t_next_pool = NULL;
5836  __kmp_reap_team( team );
5837  }; // while
5838 
5839  __kmp_reap_task_teams( );
5840 
5841  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5842  // TBD: Add some checking...
5843  // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5844  }
5845 
5846  /* Make sure all threadprivate destructors get run by joining with all worker
5847  threads before resetting this flag */
5848  TCW_SYNC_4(__kmp_init_common, FALSE);
5849 
5850  KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5851  KMP_MB();
5852 
5853  //
5854  // See note above: One of the possible fixes for CQ138434 / CQ140126
5855  //
5856  // FIXME: push both code fragments down and CSE them?
5857  // push them into __kmp_cleanup() ?
5858  //
5859  __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5860  if ( TCR_4( __kmp_init_monitor ) ) {
5861  __kmp_reap_monitor( & __kmp_monitor );
5862  TCW_4( __kmp_init_monitor, 0 );
5863  }
5864  __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5865  KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5866 
5867  } /* else !__kmp_global.t_active */
5868  TCW_4(__kmp_init_gtid, FALSE);
5869  KMP_MB(); /* Flush all pending memory write invalidates. */
5870 
5871  __kmp_cleanup();
5872 #if OMPT_SUPPORT
5873  ompt_fini();
5874 #endif
5875 }
5876 
5877 void
5878 __kmp_internal_end_library( int gtid_req )
5879 {
5880  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5881  /* this shouldn't be a race condition because __kmp_internal_end() is the
5882  * only place to clear __kmp_serial_init */
5883  /* we'll check this later too, after we get the lock */
5884  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5885  // because the next check will work in any case.
5886  if( __kmp_global.g.g_abort ) {
5887  KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5888  /* TODO abort? */
5889  return;
5890  }
5891  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5892  KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5893  return;
5894  }
5895 
5896 
5897  KMP_MB(); /* Flush all pending memory write invalidates. */
5898 
5899  /* find out who we are and what we should do */
5900  {
5901  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5902  KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req ));
5903  if( gtid == KMP_GTID_SHUTDOWN ) {
5904  KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5905  return;
5906  } else if( gtid == KMP_GTID_MONITOR ) {
5907  KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5908  return;
5909  } else if( gtid == KMP_GTID_DNE ) {
5910  KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5911  /* we don't know who we are, but we may still shutdown the library */
5912  } else if( KMP_UBER_GTID( gtid )) {
5913  /* unregister ourselves as an uber thread. gtid is no longer valid */
5914  if( __kmp_root[gtid]->r.r_active ) {
5915  __kmp_global.g.g_abort = -1;
5916  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5917  KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5918  return;
5919  } else {
5920  KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5921  __kmp_unregister_root_current_thread( gtid );
5922  }
5923  } else {
5924  /* worker threads may call this function through the atexit handler, if they call exit() */
5925  /* For now, skip the usual subsequent processing and just dump the debug buffer.
5926  TODO: do a thorough shutdown instead
5927  */
5928  #ifdef DUMP_DEBUG_ON_EXIT
5929  if ( __kmp_debug_buf )
5930  __kmp_dump_debug_buffer( );
5931  #endif
5932  return;
5933  }
5934  }
5935  /* synchronize the termination process */
5936  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5937 
5938  /* have we already finished */
5939  if( __kmp_global.g.g_abort ) {
5940  KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
5941  /* TODO abort? */
5942  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5943  return;
5944  }
5945  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5946  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5947  return;
5948  }
5949 
5950  /* We need this lock to enforce mutex between this reading of
5951  __kmp_threads_capacity and the writing by __kmp_register_root.
5952  Alternatively, we can use a counter of roots that is
5953  atomically updated by __kmp_get_global_thread_id_reg,
5954  __kmp_do_serial_initialize and __kmp_internal_end_*.
5955  */
5956  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
5957 
5958  /* now we can safely conduct the actual termination */
5959  __kmp_internal_end();
5960 
5961  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
5962  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5963 
5964  KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
5965 
5966  #ifdef DUMP_DEBUG_ON_EXIT
5967  if ( __kmp_debug_buf )
5968  __kmp_dump_debug_buffer();
5969  #endif
5970 
5971  #if KMP_OS_WINDOWS
5972  __kmp_close_console();
5973  #endif
5974 
5975  __kmp_fini_allocator();
5976 
5977 } // __kmp_internal_end_library
5978 
5979 void
5980 __kmp_internal_end_thread( int gtid_req )
5981 {
5982  int i;
5983 
5984  /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5985  /* this shouldn't be a race condition because __kmp_internal_end() is the
5986  * only place to clear __kmp_serial_init */
5987  /* we'll check this later too, after we get the lock */
5988  // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
5989  // because the next check will work in any case.
5990  if( __kmp_global.g.g_abort ) {
5991  KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
5992  /* TODO abort? */
5993  return;
5994  }
5995  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5996  KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
5997  return;
5998  }
5999 
6000  KMP_MB(); /* Flush all pending memory write invalidates. */
6001 
6002  /* find out who we are and what we should do */
6003  {
6004  int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
6005  KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req ));
6006  if( gtid == KMP_GTID_SHUTDOWN ) {
6007  KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
6008  return;
6009  } else if( gtid == KMP_GTID_MONITOR ) {
6010  KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
6011  return;
6012  } else if( gtid == KMP_GTID_DNE ) {
6013  KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
6014  return;
6015  /* we don't know who we are */
6016  } else if( KMP_UBER_GTID( gtid )) {
6017  /* unregister ourselves as an uber thread. gtid is no longer valid */
6018  if( __kmp_root[gtid]->r.r_active ) {
6019  __kmp_global.g.g_abort = -1;
6020  TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6021  KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
6022  return;
6023  } else {
6024  KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
6025  __kmp_unregister_root_current_thread( gtid );
6026  }
6027  } else {
6028  /* just a worker thread, let's leave */
6029  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
6030 
6031  if ( gtid >= 0 ) {
6032  __kmp_threads[gtid]->th.th_task_team = NULL;
6033  }
6034 
6035  KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
6036  return;
6037  }
6038  }
6039  #if defined KMP_DYNAMIC_LIB
6040  // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
6041  // because we will better shutdown later in the library destructor.
6042  // The reason of this change is performance problem when non-openmp thread
6043  // in a loop forks and joins many openmp threads. We can save a lot of time
6044  // keeping worker threads alive until the program shutdown.
6045  // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
6046  // Windows(DPD200287443) that occurs when using critical sections from foreign threads.
6047  KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
6048  return;
6049  #endif
6050  /* synchronize the termination process */
6051  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6052 
6053  /* have we already finished */
6054  if( __kmp_global.g.g_abort ) {
6055  KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
6056  /* TODO abort? */
6057  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6058  return;
6059  }
6060  if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6061  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6062  return;
6063  }
6064 
6065  /* We need this lock to enforce mutex between this reading of
6066  __kmp_threads_capacity and the writing by __kmp_register_root.
6067  Alternatively, we can use a counter of roots that is
6068  atomically updated by __kmp_get_global_thread_id_reg,
6069  __kmp_do_serial_initialize and __kmp_internal_end_*.
6070  */
6071 
6072  /* should we finish the run-time? are all siblings done? */
6073  __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6074 
6075  for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6076  if ( KMP_UBER_GTID( i ) ) {
6077  KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
6078  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6079  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6080  return;
6081  };
6082  }
6083 
6084  /* now we can safely conduct the actual termination */
6085 
6086  __kmp_internal_end();
6087 
6088  __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6089  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6090 
6091  KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
6092 
6093  #ifdef DUMP_DEBUG_ON_EXIT
6094  if ( __kmp_debug_buf )
6095  __kmp_dump_debug_buffer();
6096  #endif
6097 } // __kmp_internal_end_thread
6098 
6099 // -------------------------------------------------------------------------------------------------
6100 // Library registration stuff.
6101 
6102 static long __kmp_registration_flag = 0;
6103  // Random value used to indicate library initialization.
6104 static char * __kmp_registration_str = NULL;
6105  // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6106 
6107 
6108 static inline
6109 char *
6110 __kmp_reg_status_name() {
6111  /*
6112  On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
6113  If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
6114  the name of registered_lib_env env var can not be found, because the name will contain different pid.
6115  */
6116  return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
6117 } // __kmp_reg_status_get
6118 
6119 
6120 void
6121 __kmp_register_library_startup(
6122  void
6123 ) {
6124 
6125  char * name = __kmp_reg_status_name(); // Name of the environment variable.
6126  int done = 0;
6127  union {
6128  double dtime;
6129  long ltime;
6130  } time;
6131  #if KMP_OS_WINDOWS
6132  __kmp_initialize_system_tick();
6133  #endif
6134  __kmp_read_system_time( & time.dtime );
6135  __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
6136  __kmp_registration_str =
6137  __kmp_str_format(
6138  "%p-%lx-%s",
6139  & __kmp_registration_flag,
6140  __kmp_registration_flag,
6141  KMP_LIBRARY_FILE
6142  );
6143 
6144  KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
6145 
6146  while ( ! done ) {
6147 
6148  char * value = NULL; // Actual value of the environment variable.
6149 
6150  // Set environment variable, but do not overwrite if it is exist.
6151  __kmp_env_set( name, __kmp_registration_str, 0 );
6152  // Check the variable is written.
6153  value = __kmp_env_get( name );
6154  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6155 
6156  done = 1; // Ok, environment variable set successfully, exit the loop.
6157 
6158  } else {
6159 
6160  // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6161  // Check whether it alive or dead.
6162  int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6163  char * tail = value;
6164  char * flag_addr_str = NULL;
6165  char * flag_val_str = NULL;
6166  char const * file_name = NULL;
6167  __kmp_str_split( tail, '-', & flag_addr_str, & tail );
6168  __kmp_str_split( tail, '-', & flag_val_str, & tail );
6169  file_name = tail;
6170  if ( tail != NULL ) {
6171  long * flag_addr = 0;
6172  long flag_val = 0;
6173  KMP_SSCANF( flag_addr_str, "%p", & flag_addr );
6174  KMP_SSCANF( flag_val_str, "%lx", & flag_val );
6175  if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
6176  // First, check whether environment-encoded address is mapped into addr space.
6177  // If so, dereference it to see if it still has the right value.
6178 
6179  if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
6180  neighbor = 1;
6181  } else {
6182  // If not, then we know the other copy of the library is no longer running.
6183  neighbor = 2;
6184  }; // if
6185  }; // if
6186  }; // if
6187  switch ( neighbor ) {
6188  case 0 : // Cannot parse environment variable -- neighbor status unknown.
6189  // Assume it is the incompatible format of future version of the library.
6190  // Assume the other library is alive.
6191  // WARN( ... ); // TODO: Issue a warning.
6192  file_name = "unknown library";
6193  // Attention! Falling to the next case. That's intentional.
6194  case 1 : { // Neighbor is alive.
6195  // Check it is allowed.
6196  char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
6197  if ( ! __kmp_str_match_true( duplicate_ok ) ) {
6198  // That's not allowed. Issue fatal error.
6199  __kmp_msg(
6200  kmp_ms_fatal,
6201  KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
6202  KMP_HNT( DuplicateLibrary ),
6203  __kmp_msg_null
6204  );
6205  }; // if
6206  KMP_INTERNAL_FREE( duplicate_ok );
6207  __kmp_duplicate_library_ok = 1;
6208  done = 1; // Exit the loop.
6209  } break;
6210  case 2 : { // Neighbor is dead.
6211  // Clear the variable and try to register library again.
6212  __kmp_env_unset( name );
6213  } break;
6214  default : {
6215  KMP_DEBUG_ASSERT( 0 );
6216  } break;
6217  }; // switch
6218 
6219  }; // if
6220  KMP_INTERNAL_FREE( (void *) value );
6221 
6222  }; // while
6223  KMP_INTERNAL_FREE( (void *) name );
6224 
6225 } // func __kmp_register_library_startup
6226 
6227 
6228 void
6229 __kmp_unregister_library( void ) {
6230 
6231  char * name = __kmp_reg_status_name();
6232  char * value = __kmp_env_get( name );
6233 
6234  KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
6235  KMP_DEBUG_ASSERT( __kmp_registration_str != NULL );
6236  if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6237  // Ok, this is our variable. Delete it.
6238  __kmp_env_unset( name );
6239  }; // if
6240 
6241  KMP_INTERNAL_FREE( __kmp_registration_str );
6242  KMP_INTERNAL_FREE( value );
6243  KMP_INTERNAL_FREE( name );
6244 
6245  __kmp_registration_flag = 0;
6246  __kmp_registration_str = NULL;
6247 
6248 } // __kmp_unregister_library
6249 
6250 
6251 // End of Library registration stuff.
6252 // -------------------------------------------------------------------------------------------------
6253 
6254 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6255 
6256 static void __kmp_check_mic_type()
6257 {
6258  kmp_cpuid_t cpuid_state = {0};
6259  kmp_cpuid_t * cs_p = &cpuid_state;
6260  __kmp_x86_cpuid(1, 0, cs_p);
6261  // We don't support mic1 at the moment
6262  if( (cs_p->eax & 0xff0) == 0xB10 ) {
6263  __kmp_mic_type = mic2;
6264  } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
6265  __kmp_mic_type = mic3;
6266  } else {
6267  __kmp_mic_type = non_mic;
6268  }
6269 }
6270 
6271 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6272 
6273 static void
6274 __kmp_do_serial_initialize( void )
6275 {
6276  int i, gtid;
6277  int size;
6278 
6279  KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
6280 
6281  KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
6282  KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
6283  KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
6284  KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
6285  KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
6286 
6287 #if OMPT_SUPPORT
6288  ompt_pre_init();
6289 #endif
6290 
6291  __kmp_validate_locks();
6292 
6293  /* Initialize internal memory allocator */
6294  __kmp_init_allocator();
6295 
6296  /* Register the library startup via an environment variable
6297  and check to see whether another copy of the library is already
6298  registered. */
6299 
6300  __kmp_register_library_startup( );
6301 
6302  /* TODO reinitialization of library */
6303  if( TCR_4(__kmp_global.g.g_done) ) {
6304  KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
6305  }
6306 
6307  __kmp_global.g.g_abort = 0;
6308  TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6309 
6310  /* initialize the locks */
6311 #if KMP_USE_ADAPTIVE_LOCKS
6312 #if KMP_DEBUG_ADAPTIVE_LOCKS
6313  __kmp_init_speculative_stats();
6314 #endif
6315 #endif
6316 #if KMP_STATS_ENABLED
6317  __kmp_init_tas_lock( & __kmp_stats_lock );
6318 #endif
6319  __kmp_init_lock( & __kmp_global_lock );
6320  __kmp_init_queuing_lock( & __kmp_dispatch_lock );
6321  __kmp_init_lock( & __kmp_debug_lock );
6322  __kmp_init_atomic_lock( & __kmp_atomic_lock );
6323  __kmp_init_atomic_lock( & __kmp_atomic_lock_1i );
6324  __kmp_init_atomic_lock( & __kmp_atomic_lock_2i );
6325  __kmp_init_atomic_lock( & __kmp_atomic_lock_4i );
6326  __kmp_init_atomic_lock( & __kmp_atomic_lock_4r );
6327  __kmp_init_atomic_lock( & __kmp_atomic_lock_8i );
6328  __kmp_init_atomic_lock( & __kmp_atomic_lock_8r );
6329  __kmp_init_atomic_lock( & __kmp_atomic_lock_8c );
6330  __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6331  __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6332  __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6333  __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6334  __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6335  __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock );
6336  __kmp_init_bootstrap_lock( & __kmp_exit_lock );
6337  __kmp_init_bootstrap_lock( & __kmp_monitor_lock );
6338  __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6339 
6340  /* conduct initialization and initial setup of configuration */
6341 
6342  __kmp_runtime_initialize();
6343 
6344 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6345  __kmp_check_mic_type();
6346 #endif
6347 
6348  // Some global variable initialization moved here from kmp_env_initialize()
6349 #ifdef KMP_DEBUG
6350  kmp_diag = 0;
6351 #endif
6352  __kmp_abort_delay = 0;
6353 
6354  // From __kmp_init_dflt_team_nth()
6355  /* assume the entire machine will be used */
6356  __kmp_dflt_team_nth_ub = __kmp_xproc;
6357  if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6358  __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6359  }
6360  if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6361  __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6362  }
6363  __kmp_max_nth = __kmp_sys_max_nth;
6364 
6365  // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6366  __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6367  __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6368  __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6369  // From "KMP_LIBRARY" part of __kmp_env_initialize()
6370  __kmp_library = library_throughput;
6371  // From KMP_SCHEDULE initialization
6372  __kmp_static = kmp_sch_static_balanced;
6373  // AC: do not use analytical here, because it is non-monotonous
6374  //__kmp_guided = kmp_sch_guided_iterative_chunked;
6375  //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6376  // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6377  // control parts
6378  #if KMP_FAST_REDUCTION_BARRIER
6379  #define kmp_reduction_barrier_gather_bb ((int)1)
6380  #define kmp_reduction_barrier_release_bb ((int)1)
6381  #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6382  #define kmp_reduction_barrier_release_pat bp_hyper_bar
6383  #endif // KMP_FAST_REDUCTION_BARRIER
6384  for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6385  __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6386  __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6387  __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6388  __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6389  #if KMP_FAST_REDUCTION_BARRIER
6390  if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6391  __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6392  __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6393  __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6394  __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6395  }
6396  #endif // KMP_FAST_REDUCTION_BARRIER
6397  }
6398  #if KMP_FAST_REDUCTION_BARRIER
6399  #undef kmp_reduction_barrier_release_pat
6400  #undef kmp_reduction_barrier_gather_pat
6401  #undef kmp_reduction_barrier_release_bb
6402  #undef kmp_reduction_barrier_gather_bb
6403  #endif // KMP_FAST_REDUCTION_BARRIER
6404 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6405  if (__kmp_mic_type == mic2) { // KNC
6406  // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6407  __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3; // plain gather
6408  __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1; // forkjoin release
6409  __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6410  __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6411  }
6412 #if KMP_FAST_REDUCTION_BARRIER
6413  if (__kmp_mic_type == mic2) { // KNC
6414  __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6415  __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6416  }
6417 #endif
6418 #endif
6419 
6420  // From KMP_CHECKS initialization
6421 #ifdef KMP_DEBUG
6422  __kmp_env_checks = TRUE; /* development versions have the extra checks */
6423 #else
6424  __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6425 #endif
6426 
6427  // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6428  __kmp_foreign_tp = TRUE;
6429 
6430  __kmp_global.g.g_dynamic = FALSE;
6431  __kmp_global.g.g_dynamic_mode = dynamic_default;
6432 
6433  __kmp_env_initialize( NULL );
6434 
6435  // Print all messages in message catalog for testing purposes.
6436  #ifdef KMP_DEBUG
6437  char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6438  if ( __kmp_str_match_true( val ) ) {
6439  kmp_str_buf_t buffer;
6440  __kmp_str_buf_init( & buffer );
6441  __kmp_i18n_dump_catalog( & buffer );
6442  __kmp_printf( "%s", buffer.str );
6443  __kmp_str_buf_free( & buffer );
6444  }; // if
6445  __kmp_env_free( & val );
6446  #endif
6447 
6448  __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6449  // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6450  __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6451 
6452  // If the library is shut down properly, both pools must be NULL. Just in case, set them
6453  // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6454  KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6455  KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6456  KMP_DEBUG_ASSERT( __kmp_team_pool == NULL );
6457  __kmp_thread_pool = NULL;
6458  __kmp_thread_pool_insert_pt = NULL;
6459  __kmp_team_pool = NULL;
6460 
6461  /* Allocate all of the variable sized records */
6462  /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6463  /* Since allocation is cache-aligned, just add extra padding at the end */
6464  size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6465  __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6466  __kmp_root = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6467 
6468  /* init thread counts */
6469  KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6470  KMP_DEBUG_ASSERT( __kmp_nth == 0 ); // something was wrong in termination.
6471  __kmp_all_nth = 0;
6472  __kmp_nth = 0;
6473 
6474  /* setup the uber master thread and hierarchy */
6475  gtid = __kmp_register_root( TRUE );
6476  KA_TRACE( 10, ("__kmp_do_serial_initialize T#%d\n", gtid ));
6477  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6478  KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6479 
6480  KMP_MB(); /* Flush all pending memory write invalidates. */
6481 
6482  __kmp_common_initialize();
6483 
6484  #if KMP_OS_UNIX
6485  /* invoke the child fork handler */
6486  __kmp_register_atfork();
6487  #endif
6488 
6489  #if ! defined KMP_DYNAMIC_LIB
6490  {
6491  /* Invoke the exit handler when the program finishes, only for static library.
6492  For dynamic library, we already have _fini and DllMain.
6493  */
6494  int rc = atexit( __kmp_internal_end_atexit );
6495  if ( rc != 0 ) {
6496  __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6497  }; // if
6498  }
6499  #endif
6500 
6501  #if KMP_HANDLE_SIGNALS
6502  #if KMP_OS_UNIX
6503  /* NOTE: make sure that this is called before the user installs
6504  * their own signal handlers so that the user handlers
6505  * are called first. this way they can return false,
6506  * not call our handler, avoid terminating the library,
6507  * and continue execution where they left off. */
6508  __kmp_install_signals( FALSE );
6509  #endif /* KMP_OS_UNIX */
6510  #if KMP_OS_WINDOWS
6511  __kmp_install_signals( TRUE );
6512  #endif /* KMP_OS_WINDOWS */
6513  #endif
6514 
6515  /* we have finished the serial initialization */
6516  __kmp_init_counter ++;
6517 
6518  __kmp_init_serial = TRUE;
6519 
6520  if (__kmp_settings) {
6521  __kmp_env_print();
6522  }
6523 
6524 #if OMP_40_ENABLED
6525  if (__kmp_display_env || __kmp_display_env_verbose) {
6526  __kmp_env_print_2();
6527  }
6528 #endif // OMP_40_ENABLED
6529 
6530 #if OMPT_SUPPORT
6531  ompt_post_init();
6532 #endif
6533 
6534  KMP_MB();
6535 
6536  KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6537 }
6538 
6539 void
6540 __kmp_serial_initialize( void )
6541 {
6542  if ( __kmp_init_serial ) {
6543  return;
6544  }
6545  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6546  if ( __kmp_init_serial ) {
6547  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6548  return;
6549  }
6550  __kmp_do_serial_initialize();
6551  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6552 }
6553 
6554 static void
6555 __kmp_do_middle_initialize( void )
6556 {
6557  int i, j;
6558  int prev_dflt_team_nth;
6559 
6560  if( !__kmp_init_serial ) {
6561  __kmp_do_serial_initialize();
6562  }
6563 
6564  KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6565 
6566  //
6567  // Save the previous value for the __kmp_dflt_team_nth so that
6568  // we can avoid some reinitialization if it hasn't changed.
6569  //
6570  prev_dflt_team_nth = __kmp_dflt_team_nth;
6571 
6572 #if KMP_AFFINITY_SUPPORTED
6573  //
6574  // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6575  // number of cores on the machine.
6576  //
6577  __kmp_affinity_initialize();
6578 
6579  //
6580  // Run through the __kmp_threads array and set the affinity mask
6581  // for each root thread that is currently registered with the RTL.
6582  //
6583  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6584  if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6585  __kmp_affinity_set_init_mask( i, TRUE );
6586  }
6587  }
6588 #endif /* KMP_AFFINITY_SUPPORTED */
6589 
6590  KMP_ASSERT( __kmp_xproc > 0 );
6591  if ( __kmp_avail_proc == 0 ) {
6592  __kmp_avail_proc = __kmp_xproc;
6593  }
6594 
6595  // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6596  j = 0;
6597  while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6598  __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6599  j++;
6600  }
6601 
6602  if ( __kmp_dflt_team_nth == 0 ) {
6603 #ifdef KMP_DFLT_NTH_CORES
6604  //
6605  // Default #threads = #cores
6606  //
6607  __kmp_dflt_team_nth = __kmp_ncores;
6608  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6609  __kmp_dflt_team_nth ) );
6610 #else
6611  //
6612  // Default #threads = #available OS procs
6613  //
6614  __kmp_dflt_team_nth = __kmp_avail_proc;
6615  KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6616  __kmp_dflt_team_nth ) );
6617 #endif /* KMP_DFLT_NTH_CORES */
6618  }
6619 
6620  if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6621  __kmp_dflt_team_nth = KMP_MIN_NTH;
6622  }
6623  if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6624  __kmp_dflt_team_nth = __kmp_sys_max_nth;
6625  }
6626 
6627  //
6628  // There's no harm in continuing if the following check fails,
6629  // but it indicates an error in the previous logic.
6630  //
6631  KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6632 
6633  if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6634  //
6635  // Run through the __kmp_threads array and set the num threads icv
6636  // for each root thread that is currently registered with the RTL
6637  // (which has not already explicitly set its nthreads-var with a
6638  // call to omp_set_num_threads()).
6639  //
6640  for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6641  kmp_info_t *thread = __kmp_threads[ i ];
6642  if ( thread == NULL ) continue;
6643  if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6644 
6645  set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6646  }
6647  }
6648  KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6649  __kmp_dflt_team_nth) );
6650 
6651 #ifdef KMP_ADJUST_BLOCKTIME
6652  /* Adjust blocktime to zero if necessary */
6653  /* now that __kmp_avail_proc is set */
6654  if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6655  KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6656  if ( __kmp_nth > __kmp_avail_proc ) {
6657  __kmp_zero_bt = TRUE;
6658  }
6659  }
6660 #endif /* KMP_ADJUST_BLOCKTIME */
6661 
6662  /* we have finished middle initialization */
6663  TCW_SYNC_4(__kmp_init_middle, TRUE);
6664 
6665  KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6666 }
6667 
6668 void
6669 __kmp_middle_initialize( void )
6670 {
6671  if ( __kmp_init_middle ) {
6672  return;
6673  }
6674  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6675  if ( __kmp_init_middle ) {
6676  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6677  return;
6678  }
6679  __kmp_do_middle_initialize();
6680  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6681 }
6682 
6683 void
6684 __kmp_parallel_initialize( void )
6685 {
6686  int gtid = __kmp_entry_gtid(); // this might be a new root
6687 
6688  /* synchronize parallel initialization (for sibling) */
6689  if( TCR_4(__kmp_init_parallel) ) return;
6690  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6691  if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6692 
6693  /* TODO reinitialization after we have already shut down */
6694  if( TCR_4(__kmp_global.g.g_done) ) {
6695  KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6696  __kmp_infinite_loop();
6697  }
6698 
6699  /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6700  would cause a deadlock. So we call __kmp_do_serial_initialize directly.
6701  */
6702  if( !__kmp_init_middle ) {
6703  __kmp_do_middle_initialize();
6704  }
6705 
6706  /* begin initialization */
6707  KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6708  KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6709 
6710 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6711  //
6712  // Save the FP control regs.
6713  // Worker threads will set theirs to these values at thread startup.
6714  //
6715  __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6716  __kmp_store_mxcsr( &__kmp_init_mxcsr );
6717  __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6718 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6719 
6720 #if KMP_OS_UNIX
6721 # if KMP_HANDLE_SIGNALS
6722  /* must be after __kmp_serial_initialize */
6723  __kmp_install_signals( TRUE );
6724 # endif
6725 #endif
6726 
6727  __kmp_suspend_initialize();
6728 
6729 #if defined(USE_LOAD_BALANCE)
6730  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6731  __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6732  }
6733 #else
6734  if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6735  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6736  }
6737 #endif
6738 
6739  if ( __kmp_version ) {
6740  __kmp_print_version_2();
6741  }
6742 
6743  /* we have finished parallel initialization */
6744  TCW_SYNC_4(__kmp_init_parallel, TRUE);
6745 
6746  KMP_MB();
6747  KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6748 
6749  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6750 }
6751 
6752 
6753 /* ------------------------------------------------------------------------ */
6754 
6755 void
6756 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6757  kmp_team_t *team )
6758 {
6759  kmp_disp_t *dispatch;
6760 
6761  KMP_MB();
6762 
6763  /* none of the threads have encountered any constructs, yet. */
6764  this_thr->th.th_local.this_construct = 0;
6765 #if KMP_CACHE_MANAGE
6766  KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6767 #endif /* KMP_CACHE_MANAGE */
6768  dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6769  KMP_DEBUG_ASSERT( dispatch );
6770  KMP_DEBUG_ASSERT( team->t.t_dispatch );
6771  //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6772 
6773  dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
6774 #if OMP_45_ENABLED
6775  dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
6776 #endif
6777  if( __kmp_env_consistency_check )
6778  __kmp_push_parallel( gtid, team->t.t_ident );
6779 
6780  KMP_MB(); /* Flush all pending memory write invalidates. */
6781 }
6782 
6783 void
6784 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6785  kmp_team_t *team )
6786 {
6787  if( __kmp_env_consistency_check )
6788  __kmp_pop_parallel( gtid, team->t.t_ident );
6789 
6790  __kmp_finish_implicit_task(this_thr);
6791 }
6792 
6793 int
6794 __kmp_invoke_task_func( int gtid )
6795 {
6796  int rc;
6797  int tid = __kmp_tid_from_gtid( gtid );
6798  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6799  kmp_team_t *team = this_thr->th.th_team;
6800 
6801  __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6802 #if USE_ITT_BUILD
6803  if ( __itt_stack_caller_create_ptr ) {
6804  __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6805  }
6806 #endif /* USE_ITT_BUILD */
6807 #if INCLUDE_SSC_MARKS
6808  SSC_MARK_INVOKING();
6809 #endif
6810 
6811 #if OMPT_SUPPORT
6812  void *dummy;
6813  void **exit_runtime_p;
6814  ompt_task_id_t my_task_id;
6815  ompt_parallel_id_t my_parallel_id;
6816 
6817  if (ompt_enabled) {
6818  exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
6819  ompt_task_info.frame.exit_runtime_frame);
6820  } else {
6821  exit_runtime_p = &dummy;
6822  }
6823 
6824 #if OMPT_TRACE
6825  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6826  my_parallel_id = team->t.ompt_team_info.parallel_id;
6827  if (ompt_enabled &&
6828  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6829  ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
6830  my_parallel_id, my_task_id);
6831  }
6832 #endif
6833 #endif
6834 
6835  {
6836  KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6837  KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6838  rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6839  gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
6840 #if OMPT_SUPPORT
6841  , exit_runtime_p
6842 #endif
6843  );
6844  }
6845 
6846 #if USE_ITT_BUILD
6847  if ( __itt_stack_caller_create_ptr ) {
6848  __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6849  }
6850 #endif /* USE_ITT_BUILD */
6851  __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6852 
6853  return rc;
6854 }
6855 
6856 #if OMP_40_ENABLED
6857 void
6858 __kmp_teams_master( int gtid )
6859 {
6860  // This routine is called by all master threads in teams construct
6861  kmp_info_t *thr = __kmp_threads[ gtid ];
6862  kmp_team_t *team = thr->th.th_team;
6863  ident_t *loc = team->t.t_ident;
6864  thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6865  KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6866  KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6867  KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6868  gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6869  // Launch league of teams now, but not let workers execute
6870  // (they hang on fork barrier until next parallel)
6871 #if INCLUDE_SSC_MARKS
6872  SSC_MARK_FORKING();
6873 #endif
6874  __kmp_fork_call( loc, gtid, fork_context_intel,
6875  team->t.t_argc,
6876 #if OMPT_SUPPORT
6877  (void *)thr->th.th_teams_microtask, // "unwrapped" task
6878 #endif
6879  (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6880  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6881  NULL );
6882 #if INCLUDE_SSC_MARKS
6883  SSC_MARK_JOINING();
6884 #endif
6885 
6886  // AC: last parameter "1" eliminates join barrier which won't work because
6887  // worker threads are in a fork barrier waiting for more parallel regions
6888  __kmp_join_call( loc, gtid
6889 #if OMPT_SUPPORT
6890  , fork_context_intel
6891 #endif
6892  , 1 );
6893 }
6894 
6895 int
6896 __kmp_invoke_teams_master( int gtid )
6897 {
6898  kmp_info_t *this_thr = __kmp_threads[ gtid ];
6899  kmp_team_t *team = this_thr->th.th_team;
6900  #if KMP_DEBUG
6901  if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6902  KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6903  #endif
6904  __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6905  __kmp_teams_master( gtid );
6906  __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6907  return 1;
6908 }
6909 #endif /* OMP_40_ENABLED */
6910 
6911 /* this sets the requested number of threads for the next parallel region
6912  * encountered by this team */
6913 /* since this should be enclosed in the forkjoin critical section it
6914  * should avoid race conditions with assymmetrical nested parallelism */
6915 
6916 void
6917 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6918 {
6919  kmp_info_t *thr = __kmp_threads[gtid];
6920 
6921  if( num_threads > 0 )
6922  thr->th.th_set_nproc = num_threads;
6923 }
6924 
6925 #if OMP_40_ENABLED
6926 
6927 /* this sets the requested number of teams for the teams region and/or
6928  * the number of threads for the next parallel region encountered */
6929 void
6930 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
6931 {
6932  kmp_info_t *thr = __kmp_threads[gtid];
6933  KMP_DEBUG_ASSERT(num_teams >= 0);
6934  KMP_DEBUG_ASSERT(num_threads >= 0);
6935 
6936  if( num_teams == 0 )
6937  num_teams = 1; // default number of teams is 1.
6938  if( num_teams > __kmp_max_nth ) { // if too many teams requested?
6939  if ( !__kmp_reserve_warn ) {
6940  __kmp_reserve_warn = 1;
6941  __kmp_msg(
6942  kmp_ms_warning,
6943  KMP_MSG( CantFormThrTeam, num_teams, __kmp_max_nth ),
6944  KMP_HNT( Unset_ALL_THREADS ),
6945  __kmp_msg_null
6946  );
6947  }
6948  num_teams = __kmp_max_nth;
6949  }
6950  // Set number of teams (number of threads in the outer "parallel" of the teams)
6951  thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
6952 
6953  // Remember the number of threads for inner parallel regions
6954  if( num_threads == 0 ) {
6955  if( !TCR_4(__kmp_init_middle) )
6956  __kmp_middle_initialize(); // get __kmp_avail_proc calculated
6957  num_threads = __kmp_avail_proc / num_teams;
6958  if( num_teams * num_threads > __kmp_max_nth ) {
6959  // adjust num_threads w/o warning as it is not user setting
6960  num_threads = __kmp_max_nth / num_teams;
6961  }
6962  } else {
6963  if( num_teams * num_threads > __kmp_max_nth ) {
6964  int new_threads = __kmp_max_nth / num_teams;
6965  if ( !__kmp_reserve_warn ) { // user asked for too many threads
6966  __kmp_reserve_warn = 1; // that conflicts with OMP_THREAD_LIMIT
6967  __kmp_msg(
6968  kmp_ms_warning,
6969  KMP_MSG( CantFormThrTeam, num_threads, new_threads ),
6970  KMP_HNT( Unset_ALL_THREADS ),
6971  __kmp_msg_null
6972  );
6973  }
6974  num_threads = new_threads;
6975  }
6976  }
6977  thr->th.th_teams_size.nth = num_threads;
6978 }
6979 
6980 
6981 //
6982 // Set the proc_bind var to use in the following parallel region.
6983 //
6984 void
6985 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
6986 {
6987  kmp_info_t *thr = __kmp_threads[gtid];
6988  thr->th.th_set_proc_bind = proc_bind;
6989 }
6990 
6991 #endif /* OMP_40_ENABLED */
6992 
6993 /* Launch the worker threads into the microtask. */
6994 
6995 void
6996 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
6997 {
6998  kmp_info_t *this_thr = __kmp_threads[gtid];
6999 
7000 #ifdef KMP_DEBUG
7001  int f;
7002 #endif /* KMP_DEBUG */
7003 
7004  KMP_DEBUG_ASSERT( team );
7005  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7006  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7007  KMP_MB(); /* Flush all pending memory write invalidates. */
7008 
7009  team->t.t_construct = 0; /* no single directives seen yet */
7010  team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
7011 
7012  /* Reset the identifiers on the dispatch buffer */
7013  KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
7014  if ( team->t.t_max_nproc > 1 ) {
7015  int i;
7016  for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7017  team->t.t_disp_buffer[ i ].buffer_index = i;
7018 #if OMP_45_ENABLED
7019  team->t.t_disp_buffer[i].doacross_buf_idx = i;
7020 #endif
7021  }
7022  } else {
7023  team->t.t_disp_buffer[ 0 ].buffer_index = 0;
7024 #if OMP_45_ENABLED
7025  team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7026 #endif
7027  }
7028 
7029  KMP_MB(); /* Flush all pending memory write invalidates. */
7030  KMP_ASSERT( this_thr->th.th_team == team );
7031 
7032 #ifdef KMP_DEBUG
7033  for( f=0 ; f<team->t.t_nproc ; f++ ) {
7034  KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7035  team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7036  }
7037 #endif /* KMP_DEBUG */
7038 
7039  /* release the worker threads so they may begin working */
7040  __kmp_fork_barrier( gtid, 0 );
7041 }
7042 
7043 
7044 void
7045 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7046 {
7047  kmp_info_t *this_thr = __kmp_threads[gtid];
7048 
7049  KMP_DEBUG_ASSERT( team );
7050  KMP_DEBUG_ASSERT( this_thr->th.th_team == team );
7051  KMP_ASSERT( KMP_MASTER_GTID(gtid) );
7052  KMP_MB(); /* Flush all pending memory write invalidates. */
7053 
7054  /* Join barrier after fork */
7055 
7056 #ifdef KMP_DEBUG
7057  if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7058  __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7059  __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7060  gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7061  __kmp_print_structure();
7062  }
7063  KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7064  __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7065 #endif /* KMP_DEBUG */
7066 
7067  __kmp_join_barrier( gtid ); /* wait for everyone */
7068 
7069  KMP_MB(); /* Flush all pending memory write invalidates. */
7070  KMP_ASSERT( this_thr->th.th_team == team );
7071 }
7072 
7073 
7074 /* ------------------------------------------------------------------------ */
7075 /* ------------------------------------------------------------------------ */
7076 
7077 #ifdef USE_LOAD_BALANCE
7078 
7079 //
7080 // Return the worker threads actively spinning in the hot team, if we
7081 // are at the outermost level of parallelism. Otherwise, return 0.
7082 //
7083 static int
7084 __kmp_active_hot_team_nproc( kmp_root_t *root )
7085 {
7086  int i;
7087  int retval;
7088  kmp_team_t *hot_team;
7089 
7090  if ( root->r.r_active ) {
7091  return 0;
7092  }
7093  hot_team = root->r.r_hot_team;
7094  if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7095  return hot_team->t.t_nproc - 1; // Don't count master thread
7096  }
7097 
7098  //
7099  // Skip the master thread - it is accounted for elsewhere.
7100  //
7101  retval = 0;
7102  for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7103  if ( hot_team->t.t_threads[i]->th.th_active ) {
7104  retval++;
7105  }
7106  }
7107  return retval;
7108 }
7109 
7110 //
7111 // Perform an automatic adjustment to the number of
7112 // threads used by the next parallel region.
7113 //
7114 static int
7115 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
7116 {
7117  int retval;
7118  int pool_active;
7119  int hot_team_active;
7120  int team_curr_active;
7121  int system_active;
7122 
7123  KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
7124  root, set_nproc ) );
7125  KMP_DEBUG_ASSERT( root );
7126  KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
7127  KMP_DEBUG_ASSERT( set_nproc > 1 );
7128 
7129  if ( set_nproc == 1) {
7130  KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
7131  return 1;
7132  }
7133 
7134  //
7135  // Threads that are active in the thread pool, active in the hot team
7136  // for this particular root (if we are at the outer par level), and
7137  // the currently executing thread (to become the master) are available
7138  // to add to the new team, but are currently contributing to the system
7139  // load, and must be accounted for.
7140  //
7141  pool_active = TCR_4(__kmp_thread_pool_active_nth);
7142  hot_team_active = __kmp_active_hot_team_nproc( root );
7143  team_curr_active = pool_active + hot_team_active + 1;
7144 
7145  //
7146  // Check the system load.
7147  //
7148  system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
7149  KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
7150  system_active, pool_active, hot_team_active ) );
7151 
7152  if ( system_active < 0 ) {
7153  //
7154  // There was an error reading the necessary info from /proc,
7155  // so use the thread limit algorithm instead. Once we set
7156  // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
7157  // we shouldn't wind up getting back here.
7158  //
7159  __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7160  KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
7161 
7162  //
7163  // Make this call behave like the thread limit algorithm.
7164  //
7165  retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
7166  : root->r.r_hot_team->t.t_nproc);
7167  if ( retval > set_nproc ) {
7168  retval = set_nproc;
7169  }
7170  if ( retval < KMP_MIN_NTH ) {
7171  retval = KMP_MIN_NTH;
7172  }
7173 
7174  KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
7175  return retval;
7176  }
7177 
7178  //
7179  // There is a slight delay in the load balance algorithm in detecting
7180  // new running procs. The real system load at this instant should be
7181  // at least as large as the #active omp thread that are available to
7182  // add to the team.
7183  //
7184  if ( system_active < team_curr_active ) {
7185  system_active = team_curr_active;
7186  }
7187  retval = __kmp_avail_proc - system_active + team_curr_active;
7188  if ( retval > set_nproc ) {
7189  retval = set_nproc;
7190  }
7191  if ( retval < KMP_MIN_NTH ) {
7192  retval = KMP_MIN_NTH;
7193  }
7194 
7195  KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
7196  return retval;
7197 } // __kmp_load_balance_nproc()
7198 
7199 #endif /* USE_LOAD_BALANCE */
7200 
7201 /* ------------------------------------------------------------------------ */
7202 /* ------------------------------------------------------------------------ */
7203 
7204 /* NOTE: this is called with the __kmp_init_lock held */
7205 void
7206 __kmp_cleanup( void )
7207 {
7208  int f;
7209 
7210  KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
7211 
7212  if (TCR_4(__kmp_init_parallel)) {
7213 #if KMP_HANDLE_SIGNALS
7214  __kmp_remove_signals();
7215 #endif
7216  TCW_4(__kmp_init_parallel, FALSE);
7217  }
7218 
7219  if (TCR_4(__kmp_init_middle)) {
7220 #if KMP_AFFINITY_SUPPORTED
7221  __kmp_affinity_uninitialize();
7222 #endif /* KMP_AFFINITY_SUPPORTED */
7223  __kmp_cleanup_hierarchy();
7224  TCW_4(__kmp_init_middle, FALSE);
7225  }
7226 
7227  KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
7228 
7229  if (__kmp_init_serial) {
7230  __kmp_runtime_destroy();
7231  __kmp_init_serial = FALSE;
7232  }
7233 
7234  for ( f = 0; f < __kmp_threads_capacity; f++ ) {
7235  if ( __kmp_root[ f ] != NULL ) {
7236  __kmp_free( __kmp_root[ f ] );
7237  __kmp_root[ f ] = NULL;
7238  }
7239  }
7240  __kmp_free( __kmp_threads );
7241  // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
7242  // freeing __kmp_root.
7243  __kmp_threads = NULL;
7244  __kmp_root = NULL;
7245  __kmp_threads_capacity = 0;
7246 
7247 #if KMP_USE_DYNAMIC_LOCK
7248  __kmp_cleanup_indirect_user_locks();
7249 #else
7250  __kmp_cleanup_user_locks();
7251 #endif
7252 
7253  #if KMP_AFFINITY_SUPPORTED
7254  KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
7255  __kmp_cpuinfo_file = NULL;
7256  #endif /* KMP_AFFINITY_SUPPORTED */
7257 
7258  #if KMP_USE_ADAPTIVE_LOCKS
7259  #if KMP_DEBUG_ADAPTIVE_LOCKS
7260  __kmp_print_speculative_stats();
7261  #endif
7262  #endif
7263  KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
7264  __kmp_nested_nth.nth = NULL;
7265  __kmp_nested_nth.size = 0;
7266  __kmp_nested_nth.used = 0;
7267 
7268  __kmp_i18n_catclose();
7269 
7270 #if KMP_STATS_ENABLED
7271  __kmp_accumulate_stats_at_exit();
7272  __kmp_stats_list.deallocate();
7273 #endif
7274 
7275  KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
7276 }
7277 
7278 /* ------------------------------------------------------------------------ */
7279 /* ------------------------------------------------------------------------ */
7280 
7281 int
7282 __kmp_ignore_mppbeg( void )
7283 {
7284  char *env;
7285 
7286  if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
7287  if (__kmp_str_match_false( env ))
7288  return FALSE;
7289  }
7290  // By default __kmpc_begin() is no-op.
7291  return TRUE;
7292 }
7293 
7294 int
7295 __kmp_ignore_mppend( void )
7296 {
7297  char *env;
7298 
7299  if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
7300  if (__kmp_str_match_false( env ))
7301  return FALSE;
7302  }
7303  // By default __kmpc_end() is no-op.
7304  return TRUE;
7305 }
7306 
7307 void
7308 __kmp_internal_begin( void )
7309 {
7310  int gtid;
7311  kmp_root_t *root;
7312 
7313  /* this is a very important step as it will register new sibling threads
7314  * and assign these new uber threads a new gtid */
7315  gtid = __kmp_entry_gtid();
7316  root = __kmp_threads[ gtid ]->th.th_root;
7317  KMP_ASSERT( KMP_UBER_GTID( gtid ));
7318 
7319  if( root->r.r_begin ) return;
7320  __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
7321  if( root->r.r_begin ) {
7322  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7323  return;
7324  }
7325 
7326  root->r.r_begin = TRUE;
7327 
7328  __kmp_release_lock( & root->r.r_begin_lock, gtid );
7329 }
7330 
7331 
7332 /* ------------------------------------------------------------------------ */
7333 /* ------------------------------------------------------------------------ */
7334 
7335 void
7336 __kmp_user_set_library (enum library_type arg)
7337 {
7338  int gtid;
7339  kmp_root_t *root;
7340  kmp_info_t *thread;
7341 
7342  /* first, make sure we are initialized so we can get our gtid */
7343 
7344  gtid = __kmp_entry_gtid();
7345  thread = __kmp_threads[ gtid ];
7346 
7347  root = thread->th.th_root;
7348 
7349  KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
7350  if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
7351  KMP_WARNING( SetLibraryIncorrectCall );
7352  return;
7353  }
7354 
7355  switch ( arg ) {
7356  case library_serial :
7357  thread->th.th_set_nproc = 0;
7358  set__nproc( thread, 1 );
7359  break;
7360  case library_turnaround :
7361  thread->th.th_set_nproc = 0;
7362  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7363  break;
7364  case library_throughput :
7365  thread->th.th_set_nproc = 0;
7366  set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7367  break;
7368  default:
7369  KMP_FATAL( UnknownLibraryType, arg );
7370  }
7371 
7372  __kmp_aux_set_library ( arg );
7373 }
7374 
7375 void
7376 __kmp_aux_set_stacksize( size_t arg )
7377 {
7378  if (! __kmp_init_serial)
7379  __kmp_serial_initialize();
7380 
7381 #if KMP_OS_DARWIN
7382  if (arg & (0x1000 - 1)) {
7383  arg &= ~(0x1000 - 1);
7384  if(arg + 0x1000) /* check for overflow if we round up */
7385  arg += 0x1000;
7386  }
7387 #endif
7388  __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7389 
7390  /* only change the default stacksize before the first parallel region */
7391  if (! TCR_4(__kmp_init_parallel)) {
7392  size_t value = arg; /* argument is in bytes */
7393 
7394  if (value < __kmp_sys_min_stksize )
7395  value = __kmp_sys_min_stksize ;
7396  else if (value > KMP_MAX_STKSIZE)
7397  value = KMP_MAX_STKSIZE;
7398 
7399  __kmp_stksize = value;
7400 
7401  __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7402  }
7403 
7404  __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7405 }
7406 
7407 /* set the behaviour of the runtime library */
7408 /* TODO this can cause some odd behaviour with sibling parallelism... */
7409 void
7410 __kmp_aux_set_library (enum library_type arg)
7411 {
7412  __kmp_library = arg;
7413 
7414  switch ( __kmp_library ) {
7415  case library_serial :
7416  {
7417  KMP_INFORM( LibraryIsSerial );
7418  (void) __kmp_change_library( TRUE );
7419  }
7420  break;
7421  case library_turnaround :
7422  (void) __kmp_change_library( TRUE );
7423  break;
7424  case library_throughput :
7425  (void) __kmp_change_library( FALSE );
7426  break;
7427  default:
7428  KMP_FATAL( UnknownLibraryType, arg );
7429  }
7430 }
7431 
7432 /* ------------------------------------------------------------------------ */
7433 /* ------------------------------------------------------------------------ */
7434 
7435 void
7436 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7437 {
7438  int blocktime = arg; /* argument is in milliseconds */
7439  int bt_intervals;
7440  int bt_set;
7441 
7442  __kmp_save_internal_controls( thread );
7443 
7444  /* Normalize and set blocktime for the teams */
7445  if (blocktime < KMP_MIN_BLOCKTIME)
7446  blocktime = KMP_MIN_BLOCKTIME;
7447  else if (blocktime > KMP_MAX_BLOCKTIME)
7448  blocktime = KMP_MAX_BLOCKTIME;
7449 
7450  set__blocktime_team( thread->th.th_team, tid, blocktime );
7451  set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7452 
7453  /* Calculate and set blocktime intervals for the teams */
7454  bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7455 
7456  set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7457  set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7458 
7459  /* Set whether blocktime has been set to "TRUE" */
7460  bt_set = TRUE;
7461 
7462  set__bt_set_team( thread->th.th_team, tid, bt_set );
7463  set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7464  KF_TRACE(10, ( "kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, bt_intervals=%d, monitor_updates=%d\n",
7465  __kmp_gtid_from_tid(tid, thread->th.th_team),
7466  thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, __kmp_monitor_wakeups ) );
7467 }
7468 
7469 void
7470 __kmp_aux_set_defaults(
7471  char const * str,
7472  int len
7473 ) {
7474  if ( ! __kmp_init_serial ) {
7475  __kmp_serial_initialize();
7476  };
7477  __kmp_env_initialize( str );
7478 
7479  if (__kmp_settings
7480 #if OMP_40_ENABLED
7481  || __kmp_display_env || __kmp_display_env_verbose
7482 #endif // OMP_40_ENABLED
7483  ) {
7484  __kmp_env_print();
7485  }
7486 } // __kmp_aux_set_defaults
7487 
7488 /* ------------------------------------------------------------------------ */
7489 
7490 /*
7491  * internal fast reduction routines
7492  */
7493 
7494 PACKED_REDUCTION_METHOD_T
7495 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7496  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7497  kmp_critical_name *lck )
7498 {
7499 
7500  // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7501  // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7502  // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7503  // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7504 
7505  PACKED_REDUCTION_METHOD_T retval;
7506 
7507  int team_size;
7508 
7509  KMP_DEBUG_ASSERT( loc ); // it would be nice to test ( loc != 0 )
7510  KMP_DEBUG_ASSERT( lck ); // it would be nice to test ( lck != 0 )
7511 
7512  #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7513  #define FAST_REDUCTION_TREE_METHOD_GENERATED ( ( reduce_data ) && ( reduce_func ) )
7514 
7515  retval = critical_reduce_block;
7516 
7517  team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7518 
7519  if( team_size == 1 ) {
7520 
7521  retval = empty_reduce_block;
7522 
7523  } else {
7524 
7525  int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7526  int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7527 
7528  #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7529 
7530  #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7531 
7532  int teamsize_cutoff = 4;
7533 
7534 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7535  if( __kmp_mic_type != non_mic ) {
7536  teamsize_cutoff = 8;
7537  }
7538 #endif
7539  if( tree_available ) {
7540  if( team_size <= teamsize_cutoff ) {
7541  if ( atomic_available ) {
7542  retval = atomic_reduce_block;
7543  }
7544  } else {
7545  retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7546  }
7547  } else if ( atomic_available ) {
7548  retval = atomic_reduce_block;
7549  }
7550  #else
7551  #error "Unknown or unsupported OS"
7552  #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7553 
7554  #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7555 
7556  #if KMP_OS_LINUX || KMP_OS_WINDOWS
7557 
7558  // basic tuning
7559 
7560  if( atomic_available ) {
7561  if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7562  retval = atomic_reduce_block;
7563  }
7564  } // otherwise: use critical section
7565 
7566  #elif KMP_OS_DARWIN
7567 
7568  if( atomic_available && ( num_vars <= 3 ) ) {
7569  retval = atomic_reduce_block;
7570  } else if( tree_available ) {
7571  if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7572  retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7573  }
7574  } // otherwise: use critical section
7575 
7576  #else
7577  #error "Unknown or unsupported OS"
7578  #endif
7579 
7580  #else
7581  #error "Unknown or unsupported architecture"
7582  #endif
7583 
7584  }
7585 
7586  // KMP_FORCE_REDUCTION
7587 
7588  // If the team is serialized (team_size == 1), ignore the forced reduction
7589  // method and stay with the unsynchronized method (empty_reduce_block)
7590  if( __kmp_force_reduction_method != reduction_method_not_defined && team_size != 1) {
7591 
7592  PACKED_REDUCTION_METHOD_T forced_retval;
7593 
7594  int atomic_available, tree_available;
7595 
7596  switch( ( forced_retval = __kmp_force_reduction_method ) )
7597  {
7598  case critical_reduce_block:
7599  KMP_ASSERT( lck ); // lck should be != 0
7600  break;
7601 
7602  case atomic_reduce_block:
7603  atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7604  KMP_ASSERT( atomic_available ); // atomic_available should be != 0
7605  break;
7606 
7607  case tree_reduce_block:
7608  tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7609  KMP_ASSERT( tree_available ); // tree_available should be != 0
7610  #if KMP_FAST_REDUCTION_BARRIER
7611  forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7612  #endif
7613  break;
7614 
7615  default:
7616  KMP_ASSERT( 0 ); // "unsupported method specified"
7617  }
7618 
7619  retval = forced_retval;
7620  }
7621 
7622  KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7623 
7624  #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7625  #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7626 
7627  return ( retval );
7628 }
7629 
7630 // this function is for testing set/get/determine reduce method
7631 kmp_int32
7632 __kmp_get_reduce_method( void ) {
7633  return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7634 }
7635 
7636 /* ------------------------------------------------------------------------ */
#define KMP_COUNT_VALUE(name, value)
Adds value to specified timer (name).
Definition: kmp_stats.h:765
KMP_EXPORT void __kmpc_end_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:485
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:177
Definition: kmp.h:194
KMP_EXPORT void __kmpc_serialized_parallel(ident_t *, kmp_int32 global_tid)
Definition: kmp_csupport.c:470
sched_type
Definition: kmp.h:295
kmp_int32 flags
Definition: kmp.h:196