LLVM OpenMP* Runtime Library
kmp_barrier.cpp
1 /*
2  * kmp_barrier.cpp
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_wait_release.h"
18 #include "kmp_stats.h"
19 #include "kmp_itt.h"
20 #include "kmp_os.h"
21 
22 
23 #if KMP_MIC
24 #include <immintrin.h>
25 #define USE_NGO_STORES 1
26 #endif // KMP_MIC
27 
28 #if KMP_MIC && USE_NGO_STORES
29 // ICV copying
30 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
31 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33 #define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
34 #else
35 #define ngo_load(src) ((void)0)
36 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
37 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
38 #define ngo_sync() ((void)0)
39 #endif /* KMP_MIC && USE_NGO_STORES */
40 
41 void __kmp_print_structure(void); // Forward declaration
42 
43 // ---------------------------- Barrier Algorithms ----------------------------
44 
45 // Linear Barrier
46 static void
47 __kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48  void (*reduce)(void *, void *)
49  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
50 {
51  KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
52  register kmp_team_t *team = this_thr->th.th_team;
53  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
54  register kmp_info_t **other_threads = team->t.t_threads;
55 
56  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57  gtid, team->t.t_id, tid, bt));
58  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59 
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61  // Barrier imbalance - save arrive time to the thread
62  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
64  }
65 #endif
66  // We now perform a linear reduction to signal that all of the threads have arrived.
67  if (!KMP_MASTER_TID(tid)) {
68  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
69  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
70  __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
71  thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
72  // Mark arrival to master thread
73  /* After performing this write, a worker thread may not assume that the team is valid
74  any more - it could be deallocated by the master thread at any time. */
75  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
76  flag.release();
77  } else {
78  register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
79  register int nproc = this_thr->th.th_team_nproc;
80  register int i;
81  // Don't have to worry about sleep bit here or atomic since team setting
82  register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
83 
84  // Collect all the worker team member threads.
85  for (i=1; i<nproc; ++i) {
86 #if KMP_CACHE_MANAGE
87  // Prefetch next thread's arrived count
88  if (i+1 < nproc)
89  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
90 #endif /* KMP_CACHE_MANAGE */
91  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
92  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
93  __kmp_gtid_from_tid(i, team), team->t.t_id, i,
94  &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
95 
96  // Wait for worker thread to arrive
97  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
98  flag.wait(this_thr, FALSE
99  USE_ITT_BUILD_ARG(itt_sync_obj) );
100 #if USE_ITT_BUILD && USE_ITT_NOTIFY
101  // Barrier imbalance - write min of the thread time and the other thread time to the thread.
102  if (__kmp_forkjoin_frames_mode == 2) {
103  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
104  other_threads[i]->th.th_bar_min_time);
105  }
106 #endif
107  if (reduce) {
108  KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
109  team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
110  (*reduce)(this_thr->th.th_local.reduce_data,
111  other_threads[i]->th.th_local.reduce_data);
112  }
113  }
114  // Don't have to worry about sleep bit here or atomic since team setting
115  team_bar->b_arrived = new_state;
116  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
117  gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
118  }
119  KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
120  gtid, team->t.t_id, tid, bt));
121 }
122 
123 static void
124 __kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
125  int propagate_icvs
126  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
127 {
128  KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
129  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
130  register kmp_team_t *team;
131 
132  if (KMP_MASTER_TID(tid)) {
133  register unsigned int i;
134  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
135  register kmp_info_t **other_threads;
136 
137  team = __kmp_threads[gtid]->th.th_team;
138  KMP_DEBUG_ASSERT(team != NULL);
139  other_threads = team->t.t_threads;
140 
141  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
142  gtid, team->t.t_id, tid, bt));
143 
144  if (nproc > 1) {
145 #if KMP_BARRIER_ICV_PUSH
146  {
147  KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
148  if (propagate_icvs) {
149  ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
150  for (i=1; i<nproc; ++i) {
151  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
152  ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
153  &team->t.t_implicit_task_taskdata[0].td_icvs);
154  }
155  ngo_sync();
156  }
157  }
158 #endif // KMP_BARRIER_ICV_PUSH
159 
160  // Now, release all of the worker threads
161  for (i=1; i<nproc; ++i) {
162 #if KMP_CACHE_MANAGE
163  // Prefetch next thread's go flag
164  if (i+1 < nproc)
165  KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
166 #endif /* KMP_CACHE_MANAGE */
167  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
168  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
169  other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
170  &other_threads[i]->th.th_bar[bt].bb.b_go,
171  other_threads[i]->th.th_bar[bt].bb.b_go,
172  other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
173  kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
174  flag.release();
175  }
176  }
177  } else { // Wait for the MASTER thread to release us
178  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
179  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
180  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
181  flag.wait(this_thr, TRUE
182  USE_ITT_BUILD_ARG(itt_sync_obj) );
183 #if USE_ITT_BUILD && USE_ITT_NOTIFY
184  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
185  // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
186  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
187  // Cancel wait on previous parallel region...
188  __kmp_itt_task_starting(itt_sync_obj);
189 
190  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
191  return;
192 
193  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
194  if (itt_sync_obj != NULL)
195  // Call prepare as early as possible for "new" barrier
196  __kmp_itt_task_finished(itt_sync_obj);
197  } else
198 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
199  // Early exit for reaping threads releasing forkjoin barrier
200  if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
201  return;
202  // The worker thread may now assume that the team is valid.
203 #ifdef KMP_DEBUG
204  tid = __kmp_tid_from_gtid(gtid);
205  team = __kmp_threads[gtid]->th.th_team;
206 #endif
207  KMP_DEBUG_ASSERT(team != NULL);
208  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
209  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
210  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
211  KMP_MB(); // Flush all pending memory write invalidates.
212  }
213  KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
214  gtid, team->t.t_id, tid, bt));
215 }
216 
217 // Tree barrier
218 static void
219 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
220  void (*reduce)(void *, void *)
221  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
222 {
223  KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
224  register kmp_team_t *team = this_thr->th.th_team;
225  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
226  register kmp_info_t **other_threads = team->t.t_threads;
227  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
228  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
229  register kmp_uint32 branch_factor = 1 << branch_bits;
230  register kmp_uint32 child;
231  register kmp_uint32 child_tid;
232  register kmp_uint64 new_state;
233 
234  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
235  gtid, team->t.t_id, tid, bt));
236  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
237 
238 #if USE_ITT_BUILD && USE_ITT_NOTIFY
239  // Barrier imbalance - save arrive time to the thread
240  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
241  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
242  }
243 #endif
244  // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
245  child_tid = (tid << branch_bits) + 1;
246  if (child_tid < nproc) {
247  // Parent threads wait for all their children to arrive
248  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
249  child = 1;
250  do {
251  register kmp_info_t *child_thr = other_threads[child_tid];
252  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
253 #if KMP_CACHE_MANAGE
254  // Prefetch next thread's arrived count
255  if (child+1 <= branch_factor && child_tid+1 < nproc)
256  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
257 #endif /* KMP_CACHE_MANAGE */
258  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
259  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
260  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
261  &child_bar->b_arrived, new_state));
262  // Wait for child to arrive
263  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
264  flag.wait(this_thr, FALSE
265  USE_ITT_BUILD_ARG(itt_sync_obj) );
266 #if USE_ITT_BUILD && USE_ITT_NOTIFY
267  // Barrier imbalance - write min of the thread time and a child time to the thread.
268  if (__kmp_forkjoin_frames_mode == 2) {
269  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
270  child_thr->th.th_bar_min_time);
271  }
272 #endif
273  if (reduce) {
274  KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
275  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
276  team->t.t_id, child_tid));
277  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
278  }
279  child++;
280  child_tid++;
281  }
282  while (child <= branch_factor && child_tid < nproc);
283  }
284 
285  if (!KMP_MASTER_TID(tid)) { // Worker threads
286  register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
287 
288  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
289  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
290  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
291  &thr_bar->b_arrived, thr_bar->b_arrived,
292  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
293 
294  // Mark arrival to parent thread
295  /* After performing this write, a worker thread may not assume that the team is valid
296  any more - it could be deallocated by the master thread at any time. */
297  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
298  flag.release();
299  } else {
300  // Need to update the team arrived pointer if we are the master thread
301  if (nproc > 1) // New value was already computed above
302  team->t.t_bar[bt].b_arrived = new_state;
303  else
304  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
305  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
306  gtid, team->t.t_id, tid, team->t.t_id,
307  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
308  }
309  KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
310  gtid, team->t.t_id, tid, bt));
311 }
312 
313 static void
314 __kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
315  int propagate_icvs
316  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
317 {
318  KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
319  register kmp_team_t *team;
320  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
321  register kmp_uint32 nproc;
322  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
323  register kmp_uint32 branch_factor = 1 << branch_bits;
324  register kmp_uint32 child;
325  register kmp_uint32 child_tid;
326 
327  // Perform a tree release for all of the threads that have been gathered
328  if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
329  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
330  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
331  // Wait for parent thread to release us
332  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
333  flag.wait(this_thr, TRUE
334  USE_ITT_BUILD_ARG(itt_sync_obj) );
335 #if USE_ITT_BUILD && USE_ITT_NOTIFY
336  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
337  // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
338  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
339  // Cancel wait on previous parallel region...
340  __kmp_itt_task_starting(itt_sync_obj);
341 
342  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
343  return;
344 
345  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
346  if (itt_sync_obj != NULL)
347  // Call prepare as early as possible for "new" barrier
348  __kmp_itt_task_finished(itt_sync_obj);
349  } else
350 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
351  // Early exit for reaping threads releasing forkjoin barrier
352  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
353  return;
354 
355  // The worker thread may now assume that the team is valid.
356  team = __kmp_threads[gtid]->th.th_team;
357  KMP_DEBUG_ASSERT(team != NULL);
358  tid = __kmp_tid_from_gtid(gtid);
359 
360  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
361  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
362  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
363  KMP_MB(); // Flush all pending memory write invalidates.
364  } else {
365  team = __kmp_threads[gtid]->th.th_team;
366  KMP_DEBUG_ASSERT(team != NULL);
367  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
368  gtid, team->t.t_id, tid, bt));
369  }
370  nproc = this_thr->th.th_team_nproc;
371  child_tid = (tid << branch_bits) + 1;
372 
373  if (child_tid < nproc) {
374  register kmp_info_t **other_threads = team->t.t_threads;
375  child = 1;
376  // Parent threads release all their children
377  do {
378  register kmp_info_t *child_thr = other_threads[child_tid];
379  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
380 #if KMP_CACHE_MANAGE
381  // Prefetch next thread's go count
382  if (child+1 <= branch_factor && child_tid+1 < nproc)
383  KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
384 #endif /* KMP_CACHE_MANAGE */
385 
386 #if KMP_BARRIER_ICV_PUSH
387  {
388  KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
389  if (propagate_icvs) {
390  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
391  team, child_tid, FALSE);
392  copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
393  &team->t.t_implicit_task_taskdata[0].td_icvs);
394  }
395  }
396 #endif // KMP_BARRIER_ICV_PUSH
397  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
398  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
399  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
400  child_tid, &child_bar->b_go, child_bar->b_go,
401  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
402  // Release child from barrier
403  kmp_flag_64 flag(&child_bar->b_go, child_thr);
404  flag.release();
405  child++;
406  child_tid++;
407  }
408  while (child <= branch_factor && child_tid < nproc);
409  }
410  KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
411  gtid, team->t.t_id, tid, bt));
412 }
413 
414 
415 // Hyper Barrier
416 static void
417 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
418  void (*reduce)(void *, void *)
419  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
420 {
421  KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
422  register kmp_team_t *team = this_thr->th.th_team;
423  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
424  register kmp_info_t **other_threads = team->t.t_threads;
425  register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
426  register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
427  register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
428  register kmp_uint32 branch_factor = 1 << branch_bits;
429  register kmp_uint32 offset;
430  register kmp_uint32 level;
431 
432  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
433  gtid, team->t.t_id, tid, bt));
434 
435  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
436 
437 #if USE_ITT_BUILD && USE_ITT_NOTIFY
438  // Barrier imbalance - save arrive time to the thread
439  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
440  this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
441  }
442 #endif
443  /* Perform a hypercube-embedded tree gather to wait until all of the threads have
444  arrived, and reduce any required data as we go. */
445  kmp_flag_64 p_flag(&thr_bar->b_arrived);
446  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
447  {
448  register kmp_uint32 child;
449  register kmp_uint32 child_tid;
450 
451  if (((tid >> level) & (branch_factor - 1)) != 0) {
452  register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
453 
454  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
455  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
456  __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
457  &thr_bar->b_arrived, thr_bar->b_arrived,
458  thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
459  // Mark arrival to parent thread
460  /* After performing this write (in the last iteration of the enclosing for loop),
461  a worker thread may not assume that the team is valid any more - it could be
462  deallocated by the master thread at any time. */
463  p_flag.set_waiter(other_threads[parent_tid]);
464  p_flag.release();
465  break;
466  }
467 
468  // Parent threads wait for children to arrive
469  if (new_state == KMP_BARRIER_UNUSED_STATE)
470  new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
471  for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
472  child++, child_tid+=(1 << level))
473  {
474  register kmp_info_t *child_thr = other_threads[child_tid];
475  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
476 #if KMP_CACHE_MANAGE
477  register kmp_uint32 next_child_tid = child_tid + (1 << level);
478  // Prefetch next thread's arrived count
479  if (child+1 < branch_factor && next_child_tid < num_threads)
480  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
481 #endif /* KMP_CACHE_MANAGE */
482  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
483  "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
484  __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
485  &child_bar->b_arrived, new_state));
486  // Wait for child to arrive
487  kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
488  c_flag.wait(this_thr, FALSE
489  USE_ITT_BUILD_ARG(itt_sync_obj) );
490 #if USE_ITT_BUILD && USE_ITT_NOTIFY
491  // Barrier imbalance - write min of the thread time and a child time to the thread.
492  if (__kmp_forkjoin_frames_mode == 2) {
493  this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
494  child_thr->th.th_bar_min_time);
495  }
496 #endif
497  if (reduce) {
498  KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
499  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
500  team->t.t_id, child_tid));
501  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
502  }
503  }
504  }
505 
506  if (KMP_MASTER_TID(tid)) {
507  // Need to update the team arrived pointer if we are the master thread
508  if (new_state == KMP_BARRIER_UNUSED_STATE)
509  team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
510  else
511  team->t.t_bar[bt].b_arrived = new_state;
512  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
513  gtid, team->t.t_id, tid, team->t.t_id,
514  &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
515  }
516  KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
517  gtid, team->t.t_id, tid, bt));
518 }
519 
520 // The reverse versions seem to beat the forward versions overall
521 #define KMP_REVERSE_HYPER_BAR
522 static void
523 __kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
524  int propagate_icvs
525  USE_ITT_BUILD_ARG(void *itt_sync_obj) )
526 {
527  KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
528  register kmp_team_t *team;
529  register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
530  register kmp_info_t **other_threads;
531  register kmp_uint32 num_threads;
532  register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
533  register kmp_uint32 branch_factor = 1 << branch_bits;
534  register kmp_uint32 child;
535  register kmp_uint32 child_tid;
536  register kmp_uint32 offset;
537  register kmp_uint32 level;
538 
539  /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
540  If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
541  order of the corresponding gather, otherwise threads are released in the same order. */
542  if (KMP_MASTER_TID(tid)) { // master
543  team = __kmp_threads[gtid]->th.th_team;
544  KMP_DEBUG_ASSERT(team != NULL);
545  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
546  gtid, team->t.t_id, tid, bt));
547 #if KMP_BARRIER_ICV_PUSH
548  if (propagate_icvs) { // master already has ICVs in final destination; copy
549  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
550  }
551 #endif
552  }
553  else { // Handle fork barrier workers who aren't part of a team yet
554  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
555  gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
556  // Wait for parent thread to release us
557  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
558  flag.wait(this_thr, TRUE
559  USE_ITT_BUILD_ARG(itt_sync_obj) );
560 #if USE_ITT_BUILD && USE_ITT_NOTIFY
561  if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
562  // In fork barrier where we could not get the object reliably
563  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
564  // Cancel wait on previous parallel region...
565  __kmp_itt_task_starting(itt_sync_obj);
566 
567  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
568  return;
569 
570  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
571  if (itt_sync_obj != NULL)
572  // Call prepare as early as possible for "new" barrier
573  __kmp_itt_task_finished(itt_sync_obj);
574  } else
575 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
576  // Early exit for reaping threads releasing forkjoin barrier
577  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
578  return;
579 
580  // The worker thread may now assume that the team is valid.
581  team = __kmp_threads[gtid]->th.th_team;
582  KMP_DEBUG_ASSERT(team != NULL);
583  tid = __kmp_tid_from_gtid(gtid);
584 
585  TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
586  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
587  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
588  KMP_MB(); // Flush all pending memory write invalidates.
589  }
590  num_threads = this_thr->th.th_team_nproc;
591  other_threads = team->t.t_threads;
592 
593 #ifdef KMP_REVERSE_HYPER_BAR
594  // Count up to correct level for parent
595  for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
596  level+=branch_bits, offset<<=branch_bits);
597 
598  // Now go down from there
599  for (level-=branch_bits, offset>>=branch_bits; offset != 0;
600  level-=branch_bits, offset>>=branch_bits)
601 #else
602  // Go down the tree, level by level
603  for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
604 #endif // KMP_REVERSE_HYPER_BAR
605  {
606 #ifdef KMP_REVERSE_HYPER_BAR
607  /* Now go in reverse order through the children, highest to lowest.
608  Initial setting of child is conservative here. */
609  child = num_threads >> ((level==0)?level:level-1);
610  for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
611  child>=1; child--, child_tid-=(1<<level))
612 #else
613  if (((tid >> level) & (branch_factor - 1)) != 0)
614  // No need to go lower than this, since this is the level parent would be notified
615  break;
616  // Iterate through children on this level of the tree
617  for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
618  child++, child_tid+=(1<<level))
619 #endif // KMP_REVERSE_HYPER_BAR
620  {
621  if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
622  else {
623  register kmp_info_t *child_thr = other_threads[child_tid];
624  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
625 #if KMP_CACHE_MANAGE
626  register kmp_uint32 next_child_tid = child_tid - (1 << level);
627  // Prefetch next thread's go count
628 # ifdef KMP_REVERSE_HYPER_BAR
629  if (child-1 >= 1 && next_child_tid < num_threads)
630 # else
631  if (child+1 < branch_factor && next_child_tid < num_threads)
632 # endif // KMP_REVERSE_HYPER_BAR
633  KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
634 #endif /* KMP_CACHE_MANAGE */
635 
636 #if KMP_BARRIER_ICV_PUSH
637  if (propagate_icvs) // push my fixed ICVs to my child
638  copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
639 #endif // KMP_BARRIER_ICV_PUSH
640 
641  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
642  "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
643  __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
644  child_tid, &child_bar->b_go, child_bar->b_go,
645  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
646  // Release child from barrier
647  kmp_flag_64 flag(&child_bar->b_go, child_thr);
648  flag.release();
649  }
650  }
651  }
652 #if KMP_BARRIER_ICV_PUSH
653  if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
654  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
655  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
656  }
657 #endif
658  KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
659  gtid, team->t.t_id, tid, bt));
660 }
661 
662 // Hierarchical Barrier
663 
664 // Initialize thread barrier data
665 /* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
666  minimum amount of initialization required based on how the team has changed. Returns true if
667  leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
668  team size increases, threads already in the team will respond to on-core wakeup on their parent
669  thread, but threads newly added to the team will only be listening on the their local b_go. */
670 static bool
671 __kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
672  int gtid, int tid, kmp_team_t *team)
673 {
674  // Checks to determine if (re-)initialization is needed
675  bool uninitialized = thr_bar->team == NULL;
676  bool team_changed = team != thr_bar->team;
677  bool team_sz_changed = nproc != thr_bar->nproc;
678  bool tid_changed = tid != thr_bar->old_tid;
679  bool retval = false;
680 
681  if (uninitialized || team_sz_changed) {
682  __kmp_get_hierarchy(nproc, thr_bar);
683  }
684 
685  if (uninitialized || team_sz_changed || tid_changed) {
686  thr_bar->my_level = thr_bar->depth-1; // default for master
687  thr_bar->parent_tid = -1; // default for master
688  if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
689  kmp_uint32 d=0;
690  while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
691  kmp_uint32 rem;
692  if (d == thr_bar->depth-2) { // reached level right below the master
693  thr_bar->parent_tid = 0;
694  thr_bar->my_level = d;
695  break;
696  }
697  else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
698  // thread is not a subtree root at next level, so this is max
699  thr_bar->parent_tid = tid - rem;
700  thr_bar->my_level = d;
701  break;
702  }
703  ++d;
704  }
705  }
706  thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
707  thr_bar->old_tid = tid;
708  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
709  thr_bar->team = team;
710  thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
711  }
712  if (uninitialized || team_changed || tid_changed) {
713  thr_bar->team = team;
714  thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
715  retval = true;
716  }
717  if (uninitialized || team_sz_changed || tid_changed) {
718  thr_bar->nproc = nproc;
719  thr_bar->leaf_kids = thr_bar->base_leaf_kids;
720  if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
721  if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
722  thr_bar->leaf_kids = nproc - tid - 1;
723  thr_bar->leaf_state = 0;
724  for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
725  }
726  return retval;
727 }
728 
729 static void
730 __kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
731  int gtid, int tid, void (*reduce) (void *, void *)
732  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
733 {
734  KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
735  register kmp_team_t *team = this_thr->th.th_team;
736  register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
737  register kmp_uint32 nproc = this_thr->th.th_team_nproc;
738  register kmp_info_t **other_threads = team->t.t_threads;
739  register kmp_uint64 new_state;
740 
741  int level = team->t.t_level;
742 #if OMP_40_ENABLED
743  if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
744  if (this_thr->th.th_teams_size.nteams > 1)
745  ++level; // level was not increased in teams construct for team_of_masters
746 #endif
747  if (level == 1) thr_bar->use_oncore_barrier = 1;
748  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
749 
750  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
751  gtid, team->t.t_id, tid, bt));
752  KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
753 
754 #if USE_ITT_BUILD && USE_ITT_NOTIFY
755  // Barrier imbalance - save arrive time to the thread
756  if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
757  this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
758  }
759 #endif
760 
761  (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
762 
763  if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
764  register kmp_int32 child_tid;
765  new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
766  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
767  if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
768  kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
769  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n",
770  gtid, team->t.t_id, tid));
771  kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
772  flag.wait(this_thr, FALSE
773  USE_ITT_BUILD_ARG(itt_sync_obj) );
774  if (reduce) {
775  for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
776  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
777  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
778  team->t.t_id, child_tid));
779  (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
780  }
781  }
782  (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
783  }
784  // Next, wait for higher level children on each child's b_arrived flag
785  for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
786  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
787  if (last > nproc) last = nproc;
788  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
789  register kmp_info_t *child_thr = other_threads[child_tid];
790  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
791  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
792  "arrived(%p) == %llu\n",
793  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
794  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
795  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
796  flag.wait(this_thr, FALSE
797  USE_ITT_BUILD_ARG(itt_sync_obj) );
798  if (reduce) {
799  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
800  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
801  team->t.t_id, child_tid));
802  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
803  }
804  }
805  }
806  }
807  else { // Blocktime is not infinite
808  for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
809  kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
810  if (last > nproc) last = nproc;
811  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
812  register kmp_info_t *child_thr = other_threads[child_tid];
813  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
814  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
815  "arrived(%p) == %llu\n",
816  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
817  team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
818  kmp_flag_64 flag(&child_bar->b_arrived, new_state);
819  flag.wait(this_thr, FALSE
820  USE_ITT_BUILD_ARG(itt_sync_obj) );
821  if (reduce) {
822  KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
823  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
824  team->t.t_id, child_tid));
825  (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
826  }
827  }
828  }
829  }
830  }
831  // All subordinates are gathered; now release parent if not master thread
832 
833  if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
834  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
835  "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
836  __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
837  &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
838  /* Mark arrival to parent: After performing this write, a worker thread may not assume that
839  the team is valid any more - it could be deallocated by the master thread at any time. */
840  if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
841  || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
842  kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
843  flag.release();
844  }
845  else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
846  thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
847  kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
848  flag.set_waiter(other_threads[thr_bar->parent_tid]);
849  flag.release();
850  }
851  } else { // Master thread needs to update the team's b_arrived value
852  team->t.t_bar[bt].b_arrived = new_state;
853  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
854  gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
855  }
856  // Is the team access below unsafe or just technically invalid?
857  KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
858  gtid, team->t.t_id, tid, bt));
859 }
860 
861 static void
862 __kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
863  int propagate_icvs
864  USE_ITT_BUILD_ARG(void * itt_sync_obj) )
865 {
866  KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
867  register kmp_team_t *team;
868  register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
869  register kmp_uint32 nproc;
870  bool team_change = false; // indicates on-core barrier shouldn't be used
871 
872  if (KMP_MASTER_TID(tid)) {
873  team = __kmp_threads[gtid]->th.th_team;
874  KMP_DEBUG_ASSERT(team != NULL);
875  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
876  gtid, team->t.t_id, tid, bt));
877  }
878  else { // Worker threads
879  // Wait for parent thread to release me
880  if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
881  || thr_bar->my_level != 0 || thr_bar->team == NULL) {
882  // Use traditional method of waiting on my own b_go flag
883  thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
884  kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
885  flag.wait(this_thr, TRUE
886  USE_ITT_BUILD_ARG(itt_sync_obj) );
887  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
888  }
889  else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
890  // Wait on my "offset" bits on parent's b_go flag
891  thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
892  kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
893  bt, this_thr
894  USE_ITT_BUILD_ARG(itt_sync_obj) );
895  flag.wait(this_thr, TRUE);
896  if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
897  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
898  }
899  else { // Reset my bits on parent's b_go flag
900  ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
901  }
902  }
903  thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
904  // Early exit for reaping threads releasing forkjoin barrier
905  if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
906  return;
907  // The worker thread may now assume that the team is valid.
908  team = __kmp_threads[gtid]->th.th_team;
909  KMP_DEBUG_ASSERT(team != NULL);
910  tid = __kmp_tid_from_gtid(gtid);
911 
912  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
913  gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
914  KMP_MB(); // Flush all pending memory write invalidates.
915  }
916 
917  nproc = this_thr->th.th_team_nproc;
918  int level = team->t.t_level;
919 #if OMP_40_ENABLED
920  if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
921  if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
922  ++level; // level was not increased in teams construct for team_of_workers
923  if( this_thr->th.th_teams_size.nteams > 1 )
924  ++level; // level was not increased in teams construct for team_of_masters
925  }
926 #endif
927  if (level == 1) thr_bar->use_oncore_barrier = 1;
928  else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
929 
930  // If the team size has increased, we still communicate with old leaves via oncore barrier.
931  unsigned short int old_leaf_kids = thr_bar->leaf_kids;
932  kmp_uint64 old_leaf_state = thr_bar->leaf_state;
933  team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
934  // But if the entire team changes, we won't use oncore barrier at all
935  if (team_change) old_leaf_kids = 0;
936 
937 #if KMP_BARRIER_ICV_PUSH
938  if (propagate_icvs) {
939  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
940  if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
941  copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
942  }
943  else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
944  if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
945  // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
946  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
947  &thr_bar->parent_bar->th_fixed_icvs);
948  // non-leaves will get ICVs piggybacked with b_go via NGO store
949  }
950  else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
951  if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
952  copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
953  else // leaves copy parent's fixed ICVs directly to local ICV store
954  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
955  &thr_bar->parent_bar->th_fixed_icvs);
956  }
957  }
958 #endif // KMP_BARRIER_ICV_PUSH
959 
960  // Now, release my children
961  if (thr_bar->my_level) { // not a leaf
962  register kmp_int32 child_tid;
963  kmp_uint32 last;
964  if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
965  if (KMP_MASTER_TID(tid)) { // do a flat release
966  // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
967  thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
968  // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
969  ngo_load(&thr_bar->th_fixed_icvs);
970  // This loops over all the threads skipping only the leaf nodes in the hierarchy
971  for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
972  register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
973  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
974  " go(%p): %u => %u\n",
975  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
976  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
977  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
978  // Use ngo store (if available) to both store ICVs and release child via child's b_go
979  ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
980  }
981  ngo_sync();
982  }
983  TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
984  // Now, release leaf children
985  if (thr_bar->leaf_kids) { // if there are any
986  // We test team_change on the off-chance that the level 1 team changed.
987  if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
988  if (old_leaf_kids) { // release old leaf kids
989  thr_bar->b_go |= old_leaf_state;
990  }
991  // Release new leaf kids
992  last = tid+thr_bar->skip_per_level[1];
993  if (last > nproc) last = nproc;
994  for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
995  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
996  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
997  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
998  " T#%d(%d:%d) go(%p): %u => %u\n",
999  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1000  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1001  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1002  // Release child using child's b_go flag
1003  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1004  flag.release();
1005  }
1006  }
1007  else { // Release all children at once with leaf_state bits on my own b_go flag
1008  thr_bar->b_go |= thr_bar->leaf_state;
1009  }
1010  }
1011  }
1012  else { // Blocktime is not infinite; do a simple hierarchical release
1013  for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1014  last = tid+thr_bar->skip_per_level[d+1];
1015  kmp_uint32 skip = thr_bar->skip_per_level[d];
1016  if (last > nproc) last = nproc;
1017  for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1018  register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1019  register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1020  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1021  " go(%p): %u => %u\n",
1022  gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1023  team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1024  child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1025  // Release child using child's b_go flag
1026  kmp_flag_64 flag(&child_bar->b_go, child_thr);
1027  flag.release();
1028  }
1029  }
1030  }
1031 #if KMP_BARRIER_ICV_PUSH
1032  if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1033  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1034 #endif // KMP_BARRIER_ICV_PUSH
1035  }
1036  KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1037  gtid, team->t.t_id, tid, bt));
1038 }
1039 
1040 // ---------------------------- End of Barrier Algorithms ----------------------------
1041 
1042 // Internal function to do a barrier.
1043 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1044  If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1045  Returns 0 if master thread, 1 if worker thread. */
1046 int
1047 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1048  void *reduce_data, void (*reduce)(void *, void *))
1049 {
1050  KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
1051  KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1052  KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1053  register int tid = __kmp_tid_from_gtid(gtid);
1054  register kmp_info_t *this_thr = __kmp_threads[gtid];
1055  register kmp_team_t *team = this_thr->th.th_team;
1056  register int status = 0;
1057  ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1058 #if OMPT_SUPPORT
1059  ompt_task_id_t my_task_id;
1060  ompt_parallel_id_t my_parallel_id;
1061 #endif
1062 
1063  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1064  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1065 
1066 #if OMPT_SUPPORT
1067  if (ompt_enabled) {
1068 #if OMPT_BLAME
1069  my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1070  my_parallel_id = team->t.ompt_team_info.parallel_id;
1071 
1072 #if OMPT_TRACE
1073  if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1074  if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1075  ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1076  my_parallel_id, my_task_id);
1077  }
1078  }
1079 #endif
1080  if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1081  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1082  my_parallel_id, my_task_id);
1083  }
1084 #endif
1085  // It is OK to report the barrier state after the barrier begin callback.
1086  // According to the OMPT specification, a compliant implementation may
1087  // even delay reporting this state until the barrier begins to wait.
1088  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1089  }
1090 #endif
1091 
1092  if (! team->t.t_serialized) {
1093 #if USE_ITT_BUILD
1094  // This value will be used in itt notify events below.
1095  void *itt_sync_obj = NULL;
1096 # if USE_ITT_NOTIFY
1097  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1098  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1099 # endif
1100 #endif /* USE_ITT_BUILD */
1101  if (__kmp_tasking_mode == tskm_extra_barrier) {
1102  __kmp_tasking_barrier(team, this_thr, gtid);
1103  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1104  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1105  }
1106 
1107  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1108  the team struct is not guaranteed to exist. */
1109  // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1110  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1111  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1112  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1113  }
1114 
1115 #if USE_ITT_BUILD
1116  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1117  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1118 #endif /* USE_ITT_BUILD */
1119 #if USE_DEBUGGER
1120  // Let the debugger know: the thread arrived to the barrier and waiting.
1121  if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1122  team->t.t_bar[bt].b_master_arrived += 1;
1123  } else {
1124  this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1125  } // if
1126 #endif /* USE_DEBUGGER */
1127  if (reduce != NULL) {
1128  //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1129  this_thr->th.th_local.reduce_data = reduce_data;
1130  }
1131 
1132  if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1133  __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1
1134 
1135  switch (__kmp_barrier_gather_pattern[bt]) {
1136  case bp_hyper_bar: {
1137  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1138  __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1139  USE_ITT_BUILD_ARG(itt_sync_obj) );
1140  break;
1141  }
1142  case bp_hierarchical_bar: {
1143  __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1144  USE_ITT_BUILD_ARG(itt_sync_obj));
1145  break;
1146  }
1147  case bp_tree_bar: {
1148  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1149  __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1150  USE_ITT_BUILD_ARG(itt_sync_obj) );
1151  break;
1152  }
1153  default: {
1154  __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1155  USE_ITT_BUILD_ARG(itt_sync_obj) );
1156  }
1157  }
1158 
1159  KMP_MB();
1160 
1161  if (KMP_MASTER_TID(tid)) {
1162  status = 0;
1163  if (__kmp_tasking_mode != tskm_immediate_exec) {
1164  __kmp_task_team_wait(this_thr, team
1165  USE_ITT_BUILD_ARG(itt_sync_obj) );
1166  }
1167 #if USE_DEBUGGER
1168  // Let the debugger know: All threads are arrived and starting leaving the barrier.
1169  team->t.t_bar[bt].b_team_arrived += 1;
1170 #endif
1171 
1172 #if USE_ITT_BUILD
1173  /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1174  before the final summation into the shared variable is done (final summation can be a
1175  long operation for array reductions). */
1176  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1177  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1178 #endif /* USE_ITT_BUILD */
1179 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1180  // Barrier - report frame end (only if active_level == 1)
1181  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1182 #if OMP_40_ENABLED
1183  this_thr->th.th_teams_microtask == NULL &&
1184 #endif
1185  team->t.t_active_level == 1)
1186  {
1187  kmp_uint64 cur_time = __itt_get_timestamp();
1188  kmp_info_t **other_threads = team->t.t_threads;
1189  int nproc = this_thr->th.th_team_nproc;
1190  int i;
1191  switch(__kmp_forkjoin_frames_mode) {
1192  case 1:
1193  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1194  this_thr->th.th_frame_time = cur_time;
1195  break;
1196  case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
1197  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1198  break;
1199  case 3:
1200  if( __itt_metadata_add_ptr ) {
1201  // Initialize with master's wait time
1202  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1203  // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1204  this_thr->th.th_bar_arrive_time = 0;
1205  for (i=1; i<nproc; ++i) {
1206  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1207  other_threads[i]->th.th_bar_arrive_time = 0;
1208  }
1209  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1210  }
1211  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1212  this_thr->th.th_frame_time = cur_time;
1213  break;
1214  }
1215  }
1216 #endif /* USE_ITT_BUILD */
1217  } else {
1218  status = 1;
1219 #if USE_ITT_BUILD
1220  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1221  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1222 #endif /* USE_ITT_BUILD */
1223  }
1224  if (status == 1 || ! is_split) {
1225  switch (__kmp_barrier_release_pattern[bt]) {
1226  case bp_hyper_bar: {
1227  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1228  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1229  USE_ITT_BUILD_ARG(itt_sync_obj) );
1230  break;
1231  }
1232  case bp_hierarchical_bar: {
1233  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1234  USE_ITT_BUILD_ARG(itt_sync_obj) );
1235  break;
1236  }
1237  case bp_tree_bar: {
1238  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1239  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1240  USE_ITT_BUILD_ARG(itt_sync_obj) );
1241  break;
1242  }
1243  default: {
1244  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1245  USE_ITT_BUILD_ARG(itt_sync_obj) );
1246  }
1247  }
1248  if (__kmp_tasking_mode != tskm_immediate_exec) {
1249  __kmp_task_team_sync(this_thr, team);
1250  }
1251  }
1252 
1253 #if USE_ITT_BUILD
1254  /* GEH: TODO: Move this under if-condition above and also include in
1255  __kmp_end_split_barrier(). This will more accurately represent the actual release time
1256  of the threads for split barriers. */
1257  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1258  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1259 #endif /* USE_ITT_BUILD */
1260  } else { // Team is serialized.
1261  status = 0;
1262  if (__kmp_tasking_mode != tskm_immediate_exec) {
1263 #if OMP_45_ENABLED
1264  if ( this_thr->th.th_task_team != NULL ) {
1265  void *itt_sync_obj = NULL;
1266 #if USE_ITT_NOTIFY
1267  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1268  itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1269  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1270  }
1271 #endif
1272 
1273  KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
1274  __kmp_task_team_wait(this_thr, team
1275  USE_ITT_BUILD_ARG(itt_sync_obj));
1276  __kmp_task_team_setup(this_thr, team, 0);
1277 
1278 #if USE_ITT_BUILD
1279  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1280  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1281 #endif /* USE_ITT_BUILD */
1282  }
1283 #else
1284  // The task team should be NULL for serialized code (tasks will be executed immediately)
1285  KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
1286  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
1287 #endif
1288  }
1289  }
1290  KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1291  gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
1292 
1293 #if OMPT_SUPPORT
1294  if (ompt_enabled) {
1295 #if OMPT_BLAME
1296  if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1297  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1298  my_parallel_id, my_task_id);
1299  }
1300 #endif
1301  this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1302  }
1303 #endif
1304 
1305  return status;
1306 }
1307 
1308 
1309 void
1310 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
1311 {
1312  KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
1313  int tid = __kmp_tid_from_gtid(gtid);
1314  kmp_info_t *this_thr = __kmp_threads[gtid];
1315  kmp_team_t *team = this_thr->th.th_team;
1316 
1317  if (!team->t.t_serialized) {
1318  if (KMP_MASTER_GTID(gtid)) {
1319  switch (__kmp_barrier_release_pattern[bt]) {
1320  case bp_hyper_bar: {
1321  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1322  __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1323  USE_ITT_BUILD_ARG(NULL) );
1324  break;
1325  }
1326  case bp_hierarchical_bar: {
1327  __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1328  USE_ITT_BUILD_ARG(NULL));
1329  break;
1330  }
1331  case bp_tree_bar: {
1332  KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1333  __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1334  USE_ITT_BUILD_ARG(NULL) );
1335  break;
1336  }
1337  default: {
1338  __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1339  USE_ITT_BUILD_ARG(NULL) );
1340  }
1341  }
1342  if (__kmp_tasking_mode != tskm_immediate_exec) {
1343  __kmp_task_team_sync(this_thr, team);
1344  } // if
1345  }
1346  }
1347 }
1348 
1349 
1350 void
1351 __kmp_join_barrier(int gtid)
1352 {
1353  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
1354  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1355  KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
1356  register kmp_info_t *this_thr = __kmp_threads[gtid];
1357  register kmp_team_t *team;
1358  register kmp_uint nproc;
1359  kmp_info_t *master_thread;
1360  int tid;
1361 #ifdef KMP_DEBUG
1362  int team_id;
1363 #endif /* KMP_DEBUG */
1364 #if USE_ITT_BUILD
1365  void *itt_sync_obj = NULL;
1366 # if USE_ITT_NOTIFY
1367  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1368  // Get object created at fork_barrier
1369  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1370 # endif
1371 #endif /* USE_ITT_BUILD */
1372  KMP_MB();
1373 
1374  // Get current info
1375  team = this_thr->th.th_team;
1376  nproc = this_thr->th.th_team_nproc;
1377  KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1378  tid = __kmp_tid_from_gtid(gtid);
1379 #ifdef KMP_DEBUG
1380  team_id = team->t.t_id;
1381 #endif /* KMP_DEBUG */
1382  master_thread = this_thr->th.th_team_master;
1383 #ifdef KMP_DEBUG
1384  if (master_thread != team->t.t_threads[0]) {
1385  __kmp_print_structure();
1386  }
1387 #endif /* KMP_DEBUG */
1388  KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1389  KMP_MB();
1390 
1391  // Verify state
1392  KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1393  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1394  KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1395  KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1396  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1397 
1398 #if OMPT_SUPPORT
1399 #if OMPT_TRACE
1400  if (ompt_enabled &&
1401  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1402  ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1403  team->t.ompt_team_info.parallel_id,
1404  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1405  }
1406 #endif
1407  this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1408 #endif
1409 
1410  if (__kmp_tasking_mode == tskm_extra_barrier) {
1411  __kmp_tasking_barrier(team, this_thr, gtid);
1412  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1413  }
1414 # ifdef KMP_DEBUG
1415  if (__kmp_tasking_mode != tskm_immediate_exec) {
1416  KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
1417  __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
1418  this_thr->th.th_task_team));
1419  KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
1420  }
1421 # endif /* KMP_DEBUG */
1422 
1423  /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1424  team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1425  down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1426  since the values are not used by __kmp_wait_template() in that case. */
1427  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1428  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1429  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1430  }
1431 
1432 #if USE_ITT_BUILD
1433  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1434  __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1435 #endif /* USE_ITT_BUILD */
1436 
1437  switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1438  case bp_hyper_bar: {
1439  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1440  __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1441  USE_ITT_BUILD_ARG(itt_sync_obj) );
1442  break;
1443  }
1444  case bp_hierarchical_bar: {
1445  __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1446  USE_ITT_BUILD_ARG(itt_sync_obj) );
1447  break;
1448  }
1449  case bp_tree_bar: {
1450  KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1451  __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1452  USE_ITT_BUILD_ARG(itt_sync_obj) );
1453  break;
1454  }
1455  default: {
1456  __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1457  USE_ITT_BUILD_ARG(itt_sync_obj) );
1458  }
1459  }
1460 
1461  /* From this point on, the team data structure may be deallocated at any time by the
1462  master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1463  data items that need to be referenced before the end of the barrier should be moved to
1464  the kmp_task_team_t structs. */
1465  if (KMP_MASTER_TID(tid)) {
1466  if (__kmp_tasking_mode != tskm_immediate_exec) {
1467  __kmp_task_team_wait(this_thr, team
1468  USE_ITT_BUILD_ARG(itt_sync_obj) );
1469  }
1470 #if KMP_STATS_ENABLED
1471  // Have master thread flag the workers to indicate they are now waiting for
1472  // next parallel region, Also wake them up so they switch their timers to idle.
1473  for (int i=0; i<team->t.t_nproc; ++i) {
1474  kmp_info_t* team_thread = team->t.t_threads[i];
1475  if (team_thread == this_thr)
1476  continue;
1477  team_thread->th.th_stats->setIdleFlag();
1478  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL)
1479  __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc);
1480  }
1481 #endif
1482 #if USE_ITT_BUILD
1483  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1484  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1485 #endif /* USE_ITT_BUILD */
1486 
1487 # if USE_ITT_BUILD && USE_ITT_NOTIFY
1488  // Join barrier - report frame end
1489  if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1490 #if OMP_40_ENABLED
1491  this_thr->th.th_teams_microtask == NULL &&
1492 #endif
1493  team->t.t_active_level == 1)
1494  {
1495  kmp_uint64 cur_time = __itt_get_timestamp();
1496  ident_t * loc = team->t.t_ident;
1497  kmp_info_t **other_threads = team->t.t_threads;
1498  int nproc = this_thr->th.th_team_nproc;
1499  int i;
1500  switch(__kmp_forkjoin_frames_mode) {
1501  case 1:
1502  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1503  break;
1504  case 2:
1505  __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1506  break;
1507  case 3:
1508  if( __itt_metadata_add_ptr ) {
1509  // Initialize with master's wait time
1510  kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1511  // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1512  this_thr->th.th_bar_arrive_time = 0;
1513  for (i=1; i<nproc; ++i) {
1514  delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1515  other_threads[i]->th.th_bar_arrive_time = 0;
1516  }
1517  __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1518  }
1519  __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1520  this_thr->th.th_frame_time = cur_time;
1521  break;
1522  }
1523  }
1524 # endif /* USE_ITT_BUILD */
1525  }
1526 #if USE_ITT_BUILD
1527  else {
1528  if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1529  __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1530  }
1531 #endif /* USE_ITT_BUILD */
1532 
1533 #if KMP_DEBUG
1534  if (KMP_MASTER_TID(tid)) {
1535  KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1536  gtid, team_id, tid, nproc));
1537  }
1538 #endif /* KMP_DEBUG */
1539 
1540  // TODO now, mark worker threads as done so they may be disbanded
1541  KMP_MB(); // Flush all pending memory write invalidates.
1542  KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1543 
1544 #if OMPT_SUPPORT
1545  if (ompt_enabled) {
1546 #if OMPT_BLAME
1547  if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1548  ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1549  team->t.ompt_team_info.parallel_id,
1550  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1551  }
1552 #endif
1553 
1554  // return to default state
1555  this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1556  }
1557 #endif
1558 }
1559 
1560 
1561 // TODO release worker threads' fork barriers as we are ready instead of all at once
1562 void
1563 __kmp_fork_barrier(int gtid, int tid)
1564 {
1565  KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
1566  KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1567  KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
1568  kmp_info_t *this_thr = __kmp_threads[gtid];
1569  kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1570 #if USE_ITT_BUILD
1571  void * itt_sync_obj = NULL;
1572 #endif /* USE_ITT_BUILD */
1573 
1574  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1575  gtid, (team != NULL) ? team->t.t_id : -1, tid));
1576 
1577  // th_team pointer only valid for master thread here
1578  if (KMP_MASTER_TID(tid)) {
1579 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1580  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1581  // Create itt barrier object
1582  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1583  __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1584  }
1585 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1586 
1587 #ifdef KMP_DEBUG
1588  register kmp_info_t **other_threads = team->t.t_threads;
1589  register int i;
1590 
1591  // Verify state
1592  KMP_MB();
1593 
1594  for(i=1; i<team->t.t_nproc; ++i) {
1595  KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1596  gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1597  team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1598  other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1599  KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1600  & ~(KMP_BARRIER_SLEEP_STATE))
1601  == KMP_INIT_BARRIER_STATE);
1602  KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1603  }
1604 #endif
1605 
1606  if (__kmp_tasking_mode != tskm_immediate_exec) {
1607  __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1
1608  }
1609 
1610  /* The master thread may have changed its blocktime between the join barrier and the
1611  fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1612  access it when the team struct is not guaranteed to exist. */
1613  // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1614  if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1615  this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1616  this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1617  }
1618  } // master
1619 
1620  switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1621  case bp_hyper_bar: {
1622  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1623  __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1624  USE_ITT_BUILD_ARG(itt_sync_obj) );
1625  break;
1626  }
1627  case bp_hierarchical_bar: {
1628  __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1629  USE_ITT_BUILD_ARG(itt_sync_obj) );
1630  break;
1631  }
1632  case bp_tree_bar: {
1633  KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1634  __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1635  USE_ITT_BUILD_ARG(itt_sync_obj) );
1636  break;
1637  }
1638  default: {
1639  __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1640  USE_ITT_BUILD_ARG(itt_sync_obj) );
1641  }
1642  }
1643 
1644  // Early exit for reaping threads releasing forkjoin barrier
1645  if (TCR_4(__kmp_global.g.g_done)) {
1646  this_thr->th.th_task_team = NULL;
1647 
1648 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1649  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1650  if (!KMP_MASTER_TID(tid)) {
1651  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1652  if (itt_sync_obj)
1653  __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1654  }
1655  }
1656 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657  KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1658  return;
1659  }
1660 
1661  /* We can now assume that a valid team structure has been allocated by the master and
1662  propagated to all worker threads. The current thread, however, may not be part of the
1663  team, so we can't blindly assume that the team pointer is non-null. */
1664  team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1665  KMP_DEBUG_ASSERT(team != NULL);
1666  tid = __kmp_tid_from_gtid(gtid);
1667 
1668 
1669 #if KMP_BARRIER_ICV_PULL
1670  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1671  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1672  this data before this function is called. We cannot modify __kmp_fork_call() to look at
1673  the fixed ICVs in the master's thread struct, because it is not always the case that the
1674  threads arrays have been allocated when __kmp_fork_call() is executed. */
1675  {
1676  KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
1677  if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1678  // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1679  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1680  __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1681  copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1682  &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1683  }
1684  }
1685 #endif // KMP_BARRIER_ICV_PULL
1686 
1687  if (__kmp_tasking_mode != tskm_immediate_exec) {
1688  __kmp_task_team_sync(this_thr, team);
1689  }
1690 
1691 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1692  kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1693  if (proc_bind == proc_bind_intel) {
1694 #endif
1695 #if KMP_AFFINITY_SUPPORTED
1696  // Call dynamic affinity settings
1697  if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1698  __kmp_balanced_affinity(tid, team->t.t_nproc);
1699  }
1700 #endif // KMP_AFFINITY_SUPPORTED
1701 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1702  }
1703  else if (proc_bind != proc_bind_false) {
1704  if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1705  KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1706  __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1707  }
1708  else {
1709  __kmp_affinity_set_place(gtid);
1710  }
1711  }
1712 #endif
1713 
1714 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1715  if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1716  if (!KMP_MASTER_TID(tid)) {
1717  // Get correct barrier object
1718  itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1719  __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1720  } // (prepare called inside barrier_release)
1721  }
1722 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1723  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1724 }
1725 
1726 
1727 void
1728 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1729 {
1730  KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
1731 
1732  KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1733  KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1734 
1735  /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1736  __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1737  this data before this function is called. */
1738 #if KMP_BARRIER_ICV_PULL
1739  /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1740  all of the worker threads can access them and make their own copies after the barrier. */
1741  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1742  copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1743  KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1744  0, team->t.t_threads[0], team));
1745 #elif KMP_BARRIER_ICV_PUSH
1746  // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1747  KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1748  0, team->t.t_threads[0], team));
1749 #else
1750  // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1751  ngo_load(new_icvs);
1752  KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1753  for (int f=1; f<new_nproc; ++f) { // Skip the master thread
1754  // TODO: GEH - pass in better source location info since usually NULL here
1755  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1756  f, team->t.t_threads[f], team));
1757  __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1758  ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1759  KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1760  f, team->t.t_threads[f], team));
1761  }
1762  ngo_sync();
1763 #endif // KMP_BARRIER_ICV_PULL
1764 }
Definition: kmp.h:194