1 files changed, 229 insertions, 40 deletions
diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp
index 3dd9ab6..ead122f 100644
--- a/runtime/src/kmp_runtime.cpp
+++ b/runtime/src/kmp_runtime.cpp
@@ -4,10 +4,9 @@
 
 //===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.txt for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -752,6 +751,10 @@ int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
+#if OMP_50_ENABLED
+  __kmp_resume_if_soft_paused();
+#endif
+
   th = __kmp_threads[gtid];
   team = th->th.th_team;
   status = 0;
@@ -824,6 +827,7 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
   int new_nthreads;
   KMP_DEBUG_ASSERT(__kmp_init_serial);
   KMP_DEBUG_ASSERT(root && parent_team);
+  kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
 
   // If dyn-var is set, dynamically adjust the number of desired threads,
   // according to the method specified by dynamic_mode.
@@ -913,10 +917,12 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
   }
 
   // Respect OMP_THREAD_LIMIT
-  if (root->r.r_cg_nthreads + new_nthreads -
+  int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
+  int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
+  if (cg_nthreads + new_nthreads -
           (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
-      __kmp_cg_max_nth) {
-    int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads +
+      max_cg_threads) {
+    int tl_nthreads = max_cg_threads - cg_nthreads +
                       (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
     if (tl_nthreads <= 0) {
       tl_nthreads = 1;
@@ -1189,6 +1195,10 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
   if (!TCR_4(__kmp_init_parallel))
     __kmp_parallel_initialize();
 
+#if OMP_50_ENABLED
+  __kmp_resume_if_soft_paused();
+#endif
+
   this_thr = __kmp_threads[global_tid];
   serial_team = this_thr->th.th_serial_team;
 
@@ -1429,7 +1439,7 @@ void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     if (ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
-          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid));
+          OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
       OMPT_CUR_TASK_INFO(this_thr)
           ->thread_num = __kmp_tid_from_gtid(global_tid);
     }
@@ -1492,6 +1502,10 @@ int __kmp_fork_call(ident_t *loc, int gtid,
     if (!TCR_4(__kmp_init_parallel))
       __kmp_parallel_initialize();
 
+#if OMP_50_ENABLED
+    __kmp_resume_if_soft_paused();
+#endif
+
     /* setup current data */
     master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
     // shutdown
@@ -1595,7 +1609,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
           if (ompt_enabled.ompt_callback_implicit_task) {
             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
+                implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
             OMPT_CUR_TASK_INFO(master_th)
                 ->thread_num = __kmp_tid_from_gtid(gtid);
           }
@@ -1625,7 +1639,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
           if (ompt_enabled.ompt_callback_implicit_task) {
             ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                 ompt_scope_end, NULL, implicit_task_data, 1,
-                OMPT_CUR_TASK_INFO(master_th)->thread_num);
+                OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
           }
           __ompt_lw_taskteam_unlink(master_th);
 
@@ -1807,7 +1821,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
             if (ompt_enabled.ompt_callback_implicit_task) {
               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid));
+                  &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
               OMPT_CUR_TASK_INFO(master_th)
                   ->thread_num = __kmp_tid_from_gtid(gtid);
             }
@@ -1837,7 +1851,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
             if (ompt_enabled.ompt_callback_implicit_task) {
               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                   ompt_scope_end, NULL, &(task_info->task_data), 1,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num);
+                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
             }
 
             __ompt_lw_taskteam_unlink(master_th);
@@ -1908,7 +1922,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
             if (ompt_enabled.ompt_callback_implicit_task) {
               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                   ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
-                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid));
+                  implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
               OMPT_CUR_TASK_INFO(master_th)
                   ->thread_num = __kmp_tid_from_gtid(gtid);
             }
@@ -1937,7 +1951,7 @@ int __kmp_fork_call(ident_t *loc, int gtid,
             if (ompt_enabled.ompt_callback_implicit_task) {
               ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
                   ompt_scope_end, NULL, &(task_info->task_data), 1,
-                  OMPT_CUR_TASK_INFO(master_th)->thread_num);
+                  OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
             }
 
             ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
@@ -2351,7 +2365,6 @@ void __kmp_join_call(ident_t *loc, int gtid
   kmp_info_t *master_th;
   kmp_root_t *root;
   int master_active;
-  int i;
 
   KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
 
@@ -2474,21 +2487,24 @@ void __kmp_join_call(ident_t *loc, int gtid
     team->t.t_active_level--;
     KMP_ATOMIC_DEC(&root->r.r_in_parallel);
 
-    /* Restore number of threads in the team if needed */
+    // Restore number of threads in the team if needed. This code relies on
+    // the proper adjustment of th_teams_size.nth after the fork in
+    // __kmp_teams_master on each teams master in the case that
+    // __kmp_reserve_threads reduced it.
     if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
       int old_num = master_th->th.th_team_nproc;
       int new_num = master_th->th.th_teams_size.nth;
       kmp_info_t **other_threads = team->t.t_threads;
       team->t.t_nproc = new_num;
-      for (i = 0; i < old_num; ++i) {
+      for (int i = 0; i < old_num; ++i) {
         other_threads[i]->th.th_team_nproc = new_num;
       }
       // Adjust states of non-used threads of the team
-      for (i = old_num; i < new_num; ++i) {
+      for (int i = old_num; i < new_num; ++i) {
         // Re-initialize thread's barrier data.
-        int b;
+        KMP_DEBUG_ASSERT(other_threads[i]);
         kmp_balign_t *balign = other_threads[i]->th.th_bar;
-        for (b = 0; b < bs_last_barrier; ++b) {
+        for (int b = 0; b < bs_last_barrier; ++b) {
           balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
           KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
 #if USE_DEBUGGER
@@ -2541,7 +2557,7 @@ void __kmp_join_call(ident_t *loc, int gtid
       int ompt_team_size = team->t.t_nproc;
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
           ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
-          OMPT_CUR_TASK_INFO(master_th)->thread_num);
+          OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
     }
 
     task_info->frame.exit_frame = ompt_data_none;
@@ -3166,6 +3182,7 @@ static kmp_internal_control_t __kmp_get_global_icvs(void) {
     __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
     // next parallel region (per thread)
     // (use a max ub on value if __kmp_parallel_initialize not called yet)
+    __kmp_cg_max_nth, // int thread_limit;
     __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
     // for max_active_levels
     r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
@@ -3209,7 +3226,6 @@ static void __kmp_initialize_root(kmp_root_t *root) {
   root->r.r_in_parallel = 0;
   root->r.r_blocktime = __kmp_dflt_blocktime;
   root->r.r_nested = __kmp_dflt_nested;
-  root->r.r_cg_nthreads = 1;
 
   /* setup the root team for this task */
   /* allocate the root team structure */
@@ -3850,6 +3866,16 @@ int __kmp_register_root(int initial_thread) {
   root_thread->th.th_prev_num_threads = 1;
 #endif
 
+  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
+  tmp->cg_root = root_thread;
+  tmp->cg_thread_limit = __kmp_cg_max_nth;
+  tmp->cg_nthreads = 1;
+  KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
+                 " cg_nthreads init to 1\n",
+                 root_thread, tmp));
+  tmp->up = NULL;
+  root_thread->th.th_cg_roots = tmp;
+
   __kmp_root_counter++;
 
 #if OMPT_SUPPORT
@@ -3966,7 +3992,11 @@ static int __kmp_reset_root(int gtid, kmp_root_t *root) {
 
   TCW_4(__kmp_nth,
         __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
-  root->r.r_cg_nthreads--;
+  root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
+  KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
+                 " to %d\n",
+                 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
+                 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
 
   __kmp_reap_thread(root->r.r_uber_thread, 1);
 
@@ -4141,6 +4171,22 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
     this_thr->th.th_pri_head = NULL;
   }
 
+  if (this_thr != master && // Master's CG root is initialized elsewhere
+      this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
+    // Make new thread's CG root same as master's
+    KMP_DEBUG_ASSERT(master->th.th_cg_roots);
+    this_thr->th.th_cg_roots = master->th.th_cg_roots;
+    // Increment new thread's CG root's counter to add the new thread
+    this_thr->th.th_cg_roots->cg_nthreads++;
+    KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
+                   " node %p of thread %p to %d\n",
+                   this_thr, this_thr->th.th_cg_roots,
+                   this_thr->th.th_cg_roots->cg_root,
+                   this_thr->th.th_cg_roots->cg_nthreads));
+    this_thr->th.th_current_task->td_icvs.thread_limit =
+        this_thr->th.th_cg_roots->cg_thread_limit;
+  }
+
   /* Initialize dynamic dispatch */
   {
     volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
@@ -4222,7 +4268,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
 
   /* first, try to get one from the thread pool */
   if (__kmp_thread_pool) {
-
     new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
     __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
     if (new_thr == __kmp_thread_pool_insert_pt) {
@@ -4245,7 +4290,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
     KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
 
     TCW_4(__kmp_nth, __kmp_nth + 1);
-    root->r.r_cg_nthreads++;
 
     new_thr->th.th_task_state = 0;
     new_thr->th.th_task_state_top = 0;
@@ -4401,8 +4445,6 @@ kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
   __kmp_all_nth++;
   __kmp_nth++;
 
-  root->r.r_cg_nthreads++;
-
   // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
   // numbers of procs, and method #2 (keyed API call) for higher numbers.
   if (__kmp_adjust_gtid_mode) {
@@ -4954,7 +4996,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
 #endif
   // Optimization to use a "hot" team
   if (use_hot_team && new_nproc > 1) {
-    KMP_DEBUG_ASSERT(new_nproc == max_nproc);
+    KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
 #if KMP_NESTED_HOT_TEAMS
     team = hot_teams[level].hot_team;
 #else
@@ -5060,10 +5102,11 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       __kmp_reinitialize_team(team, new_icvs,
                               root->r.r_uber_thread->th.th_ident);
 
-      /* update the remaining threads */
+      // Update remaining threads
       for (f = 0; f < new_nproc; ++f) {
         team->t.t_threads[f]->th.th_team_nproc = new_nproc;
       }
+
       // restore the current task state of the master thread: should be the
       // implicit task
       KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
@@ -5191,6 +5234,7 @@ __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
       for (f = 0; f < team->t.t_nproc; ++f)
         __kmp_initialize_info(team->t.t_threads[f], team, f,
                               __kmp_gtid_from_tid(f, team));
+
       if (level) { // set th_task_state for new threads in nested hot team
         // __kmp_initialize_info() no longer zeroes th_task_state, so we should
         // only need to set the th_task_state for the new threads. th_task_state
@@ -5479,8 +5523,8 @@ void __kmp_free_team(kmp_root_t *root,
       for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
         kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
         if (task_team != NULL) {
-          for (f = 0; f < team->t.t_nproc;
-               ++f) { // Have all threads unref task teams
+          for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
+            KMP_DEBUG_ASSERT(team->t.t_threads[f]);
             team->t.t_threads[f]->th.th_task_team = NULL;
           }
           KA_TRACE(
@@ -5511,6 +5555,29 @@ void __kmp_free_team(kmp_root_t *root,
     /* TODO limit size of team pool, call reap_team if pool too large */
     team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
     __kmp_team_pool = (volatile kmp_team_t *)team;
+  } else { // Check if team was created for the masters in a teams construct
+    // See if first worker is a CG root
+    KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
+                     team->t.t_threads[1]->th.th_cg_roots);
+    if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
+      // Clean up the CG root nodes on workers so that this team can be re-used
+      for (f = 1; f < team->t.t_nproc; ++f) {
+        kmp_info_t *thr = team->t.t_threads[f];
+        KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
+                         thr->th.th_cg_roots->cg_root == thr);
+        // Pop current CG root off list
+        kmp_cg_root_t *tmp = thr->th.th_cg_roots;
+        thr->th.th_cg_roots = tmp->up;
+        KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
+                       " up to node %p. cg_nthreads was %d\n",
+                       thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
+        __kmp_free(tmp);
+        // Restore current task's thread_limit from CG root
+        if (thr->th.th_cg_roots)
+          thr->th.th_current_task->td_icvs.thread_limit =
+              thr->th.th_cg_roots->cg_thread_limit;
+      }
+    }
   }
 
   KMP_MB();
@@ -5566,7 +5633,6 @@ kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
 void __kmp_free_thread(kmp_info_t *this_th) {
   int gtid;
   kmp_info_t **scan;
-  kmp_root_t *root = this_th->th.th_root;
 
   KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
@@ -5591,6 +5657,26 @@ void __kmp_free_thread(kmp_info_t *this_th) {
   TCW_PTR(this_th->th.th_root, NULL);
   TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
 
+  while (this_th->th.th_cg_roots) {
+    this_th->th.th_cg_roots->cg_nthreads--;
+    KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
+                   " %p of thread  %p to %d\n",
+                   this_th, this_th->th.th_cg_roots,
+                   this_th->th.th_cg_roots->cg_root,
+                   this_th->th.th_cg_roots->cg_nthreads));
+    kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
+    if (tmp->cg_root == this_th) { // Thread is a cg_root
+      KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
+      KA_TRACE(
+          5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
+      this_th->th.th_cg_roots = tmp->up;
+      __kmp_free(tmp);
+    } else { // Worker thread
+      this_th->th.th_cg_roots = NULL;
+      break;
+    }
+  }
+
   /* If the implicit task assigned to this thread can be used by other threads
    * -> multiple threads can share the data and try to free the task at
    * __kmp_reap_thread at exit. This duplicate use of the task data can happen
@@ -5634,7 +5720,6 @@ void __kmp_free_thread(kmp_info_t *this_th) {
   __kmp_thread_pool_nth++;
 
   TCW_4(__kmp_nth, __kmp_nth - 1);
-  root->r.r_cg_nthreads--;
 
 #ifdef KMP_ADJUST_BLOCKTIME
   /* Adjust blocktime back to user setting or default if necessary */
@@ -5851,7 +5936,6 @@ static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
   gtid = thread->th.th_info.ds.ds_gtid;
 
   if (!is_root) {
-
     if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
       /* Assume the threads are at the fork barrier here */
       KA_TRACE(
@@ -6272,8 +6356,10 @@ void __kmp_internal_end_thread(int gtid_req) {
   // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966)
   // and Windows(DPD200287443) that occurs when using critical sections from
   // foreign threads.
-  KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
-  return;
+  if (__kmp_pause_status != kmp_hard_paused) {
+    KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
+    return;
+  }
 #endif
   /* synchronize the termination process */
   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
@@ -6409,6 +6495,7 @@ void __kmp_register_library_startup(void) {
         // library. Assume the other library is alive.
         // WARN( ... ); // TODO: Issue a warning.
         file_name = "unknown library";
+        KMP_FALLTHROUGH();
       // Attention! Falling to the next case. That's intentional.
       case 1: { // Neighbor is alive.
         // Check it is allowed.
@@ -6920,6 +7007,10 @@ void __kmp_parallel_initialize(void) {
     __kmp_do_middle_initialize();
   }
 
+#if OMP_50_ENABLED
+  __kmp_resume_if_hard_paused();
+#endif
+
   /* begin initialization */
   KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
   KMP_ASSERT(KMP_UBER_GTID(gtid));
@@ -7041,7 +7132,7 @@ int __kmp_invoke_task_func(int gtid) {
     ompt_team_size = team->t.t_nproc;
     ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
         ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
-        __kmp_tid_from_gtid(gtid));
+        __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
     OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
   }
 #endif
@@ -7085,6 +7176,19 @@ void __kmp_teams_master(int gtid) {
   KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
   KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
                 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
+
+  // This thread is a new CG root.  Set up the proper variables.
+  kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
+  tmp->cg_root = thr; // Make thr the CG root
+  // Init to thread limit that was stored when league masters were forked
+  tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
+  tmp->cg_nthreads = 1; // Init counter to one active thread, this one
+  KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
+                 " cg_threads to 1\n",
+                 thr, tmp));
+  tmp->up = thr->th.th_cg_roots;
+  thr->th.th_cg_roots = tmp;
+
 // Launch league of teams now, but not let workers execute
 // (they hang on fork barrier until next parallel)
 #if INCLUDE_SSC_MARKS
@@ -7096,7 +7200,9 @@ void __kmp_teams_master(int gtid) {
 #if INCLUDE_SSC_MARKS
   SSC_MARK_JOINING();
 #endif
-
+  // If the team size was reduced from the limit, set it to the new size
+  if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
+    thr->th.th_teams_size.nth = thr->th.th_team_nproc;
   // AC: last parameter "1" eliminates join barrier which won't work because
   // worker threads are in a fork barrier waiting for more parallel regions
   __kmp_join_call(loc, gtid
@@ -7170,10 +7276,14 @@ void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
       num_threads = __kmp_teams_max_nth / num_teams;
     }
   } else {
+    // This thread will be the master of the league masters
+    // Store new thread limit; old limit is saved in th_cg_roots list
+    thr->th.th_current_task->td_icvs.thread_limit = num_threads;
+
     if (num_teams * num_threads > __kmp_teams_max_nth) {
       int new_threads = __kmp_teams_max_nth / num_teams;
       if (!__kmp_reserve_warn) { // user asked for too many threads
-        __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT
+        __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
         __kmp_msg(kmp_ms_warning,
                   KMP_MSG(CantFormThrTeam, num_threads, new_threads),
                   KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
@@ -7291,7 +7401,7 @@ void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
 #endif
     if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
       ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
-          ompt_scope_end, NULL, task_data, 0, ds_tid);
+          ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
     }
   }
 #endif
@@ -8190,3 +8300,82 @@ __kmp_determine_reduction_method(
 kmp_int32 __kmp_get_reduce_method(void) {
   return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
 }
+
+#if OMP_50_ENABLED
+
+// Soft pause sets up threads to ignore blocktime and just go to sleep.
+// Spin-wait code checks __kmp_pause_status and reacts accordingly.
+void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
+
+// Hard pause shuts down the runtime completely.  Resume happens naturally when
+// OpenMP is used subsequently.
+void __kmp_hard_pause() {
+  __kmp_pause_status = kmp_hard_paused;
+  __kmp_internal_end_thread(-1);
+}
+
+// Soft resume sets __kmp_pause_status, and wakes up all threads.
+void __kmp_resume_if_soft_paused() {
+  if (__kmp_pause_status == kmp_soft_paused) {
+    __kmp_pause_status = kmp_not_paused;
+
+    for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
+      kmp_info_t *thread = __kmp_threads[gtid];
+      if (thread) { // Wake it if sleeping
+        kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread);
+        if (fl.is_sleeping())
+          fl.resume(gtid);
+        else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
+          __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
+        } else { // thread holds the lock and may sleep soon
+          do { // until either the thread sleeps, or we can get the lock
+            if (fl.is_sleeping()) {
+              fl.resume(gtid);
+              break;
+            } else if (__kmp_try_suspend_mx(thread)) {
+              __kmp_unlock_suspend_mx(thread);
+              break;
+            }
+          } while (1);
+        }
+      }
+    }
+  }
+}
+
+// This function is called via __kmpc_pause_resource. Returns 0 if successful.
+// TODO: add warning messages
+int __kmp_pause_resource(kmp_pause_status_t level) {
+  if (level == kmp_not_paused) { // requesting resume
+    if (__kmp_pause_status == kmp_not_paused) {
+      // error message about runtime not being paused, so can't resume
+      return 1;
+    } else {
+      KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
+                       __kmp_pause_status == kmp_hard_paused);
+      __kmp_pause_status = kmp_not_paused;
+      return 0;
+    }
+  } else if (level == kmp_soft_paused) { // requesting soft pause
+    if (__kmp_pause_status != kmp_not_paused) {
+      // error message about already being paused
+      return 1;
+    } else {
+      __kmp_soft_pause();
+      return 0;
+    }
+  } else if (level == kmp_hard_paused) { // requesting hard pause
+    if (__kmp_pause_status != kmp_not_paused) {
+      // error message about already being paused
+      return 1;
+    } else {
+      __kmp_hard_pause();
+      return 0;
+    }
+  } else {
+    // error message about invalid level
+    return 1;
+  }
+}
+
+#endif // OMP_50_ENABLED