aboutsummaryrefslogtreecommitdiff
path: root/libomptarget
diff options
context:
space:
mode:
authorAlexey Bataev <a.bataev@hotmail.com>2019-01-04 17:09:12 +0000
committerAlexey Bataev <a.bataev@hotmail.com>2019-01-04 17:09:12 +0000
commit900210a37567f84e17b8eae18568410fbac77877 (patch)
treedf84202e87da83b267668040e767c052c5838e21 /libomptarget
parent58df0162f131f71d38ec0d4e13015ad7940922cb (diff)
downloadopenmp_llvm-900210a37567f84e17b8eae18568410fbac77877.tar.gz
[OPENMP][NVPTX]Improve performance + reduce number of used registers.
Summary: Reduced number of the used register + improved performance propagating the information about current execution/data sharing mode directly from the compiler, where it is possible. In some cases, it requires new/reworked interfaces of the runtime external functions. Old functions are marked as deprecated. Reviewers: grokos, gtbercea, kkwli0 Subscribers: guansong, jfb, openmp-commits, caomhin Differential Revision: https://reviews.llvm.org/D56278 git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@350405 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'libomptarget')
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/data_sharing.cu27
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/interface.h12
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/libcall.cu43
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/loop.cu27
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu6
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h7
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h14
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/parallel.cu20
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/reduction.cu25
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/support.h6
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/supporti.h12
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/sync.cu2
-rw-r--r--libomptarget/deviceRTLs/nvptx/src/task.cu6
13 files changed, 124 insertions, 83 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
index 1f598ec..1522400 100644
--- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -43,8 +43,8 @@ __device__ static bool IsWarpMasterActiveThread() {
return (unsigned)Sh == 0;
}
// Return true if this is the master thread.
-__device__ static bool IsMasterThread() {
- return !isSPMDMode() && getMasterThreadId() == getThreadId();
+__device__ static bool IsMasterThread(bool isSPMDExecutionMode) {
+ return !isSPMDExecutionMode && getMasterThreadId() == getThreadId();
}
/// Return the provided size aligned to the size of a pointer.
@@ -88,7 +88,8 @@ __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
omptarget_nvptx_TeamDescr *teamDescr =
&omptarget_nvptx_threadPrivateContext->TeamContext();
- __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID, IsMasterThread());
+ __kmpc_data_sharing_slot *RootS =
+ teamDescr->RootS(WID, IsMasterThread(isSPMDMode()));
DataSharingState.SlotPtr[WID] = RootS;
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
@@ -253,8 +254,9 @@ EXTERN void __kmpc_data_sharing_environment_end(
// The master thread cleans the saved slot, because this is an environment
// only for the master.
- __kmpc_data_sharing_slot *S =
- IsMasterThread() ? *SavedSharedSlot : DataSharingState.SlotPtr[WID];
+ __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode())
+ ? *SavedSharedSlot
+ : DataSharingState.SlotPtr[WID];
if (S->Next) {
free(S->Next);
@@ -472,8 +474,9 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize,
// space for the variables of each thread in the warp,
// i.e. one DataSize chunk per warp lane.
// TODO: change WARPSIZE to the number of active threads in the warp.
- size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ?
- DataSize : WARPSIZE * DataSize;
+ size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
+ ? DataSize
+ : WARPSIZE * DataSize;
// Compute the start address of the frame of each thread in the warp.
uintptr_t FrameStartAddress =
@@ -553,14 +556,15 @@ EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
// manage statically allocated global memory. This memory is allocated by the
// compiler and used to correctly implement globalization of the variables in
// target, teams and distribute regions.
-EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
+EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+ const void *buf, size_t size,
int16_t is_shared,
const void **frame) {
if (is_shared) {
*frame = buf;
return;
}
- if (isSPMDMode()) {
+ if (isSPMDExecutionMode) {
if (GetThreadIdInBlock() == 0) {
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
}
@@ -574,10 +578,11 @@ EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
__threadfence();
}
-EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared) {
+EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+ int16_t is_shared) {
if (is_shared)
return;
- if (isSPMDMode()) {
+ if (isSPMDExecutionMode) {
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
if (GetThreadIdInBlock() == 0) {
diff --git a/libomptarget/deviceRTLs/nvptx/src/interface.h b/libomptarget/deviceRTLs/nvptx/src/interface.h
index 2c2beae..558860b 100644
--- a/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ b/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -395,9 +395,13 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc,
// reduction
EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
-EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait(
+EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
+EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+ kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
+ void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+ kmp_InterWarpCopyFctPtr cpyFct);
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
@@ -550,9 +554,11 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
// SPMD execution mode interrogation function.
EXTERN int8_t __kmpc_is_spmd_exec_mode();
-EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size,
+EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
+ const void *buf, size_t size,
int16_t is_shared, const void **res);
-EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared);
+EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
+ int16_t is_shared);
#endif
diff --git a/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/libomptarget/deviceRTLs/nvptx/src/libcall.cu
index 91b270c..9abe599 100644
--- a/libomptarget/deviceRTLs/nvptx/src/libcall.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/libcall.cu
@@ -39,14 +39,17 @@ EXTERN void omp_set_num_threads(int num) {
if (num <= 0) {
WARNING0(LW_INPUT, "expected positive num; ignore\n");
} else {
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
currTaskDescr->NThreads() = num;
}
}
EXTERN int omp_get_num_threads(void) {
- int tid = GetLogicalThreadIdInBlock();
- int rc = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+ bool isSPMDExecutionMode = isSPMDMode();
+ int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+ int rc =
+ GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
return rc;
}
@@ -58,7 +61,8 @@ EXTERN int omp_get_max_threads(void) {
// We're already in parallel region.
return 1; // default is 1 thread avail
}
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(isSPMDMode());
int rc = 1; // default is 1 thread avail
if (!currTaskDescr->InParallelRegion()) {
// Not currently in a parallel region, return what was set.
@@ -76,21 +80,23 @@ EXTERN int omp_get_thread_limit(void) {
return 0; // default is 0
}
// per contention group.. meaning threads in current team
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(isSPMDMode());
int rc = currTaskDescr->ThreadLimit();
PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
return rc;
}
EXTERN int omp_get_thread_num() {
- int tid = GetLogicalThreadIdInBlock();
- int rc = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
+ bool isSPMDExecutionMode = isSPMDMode();
+ int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+ int rc = GetOmpThreadId(tid, isSPMDExecutionMode, isRuntimeUninitialized());
PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
return rc;
}
EXTERN int omp_get_num_procs(void) {
- int rc = GetNumberOfProcsInDevice();
+ int rc = GetNumberOfProcsInDevice(isSPMDMode());
PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
return rc;
}
@@ -102,7 +108,8 @@ EXTERN int omp_in_parallel(void) {
"Expected SPMD mode only with uninitialized runtime.");
rc = 1; // SPMD mode is always in parallel.
} else {
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(isSPMDMode());
if (currTaskDescr->InParallelRegion()) {
rc = 1;
}
@@ -161,7 +168,8 @@ EXTERN int omp_get_level(void) {
return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
}
int level = 0;
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(isSPMDMode());
ASSERT0(LT_FUSSY, currTaskDescr,
"do not expect fct to be called in a non-active thread");
do {
@@ -181,7 +189,8 @@ EXTERN int omp_get_active_level(void) {
return 1;
}
int level = 0; // no active level parallelism
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(isSPMDMode());
ASSERT0(LT_FUSSY, currTaskDescr,
"do not expect fct to be called in a non-active thread");
do {
@@ -208,7 +217,8 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
} else if (level > 0) {
int totLevel = omp_get_level();
if (level <= totLevel) {
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(isSPMDMode());
int steps = totLevel - level;
PRINT(LD_IO, "backtrack %d steps\n", steps);
ASSERT0(LT_FUSSY, currTaskDescr,
@@ -259,7 +269,8 @@ EXTERN int omp_get_team_size(int level) {
} else if (level > 0) {
int totLevel = omp_get_level();
if (level <= totLevel) {
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(isSPMDMode());
int steps = totLevel - level;
ASSERT0(LT_FUSSY, currTaskDescr,
"do not expect fct to be called in a non-active thread");
@@ -288,7 +299,8 @@ EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
*kind = omp_sched_static;
*modifier = 1;
} else {
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(isSPMDMode());
*kind = currTaskDescr->GetRuntimeSched();
*modifier = currTaskDescr->RuntimeChunkSize();
}
@@ -305,7 +317,8 @@ EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
return;
}
if (kind >= omp_sched_static && kind < omp_sched_auto) {
- omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
+ omptarget_nvptx_TaskDescr *currTaskDescr =
+ getMyTopTaskDescriptor(isSPMDMode());
currTaskDescr->SetRuntimeSched(kind);
currTaskDescr->RuntimeChunkSize() = modifier;
PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n",
diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu
index c100be5..fdddf30 100644
--- a/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu
@@ -101,7 +101,7 @@ public:
// When IsRuntimeUninitialized is true, we assume that the caller is
// in an L0 parallel region and that all worker threads participate.
- int tid = GetLogicalThreadIdInBlock();
+ int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode);
// Assume we are in teams region or that we use a single block
// per target region
@@ -208,7 +208,7 @@ public:
ST chunk) {
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
"Expected non-SPMD mode + initialized runtime.");
- int tid = GetLogicalThreadIdInBlock();
+ int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
T tnum = currTaskDescr->ThreadsInTeam();
T tripCount = ub - lb + 1; // +1 because ub is inclusive
@@ -417,17 +417,18 @@ public:
// On Pascal, with inlining of the runtime into the user application,
// this code deadlocks. This is probably because different threads
// in a warp cannot make independent progress.
- NOINLINE static int dispatch_next(int32_t gtid, int32_t *plast, T *plower,
- T *pupper, ST *pstride) {
- ASSERT0(LT_FUSSY, isRuntimeInitialized(),
+ NOINLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid,
+ int32_t *plast, T *plower, T *pupper,
+ ST *pstride) {
+ ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc),
"Expected non-SPMD mode + initialized runtime.");
// ID of a thread in its own warp
// automatically selects thread or warp ID based on selected implementation
- int tid = GetLogicalThreadIdInBlock();
+ int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
ASSERT0(LT_FUSSY,
- gtid < GetNumberOfOmpThreads(tid, isSPMDMode(),
- isRuntimeUninitialized()),
+ gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
+ checkRuntimeUninitialized(loc)),
"current thread is not needed here; error");
// retrieve schedule
kmp_sched_t schedule =
@@ -540,7 +541,7 @@ EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
- tid, p_last, p_lb, p_ub, p_st);
+ loc, tid, p_last, p_lb, p_ub, p_st);
}
EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
@@ -548,14 +549,14 @@ EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid,
uint32_t *p_ub, int32_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
- tid, p_last, p_lb, p_ub, p_st);
+ loc, tid, p_last, p_lb, p_ub, p_st);
}
EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
- tid, p_last, p_lb, p_ub, p_st);
+ loc, tid, p_last, p_lb, p_ub, p_st);
}
EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
@@ -563,7 +564,7 @@ EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid,
uint64_t *p_ub, int64_t *p_st) {
PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
- tid, p_last, p_lb, p_ub, p_st);
+ loc, tid, p_last, p_lb, p_ub, p_st);
}
// fini
@@ -756,7 +757,7 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
"Expected non-SPMD mode + initialized runtime.");
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
- int tid = GetLogicalThreadIdInBlock();
+ int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc),
checkRuntimeUninitialized(loc));
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
index dd51786..f8610a5 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -61,12 +61,12 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
omptarget_nvptx_device_State[slot].Dequeue();
// init thread private
- int threadId = GetLogicalThreadIdInBlock();
+ int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false);
omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
// init team context
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
- currTeamDescr.InitTeamDescr();
+ currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/false);
// this thread will start execution... has to update its task ICV
// to point to the level zero task ICV. That ICV was init in
// InitTeamDescr()
@@ -128,7 +128,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
// init team context
- currTeamDescr.InitTeamDescr();
+ currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/true);
}
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index 6539756..4a12ff3 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -176,7 +176,7 @@ public:
prev = taskDescr;
}
// init & copy
- INLINE void InitLevelZeroTaskDescr();
+ INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode);
INLINE void InitLevelOneTaskDescr(uint16_t tnum,
omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
@@ -257,7 +257,7 @@ public:
INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
// init
- INLINE void InitTeamDescr();
+ INLINE void InitTeamDescr(bool isSPMDExecutionMode);
INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
// If this is invoked by the master thread of the master warp then intialize
@@ -462,7 +462,8 @@ extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
-INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor();
+INLINE omptarget_nvptx_TaskDescr *
+getMyTopTaskDescriptor(bool isSPMDExecutionMode);
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
index f325601..2e834cc 100644
--- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
+++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
@@ -31,7 +31,8 @@ INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
items.flags |= val;
}
-INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
+INLINE void
+omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) {
// slow method
// flag:
// default sched is static,
@@ -39,7 +40,7 @@ INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
// not in parallel
items.flags = 0;
- items.nthreads = GetNumberOfProcsInTeam();
+ items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode);
; // threads: whatever was alloc by kernel
items.threadId = 0; // is master
items.threadsInTeam = 1; // sequential
@@ -177,8 +178,8 @@ omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
// Team Descriptor
////////////////////////////////////////////////////////////////////////////////
-INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
- levelZeroTaskDescr.InitLevelZeroTaskDescr();
+INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr(bool isSPMDExecutionMode) {
+ levelZeroTaskDescr.InitLevelZeroTaskDescr(isSPMDExecutionMode);
}
////////////////////////////////////////////////////////////////////////////////
@@ -199,8 +200,9 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
}
-INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
- return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
+INLINE omptarget_nvptx_TaskDescr *
+getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
+ return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode));
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
index 8aea26d..aa0b9cf 100644
--- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -57,7 +57,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
*LaneId = __popc(ConvergentMask & lanemask_lt);
- int threadId = GetLogicalThreadIdInBlock();
+ int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
@@ -101,7 +101,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
// pop stack
- int threadId = GetLogicalThreadIdInBlock();
+ int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
job->slimForNextSimd;
@@ -131,7 +131,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
- int threadId = GetLogicalThreadIdInBlock();
+ int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
@@ -181,7 +181,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
// pop stack
- int threadId = GetLogicalThreadIdInBlock();
+ int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, job->convHeadTaskDescr);
@@ -345,7 +345,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
}
// assume this is only called for nested parallel
- int threadId = GetLogicalThreadIdInBlock();
+ int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
// unlike actual parallel, threads in the same team do not share
// the workTaskDescr in this case and num threads is fixed to 1
@@ -384,7 +384,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc,
}
// pop stack
- int threadId = GetLogicalThreadIdInBlock();
+ int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
// set new top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
@@ -404,7 +404,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel();
}
- int threadId = GetLogicalThreadIdInBlock();
+ int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
if (currTaskDescr->InL2OrHigherParallelRegion())
@@ -420,7 +420,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) {
// it's cheap to recalculate this value so we never use the result
// of this call.
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
- int tid = GetLogicalThreadIdInBlock();
+ int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
return GetOmpThreadId(tid, checkSPMDMode(loc),
checkRuntimeUninitialized(loc));
}
@@ -433,7 +433,7 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid,
int32_t num_threads) {
PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
- tid = GetLogicalThreadIdInBlock();
+ tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
num_threads;
}
@@ -442,7 +442,7 @@ EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid,
int32_t simd_limit) {
PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit);
ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized.");
- tid = GetLogicalThreadIdInBlock();
+ tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
}
diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
index c0d22df..fde1fde 100644
--- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu
@@ -20,8 +20,10 @@
// may eventually remove this
EXTERN
int32_t __gpu_block_reduce() {
- int tid = GetLogicalThreadIdInBlock();
- int nt = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
+ bool isSPMDExecutionMode = isSPMDMode();
+ int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
+ int nt =
+ GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
if (nt != blockDim.x)
return 0;
unsigned tnum = __ACTIVEMASK();
@@ -35,7 +37,7 @@ int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars,
size_t reduce_size, void *reduce_data,
void *reduce_array_size, kmp_ReductFctPtr *reductFct,
kmp_CriticalName *lck) {
- int threadId = GetLogicalThreadIdInBlock();
+ int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
int numthread;
if (currTaskDescr->IsParallelConstruct()) {
@@ -150,7 +152,7 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
kmp_InterWarpCopyFctPtr cpyFct,
bool isSPMDExecutionMode,
bool isRuntimeUninitialized) {
- uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
+ uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
uint32_t NumThreads = GetNumberOfOmpThreads(
BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
if (NumThreads == 1)
@@ -236,8 +238,7 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
#endif // __CUDA_ARCH__ >= 700
}
-EXTERN
-int32_t __kmpc_nvptx_parallel_reduce_nowait(
+EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
return nvptx_parallel_reduce_nowait(
@@ -247,6 +248,16 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait(
}
EXTERN
+int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+ kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
+ void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
+ kmp_InterWarpCopyFctPtr cpyFct) {
+ return nvptx_parallel_reduce_nowait(
+ global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
+ checkSPMDMode(loc), checkRuntimeUninitialized(loc));
+}
+
+EXTERN
int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
@@ -272,7 +283,7 @@ int32_t nvptx_teams_reduce_nowait(
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
- uint32_t ThreadId = GetLogicalThreadIdInBlock();
+ uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
diff --git a/libomptarget/deviceRTLs/nvptx/src/support.h b/libomptarget/deviceRTLs/nvptx/src/support.h
index 44298f4..9fe3749 100644
--- a/libomptarget/deviceRTLs/nvptx/src/support.h
+++ b/libomptarget/deviceRTLs/nvptx/src/support.h
@@ -43,7 +43,7 @@ INLINE int GetNumberOfBlocksInKernel();
INLINE int GetNumberOfThreadsInBlock();
// get global ids to locate tread/team info (constant regardless of OMP)
-INLINE int GetLogicalThreadIdInBlock();
+INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode);
INLINE int GetMasterThreadID();
INLINE int GetNumberOfWorkersInTeam();
@@ -59,8 +59,8 @@ GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
INLINE int GetNumberOfOmpTeams(); // omp_num_teams
// get OpenMP number of procs
-INLINE int GetNumberOfProcsInTeam();
-INLINE int GetNumberOfProcsInDevice();
+INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
+INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
// masters
INLINE int IsTeamMaster(int ompThreadId);
diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/nvptx/src/supporti.h
index 06c8dae..ece3295 100644
--- a/libomptarget/deviceRTLs/nvptx/src/supporti.h
+++ b/libomptarget/deviceRTLs/nvptx/src/supporti.h
@@ -130,11 +130,11 @@ INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
// or a serial region by the master. If the master (whose CUDA thread
// id is GetMasterThreadID()) calls this routine, we return 0 because
// it is a shadow for the first worker.
-INLINE int GetLogicalThreadIdInBlock() {
+INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) {
// Implemented using control flow (predication) instead of with a modulo
// operation.
int tid = GetThreadIdInBlock();
- if (isGenericMode() && tid >= GetMasterThreadID())
+ if (!isSPMDExecutionMode && tid >= GetMasterThreadID())
return 0;
else
return tid;
@@ -214,13 +214,15 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
// get OpenMP number of procs
// Get the number of processors in the device.
-INLINE int GetNumberOfProcsInDevice() {
- if (isGenericMode())
+INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
+ if (!isSPMDExecutionMode)
return GetNumberOfWorkersInTeam();
return GetNumberOfThreadsInBlock();
}
-INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }
+INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
+ return GetNumberOfProcsInDevice(isSPMDExecutionMode);
+}
////////////////////////////////////////////////////////////////////////////////
// Memory
diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/nvptx/src/sync.cu
index 5f6aef9..c89dee2 100644
--- a/libomptarget/deviceRTLs/nvptx/src/sync.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/sync.cu
@@ -46,7 +46,7 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
"Expected SPMD mode with uninitialized runtime.");
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
- tid = GetLogicalThreadIdInBlock();
+ tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
int numberOfActiveOMPThreads = GetNumberOfOmpThreads(
diff --git a/libomptarget/deviceRTLs/nvptx/src/task.cu b/libomptarget/deviceRTLs/nvptx/src/task.cu
index 2f47d4b..a6eb9ab 100644
--- a/libomptarget/deviceRTLs/nvptx/src/task.cu
+++ b/libomptarget/deviceRTLs/nvptx/src/task.cu
@@ -96,7 +96,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
"bad assumptions");
// 2. push new context: update new task descriptor
- int tid = GetLogicalThreadIdInBlock();
+ int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
@@ -135,7 +135,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
"bad assumptions");
// 2. push new context: update new task descriptor
- int tid = GetLogicalThreadIdInBlock();
+ int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
// set new task descriptor as top
@@ -163,7 +163,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
// 3... noting to call... is inline
// 4. pop context
- int tid = GetLogicalThreadIdInBlock();
+ int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
parentTaskDescr);
// 5. free