diff options
author | Alexey Bataev <a.bataev@hotmail.com> | 2019-01-04 17:09:12 +0000 |
---|---|---|
committer | Alexey Bataev <a.bataev@hotmail.com> | 2019-01-04 17:09:12 +0000 |
commit | 900210a37567f84e17b8eae18568410fbac77877 (patch) | |
tree | df84202e87da83b267668040e767c052c5838e21 /libomptarget | |
parent | 58df0162f131f71d38ec0d4e13015ad7940922cb (diff) | |
download | openmp_llvm-900210a37567f84e17b8eae18568410fbac77877.tar.gz |
[OPENMP][NVPTX]Improve performance + reduce number of used registers.
Summary:
Reduced number of the used register + improved performance propagating
the information about current execution/data sharing mode directly from
the compiler, where it is possible.
In some cases, it requires new/reworked interfaces of the runtime
external functions. Old functions are marked as deprecated.
Reviewers: grokos, gtbercea, kkwli0
Subscribers: guansong, jfb, openmp-commits, caomhin
Differential Revision: https://reviews.llvm.org/D56278
git-svn-id: https://llvm.org/svn/llvm-project/openmp/trunk@350405 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'libomptarget')
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/data_sharing.cu | 27 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/interface.h | 12 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/libcall.cu | 43 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/loop.cu | 27 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu | 6 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h | 7 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h | 14 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/parallel.cu | 20 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/reduction.cu | 25 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/support.h | 6 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/supporti.h | 12 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/sync.cu | 2 | ||||
-rw-r--r-- | libomptarget/deviceRTLs/nvptx/src/task.cu | 6 |
13 files changed, 124 insertions, 83 deletions
diff --git a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu index 1f598ec..1522400 100644 --- a/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ b/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -43,8 +43,8 @@ __device__ static bool IsWarpMasterActiveThread() { return (unsigned)Sh == 0; } // Return true if this is the master thread. -__device__ static bool IsMasterThread() { - return !isSPMDMode() && getMasterThreadId() == getThreadId(); +__device__ static bool IsMasterThread(bool isSPMDExecutionMode) { + return !isSPMDExecutionMode && getMasterThreadId() == getThreadId(); } /// Return the provided size aligned to the size of a pointer. @@ -88,7 +88,8 @@ __kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS, omptarget_nvptx_TeamDescr *teamDescr = &omptarget_nvptx_threadPrivateContext->TeamContext(); - __kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID, IsMasterThread()); + __kmpc_data_sharing_slot *RootS = + teamDescr->RootS(WID, IsMasterThread(isSPMDMode())); DataSharingState.SlotPtr[WID] = RootS; DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; @@ -253,8 +254,9 @@ EXTERN void __kmpc_data_sharing_environment_end( // The master thread cleans the saved slot, because this is an environment // only for the master. - __kmpc_data_sharing_slot *S = - IsMasterThread() ? *SavedSharedSlot : DataSharingState.SlotPtr[WID]; + __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode()) + ? *SavedSharedSlot + : DataSharingState.SlotPtr[WID]; if (S->Next) { free(S->Next); @@ -472,8 +474,9 @@ EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, // space for the variables of each thread in the warp, // i.e. one DataSize chunk per warp lane. // TODO: change WARPSIZE to the number of active threads in the warp. - size_t PushSize = (isRuntimeUninitialized() || IsMasterThread()) ? - DataSize : WARPSIZE * DataSize; + size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode())) + ? DataSize + : WARPSIZE * DataSize; // Compute the start address of the frame of each thread in the warp. uintptr_t FrameStartAddress = @@ -553,14 +556,15 @@ EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) { // manage statically allocated global memory. This memory is allocated by the // compiler and used to correctly implement globalization of the variables in // target, teams and distribute regions. -EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size, +EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + const void *buf, size_t size, int16_t is_shared, const void **frame) { if (is_shared) { *frame = buf; return; } - if (isSPMDMode()) { + if (isSPMDExecutionMode) { if (GetThreadIdInBlock() == 0) { *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); } @@ -574,10 +578,11 @@ EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size, __threadfence(); } -EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared) { +EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + int16_t is_shared) { if (is_shared) return; - if (isSPMDMode()) { + if (isSPMDExecutionMode) { // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. __SYNCTHREADS(); if (GetThreadIdInBlock() == 0) { diff --git a/libomptarget/deviceRTLs/nvptx/src/interface.h b/libomptarget/deviceRTLs/nvptx/src/interface.h index 2c2beae..558860b 100644 --- a/libomptarget/deviceRTLs/nvptx/src/interface.h +++ b/libomptarget/deviceRTLs/nvptx/src/interface.h @@ -395,9 +395,13 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, // reduction EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid); EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); -EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait( +EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, + void *reduce_data, kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct); EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); @@ -550,9 +554,11 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, // SPMD execution mode interrogation function. EXTERN int8_t __kmpc_is_spmd_exec_mode(); -EXTERN void __kmpc_get_team_static_memory(const void *buf, size_t size, +EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + const void *buf, size_t size, int16_t is_shared, const void **res); -EXTERN void __kmpc_restore_team_static_memory(int16_t is_shared); +EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + int16_t is_shared); #endif diff --git a/libomptarget/deviceRTLs/nvptx/src/libcall.cu b/libomptarget/deviceRTLs/nvptx/src/libcall.cu index 91b270c..9abe599 100644 --- a/libomptarget/deviceRTLs/nvptx/src/libcall.cu +++ b/libomptarget/deviceRTLs/nvptx/src/libcall.cu @@ -39,14 +39,17 @@ EXTERN void omp_set_num_threads(int num) { if (num <= 0) { WARNING0(LW_INPUT, "expected positive num; ignore\n"); } else { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); currTaskDescr->NThreads() = num; } } EXTERN int omp_get_num_threads(void) { - int tid = GetLogicalThreadIdInBlock(); - int rc = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); + bool isSPMDExecutionMode = isSPMDMode(); + int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + int rc = + GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized()); PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); return rc; } @@ -58,7 +61,8 @@ EXTERN int omp_get_max_threads(void) { // We're already in parallel region. return 1; // default is 1 thread avail } - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); int rc = 1; // default is 1 thread avail if (!currTaskDescr->InParallelRegion()) { // Not currently in a parallel region, return what was set. @@ -76,21 +80,23 @@ EXTERN int omp_get_thread_limit(void) { return 0; // default is 0 } // per contention group.. meaning threads in current team - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); int rc = currTaskDescr->ThreadLimit(); PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); return rc; } EXTERN int omp_get_thread_num() { - int tid = GetLogicalThreadIdInBlock(); - int rc = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()); + bool isSPMDExecutionMode = isSPMDMode(); + int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + int rc = GetOmpThreadId(tid, isSPMDExecutionMode, isRuntimeUninitialized()); PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); return rc; } EXTERN int omp_get_num_procs(void) { - int rc = GetNumberOfProcsInDevice(); + int rc = GetNumberOfProcsInDevice(isSPMDMode()); PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc); return rc; } @@ -102,7 +108,8 @@ EXTERN int omp_in_parallel(void) { "Expected SPMD mode only with uninitialized runtime."); rc = 1; // SPMD mode is always in parallel. } else { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); if (currTaskDescr->InParallelRegion()) { rc = 1; } @@ -161,7 +168,8 @@ EXTERN int omp_get_level(void) { return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel(); } int level = 0; - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread"); do { @@ -181,7 +189,8 @@ EXTERN int omp_get_active_level(void) { return 1; } int level = 0; // no active level parallelism - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread"); do { @@ -208,7 +217,8 @@ EXTERN int omp_get_ancestor_thread_num(int level) { } else if (level > 0) { int totLevel = omp_get_level(); if (level <= totLevel) { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); int steps = totLevel - level; PRINT(LD_IO, "backtrack %d steps\n", steps); ASSERT0(LT_FUSSY, currTaskDescr, @@ -259,7 +269,8 @@ EXTERN int omp_get_team_size(int level) { } else if (level > 0) { int totLevel = omp_get_level(); if (level <= totLevel) { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); int steps = totLevel - level; ASSERT0(LT_FUSSY, currTaskDescr, "do not expect fct to be called in a non-active thread"); @@ -288,7 +299,8 @@ EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) { *kind = omp_sched_static; *modifier = 1; } else { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); *kind = currTaskDescr->GetRuntimeSched(); *modifier = currTaskDescr->RuntimeChunkSize(); } @@ -305,7 +317,8 @@ EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) { return; } if (kind >= omp_sched_static && kind < omp_sched_auto) { - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(); + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); currTaskDescr->SetRuntimeSched(kind); currTaskDescr->RuntimeChunkSize() = modifier; PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n", diff --git a/libomptarget/deviceRTLs/nvptx/src/loop.cu b/libomptarget/deviceRTLs/nvptx/src/loop.cu index c100be5..fdddf30 100644 --- a/libomptarget/deviceRTLs/nvptx/src/loop.cu +++ b/libomptarget/deviceRTLs/nvptx/src/loop.cu @@ -101,7 +101,7 @@ public: // When IsRuntimeUninitialized is true, we assume that the caller is // in an L0 parallel region and that all worker threads participate. - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode); // Assume we are in teams region or that we use a single block // per target region @@ -208,7 +208,7 @@ public: ST chunk) { ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Expected non-SPMD mode + initialized runtime."); - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); T tnum = currTaskDescr->ThreadsInTeam(); T tripCount = ub - lb + 1; // +1 because ub is inclusive @@ -417,17 +417,18 @@ public: // On Pascal, with inlining of the runtime into the user application, // this code deadlocks. This is probably because different threads // in a warp cannot make independent progress. - NOINLINE static int dispatch_next(int32_t gtid, int32_t *plast, T *plower, - T *pupper, ST *pstride) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), + NOINLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, + int32_t *plast, T *plower, T *pupper, + ST *pstride) { + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Expected non-SPMD mode + initialized runtime."); // ID of a thread in its own warp // automatically selects thread or warp ID based on selected implementation - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); ASSERT0(LT_FUSSY, - gtid < GetNumberOfOmpThreads(tid, isSPMDMode(), - isRuntimeUninitialized()), + gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc), + checkRuntimeUninitialized(loc)), "current thread is not needed here; error"); // retrieve schedule kmp_sched_t schedule = @@ -540,7 +541,7 @@ EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last, int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next( - tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st); } EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, @@ -548,14 +549,14 @@ EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, uint32_t *p_ub, int32_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next( - tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st); } EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last, int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next( - tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st); } EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, @@ -563,7 +564,7 @@ EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, uint64_t *p_ub, int64_t *p_st) { PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next( - tid, p_last, p_lb, p_ub, p_st); + loc, tid, p_last, p_lb, p_ub, p_st); } // fini @@ -756,7 +757,7 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid, "Expected non-SPMD mode + initialized runtime."); omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc), checkRuntimeUninitialized(loc)); uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu index dd51786..f8610a5 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -61,12 +61,12 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { omptarget_nvptx_device_State[slot].Dequeue(); // init thread private - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false); omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId); // init team context omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); - currTeamDescr.InitTeamDescr(); + currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/false); // this thread will start execution... has to update its task ICV // to point to the level zero task ICV. That ICV was init in // InitTeamDescr() @@ -128,7 +128,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); // init team context - currTeamDescr.InitTeamDescr(); + currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/true); } // FIXME: use __syncthreads instead when the function copy is fixed in LLVM. __SYNCTHREADS(); diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h index 6539756..4a12ff3 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -176,7 +176,7 @@ public: prev = taskDescr; } // init & copy - INLINE void InitLevelZeroTaskDescr(); + INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode); INLINE void InitLevelOneTaskDescr(uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr); INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); @@ -257,7 +257,7 @@ public: INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } // init - INLINE void InitTeamDescr(); + INLINE void InitTeamDescr(bool isSPMDExecutionMode); INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { // If this is invoked by the master thread of the master warp then intialize @@ -462,7 +462,8 @@ extern volatile __device__ __shared__ omptarget_nvptx_WorkFn INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(); +INLINE omptarget_nvptx_TaskDescr * +getMyTopTaskDescriptor(bool isSPMDExecutionMode); INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); //////////////////////////////////////////////////////////////////////////////// diff --git a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h index f325601..2e834cc 100644 --- a/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h +++ b/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h @@ -31,7 +31,8 @@ INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) { items.flags |= val; } -INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() { +INLINE void +omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) { // slow method // flag: // default sched is static, @@ -39,7 +40,7 @@ INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() { // not in parallel items.flags = 0; - items.nthreads = GetNumberOfProcsInTeam(); + items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode); ; // threads: whatever was alloc by kernel items.threadId = 0; // is master items.threadsInTeam = 1; // sequential @@ -177,8 +178,8 @@ omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) { // Team Descriptor //////////////////////////////////////////////////////////////////////////////// -INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() { - levelZeroTaskDescr.InitLevelZeroTaskDescr(); +INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr(bool isSPMDExecutionMode) { + levelZeroTaskDescr.InitLevelZeroTaskDescr(isSPMDExecutionMode); } //////////////////////////////////////////////////////////////////////////////// @@ -199,8 +200,9 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) { return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); } -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() { - return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock()); +INLINE omptarget_nvptx_TaskDescr * +getMyTopTaskDescriptor(bool isSPMDExecutionMode) { + return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode)); } //////////////////////////////////////////////////////////////////////////////// diff --git a/libomptarget/deviceRTLs/nvptx/src/parallel.cu b/libomptarget/deviceRTLs/nvptx/src/parallel.cu index 8aea26d..aa0b9cf 100644 --- a/libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ b/libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -57,7 +57,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); *LaneId = __popc(ConvergentMask & lanemask_lt); - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; @@ -101,7 +101,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask, EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); // pop stack - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = job->slimForNextSimd; @@ -131,7 +131,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); uint32_t OmpId = __popc(ConvergentMask & lanemask_lt); - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; @@ -181,7 +181,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask, EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); // pop stack - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( threadId, job->convHeadTaskDescr); @@ -345,7 +345,7 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { } // assume this is only called for nested parallel - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); // unlike actual parallel, threads in the same team do not share // the workTaskDescr in this case and num threads is fixed to 1 @@ -384,7 +384,7 @@ EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, } // pop stack - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); // set new top omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( @@ -404,7 +404,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { return omptarget_nvptx_simpleThreadPrivateContext->GetParallelLevel(); } - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); if (currTaskDescr->InL2OrHigherParallelRegion()) @@ -420,7 +420,7 @@ EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { // it's cheap to recalculate this value so we never use the result // of this call. EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); return GetOmpThreadId(tid, checkSPMDMode(loc), checkRuntimeUninitialized(loc)); } @@ -433,7 +433,7 @@ EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid, int32_t num_threads) { PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); - tid = GetLogicalThreadIdInBlock(); + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = num_threads; } @@ -442,7 +442,7 @@ EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid, int32_t simd_limit) { PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit); ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); - tid = GetLogicalThreadIdInBlock(); + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; } diff --git a/libomptarget/deviceRTLs/nvptx/src/reduction.cu b/libomptarget/deviceRTLs/nvptx/src/reduction.cu index c0d22df..fde1fde 100644 --- a/libomptarget/deviceRTLs/nvptx/src/reduction.cu +++ b/libomptarget/deviceRTLs/nvptx/src/reduction.cu @@ -20,8 +20,10 @@ // may eventually remove this EXTERN int32_t __gpu_block_reduce() { - int tid = GetLogicalThreadIdInBlock(); - int nt = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()); + bool isSPMDExecutionMode = isSPMDMode(); + int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + int nt = + GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized()); if (nt != blockDim.x) return 0; unsigned tnum = __ACTIVEMASK(); @@ -35,7 +37,7 @@ int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, void *reduce_array_size, kmp_ReductFctPtr *reductFct, kmp_CriticalName *lck) { - int threadId = GetLogicalThreadIdInBlock(); + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); int numthread; if (currTaskDescr->IsParallelConstruct()) { @@ -150,7 +152,7 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, kmp_InterWarpCopyFctPtr cpyFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized) { - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(); + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); uint32_t NumThreads = GetNumberOfOmpThreads( BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); if (NumThreads == 1) @@ -236,8 +238,7 @@ int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars, #endif // __CUDA_ARCH__ >= 700 } -EXTERN -int32_t __kmpc_nvptx_parallel_reduce_nowait( +EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { return nvptx_parallel_reduce_nowait( @@ -247,6 +248,16 @@ int32_t __kmpc_nvptx_parallel_reduce_nowait( } EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, + void *reduce_data, kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait( + global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); +} + +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { @@ -272,7 +283,7 @@ int32_t nvptx_teams_reduce_nowait( kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct, bool isSPMDExecutionMode, bool isRuntimeUninitialized) { - uint32_t ThreadId = GetLogicalThreadIdInBlock(); + uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); // In non-generic mode all workers participate in the teams reduction. // In generic mode only the team master participates in the teams // reduction because the workers are waiting for parallel work. diff --git a/libomptarget/deviceRTLs/nvptx/src/support.h b/libomptarget/deviceRTLs/nvptx/src/support.h index 44298f4..9fe3749 100644 --- a/libomptarget/deviceRTLs/nvptx/src/support.h +++ b/libomptarget/deviceRTLs/nvptx/src/support.h @@ -43,7 +43,7 @@ INLINE int GetNumberOfBlocksInKernel(); INLINE int GetNumberOfThreadsInBlock(); // get global ids to locate tread/team info (constant regardless of OMP) -INLINE int GetLogicalThreadIdInBlock(); +INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); INLINE int GetMasterThreadID(); INLINE int GetNumberOfWorkersInTeam(); @@ -59,8 +59,8 @@ GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode, INLINE int GetNumberOfOmpTeams(); // omp_num_teams // get OpenMP number of procs -INLINE int GetNumberOfProcsInTeam(); -INLINE int GetNumberOfProcsInDevice(); +INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); +INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); // masters INLINE int IsTeamMaster(int ompThreadId); diff --git a/libomptarget/deviceRTLs/nvptx/src/supporti.h b/libomptarget/deviceRTLs/nvptx/src/supporti.h index 06c8dae..ece3295 100644 --- a/libomptarget/deviceRTLs/nvptx/src/supporti.h +++ b/libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -130,11 +130,11 @@ INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); } // or a serial region by the master. If the master (whose CUDA thread // id is GetMasterThreadID()) calls this routine, we return 0 because // it is a shadow for the first worker. -INLINE int GetLogicalThreadIdInBlock() { +INLINE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) { // Implemented using control flow (predication) instead of with a modulo // operation. int tid = GetThreadIdInBlock(); - if (isGenericMode() && tid >= GetMasterThreadID()) + if (!isSPMDExecutionMode && tid >= GetMasterThreadID()) return 0; else return tid; @@ -214,13 +214,15 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } // get OpenMP number of procs // Get the number of processors in the device. -INLINE int GetNumberOfProcsInDevice() { - if (isGenericMode()) +INLINE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) { + if (!isSPMDExecutionMode) return GetNumberOfWorkersInTeam(); return GetNumberOfThreadsInBlock(); } -INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); } +INLINE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) { + return GetNumberOfProcsInDevice(isSPMDExecutionMode); +} //////////////////////////////////////////////////////////////////////////////// // Memory diff --git a/libomptarget/deviceRTLs/nvptx/src/sync.cu b/libomptarget/deviceRTLs/nvptx/src/sync.cu index 5f6aef9..c89dee2 100644 --- a/libomptarget/deviceRTLs/nvptx/src/sync.cu +++ b/libomptarget/deviceRTLs/nvptx/src/sync.cu @@ -46,7 +46,7 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) { "Expected SPMD mode with uninitialized runtime."); __kmpc_barrier_simple_spmd(loc_ref, tid); } else { - tid = GetLogicalThreadIdInBlock(); + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref)); omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid); int numberOfActiveOMPThreads = GetNumberOfOmpThreads( diff --git a/libomptarget/deviceRTLs/nvptx/src/task.cu b/libomptarget/deviceRTLs/nvptx/src/task.cu index 2f47d4b..a6eb9ab 100644 --- a/libomptarget/deviceRTLs/nvptx/src/task.cu +++ b/libomptarget/deviceRTLs/nvptx/src/task.cu @@ -96,7 +96,7 @@ EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, "bad assumptions"); // 2. push new context: update new task descriptor - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); newTaskDescr->CopyForExplicitTask(parentTaskDescr); // set new task descriptor as top @@ -135,7 +135,7 @@ EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, "bad assumptions"); // 2. push new context: update new task descriptor - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); newTaskDescr->CopyForExplicitTask(parentTaskDescr); // set new task descriptor as top @@ -163,7 +163,7 @@ EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr(); // 3... noting to call... is inline // 4. pop context - int tid = GetLogicalThreadIdInBlock(); + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, parentTaskDescr); // 5. free |