From 71909c57ca96f50978a2a6b9507f47a8d99326a7 Mon Sep 17 00:00:00 2001 From: Jonathan Peyton Date: Wed, 2 Mar 2016 22:42:06 +0000 Subject: [PATCH] Add new OpenMP 4.5 doacross loop nest feature From the standard: A doacross loop nest is a loop nest that has cross-iteration dependence. An iteration is dependent on one or more lexicographically earlier iterations. The ordered clause parameter on a loop directive identifies the loop(s) associated with the doacross loop nest. The init/fini routines allocate/free doacross buffer(s) for each loop for each thread. The wait routine waits for a flag designated by the dependence vector. The post routine sets the flag designated by current iteration vector. We use a similar technique of shared buffer indices that covers up to 7 nowait loops executed simultaneously by different threads (number 7 has no real meaning, just heuristic value). Also, the size of structures are kept intact via reducing dummy arrays. This needs to be put into the OpenMP runtime library in order for the compiler team to develop the compiler side of the implementation. Differential Revision: http://reviews.llvm.org/D17399 llvm-svn: 262532 --- openmp/runtime/src/dllexports | 4 + openmp/runtime/src/kmp.h | 26 ++- openmp/runtime/src/kmp_csupport.c | 289 ++++++++++++++++++++++++++++ openmp/runtime/src/kmp_dispatch.cpp | 7 +- openmp/runtime/src/kmp_runtime.c | 23 ++- 5 files changed, 341 insertions(+), 8 deletions(-) diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports index a8a70c0a364b..6ff52521e502 100644 --- a/openmp/runtime/src/dllexports +++ b/openmp/runtime/src/dllexports @@ -389,6 +389,10 @@ kmpc_set_defaults 224 %ifdef OMP_41 __kmpc_proxy_task_completed 259 __kmpc_proxy_task_completed_ooo 260 + __kmpc_doacross_init 261 + __kmpc_doacross_wait 262 + __kmpc_doacross_post 263 + __kmpc_doacross_fini 264 %endif %endif diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index 71bb323159f0..140bdd8caef4 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1665,7 +1665,7 @@ typedef struct dispatch_shared_info64 { volatile kmp_uint64 iteration; volatile kmp_uint64 num_done; volatile kmp_uint64 ordered_iteration; - kmp_int64 ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size after making ordered_iteration scalar + kmp_int64 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making ordered_iteration scalar } dispatch_shared_info64_t; typedef struct dispatch_shared_info { @@ -1673,8 +1673,12 @@ typedef struct dispatch_shared_info { dispatch_shared_info32_t s32; dispatch_shared_info64_t s64; } u; -/* volatile kmp_int32 dispatch_abort; depricated */ volatile kmp_uint32 buffer_index; +#if OMP_41_ENABLED + volatile kmp_int32 doacross_buf_idx; // teamwise index + volatile kmp_uint32 *doacross_flags; // shared array of iteration flags (0/1) + kmp_int32 doacross_num_done; // count finished threads +#endif } dispatch_shared_info_t; typedef struct kmp_disp { @@ -1688,7 +1692,13 @@ typedef struct kmp_disp { dispatch_private_info_t *th_disp_buffer; kmp_int32 th_disp_index; +#if OMP_41_ENABLED + kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index + volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags + kmp_int64 *th_doacross_info; // info on loop bounds +#else void* dummy_padding[2]; // make it 64 bytes on Intel(R) 64 +#endif #if KMP_USE_INTERNODE_ALIGNMENT char more_padding[INTERNODE_CACHE_LINE]; #endif @@ -3543,7 +3553,17 @@ KMP_EXPORT void __kmpc_push_num_threads( ident_t *loc, kmp_int32 global_tid, kmp KMP_EXPORT void __kmpc_push_proc_bind( ident_t *loc, kmp_int32 global_tid, int proc_bind ); KMP_EXPORT void __kmpc_push_num_teams( ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads ); KMP_EXPORT void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...); - +#endif +#if OMP_41_ENABLED +struct kmp_dim { // loop bounds info casted to kmp_int64 + kmp_int64 lo; // lower + kmp_int64 up; // upper + kmp_int64 st; // stride +}; +KMP_EXPORT void __kmpc_doacross_init(ident_t *loc, kmp_int32 gtid, kmp_int32 num_dims, struct kmp_dim * dims); +KMP_EXPORT void __kmpc_doacross_wait(ident_t *loc, kmp_int32 gtid, kmp_int64 *vec); +KMP_EXPORT void __kmpc_doacross_post(ident_t *loc, kmp_int32 gtid, kmp_int64 *vec); +KMP_EXPORT void __kmpc_doacross_fini(ident_t *loc, kmp_int32 gtid); #endif KMP_EXPORT void* diff --git a/openmp/runtime/src/kmp_csupport.c b/openmp/runtime/src/kmp_csupport.c index 905f596c964c..50650aac2ac3 100644 --- a/openmp/runtime/src/kmp_csupport.c +++ b/openmp/runtime/src/kmp_csupport.c @@ -3049,5 +3049,294 @@ void __kmpc_place_threads(int nS, int sO, int nC, int cO, int nT) __kmp_place_num_threads_per_core = nT; } +#if OMP_41_ENABLED +/*! +@ingroup WORK_SHARING +@param loc source location information. +@param gtid global thread number. +@param num_dims number of associated doacross loops. +@param dims info on loops bounds. + +Initialize doacross loop information. +Expect compiler send us inclusive bounds, +e.g. for(i=2;i<9;i+=2) lo=2, up=8, st=2. +*/ +void +__kmpc_doacross_init(ident_t *loc, int gtid, int num_dims, struct kmp_dim * dims) +{ + int j, idx; + kmp_int64 last, trace_count; + kmp_info_t *th = __kmp_threads[gtid]; + kmp_team_t *team = th->th.th_team; + kmp_uint32 *flags; + kmp_disp_t *pr_buf = th->th.th_dispatch; + dispatch_shared_info_t *sh_buf; + + KA_TRACE(20,("__kmpc_doacross_init() enter: called T#%d, num dims %d, active %d\n", + gtid, num_dims, !team->t.t_serialized)); + KMP_DEBUG_ASSERT(dims != NULL); + KMP_DEBUG_ASSERT(num_dims > 0); + + if( team->t.t_serialized ) { + KA_TRACE(20,("__kmpc_doacross_init() exit: serialized team\n")); + return; // no dependencies if team is serialized + } + KMP_DEBUG_ASSERT(team->t.t_nproc > 1); + idx = pr_buf->th_doacross_buf_idx++; // Increment index of shared buffer for the next loop + sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF]; + + // Save bounds info into allocated private buffer + KMP_DEBUG_ASSERT(pr_buf->th_doacross_info == NULL); + pr_buf->th_doacross_info = + (kmp_int64*)__kmp_thread_malloc(th, sizeof(kmp_int64)*(4 * num_dims + 1)); + KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); + pr_buf->th_doacross_info[0] = (kmp_int64)num_dims; // first element is number of dimensions + // Save also address of num_done in order to access it later without knowing the buffer index + pr_buf->th_doacross_info[1] = (kmp_int64)&sh_buf->doacross_num_done; + pr_buf->th_doacross_info[2] = dims[0].lo; + pr_buf->th_doacross_info[3] = dims[0].up; + pr_buf->th_doacross_info[4] = dims[0].st; + last = 5; + for( j = 1; j < num_dims; ++j ) { + kmp_int64 range_length; // To keep ranges of all dimensions but the first dims[0] + if( dims[j].st == 1 ) { // most common case + // AC: should we care of ranges bigger than LLONG_MAX? (not for now) + range_length = dims[j].up - dims[j].lo + 1; + } else { + if( dims[j].st > 0 ) { + KMP_DEBUG_ASSERT(dims[j].up > dims[j].lo); + range_length = (kmp_uint64)(dims[j].up - dims[j].lo) / dims[j].st + 1; + } else { // negative increment + KMP_DEBUG_ASSERT(dims[j].lo > dims[j].up); + range_length = (kmp_uint64)(dims[j].lo - dims[j].up) / (-dims[j].st) + 1; + } + } + pr_buf->th_doacross_info[last++] = range_length; + pr_buf->th_doacross_info[last++] = dims[j].lo; + pr_buf->th_doacross_info[last++] = dims[j].up; + pr_buf->th_doacross_info[last++] = dims[j].st; + } + + // Compute total trip count. + // Start with range of dims[0] which we don't need to keep in the buffer. + if( dims[0].st == 1 ) { // most common case + trace_count = dims[0].up - dims[0].lo + 1; + } else if( dims[0].st > 0 ) { + KMP_DEBUG_ASSERT(dims[0].up > dims[0].lo); + trace_count = (kmp_uint64)(dims[0].up - dims[0].lo) / dims[0].st + 1; + } else { // negative increment + KMP_DEBUG_ASSERT(dims[0].lo > dims[0].up); + trace_count = (kmp_uint64)(dims[0].lo - dims[0].up) / (-dims[0].st) + 1; + } + for( j = 1; j < num_dims; ++j ) { + trace_count *= pr_buf->th_doacross_info[4 * j + 1]; // use kept ranges + } + KMP_DEBUG_ASSERT(trace_count > 0); + + // Check if shared buffer is not occupied by other loop (idx - KMP_MAX_DISP_BUF) + if( idx != sh_buf->doacross_buf_idx ) { + // Shared buffer is occupied, wait for it to be free + __kmp_wait_yield_4( (kmp_uint32*)&sh_buf->doacross_buf_idx, idx, __kmp_eq_4, NULL ); + } + // Check if we are the first thread. After the CAS the first thread gets 0, + // others get 1 if initialization is in progress, allocated pointer otherwise. + flags = (kmp_uint32*)KMP_COMPARE_AND_STORE_RET64( + (kmp_int64*)&sh_buf->doacross_flags,NULL,(kmp_int64)1); + if( flags == NULL ) { + // we are the first thread, allocate the array of flags + kmp_int64 size = trace_count / 8 + 8; // in bytes, use single bit per iteration + sh_buf->doacross_flags = (kmp_uint32*)__kmp_thread_calloc(th, size, 1); + } else if( (kmp_int64)flags == 1 ) { + // initialization is still in progress, need to wait + while( (volatile kmp_int64)sh_buf->doacross_flags == 1 ) { + KMP_YIELD(TRUE); + } + } + KMP_DEBUG_ASSERT((kmp_int64)sh_buf->doacross_flags > 1); // check value of pointer + pr_buf->th_doacross_flags = sh_buf->doacross_flags; // save private copy in order to not + // touch shared buffer on each iteration + KA_TRACE(20,("__kmpc_doacross_init() exit: T#%d\n", gtid)); +} + +void +__kmpc_doacross_wait(ident_t *loc, int gtid, long long *vec) +{ + kmp_int32 shft, num_dims, i; + kmp_uint32 flag; + kmp_int64 iter_number; // iteration number of "collapsed" loop nest + kmp_info_t *th = __kmp_threads[gtid]; + kmp_team_t *team = th->th.th_team; + kmp_disp_t *pr_buf; + kmp_int64 lo, up, st; + + KA_TRACE(20,("__kmpc_doacross_wait() enter: called T#%d\n", gtid)); + if( team->t.t_serialized ) { + KA_TRACE(20,("__kmpc_doacross_wait() exit: serialized team\n")); + return; // no dependencies if team is serialized + } + + // calculate sequential iteration number and check out-of-bounds condition + pr_buf = th->th.th_dispatch; + KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); + num_dims = pr_buf->th_doacross_info[0]; + lo = pr_buf->th_doacross_info[2]; + up = pr_buf->th_doacross_info[3]; + st = pr_buf->th_doacross_info[4]; + if( st == 1 ) { // most common case + if( vec[0] < lo || vec[0] > up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[0], lo, up)); + return; + } + iter_number = vec[0] - lo; + } else if( st > 0 ) { + if( vec[0] < lo || vec[0] > up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[0], lo, up)); + return; + } + iter_number = (kmp_uint64)(vec[0] - lo) / st; + } else { // negative increment + if( vec[0] > lo || vec[0] < up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[0], lo, up)); + return; + } + iter_number = (kmp_uint64)(lo - vec[0]) / (-st); + } + for( i = 1; i < num_dims; ++i ) { + kmp_int64 iter, ln; + kmp_int32 j = i * 4; + ln = pr_buf->th_doacross_info[j + 1]; + lo = pr_buf->th_doacross_info[j + 2]; + up = pr_buf->th_doacross_info[j + 3]; + st = pr_buf->th_doacross_info[j + 4]; + if( st == 1 ) { + if( vec[i] < lo || vec[i] > up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[i], lo, up)); + return; + } + iter = vec[i] - lo; + } else if( st > 0 ) { + if( vec[i] < lo || vec[i] > up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[i], lo, up)); + return; + } + iter = (kmp_uint64)(vec[i] - lo) / st; + } else { // st < 0 + if( vec[i] > lo || vec[i] < up ) { + KA_TRACE(20,( + "__kmpc_doacross_wait() exit: T#%d iter %lld is out of bounds [%lld,%lld]\n", + gtid, vec[i], lo, up)); + return; + } + iter = (kmp_uint64)(lo - vec[i]) / (-st); + } + iter_number = iter + ln * iter_number; + } + shft = iter_number % 32; // use 32-bit granularity + iter_number >>= 5; // divided by 32 + flag = 1 << shft; + while( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) { + KMP_YIELD(TRUE); + } + KA_TRACE(20,("__kmpc_doacross_wait() exit: T#%d wait for iter %lld completed\n", + gtid, (iter_number<<5)+shft)); +} + +void +__kmpc_doacross_post(ident_t *loc, int gtid, long long *vec) +{ + kmp_int32 shft, num_dims, i; + kmp_uint32 flag; + kmp_int64 iter_number; // iteration number of "collapsed" loop nest + kmp_info_t *th = __kmp_threads[gtid]; + kmp_team_t *team = th->th.th_team; + kmp_disp_t *pr_buf; + kmp_int64 lo, st; + + KA_TRACE(20,("__kmpc_doacross_post() enter: called T#%d\n", gtid)); + if( team->t.t_serialized ) { + KA_TRACE(20,("__kmpc_doacross_post() exit: serialized team\n")); + return; // no dependencies if team is serialized + } + + // calculate sequential iteration number (same as in "wait" but no out-of-bounds checks) + pr_buf = th->th.th_dispatch; + KMP_DEBUG_ASSERT(pr_buf->th_doacross_info != NULL); + num_dims = pr_buf->th_doacross_info[0]; + lo = pr_buf->th_doacross_info[2]; + st = pr_buf->th_doacross_info[4]; + if( st == 1 ) { // most common case + iter_number = vec[0] - lo; + } else if( st > 0 ) { + iter_number = (kmp_uint64)(vec[0] - lo) / st; + } else { // negative increment + iter_number = (kmp_uint64)(lo - vec[0]) / (-st); + } + for( i = 1; i < num_dims; ++i ) { + kmp_int64 iter, ln; + kmp_int32 j = i * 4; + ln = pr_buf->th_doacross_info[j + 1]; + lo = pr_buf->th_doacross_info[j + 2]; + st = pr_buf->th_doacross_info[j + 4]; + if( st == 1 ) { + iter = vec[i] - lo; + } else if( st > 0 ) { + iter = (kmp_uint64)(vec[i] - lo) / st; + } else { // st < 0 + iter = (kmp_uint64)(lo - vec[i]) / (-st); + } + iter_number = iter + ln * iter_number; + } + shft = iter_number % 32; // use 32-bit granularity + iter_number >>= 5; // divided by 32 + flag = 1 << shft; + if( (flag & pr_buf->th_doacross_flags[iter_number]) == 0 ) + KMP_TEST_THEN_OR32( (kmp_int32*)&pr_buf->th_doacross_flags[iter_number], (kmp_int32)flag ); + KA_TRACE(20,("__kmpc_doacross_post() exit: T#%d iter %lld posted\n", + gtid, (iter_number<<5)+shft)); +} + +void +__kmpc_doacross_fini(ident_t *loc, int gtid) +{ + kmp_int64 num_done; + kmp_info_t *th = __kmp_threads[gtid]; + kmp_team_t *team = th->th.th_team; + kmp_disp_t *pr_buf = th->th.th_dispatch; + + KA_TRACE(20,("__kmpc_doacross_fini() enter: called T#%d\n", gtid)); + if( team->t.t_serialized ) { + KA_TRACE(20,("__kmpc_doacross_fini() exit: serialized team %p\n", team)); + return; // nothing to do + } + num_done = KMP_TEST_THEN_INC64((kmp_int64*)pr_buf->th_doacross_info[1]) + 1; + if( num_done == th->th.th_team_nproc ) { + // we are the last thread, need to free shared resources + int idx = pr_buf->th_doacross_buf_idx - 1; + dispatch_shared_info_t *sh_buf = &team->t.t_disp_buffer[idx % KMP_MAX_DISP_BUF]; + KMP_DEBUG_ASSERT(pr_buf->th_doacross_info[1] == (kmp_int64)&sh_buf->doacross_num_done); + KMP_DEBUG_ASSERT(num_done == (kmp_int64)sh_buf->doacross_num_done); + KMP_DEBUG_ASSERT(idx == sh_buf->doacross_buf_idx); + __kmp_thread_free(th, (void*)sh_buf->doacross_flags); + sh_buf->doacross_flags = NULL; + sh_buf->doacross_num_done = 0; + sh_buf->doacross_buf_idx += KMP_MAX_DISP_BUF; // free buffer for future re-use + } + // free private resources (need to keep buffer index forever) + __kmp_thread_free(th, (void*)pr_buf->th_doacross_info); + pr_buf->th_doacross_info = NULL; + KA_TRACE(20,("__kmpc_doacross_fini() exit: T#%d\n", gtid)); +} +#endif + // end of file // diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp index 8f1852b392ac..23d736a1b5af 100644 --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -163,7 +163,7 @@ struct dispatch_shared_infoXX_template { volatile UT iteration; volatile UT num_done; volatile UT ordered_iteration; - UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar + UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar }; // replaces dispatch_shared_info structure and dispatch_shared_info_t type @@ -175,6 +175,11 @@ struct dispatch_shared_info_template { dispatch_shared_info64_t s64; } u; volatile kmp_uint32 buffer_index; +#if OMP_41_ENABLED + volatile kmp_int32 doacross_buf_idx; // teamwise index + kmp_uint32 *doacross_flags; // array of iteration flags (0/1) + kmp_int32 doacross_num_done; // count finished threads +#endif }; /* ------------------------------------------------------------------------ */ diff --git a/openmp/runtime/src/kmp_runtime.c b/openmp/runtime/src/kmp_runtime.c index 7a2fa7bac55e..7b31eb90e6f1 100644 --- a/openmp/runtime/src/kmp_runtime.c +++ b/openmp/runtime/src/kmp_runtime.c @@ -3046,8 +3046,12 @@ __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) team->t.t_max_nproc = max_nth; /* setup dispatch buffers */ - for(i = 0 ; i < num_disp_buff; ++i) + for(i = 0 ; i < num_disp_buff; ++i) { team->t.t_disp_buffer[i].buffer_index = i; +#if OMP_41_ENABLED + team->t.t_disp_buffer[i].doacross_buf_idx = i; +#endif + } } static void @@ -4121,7 +4125,9 @@ __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] ); dispatch->th_disp_index = 0; - +#if OMP_41_ENABLED + dispatch->th_doacross_buf_idx = 0; +#endif if( ! dispatch->th_disp_buffer ) { dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size ); @@ -6813,7 +6819,9 @@ __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr, //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] ); dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ - +#if OMP_41_ENABLED + dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */ +#endif if( __kmp_env_consistency_check ) __kmp_push_parallel( gtid, team->t.t_ident ); @@ -7050,10 +7058,17 @@ __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team ) KMP_DEBUG_ASSERT( team->t.t_disp_buffer ); if ( team->t.t_max_nproc > 1 ) { int i; - for (i = 0; i < KMP_MAX_DISP_BUF; ++i) + for (i = 0; i < KMP_MAX_DISP_BUF; ++i) { team->t.t_disp_buffer[ i ].buffer_index = i; +#if OMP_41_ENABLED + team->t.t_disp_buffer[i].doacross_buf_idx = i; +#endif + } } else { team->t.t_disp_buffer[ 0 ].buffer_index = 0; +#if OMP_41_ENABLED + team->t.t_disp_buffer[0].doacross_buf_idx = 0; +#endif } KMP_MB(); /* Flush all pending memory write invalidates. */ -- GitLab