#endif
#if defined(_OSU_COLLECTIVES_)
#include <unistd.h> /* -- Begin Profiling Symbol Block for routine MPI_Bcast */ #if defined(HAVE_PRAGMA_WEAK) #pragma weak MPI_Bcast = PMPI_Bcast #elif defined(HAVE_PRAGMA_HP_SEC_DEF) #pragma _HP_SECONDARY_DEF PMPI_Bcast MPI_Bcast #elif defined(HAVE_PRAGMA_CRI_DUP) #pragma _CRI duplicate MPI_Bcast as PMPI_Bcast #endif /* -- End Profiling Symbol Block */
/* Define MPICH_MPI_FROM_PMPI if weak symbols are not supported to build the MPI routines */ #ifndef MPICH_MPI_FROM_PMPI #undef MPI_Bcast #define MPI_Bcast PMPI_Bcast
Algorithm: MPI_Bcast
For short messages, we use a binomial tree algorithm. Cost = lgp.alpha + n.lgp.beta
For long messages, we do a scatter followed by an allgather. We first scatter the buffer using a binomial tree algorithm. This costs lgp.alpha + n.((p-1)/p).beta If the datatype is contiguous and the communicator is homogeneous, we treat the data as bytes and divide (scatter) it among processes by using ceiling division. For the noncontiguous or heterogeneous cases, we first pack the data into a temporary buffer by using MPI_Pack, scatter it as bytes, and unpack it after the allgather.
For the allgather, we use a recursive doubling algorithm for medium-size messages and power-of-two number of processes. This takes lgp steps. In each step pairs of processes exchange all the data they have (we take care of non-power-of-two situations). This costs approximately lgp.alpha + n.((p-1)/p).beta. (Approximately because it may be slightly more in the non-power-of-two case, but its still a logarithmic algorithm.) Therefore, for long messages Total Cost = 2.lgp.alpha + 2.n.((p-1)/p).beta
Note that this algorithm has twice the latency as the tree algorithm we use for short messages, but requires lower bandwidth: 2.n.beta versus n.lgp.beta. Therefore, for long messages and when lgp > 2, this algorithm will perform better.
For long messages and for medium-size messages and non-power-of-two processes, we use a ring algorithm for the allgather, which takes p-1 steps, because it performs better than recursive doubling. Total Cost = (lgp+p-1).alpha + 2.n.((p-1)/p).beta
End Algorithm: MPI_Bcast */
/* begin:nested */ /* not declared static because it is called in intercomm. allgatherv */ int MPIR_Bcast ( void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr ) { static const char FCNAME[] = "MPIR_Bcast"; MPI_Status status; int rank, comm_size, src, dst; int relative_rank, mask; int mpi_errno = MPI_SUCCESS; int scatter_size, nbytes=0, curr_size, recv_size = 0, send_size; int type_size, j, k, i, tmp_mask, is_contig, is_homogeneous; int relative_dst, dst_tree_root, my_tree_root, send_offset; int recv_offset, tree_root, nprocs_completed, offset, position; #if defined(_OSU_MVAPICH_) int *recvcnts, *displs, left, right, jnext; int bcast_short_msg; #else /* defined(_OSU_MVAPICH_) */ int *recvcnts, *displs, left, right, jnext, pof2, comm_size_is_pof2; #endif /* defined(_OSU_MVAPICH_) */ void *tmp_buf=NULL; MPI_Comm comm; MPID_Datatype *dtp; MPI_Aint true_extent, true_lb;
if (count == 0) { return MPI_SUCCESS; }
comm = comm_ptr->handle; comm_size = comm_ptr->local_size; rank = comm_ptr->rank;
/* If there is only one process, return */ if (comm_size == 1) { return MPI_SUCCESS; }
if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN) { is_contig = 1; } else { MPID_Datatype_get_ptr(datatype, dtp); is_contig = dtp->is_contig; }
is_homogeneous = 1; #ifdef MPID_HAS_HETERO if (comm_ptr->is_hetero) { is_homogeneous = 0; } #endif
/* MPI_Type_size() might not give the accurate size of the packed * datatype for heterogeneous systems (because of padding, encoding, * etc). On the other hand, MPI_Pack_size() can become very * expensive, depending on the implementation, especially for * heterogeneous systems. We want to use MPI_Type_size() wherever * possible, and MPI_Pack_size() in other places. */ if (is_homogeneous) { MPID_Datatype_get_size_macro(datatype, type_size); } else { mpi_errno = NMPI_Pack_size(1, datatype, comm, &type_size); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } } nbytes = type_size * count; if (!is_contig || !is_homogeneous) { tmp_buf = MPIU_Malloc(nbytes); /* --BEGIN ERROR HANDLING-- */ if (!tmp_buf) { mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %d", type_size ); return mpi_errno; } /* --END ERROR HANDLING-- */
/* TODO: Pipeline the packing and communication */ position = 0; if (rank == root) { NMPI_Pack(buffer, count, datatype, tmp_buf, nbytes, &position, comm); } }
relative_rank = (rank >= root) ? rank - root : rank - root + comm_size;
/* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr );
if ((nbytes <= bcast_tuning(nbytes, comm_ptr)) || (comm_size < MPIR_BCAST_MIN_PROCS)) { #if defined(_OSU_MVAPICH_) if(enable_knomial_2level_bcast && enable_shmem_collectives && comm_ptr->shmem_coll_ok == 1 && comm_ptr->leader_comm != 0 && comm_ptr->shmem_comm != 0 && comm_size > knomial_2level_bcast_system_size_threshold) { if ( !is_contig || !is_homogeneous) { mpi_errno = knomial_2level_Bcast(tmp_buf, nbytes, MPI_BYTE,nbytes,root,comm_ptr); }else { mpi_errno = knomial_2level_Bcast(buffer,count,datatype,nbytes, root,comm_ptr); } } else { #endif /* Use short message algorithm, namely, binomial tree */
So that the new root can easily identify the size of its subtree, the (subtree) roots are all powers of two (relative to the root) If m = the first power of 2 such that 2^m >= the size of the communicator, then the subtree at root at 2^(m-k) has size 2^k (with special handling for subtrees that arent a power of two in size).
Note that the process that is the tree root is handled automatically by this code, since it has no bits set. */
mask = 0x1; while (mask < comm_size) { if (relative_rank & mask) { src = rank - mask; if (src < 0) { src += comm_size; } if (!is_contig || !is_homogeneous) { mpi_errno = MPIC_Recv(tmp_buf,nbytes,MPI_BYTE,src, MPIR_BCAST_TAG,comm,MPI_STATUS_IGNORE); } else { mpi_errno = MPIC_Recv(buffer,count,datatype,src, MPIR_BCAST_TAG,comm,MPI_STATUS_IGNORE); } if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } break; } mask <<= 1; } /* This process is responsible for all processes that have bits set from the LSB upto (but not including) mask. Because of the "not including", we start by shifting mask back down one.
We can easily change to a different algorithm at any power of two by changing the test (mask > 1) to (mask > block_size)
One such version would use non-blocking operations for the last 2-4 steps (this also bounds the number of MPI_Requests that would be needed). */ mask >>= 1; while (mask > 0) { if (relative_rank + mask < comm_size) { dst = rank + mask; if (dst >= comm_size) dst -= comm_size; if (!is_contig || !is_homogeneous) { mpi_errno = MPIC_Send(tmp_buf,nbytes,MPI_BYTE,dst, MPIR_BCAST_TAG,comm); } else { mpi_errno = MPIC_Send(buffer,count,datatype,dst, MPIR_BCAST_TAG,comm); } if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } } mask >>= 1; } #if defined(_OSU_MVAPICH_) } #endif } #if defined(_OSU_MVAPICH_) else if (enable_shmem_collectives && (comm_ptr->shmem_coll_ok == 1) && (nbytes < shmem_bcast_threshold) && enable_shmem_bcast) { if ( !is_contig || !is_homogeneous) { mpi_errno = intra_shmem_Bcast_Large(tmp_buf, nbytes, MPI_BYTE, nbytes, root, comm_ptr); } else { mpi_errno = intra_shmem_Bcast_Large(buffer, count, datatype, nbytes, root, comm_ptr); }
if (mpi_errno == -1) { /* use long message algorithm: binomial tree scatter followed by an allgather */
/* The scatter algorithm divides the buffer into nprocs pieces and scatters them among the processes. Root gets the first piece, root+1 gets the second piece, and so forth. Uses the same binomial tree algorithm as above. Ceiling division is used to compute the size of each piece. This means some processes may not get any data. For example if bufsize = 97 and nprocs = 16, ranks 15 and 16 will get 0 data. On each process, the scattered data is stored at the same offset in the buffer as it is on the root process. */
if (is_contig && is_homogeneous) { /* contiguous and homogeneous. no need to pack. */ mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb, &true_extent); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } tmp_buf = (char *) buffer + true_lb; }
scatter_size = (nbytes + comm_size - 1)/comm_size; /* ceiling division */ curr_size = (rank == root) ? nbytes : 0; /* root starts with all the data */
mask = 0x1; while (mask < comm_size) { if (relative_rank & mask) { src = rank - mask; if (src < 0) { src += comm_size; } recv_size = nbytes - relative_rank*scatter_size; /* recv_size is larger than what might actually be sent by the sender. We dont need compute the exact value because MPI allows you to post a larger recv.*/ if (recv_size <= 0) { curr_size = 0; /* this process doesnt receive any data because of uneven division */ } else { mpi_errno = MPIC_Recv(((char *)tmp_buf + relative_rank*scatter_size), recv_size, MPI_BYTE, src, MPIR_BCAST_TAG, comm, &status); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
/* query actual size of data received */ NMPI_Get_count(&status, MPI_BYTE, &curr_size); } break; } mask <<= 1; }
/* This process is responsible for all processes that have bits set from the LSB upto (but not including) mask. Because of the "not including", we start by shifting mask back down one. */
mask >>= 1; while (mask > 0) { if (relative_rank + mask < comm_size) { send_size = curr_size - scatter_size * mask; /* mask is also the size of this processs subtree */
if (send_size > 0) { dst = rank + mask; if (dst >= comm_size) { dst -= comm_size; } mpi_errno = MPIC_Send (((char *)tmp_buf + scatter_size*(relative_rank+mask)), send_size, MPI_BYTE, dst, MPIR_BCAST_TAG, comm); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } curr_size -= send_size; } } mask >>= 1; }
/* Scatter complete. Now do an allgather . */
/* check if comm_size is a power of two */ #if defined(_OSU_MVAPICH_) if (nbytes < MPIR_BCAST_LONG_MSG && (comm_size & (comm_size - 1)) == 0) #else /* defined(_OSU_MVAPICH_) */ pof2 = 1; while (pof2 < comm_size) { pof2 *= 2; } if (pof2 == comm_size) { comm_size_is_pof2 = 1; } else { comm_size_is_pof2 = 0; }
if ((nbytes < MPIR_BCAST_LONG_MSG) && (comm_size_is_pof2)) #endif /* defined(_OSU_MVAPICH_) */ { /* medium size allgather and pof2 comm_size. use recurive doubling. */
mask = 0x1; i = 0; while (mask < comm_size) { relative_dst = relative_rank ^ mask;
dst = (relative_dst + root) % comm_size;
/* find offset into send and recv buffers. zero out the least significant "i" bits of relative_rank and relative_dst to find root of src and dst subtrees. Use ranks of roots as index to send from and recv into buffer */
dst_tree_root = relative_dst >> i; dst_tree_root <<= i;
my_tree_root = relative_rank >> i; my_tree_root <<= i;
send_offset = my_tree_root * scatter_size; recv_offset = dst_tree_root * scatter_size;
if (relative_dst < comm_size) { mpi_errno = MPIC_Sendrecv(((char *)tmp_buf + send_offset), curr_size, MPI_BYTE, dst, MPIR_BCAST_TAG, ((char *)tmp_buf + recv_offset), (nbytes-recv_offset < 0 ? 0 : nbytes-recv_offset), MPI_BYTE, dst, MPIR_BCAST_TAG, comm, &status); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } NMPI_Get_count(&status, MPI_BYTE, &recv_size); curr_size += recv_size; }
/* if some processes in this processs subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the data that they would normally have received from those processes. That is, the haves in this subtree must send to the havenots. We use a logarithmic recursive-halfing algorithm for this. */
/* This part of the code will not currently be executed because we are not using recursive doubling for non power of two. Mark it as experimental so that it doesnt show up as red in the coverage tests. */
/* --BEGIN EXPERIMENTAL-- */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this processs rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--;
offset = scatter_size * (my_tree_root + mask); tmp_mask = mask >> 1;
while (tmp_mask) { relative_dst = relative_rank ^ tmp_mask; dst = (relative_dst + root) % comm_size;
tree_root = relative_rank >> k; tree_root <<= k;
/* send only if this proc has data and destination doesnt have data. */
if ((relative_dst > relative_rank) && (relative_rank < tree_root + nprocs_completed) && (relative_dst >= tree_root + nprocs_completed)) {
mpi_errno = MPIC_Send(((char *)tmp_buf + offset), recv_size, MPI_BYTE, dst, MPIR_BCAST_TAG, comm); /* recv_size was set in the previous receive. thats the amount of data to be sent now. */ if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } } else if ((relative_dst < relative_rank) && (relative_dst < tree_root + nprocs_completed) && (relative_rank >= tree_root + nprocs_completed)) { /* recv only if this proc. doesnt have data and sender has data */ mpi_errno = MPIC_Recv(((char *)tmp_buf + offset), nbytes - offset, MPI_BYTE, dst, MPIR_BCAST_TAG, comm, &status); /* nprocs_completed is also equal to the no. of processes whose data we dont have */ if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } NMPI_Get_count(&status, MPI_BYTE, &recv_size); curr_size += recv_size; } tmp_mask >>= 1; k--; } } /* --END EXPERIMENTAL-- */
mask <<= 1; i++; } } else { /* long-message allgather or medium-size but non-power-of-two. use ring algorithm. */
recvcnts = MPIU_Malloc(comm_size*sizeof(int)); /* --BEGIN ERROR HANDLING-- */ if (!recvcnts) { mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %d", comm_size * sizeof(int)); return mpi_errno; } /* --END ERROR HANDLING-- */ displs = MPIU_Malloc(comm_size*sizeof(int)); /* --BEGIN ERROR HANDLING-- */ if (!displs) { mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %d", comm_size * sizeof(int)); return mpi_errno; } /* --END ERROR HANDLING-- */
for (i=0; i<comm_size; i++) { recvcnts[i] = nbytes - i*scatter_size; if (recvcnts[i] > scatter_size) { recvcnts[i] = scatter_size; } if (recvcnts[i] < 0) { recvcnts[i] = 0; } }
displs[0] = 0; for (i=1; i<comm_size; i++) { displs[i] = displs[i-1] + recvcnts[i-1]; }
left = (comm_size + rank - 1) % comm_size; right = (rank + 1) % comm_size;
j = rank; jnext = left; for (i=1; i<comm_size; i++) { mpi_errno = MPIC_Sendrecv((char *)tmp_buf + displs[(j-root+comm_size)%comm_size], recvcnts[(j-root+comm_size)%comm_size], MPI_BYTE, right, MPIR_BCAST_TAG, (char *)tmp_buf + displs[(jnext-root+comm_size)%comm_size], recvcnts[(jnext-root+comm_size)%comm_size], MPI_BYTE, left, MPIR_BCAST_TAG, comm, MPI_STATUS_IGNORE); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } j = jnext; jnext = (comm_size + jnext - 1) % comm_size; }
MPIU_Free(recvcnts); MPIU_Free(displs); } } else { return mpi_errno; } } #endif else { /* use long message algorithm: binomial tree scatter followed by an allgather */ /* The scatter algorithm divides the buffer into nprocs pieces and scatters them among the processes. Root gets the first piece, root+1 gets the second piece, and so forth. Uses the same binomial tree algorithm as above. Ceiling division is used to compute the size of each piece. This means some processes may not get any data. For example if bufsize = 97 and nprocs = 16, ranks 15 and 16 will get 0 data. On each process, the scattered data is stored at the same offset in the buffer as it is on the root process. */
if (is_contig && is_homogeneous) { /* contiguous and homogeneous. no need to pack. */ mpi_errno = NMPI_Type_get_true_extent(datatype, &true_lb, &true_extent); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } tmp_buf = (char *) buffer + true_lb; }
scatter_size = (nbytes + comm_size - 1)/comm_size; /* ceiling division */ curr_size = (rank == root) ? nbytes : 0; /* root starts with all the data */
mask = 0x1; while (mask < comm_size) { if (relative_rank & mask) { src = rank - mask; if (src < 0) { src += comm_size; } recv_size = nbytes - relative_rank*scatter_size; /* recv_size is larger than what might actually be sent by the sender. We dont need compute the exact value because MPI allows you to post a larger recv.*/ if (recv_size <= 0) { curr_size = 0; /* this process doesnt receive any data because of uneven division */ } else { mpi_errno = MPIC_Recv(((char *)tmp_buf + relative_rank*scatter_size), recv_size, MPI_BYTE, src, MPIR_BCAST_TAG, comm, &status); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); }
/* query actual size of data received */ NMPI_Get_count(&status, MPI_BYTE, &curr_size); } break; } mask <<= 1; }
/* This process is responsible for all processes that have bits set from the LSB upto (but not including) mask. Because of the "not including", we start by shifting mask back down one. */
mask >>= 1; while (mask > 0) { if (relative_rank + mask < comm_size) { send_size = curr_size - scatter_size * mask; /* mask is also the size of this processs subtree */
if (send_size > 0) { dst = rank + mask; if (dst >= comm_size) { dst -= comm_size; } mpi_errno = MPIC_Send (((char *)tmp_buf + scatter_size*(relative_rank+mask)), send_size, MPI_BYTE, dst, MPIR_BCAST_TAG, comm); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } curr_size -= send_size; } } mask >>= 1; }
/* Scatter complete. Now do an allgather . */
/* check if comm_size is a power of two */ #if defined(_OSU_MVAPICH_) if (nbytes < MPIR_BCAST_LONG_MSG && (comm_size & (comm_size - 1)) == 0) #else /* defined(_OSU_MVAPICH_) */ pof2 = 1; while (pof2 < comm_size) { pof2 *= 2; } if (pof2 == comm_size) { comm_size_is_pof2 = 1; } else { comm_size_is_pof2 = 0; }
if ((nbytes < MPIR_BCAST_LONG_MSG) && (comm_size_is_pof2)) #endif /* defined(_OSU_MVAPICH_) */ { /* medium size allgather and pof2 comm_size. use recurive doubling. */
mask = 0x1; i = 0; while (mask < comm_size) { relative_dst = relative_rank ^ mask;
dst = (relative_dst + root) % comm_size;
/* find offset into send and recv buffers. zero out the least significant "i" bits of relative_rank and relative_dst to find root of src and dst subtrees. Use ranks of roots as index to send from and recv into buffer */
dst_tree_root = relative_dst >> i; dst_tree_root <<= i;
my_tree_root = relative_rank >> i; my_tree_root <<= i;
send_offset = my_tree_root * scatter_size; recv_offset = dst_tree_root * scatter_size;
if (relative_dst < comm_size) { mpi_errno = MPIC_Sendrecv(((char *)tmp_buf + send_offset), curr_size, MPI_BYTE, dst, MPIR_BCAST_TAG, ((char *)tmp_buf + recv_offset), (nbytes-recv_offset < 0 ? 0 : nbytes-recv_offset), MPI_BYTE, dst, MPIR_BCAST_TAG, comm, &status); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } NMPI_Get_count(&status, MPI_BYTE, &recv_size); curr_size += recv_size; }
/* if some processes in this processs subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the data that they would normally have received from those processes. That is, the haves in this subtree must send to the havenots. We use a logarithmic recursive-halfing algorithm for this. */
/* This part of the code will not currently be executed because we are not using recursive doubling for non power of two. Mark it as experimental so that it doesnt show up as red in the coverage tests. */
/* --BEGIN EXPERIMENTAL-- */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this processs rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--;
offset = scatter_size * (my_tree_root + mask); tmp_mask = mask >> 1;
while (tmp_mask) { relative_dst = relative_rank ^ tmp_mask; dst = (relative_dst + root) % comm_size;
tree_root = relative_rank >> k; tree_root <<= k;
/* send only if this proc has data and destination doesnt have data. */
if ((relative_dst > relative_rank) && (relative_rank < tree_root + nprocs_completed) && (relative_dst >= tree_root + nprocs_completed)) {
mpi_errno = MPIC_Send(((char *)tmp_buf + offset), recv_size, MPI_BYTE, dst, MPIR_BCAST_TAG, comm); /* recv_size was set in the previous receive. thats the amount of data to be sent now. */ if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } } else if ((relative_dst < relative_rank) && (relative_dst < tree_root + nprocs_completed) && (relative_rank >= tree_root + nprocs_completed)) { /* recv only if this proc. doesnt have data and sender has data */ mpi_errno = MPIC_Recv(((char *)tmp_buf + offset), nbytes - offset, MPI_BYTE, dst, MPIR_BCAST_TAG, comm, &status); /* nprocs_completed is also equal to the no. of processes whose data we dont have */ if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } NMPI_Get_count(&status, MPI_BYTE, &recv_size); curr_size += recv_size; } tmp_mask >>= 1; k--; } } /* --END EXPERIMENTAL-- */
mask <<= 1; i++; } } else { /* long-message allgather or medium-size but non-power-of-two. use ring algorithm. */
recvcnts = MPIU_Malloc(comm_size*sizeof(int)); /* --BEGIN ERROR HANDLING-- */ if (!recvcnts) { mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %d", comm_size * sizeof(int)); return mpi_errno; } /* --END ERROR HANDLING-- */ displs = MPIU_Malloc(comm_size*sizeof(int)); /* --BEGIN ERROR HANDLING-- */ if (!displs) { mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", "**nomem %d", comm_size * sizeof(int)); return mpi_errno; } /* --END ERROR HANDLING-- */
for (i=0; i<comm_size; i++) { recvcnts[i] = nbytes - i*scatter_size; if (recvcnts[i] > scatter_size) { recvcnts[i] = scatter_size; } if (recvcnts[i] < 0) { recvcnts[i] = 0; } }
displs[0] = 0; for (i=1; i<comm_size; i++) { displs[i] = displs[i-1] + recvcnts[i-1]; }
left = (comm_size + rank - 1) % comm_size; right = (rank + 1) % comm_size;
j = rank; jnext = left; for (i=1; i<comm_size; i++) { mpi_errno = MPIC_Sendrecv((char *)tmp_buf + displs[(j-root+comm_size)%comm_size], recvcnts[(j-root+comm_size)%comm_size], MPI_BYTE, right, MPIR_BCAST_TAG, (char *)tmp_buf + displs[(jnext-root+comm_size)%comm_size], recvcnts[(jnext-root+comm_size)%comm_size], MPI_BYTE, left, MPIR_BCAST_TAG, comm, MPI_STATUS_IGNORE); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } j = jnext; jnext = (comm_size + jnext - 1) % comm_size; }
MPIU_Free(recvcnts); MPIU_Free(displs); } }
if (!is_contig || !is_homogeneous) { if (rank != root) { position = 0; NMPI_Unpack(tmp_buf, nbytes, &position, buffer, count, datatype, comm); } MPIU_Free(tmp_buf); }
return mpi_errno;
/* begin:nested */ /* Not PMPI_LOCAL because it is called in intercomm allgather */ int MPIR_Bcast_inter ( void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr ) { /* Intercommunicator broadcast. Root sends to rank 0 in remote group. Remote group does local intracommunicator broadcast. */ static const char FCNAME[] = "MPIR_Bcast_inter"; int rank, mpi_errno; MPI_Status status; MPID_Comm *newcomm_ptr = NULL; MPI_Comm comm;
comm = comm_ptr->handle;
if (root == MPI_PROC_NULL) { /* local processes other than root do nothing */ mpi_errno = MPI_SUCCESS; } else if (root == MPI_ROOT) { /* root sends to rank 0 on remote group and returns */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr ); mpi_errno = MPIC_Send(buffer, count, datatype, 0, MPIR_BCAST_TAG, comm); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", 0); } /* --END ERROR HANDLING-- */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr ); return mpi_errno; } else { /* remote group. rank 0 on remote group receives from root */
rank = comm_ptr->rank;
if (rank == 0) { mpi_errno = MPIC_Recv(buffer, count, datatype, root, MPIR_BCAST_TAG, comm, &status); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", 0); return mpi_errno; } /* --END ERROR HANDLING-- */ }
/* Get the local intracommunicator */ if (!comm_ptr->local_comm) { MPIR_Setup_intercomm_localcomm( comm_ptr ); }
newcomm_ptr = comm_ptr->local_comm;
/* now do the usual broadcast on this intracommunicator with rank 0 as root. */ mpi_errno = MPIR_Bcast(buffer, count, datatype, 0, newcomm_ptr);
/* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**fail", 0); } /* --END ERROR HANDLING-- */ }
return mpi_errno; } /* end:nested */ #endif
#undef FUNCNAME #define FUNCNAME MPI_Bcast
/*@ MPI_Bcast - Broadcasts a message from the process with rank "root" to all other processes of the communicator
This routine is thread-safe. This means that this routine may be safely used by multiple threads without the need for any user-provided thread locks. However, the routine is not interrupt safe. Typically, this is due to the use of memory allocation routines such as malloc or other non-MPICH runtime routines that are themselves not interrupt-safe.
All MPI objects (e.g., MPI_Datatype, MPI_Comm) are of type INTEGER in Fortran.
All MPI routines (except MPI_Wtime and MPI_Wtick) return an error value; C routines as the value of the function and Fortran routines in the last argument. Before the value is returned, the current MPI error handler is called. By default, this error handler aborts the MPI job. The error handler may be changed with MPI_Comm_set_errhandler (for communicators), MPI_File_set_errhandler (for files), and MPI_Win_set_errhandler (for RMA windows). The MPI-1 routine MPI_Errhandler_set may be used but its use is deprecated. The predefined error handler MPI_ERRORS_RETURN may be used to cause error values to be returned. Note that MPI does not guarentee that an MPI program can continue past an error; however, MPI implementations will attempt to continue whenever possible.
MPIR_ERRTEST_INITIALIZED_ORDIE();
MPIU_THREAD_CS_ENTER(ALLFUNC,);
MPID_MPI_COLL_FUNC_ENTER(MPID_STATE_MPI_BCAST);
/* Validate parameters, especially handles needing to be converted */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_COMM(comm, mpi_errno); if (mpi_errno != MPI_SUCCESS) { goto fn_fail; } } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */
/* Convert MPI object handles to object pointers */ MPID_Comm_get_ptr( comm, comm_ptr ); /* Validate parameters and objects (post conversion) */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPID_Datatype *datatype_ptr = NULL;
MPID_Comm_valid_ptr( comm_ptr, mpi_errno ); if (mpi_errno != MPI_SUCCESS) { goto fn_fail; } MPIR_ERRTEST_COUNT(count, mpi_errno); MPIR_ERRTEST_DATATYPE(datatype, "datatype", mpi_errno); if (comm_ptr->comm_kind == MPID_INTRACOMM) { MPIR_ERRTEST_INTRA_ROOT(comm_ptr, root, mpi_errno); } else { MPIR_ERRTEST_INTER_ROOT(comm_ptr, root, mpi_errno); }
if (HANDLE_GET_KIND(datatype) != HANDLE_KIND_BUILTIN) { MPID_Datatype_get_ptr(datatype, datatype_ptr); MPID_Datatype_valid_ptr( datatype_ptr, mpi_errno ); MPID_Datatype_committed_ptr( datatype_ptr, mpi_errno ); }
MPIR_ERRTEST_BUF_INPLACE(buffer, count, mpi_errno); MPIR_ERRTEST_USERBUFFER(buffer,count,datatype,mpi_errno);
if (mpi_errno != MPI_SUCCESS) { goto fn_fail; } } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */
/* ... body of routine ... */
if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Bcast != NULL) { mpi_errno = comm_ptr->coll_fns->Bcast(buffer, count, datatype, root, comm_ptr); } else { MPIU_THREADPRIV_DECL; MPIU_THREADPRIV_GET;
MPIR_Nest_incr(); if (comm_ptr->comm_kind == MPID_INTRACOMM) { /* intracommunicator */ mpi_errno = MPIR_Bcast( buffer, count, datatype, root, comm_ptr ); } else { /* intercommunicator */ mpi_errno = MPIR_Bcast_inter( buffer, count, datatype, root, comm_ptr ); } MPIR_Nest_decr(); }
if (mpi_errno != MPI_SUCCESS) { goto fn_fail; }
/* ... end of body of routine ... */
#if defined(_OSU_MVAPICH_) int knomial_2level_Bcast( void *buffer, int count, MPI_Datatype datatype, int nbytes, int root, MPID_Comm *comm_ptr) { MPI_Comm comm, shmem_comm, leader_comm; MPID_Comm *shmem_commptr = 0, *leader_commptr = 0; int local_rank = -1, global_rank = -1, local_size=0,rank,size; int leader_root = 0; int leader_of_root; int mpi_errno = MPI_SUCCESS; static const char FCNAME[] = "knomial_2level_Bcast"; void *tmp_buf; int src,dst,mask,relative_rank,comm_size; int k; comm = comm_ptr->handle; PMPI_Comm_size ( comm, &size ); rank = comm_ptr->rank;
shmem_comm = comm_ptr->shmem_comm; mpi_errno = PMPI_Comm_rank(shmem_comm, &local_rank); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = PMPI_Comm_size(shmem_comm, &local_size); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); }
MPID_Comm_get_ptr(shmem_comm,shmem_commptr);
leader_comm = comm_ptr->leader_comm; MPID_Comm_get_ptr(leader_comm,leader_commptr);
if ((local_rank == 0)&&(local_size > 1)) { global_rank = leader_commptr->rank; }
leader_of_root = comm_ptr->leader_map[root]; leader_root = comm_ptr->leader_rank[leader_of_root];
if (local_size > 1) { if ((local_rank == 0) && (root != rank) && (leader_root == global_rank)) { mpi_errno = MPIC_Recv (buffer,count,datatype, root, MPIR_BCAST_TAG, comm, MPI_STATUS_IGNORE); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if ((local_rank != 0) && (root == rank)) { mpi_errno = MPIC_Send(buffer,count,datatype, leader_of_root, MPIR_BCAST_TAG, comm); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } } }
/* inter-leader k-nomial broadcast */ if (local_size != size && local_rank == 0) { rank = leader_commptr->rank; root = leader_root; comm_size = leader_commptr->local_size;
relative_rank = (rank >= root) ? rank - root : rank - root + comm_size; mask = 0x1; while (mask < comm_size) { if (relative_rank % (inter_node_knomial_factor*mask)) { src = relative_rank/(inter_node_knomial_factor*mask) * (inter_node_knomial_factor*mask) + root; if (src >= comm_size) { src -= comm_size; }
mpi_errno = MPIC_Recv(buffer,nbytes,MPI_BYTE,src, MPIR_BCAST_TAG,leader_comm,MPI_STATUS_IGNORE); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } break; } mask *= inter_node_knomial_factor; } mask /= inter_node_knomial_factor; while (mask > 0) { for(k=1;k<inter_node_knomial_factor;k++) { if (relative_rank + mask*k < comm_size) { dst = rank + mask*k; if (dst >= comm_size) { dst -= comm_size; } mpi_errno = MPIC_Send (buffer,nbytes,MPI_BYTE,dst, MPIR_BCAST_TAG,leader_comm); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } } } mask /= inter_node_knomial_factor; } }
/* intra-node k-nomial bcast*/ if(local_size > 1) { rank = shmem_commptr->rank; root = 0; comm_size = shmem_commptr->local_size;
relative_rank = (rank >= root) ? rank - root : rank - root + comm_size; mask = 0x1; while (mask < comm_size) { if (relative_rank % (intra_node_knomial_factor*mask)) { src = relative_rank/(intra_node_knomial_factor*mask)* (intra_node_knomial_factor*mask)+root; if (src >= comm_size) { src -= comm_size; }
mpi_errno = MPIC_Recv(buffer,nbytes,MPI_BYTE,src, MPIR_BCAST_TAG,shmem_comm,MPI_STATUS_IGNORE); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } break; } mask *= intra_node_knomial_factor; } mask /= intra_node_knomial_factor; while (mask > 0) { for(k=1;k<intra_node_knomial_factor;k++) { if (relative_rank + mask*k < comm_size) { dst = rank + mask*k; if (dst >= comm_size) { dst -= comm_size; } mpi_errno = MPIC_Send (buffer,nbytes,MPI_BYTE,dst, MPIR_BCAST_TAG,shmem_comm); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } } } mask /= intra_node_knomial_factor; } }
int MPID_SHMEM_BCAST_init(int file_size, int shmem_comm_rank, int my_local_rank, int* bcast_seg_size, char** bcast_shmem_file, int* fd);
int MPID_SHMEM_BCAST_mmap(void** mmap_ptr, int bcast_seg_size, int fd, int my_local_rank, char* bcast_shmem_file);
extern void MPID_SHMEM_COLL_GetShmemBcastBuf(void**, void*); extern void signal_local_processes(int, int, char*, int, int, void*); extern void wait_for_signal(int, int, char**, int*, int*, void*);
int viadev_use_shmem_ring= 1; int intra_shmem_Bcast_Large( void *buffer, int count, MPI_Datatype datatype, int nbytes, int root, MPID_Comm *comm ) { MPI_Status status; int rank, size, src, dst; int relative_rank, mask; int mpi_errno = MPI_SUCCESS; int scatter_size, curr_size, recv_size, send_size; int j=0, i; int *recvcnts = NULL, *displs = NULL, left, right, jnext; void *tmp_buf = NULL;
char* shmem_buf; MPI_Comm shmem_comm, leader_comm = 0; MPID_Comm *comm_ptr = 0,*shmem_commptr = 0; int local_rank = -1, local_size=0, relative_lcomm_rank; int leader_comm_size, leader_comm_rank; int shmem_comm_rank, num_bytes=0, shmem_offset=-1; int index; int file_size = shmem_bcast_threshold; int ret_val = 0, flag = 0; /* Get my rank and switch communicators to the hidden collective */ rank = comm->rank; comm_ptr = comm; size = comm_ptr->local_size; index = comm_ptr->bcast_index; /* Obtaining the shared memory communicator information */ shmem_comm = comm_ptr->shmem_comm; /* MPI_Comm_rank(shmem_comm, &local_rank); MPI_Comm_size(shmem_comm, &local_size); */ MPID_Comm_get_ptr(shmem_comm, shmem_commptr); local_rank = shmem_commptr->rank; local_size = shmem_commptr->local_size;
shmem_comm_rank = shmem_commptr->shmem_comm_rank;
/* Obtaining the Leader Communicator information */ if (local_rank == 0) { leader_comm = comm_ptr->leader_comm; PMPI_Comm_rank(leader_comm, &leader_comm_rank);
} leader_comm_size = comm_ptr->leader_group_size;
/* Initialize the bcast segment for the first time */ if (comm_ptr->bcast_mmap_ptr == NULL) { ret_val = MPID_SHMEM_BCAST_init(file_size, shmem_comm_rank, local_rank, &(comm_ptr->bcast_seg_size), &(comm_ptr->bcast_shmem_file), &(comm_ptr->bcast_fd)); MPIR_Allreduce(&ret_val, &flag, 1, MPI_INT, MPI_LAND, comm_ptr); if (flag == 0) { return -1; } MPIR_Barrier(shmem_commptr); MPID_SHMEM_BCAST_mmap(&(comm_ptr->bcast_mmap_ptr), comm_ptr->bcast_seg_size, comm_ptr->bcast_fd, local_rank, comm_ptr->bcast_shmem_file); MPIR_Barrier(shmem_commptr); if (local_rank == 0) { unlink(comm_ptr->bcast_shmem_file); } }
if ((local_rank == 0) || (root == rank)) {
MPID_SHMEM_COLL_GetShmemBcastBuf((void *)&shmem_buf,comm_ptr->bcast_mmap_ptr); /* The collective uses the shared buffer for inter and intra node */ tmp_buf = shmem_buf; }
if (root == rank) { mpi_errno = MPIC_Sendrecv(buffer, count, datatype, rank, MPIR_BCAST_TAG, shmem_buf, count, datatype, rank, MPIR_BCAST_TAG, comm->handle, &status); }
MPIR_Barrier(shmem_commptr);
/* The Leader for the given root of the broadcast */ int leader_of_root = comm_ptr->leader_map[root]; /* The rank of the leader process in the Leader communicator*/ int leader_comm_root = comm_ptr->leader_rank[leader_of_root];
relative_rank = (rank >= root) ? rank - root : rank - root + size;
/* use long message algorithm: binomial tree scatter followed by an allgather */
/* Scatter algorithm divides the buffer into nprocs pieces and scatters them among the processes. Root gets the first piece, root+1 gets the second piece, and so forth. Uses the same binomial tree algorithm as above. Ceiling division is used to compute the size of each piece. This means some processes may not get any data. For example if bufsize = 97 and nprocs = 16, ranks 15 and 16 will get 0 data. On each process, the scattered data is stored at the same offset in the buffer as it is on the root process. */
if (local_rank == 0) {
relative_lcomm_rank = (leader_comm_rank >= leader_comm_root) ? (leader_comm_rank - leader_comm_root) : (leader_comm_rank - leader_comm_root + leader_comm_size);
scatter_size = (nbytes + leader_comm_size - 1)/leader_comm_size; /* ceiling division */ curr_size = (leader_comm_rank == leader_comm_root) ? nbytes : 0; /* root starts with all the data */
mask = 0x1; while (mask < leader_comm_size) { if (relative_lcomm_rank & mask) { src = leader_comm_rank - mask; if (src < 0) { src += leader_comm_size; } recv_size = nbytes - relative_lcomm_rank*scatter_size; /* recv_size is larger than what might actually be sent by the sender. We dont need compute the exact value because MPI allows you to post a larger recv.*/ if (recv_size <= 0) { curr_size = 0; /* this process doesnt receive any data because of uneven division */ } else { mpi_errno = MPIC_Recv((void *)((char *)tmp_buf + relative_lcomm_rank*scatter_size), recv_size, MPI_BYTE, src, MPIR_BCAST_TAG, leader_comm, &status); if (mpi_errno) { return mpi_errno; }
/* query actual size of data received */ PMPI_Get_count(&status, MPI_BYTE, &curr_size); } break; } mask <<= 1; }
/* This process is responsible for all processes that have bits set from the LSB upto (but not including) mask. Because of the "not including", we start by shifting mask back down one. */
mask >>= 1; while (mask > 0) { if (relative_lcomm_rank + mask < leader_comm_size) {
send_size = curr_size - scatter_size * mask; /* mask is also the size of this processs subtree */
if (send_size > 0) { dst = leader_comm_rank + mask; if (dst >= leader_comm_size) { dst -= leader_comm_size; } mpi_errno = MPIC_Send (((char *)tmp_buf + scatter_size*(relative_lcomm_rank+mask)), send_size, MPI_BYTE, dst, MPIR_BCAST_TAG, leader_comm); if (mpi_errno) { return mpi_errno; } curr_size -= send_size; } } mask >>= 1; } }
#if 1 /* Scatter complete. Now do an allgather. */
/* use ring algorithm. */ if (local_rank == 0) { recvcnts = MPIU_Malloc(leader_comm_size*sizeof(int)); displs = MPIU_Malloc(leader_comm_size*sizeof(int));
for (i=0; i<leader_comm_size; i++) { recvcnts[i] = nbytes - i*scatter_size; if (recvcnts[i] > scatter_size) { recvcnts[i] = scatter_size; } if (recvcnts[i] < 0) { recvcnts[i] = 0; } }
displs[0] = 0; for (i=1; i<leader_comm_size; i++) { displs[i] = displs[i-1] + recvcnts[i-1]; }
left = (leader_comm_size + leader_comm_rank - 1) % leader_comm_size; right = (leader_comm_rank + 1) % leader_comm_size;
j = leader_comm_rank; jnext = left; for (i=1; i<leader_comm_size; i++) { signal_local_processes(i, index, (char *)tmp_buf+displs[(j-leader_comm_root+leader_comm_size)%leader_comm_size], displs[(j-leader_comm_root+leader_comm_size)%leader_comm_size], recvcnts[(j-leader_comm_root+leader_comm_size)%leader_comm_size], comm_ptr->bcast_mmap_ptr); mpi_errno = MPIC_Sendrecv((char *)tmp_buf+displs[(j-leader_comm_root+leader_comm_size)%leader_comm_size], recvcnts[(j-leader_comm_root+leader_comm_size)%leader_comm_size], MPI_BYTE, right, MPIR_BCAST_TAG, (char *)tmp_buf + displs[(jnext-leader_comm_root+leader_comm_size)%leader_comm_size], recvcnts[(jnext-leader_comm_root+leader_comm_size)%leader_comm_size], MPI_BYTE, left, MPIR_BCAST_TAG, leader_comm, &status ); if (mpi_errno) { break; } j = jnext; jnext = (leader_comm_size + jnext - 1) % leader_comm_size; }
} else { for (i=1; i<leader_comm_size; i++) { wait_for_signal(i, index, &shmem_buf, &shmem_offset, &num_bytes, comm_ptr->bcast_mmap_ptr);
/* Copy the data out from shmem buf */ mpi_errno = MPIC_Sendrecv((void *)shmem_buf, num_bytes, MPI_BYTE, rank, MPIR_BCAST_TAG, (void *)((char *)buffer + shmem_offset), num_bytes, MPI_BYTE, rank, MPIR_BCAST_TAG, comm->handle, &status);
} } /* The leader copies the data only in the end from the shmem buffer */ if (local_rank == 0) { signal_local_processes(i, index, (char *)tmp_buf+displs[(j-leader_comm_root+leader_comm_size)%leader_comm_size], displs[(j-leader_comm_root+leader_comm_size)%leader_comm_size], recvcnts[(j-leader_comm_root+leader_comm_size)%leader_comm_size], comm_ptr->bcast_mmap_ptr); /* Copy the data out from shmem buf */ mpi_errno = MPIC_Sendrecv((void *)shmem_buf, nbytes, MPI_BYTE, rank, MPIR_BCAST_TAG, (void *)((char *)buffer), nbytes, MPI_BYTE, rank, MPIR_BCAST_TAG, comm->handle, &status); MPIU_Free(recvcnts); MPIU_Free(displs); } else { wait_for_signal(i, index, &shmem_buf, &shmem_offset, &num_bytes, comm_ptr->bcast_mmap_ptr);
/* Copy the data out from shmem buf */ mpi_errno = MPIC_Sendrecv((void *)shmem_buf, num_bytes, MPI_BYTE, rank, MPIR_BCAST_TAG, (void *)((char *)buffer + shmem_offset), num_bytes, MPI_BYTE, rank, MPIR_BCAST_TAG, comm->handle, &status); }
#endif MPIR_Barrier(shmem_commptr); /* For the bcast signalling flags */ index = (index + 1)%3; comm_ptr->bcast_index = index;
return (mpi_errno); }
#endif
/* A simple utility function to that calls the comm_ptr->coll_fns->Bcast override if it exists or else it calls MPIR_Bcast with the same arguments. This function just makes the high-level broadcast logic easier to read while still accomodating coll_fns-style overrides. It also reduces future errors by eliminating the duplication of Bcast arguments.
This routine is used in other files as well (barrier.c, allreduce.c)
TODO This function should be deprecated in favor of a direct call to MPIR_Bcast now that we handle SMP-awareness inside of MPIR_Bcast instead of MPI_Bcast. */ #undef FUNCNAME #define FUNCNAME MPIR_Bcast_or_coll_fn #undef FCNAME #define FCNAME MPIU_QUOTE(FUNCNAME) int MPIR_Bcast_or_coll_fn(void *buffer, int count, MPI_Datatype datatype, int root, MPID_Comm *comm_ptr) { int mpi_errno = MPI_SUCCESS;
if (comm_ptr->coll_fns != NULL && comm_ptr->coll_fns->Bcast != NULL) { /* --BEGIN USEREXTENSION-- */ mpi_errno = comm_ptr->node_roots_comm->coll_fns->Bcast(buffer, count, datatype, root, comm_ptr); /* --END USEREXTENSION-- */ } else { mpi_errno = MPIR_Bcast(buffer, count, datatype, root, comm_ptr); }
return mpi_errno; }
#endif /* defined(_OSU_COLLECTIVES_) */
Location:bcast_osu.c