Skip to content

Instantly share code, notes, and snippets.

@ggouaillardet
Created February 1, 2019 01:01
Show Gist options
  • Select an option

  • Save ggouaillardet/2ea8d2207c1bdeedcc655e556f7eeed2 to your computer and use it in GitHub Desktop.

Select an option

Save ggouaillardet/2ea8d2207c1bdeedcc655e556f7eeed2 to your computer and use it in GitHub Desktop.
diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h
index b3743f2..ca578ab 100644
--- a/ompi/mca/osc/rdma/osc_rdma.h
+++ b/ompi/mca/osc/rdma/osc_rdma.h
@@ -13,6 +13,8 @@
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
+ * Copyright (c) 2019 Triad National Security, LLC. All rights
+ * reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -103,6 +105,9 @@ struct ompi_osc_rdma_component_t {
/** directory where to place backing files */
char *backing_directory;
+
+ /** maximum count for network AMO usage */
+ unsigned long network_amo_max_count;
};
typedef struct ompi_osc_rdma_component_t ompi_osc_rdma_component_t;
@@ -154,6 +159,9 @@ struct ompi_osc_rdma_module_t {
/** Local displacement unit. */
int disp_unit;
+ /** maximum count for network AMO usage */
+ unsigned long network_amo_max_count;
+
/** global leader */
ompi_osc_rdma_peer_t *leader;
diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c
index 31c3fc2..ac2191c 100644
--- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c
+++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c
@@ -2,9 +2,11 @@
/*
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
- * Copyright (c) 2016-2017 Research Organization for Information Science
- * and Technology (RIST). All rights reserved.
+ * Copyright (c) 2016-2019 Research Organization for Information Science
+ * and Technology (RIST). All rights reserved.
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
+ * Copyright (c) 2019 Triad National Security, LLC. All rights
+ * reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -50,71 +52,6 @@ struct ompi_osc_rdma_event_t {
typedef struct ompi_osc_rdma_event_t ompi_osc_rdma_event_t;
-#if 0
-static void *ompi_osc_rdma_event_put (int fd, int flags, void *context)
-{
- ompi_osc_rdma_event_t *event = (ompi_osc_rdma_event_t *) context;
- int ret;
-
- ret = event->module->selected_btl->btl_put (event->module->selected_btl, event->endpoint, event->local_address,
- event->remote_address, event->local_handle, event->remote_handle,
- event->length, 0, MCA_BTL_NO_ORDER, event->cbfunc, event->cbcontext,
- event->cbdata);
- if (OPAL_LIKELY(OPAL_SUCCESS == ret)) {
- /* done with this event */
- opal_event_del (&event->super);
- free (event);
- } else {
- /* re-activate the event */
- opal_event_active (&event->super, OPAL_EV_READ, 1);
- }
-
- return NULL;
-}
-
-static int ompi_osc_rdma_event_queue (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint,
- ompi_osc_rdma_event_type_t event_type, void *local_address, mca_btl_base_registration_handle_t *local_handle,
- uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
- uint64_t length, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
- void *cbdata)
-{
- ompi_osc_rdma_event_t *event = malloc (sizeof (*event));
- void *(*event_func) (int, int, void *);
-
- OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "queueing event type %d", event_type);
-
- if (OPAL_UNLIKELY(NULL == event)) {
- return OMPI_ERR_OUT_OF_RESOURCE;
- }
-
- event->module = module;
- event->endpoint = endpoint;
- event->local_address = local_address;
- event->local_handle = local_handle;
- event->remote_address = remote_address;
- event->remote_handle = remote_handle;
- event->length = length;
- event->cbfunc = cbfunc;
- event->cbcontext = cbcontext;
- event->cbdata = cbdata;
-
- switch (event_type) {
- case OMPI_OSC_RDMA_EVENT_TYPE_PUT:
- event_func = ompi_osc_rdma_event_put;
- break;
- default:
- opal_output(0, "osc/rdma: cannot queue unknown event type %d", event_type);
- abort ();
- }
-
- opal_event_set (opal_sync_event_base, &event->super, -1, OPAL_EV_READ,
- event_func, event);
- opal_event_active (&event->super, OPAL_EV_READ, 1);
-
- return OMPI_SUCCESS;
-}
-#endif
-
static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype,
void *result_buffer, int result_count, ompi_datatype_t *result_datatype,
ompi_osc_rdma_peer_t *peer, uint64_t target_address,
@@ -127,7 +64,7 @@ static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count
do {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "performing accumulate with local region(s)");
- if (NULL != result_buffer) {
+ if (NULL != result_datatype) {
/* get accumulate */
ret = ompi_datatype_sndrcv ((void *) (intptr_t) target_address, target_count, target_datatype,
@@ -182,9 +119,212 @@ static inline int ompi_osc_rdma_cas_local (const void *source_addr, const void *
return OMPI_SUCCESS;
}
+static int ompi_osc_rdma_op_mapping[OMPI_OP_NUM_OF_TYPES + 1] = {
+ [OMPI_OP_MAX] = MCA_BTL_ATOMIC_MAX,
+ [OMPI_OP_MIN] = MCA_BTL_ATOMIC_MIN,
+ [OMPI_OP_SUM] = MCA_BTL_ATOMIC_ADD,
+ [OMPI_OP_BAND] = MCA_BTL_ATOMIC_AND,
+ [OMPI_OP_BOR] = MCA_BTL_ATOMIC_OR,
+ [OMPI_OP_BXOR] = MCA_BTL_ATOMIC_XOR,
+ [OMPI_OP_LAND] = MCA_BTL_ATOMIC_LAND,
+ [OMPI_OP_LOR] = MCA_BTL_ATOMIC_LOR,
+ [OMPI_OP_LXOR] = MCA_BTL_ATOMIC_LXOR,
+ [OMPI_OP_REPLACE] = MCA_BTL_ATOMIC_SWAP,
+};
+
+static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt,
+ ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
+ mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req)
+{
+ ompi_osc_rdma_module_t *module = sync->module;
+ int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
+ int btl_op, flags;
+ int64_t origin;
+
+ if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) ||
+ (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) ||
+ !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) {
+ return OMPI_ERR_NOT_SUPPORTED;
+ }
+
+ btl_op = ompi_osc_rdma_op_mapping[op->op_type];
+ if (0 == btl_op) {
+ return OMPI_ERR_NOT_SUPPORTED;
+ }
+
+ flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0;
+ if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) {
+ flags |= MCA_BTL_ATOMIC_FLAG_FLOAT;
+ }
+
+ OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using %d-bit btl atomics. origin: 0x%" PRIx64,
+ (4 == extent) ? 32 : 64, *((int64_t *) origin_addr));
+
+ origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0];
+
+ return ompi_osc_rdma_btl_fop (module, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags,
+ result_addr, true, NULL, NULL, NULL);
+}
+
+static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt,
+ ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
+ mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req)
+{
+ ompi_osc_rdma_module_t *module = sync->module;
+ uint64_t address, offset, new_value, old_value;
+ int ret;
+
+ if (extent > 8) {
+ return OMPI_ERR_NOT_SUPPORTED;
+ }
+
+ /* align the address. the user should not call with an unaligned address so don't need to range check here */
+ address = target_address & ~7;
+ offset = target_address & ~address;
+
+ OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap");
+
+ ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, &old_value, 8);
+ if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+ return ret;
+ }
+
+ /* store the destination in the temporary buffer */
+ do {
+ new_value = old_value;
+
+ if (&ompi_mpi_op_replace.op == op) {
+ memcpy ((void *)((intptr_t) &new_value + offset), origin_addr + dt->super.true_lb, extent);
+ } else if (&ompi_mpi_op_no_op.op != op) {
+ ompi_op_reduce (op, (void *) origin_addr + dt->super.true_lb, (void*)((intptr_t) &new_value + offset), 1, dt);
+ }
+
+ ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle,
+ old_value, new_value, 0, (int64_t*)&new_value);
+ if (OPAL_SUCCESS != ret || new_value == old_value) {
+ break;
+ }
+
+ old_value = new_value;
+ } while (1);
+
+ if (result_addr) {
+ memcpy (result_addr, (void *)((intptr_t) &new_value + offset), extent);
+ }
+
+ return ret;
+}
+
+static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, ompi_datatype_t *dt, ptrdiff_t extent,
+ ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle,
+ ompi_op_t *op, ompi_osc_rdma_request_t *req)
+{
+ ompi_osc_rdma_module_t *module = sync->module;
+ int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
+ int ret, btl_op, flags;
+ int64_t origin;
+
+ if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
+ /* btl put atomics not supported or disabled. fall back on fetch-and-op */
+ return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle,
+ op, req);
+ }
+
+ if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) ||
+ (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) ||
+ !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) {
+ return OMPI_ERR_NOT_SUPPORTED;
+ }
+
+ origin = (8 == extent) ? ((uint64_t *) origin_addr)[0] : ((uint32_t *) origin_addr)[0];
+
+ /* set the appropriate flags for this atomic */
+ flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0;
+ if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) {
+ flags |= MCA_BTL_ATOMIC_FLAG_FLOAT;
+ }
+
+ btl_op = ompi_osc_rdma_op_mapping[op->op_type];
+
+ OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating accumulate using 64-bit btl atomics. origin: 0x%" PRIx64,
+ *((int64_t *) origin_addr));
+
+ /* if we locked the peer its best to wait for completion before returning */
+ return ompi_osc_rdma_btl_op (module, peer->data_endpoint, target_address, target_handle, btl_op, origin,
+ flags, true, NULL, NULL, NULL);
+}
+
+static inline int ompi_osc_rdma_gacc_amo (ompi_osc_rdma_module_t *module, ompi_osc_rdma_sync_t *sync, const void *source, void *result,
+ int result_count, ompi_datatype_t *result_datatype, opal_convertor_t *result_convertor,
+ ompi_osc_rdma_peer_t *peer, uint64_t target_address,
+ mca_btl_base_registration_handle_t *target_handle, int count,
+ ompi_datatype_t *datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
+{
+ const bool use_amo = module->acc_use_amo;
+ const size_t dt_size = datatype->super.size;
+ uint64_t offset = 0;
+ int ret;
+
+ OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using network atomics for accumulate operation with count %d", count);
+
+ /* only one can be non-NULL */
+ assert (NULL == result || NULL == result_convertor);
+
+ if (NULL == result && NULL != result_convertor) {
+ result = malloc (request->len);
+ if (OPAL_UNLIKELY(NULL == result)) {
+ return OMPI_ERR_OUT_OF_RESOURCE;
+ }
+ }
+
+ for (int i = 0 ; i < count ; ) {
+ if (use_amo) {
+ if (NULL == result) {
+ ret = ompi_osc_rdma_acc_single_atomic (sync, source, datatype, dt_size, peer, target_address, target_handle, op, request);
+ } else {
+ ret = ompi_osc_rdma_fetch_and_op_atomic (sync, source, result, datatype, dt_size, peer, target_address, target_handle, op,
+ request);
+ }
+ } else {
+ ret = ompi_osc_rdma_fetch_and_op_cas (sync, source, result, datatype, dt_size, peer, target_address, target_handle, op,
+ request);
+ }
+
+ if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
+ if (source) {
+ source = (const void *) ((intptr_t) source + dt_size);
+ }
+ if (result) {
+ result = (void *) ((intptr_t) result + dt_size);
+ }
+ target_address += dt_size;
+ ++i;
+ } else if (OPAL_UNLIKELY(OMPI_ERR_NOT_SUPPORTED == ret)) {
+ return OMPI_ERR_NOT_SUPPORTED;
+ }
+ }
+
+ if (NULL != result_convertor) {
+ /* result buffer is not necessarily contiguous. use the opal datatype engine to
+ * copy the data over in this case */
+ struct iovec iov = {.iov_base = result, .iov_len = request->len};
+ uint32_t iov_count = 1;
+ size_t size = request->len;
+
+ opal_convertor_unpack (result_convertor, &iov, &iov_count, &size);
+ }
+
+ if (request) {
+ ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
+ }
+
+ return OMPI_SUCCESS;
+}
+
static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const void *source, int source_count,
ompi_datatype_t *source_datatype, void *result, int result_count,
- ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
+ ompi_datatype_t *result_datatype, opal_convertor_t *result_convertor,
+ ompi_osc_rdma_peer_t *peer, uint64_t target_address,
mca_btl_base_registration_handle_t *target_handle, int target_count,
ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request)
{
@@ -196,6 +336,20 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating accumulate on contiguous region of %lu bytes to remote address %" PRIx64
", sync %p", len, target_address, (void *) sync);
+ /* if the datatype is small enough (and the count is 1) then try to directly use the hardware to execute
+ * the atomic operation. this should be safe in all cases as either 1) the user has assured us they will
+ * never use atomics with count > 1, 2) we have the accumulate lock, or 3) we have an exclusive lock */
+ if (target_datatype->super.size <= 8 && target_count <= module->network_amo_max_count) {
+ ret = ompi_osc_rdma_gacc_amo (module, sync, source, result, result_count, result_datatype, result_convertor,
+ peer, target_address, target_handle, target_count, target_datatype, op, request);
+ if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
+ return OMPI_SUCCESS;
+ }
+ OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "network atomics not available. falling back to get-op-put implementation...");
+ }
+
+ OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using get-op-put to execute accumulate with count %d", target_count);
+
if (&ompi_mpi_op_replace.op != op || OMPI_OSC_RDMA_TYPE_GET_ACC == request->type) {
ptr = malloc (len);
if (OPAL_UNLIKELY(NULL == ptr)) {
@@ -219,8 +373,7 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
uint32_t iov_count = 1;
size_t size = request->len;
- opal_convertor_unpack (&request->convertor, &iov, &iov_count, &size);
- opal_convertor_cleanup (&request->convertor);
+ opal_convertor_unpack (result_convertor, &iov, &iov_count, &size);
} else {
/* copy contiguous data to the result buffer */
ompi_datatype_sndrcv (ptr, len, MPI_BYTE, result, result_count, result_datatype);
@@ -264,7 +417,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
struct iovec source_iovec[OMPI_OSC_RDMA_DECODE_MAX], target_iovec[OMPI_OSC_RDMA_DECODE_MAX];
const size_t acc_limit = (mca_osc_rdma_component.buffer_size >> 3);
uint32_t source_primitive_count, target_primitive_count;
- opal_convertor_t source_convertor, target_convertor;
+ opal_convertor_t source_convertor, target_convertor, result_convertor;
uint32_t source_iov_count, target_iov_count;
uint32_t source_iov_index, target_iov_index;
ompi_datatype_t *source_primitive, *target_primitive;
@@ -281,6 +434,13 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
request->internal = true;
}
+ if (&ompi_mpi_op_no_op.op == op) {
+ /* NTH: just zero these out to catch any coding errors (they should be ignored in the no-op case) */
+ source_count = 0;
+ source_datatype = NULL;
+ source_addr = NULL;
+ }
+
request->cleanup = ompi_osc_rdma_gacc_master_cleanup;
request->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC;
@@ -303,7 +463,7 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
}
ret = ompi_osc_rdma_gacc_contig (sync, source_addr, source_count, source_datatype, result_addr,
- result_count, result_datatype, peer, target_address,
+ result_count, result_datatype, NULL, peer, target_address,
target_handle, target_count, target_datatype, op,
request);
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
@@ -357,6 +517,20 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
+ source_iov_count = 0;
+ } else {
+ source_iovec[0].iov_len = (size_t) -1;
+ source_iovec[0].iov_base = NULL;
+ source_iov_count = 1;
+ }
+
+ if (result_datatype) {
+ OBJ_CONSTRUCT(&result_convertor, opal_convertor_t);
+ ret = opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count, result_addr,
+ 0, &result_convertor);
+ if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+ return ret;
+ }
}
/* target_datatype can never be NULL */
@@ -372,85 +546,77 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
target_iov_index = 0;
target_iov_count = 0;
+ source_iov_index = 0;
result_position = 0;
subreq = NULL;
do {
- /* decode segments of the source data */
- source_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
- source_iov_index = 0;
- /* opal_convertor_raw returns done when it has reached the end of the data */
- if (!source_datatype) {
- done = true;
- source_iovec[0].iov_len = (size_t) -1;
- source_iovec[0].iov_base = NULL;
- source_iov_count = 1;
- } else {
- done = opal_convertor_raw (&source_convertor, source_iovec, &source_iov_count, &source_size);
- }
-
- /* loop on the target segments until we have exhaused the decoded source data */
- while (source_iov_index != source_iov_count) {
- if (target_iov_index == target_iov_count) {
- /* decode segments of the target buffer */
- target_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
- target_iov_index = 0;
- (void) opal_convertor_raw (&target_convertor, target_iovec, &target_iov_count, &target_size);
+ /* decode segments of the target buffer */
+ target_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
+ target_iov_index = 0;
+ done = opal_convertor_raw (&target_convertor, target_iovec, &target_iov_count, &target_size);
+
+ /* loop on the source segments (if any) until we have exhaused the decoded target data */
+ while (target_iov_index != target_iov_count) {
+ if (source_iov_count == source_iov_index) {
+ /* decode segments of the source data */
+ source_iov_count = OMPI_OSC_RDMA_DECODE_MAX;
+ source_iov_index = 0;
+ (void) opal_convertor_raw (&source_convertor, source_iovec, &source_iov_count, &source_size);
}
/* we already checked that the target was large enough. this should be impossible */
assert (0 != target_iov_count);
/* determine how much to put in this operation */
- acc_len = min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len);
- acc_len = min((size_t) acc_len, acc_limit);
-
- /* execute the get */
- if (!subreq) {
- OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
- subreq->internal = true;
- subreq->parent_request = request;
- (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
- }
-
- if (result_datatype) {
- /* prepare a convertor for this part of the result */
- opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count,
- result_addr, 0, &subreq->convertor);
- opal_convertor_set_position (&subreq->convertor, &result_position);
- subreq->type = OMPI_OSC_RDMA_TYPE_GET_ACC;
+ if (source_count) {
+ acc_len = min(min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit);
} else {
- subreq->type = OMPI_OSC_RDMA_TYPE_ACC;
+ acc_len = min(target_iovec[target_iov_index].iov_len, acc_limit);
}
- ret = ompi_osc_rdma_gacc_contig (sync, source_iovec[source_iov_index].iov_base, acc_len / target_primitive->super.size,
- target_primitive, NULL, 0, NULL, peer,
- (uint64_t) (intptr_t) target_iovec[target_iov_index].iov_base, target_handle,
- acc_len / target_primitive->super.size, target_primitive, op, subreq);
- if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
- if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
- OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
- (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1);
- /* something bad happened. need to figure out how to handle these errors */
- return ret;
+ if (0 != acc_len) {
+ /* execute the get-accumulate */
+ if (!subreq) {
+ OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq);
+ subreq->internal = true;
+ subreq->parent_request = request;
+ subreq->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC;
+ (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, 1);
}
- /* progress and try again */
- ompi_osc_rdma_progress (module);
- continue;
+ ret = ompi_osc_rdma_gacc_contig (sync, source_iovec[source_iov_index].iov_base, acc_len / target_primitive->super.size,
+ target_primitive, NULL, 0, NULL, &result_convertor, peer,
+ (uint64_t) (intptr_t) target_iovec[target_iov_index].iov_base, target_handle,
+ acc_len / target_primitive->super.size, target_primitive, op, subreq);
+ if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+ if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) {
+ OMPI_OSC_RDMA_REQUEST_RETURN(subreq);
+ (void) OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1);
+ /* something bad happened. need to figure out how to handle these errors */
+ return ret;
+ }
+
+ /* progress and try again */
+ ompi_osc_rdma_progress (module);
+ continue;
+ }
}
subreq = NULL;
/* adjust io vectors */
target_iovec[target_iov_index].iov_len -= acc_len;
- source_iovec[source_iov_index].iov_len -= acc_len;
target_iovec[target_iov_index].iov_base = (void *)((intptr_t) target_iovec[target_iov_index].iov_base + acc_len);
- source_iovec[source_iov_index].iov_base = (void *)((intptr_t) source_iovec[source_iov_index].iov_base + acc_len);
+ target_iov_index += (0 == target_iovec[target_iov_index].iov_len);
+
result_position += acc_len;
- source_iov_index += !source_datatype || (0 == source_iovec[source_iov_index].iov_len);
- target_iov_index += (0 == target_iovec[target_iov_index].iov_len);
+ if (source_datatype) {
+ source_iov_index += (0 == source_iovec[source_iov_index].iov_len);
+ source_iovec[source_iov_index].iov_len -= acc_len;
+ source_iovec[source_iov_index].iov_base = (void *)((intptr_t) source_iovec[source_iov_index].iov_base + acc_len);
+ }
}
} while (!done);
@@ -462,6 +628,11 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v
OBJ_DESTRUCT(&source_convertor);
}
+ if (result_datatype) {
+ opal_convertor_cleanup (&result_convertor);
+ OBJ_DESTRUCT(&result_convertor);
+ }
+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "finished scheduling rdma on non-contiguous datatype(s)");
opal_convertor_cleanup (&target_convertor);
@@ -478,7 +649,7 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
ompi_osc_rdma_module_t *module = sync->module;
const size_t size = datatype->super.size;
int64_t compare, source;
- int ret, flags;
+ int flags, ret;
if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->selected_btl->btl_flags))) {
return OMPI_ERR_NOT_SUPPORTED;
@@ -500,172 +671,6 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
return ret;
}
-static int ompi_osc_rdma_op_mapping[OMPI_OP_NUM_OF_TYPES + 1] = {
- [OMPI_OP_MAX] = MCA_BTL_ATOMIC_MAX,
- [OMPI_OP_MIN] = MCA_BTL_ATOMIC_MIN,
- [OMPI_OP_SUM] = MCA_BTL_ATOMIC_ADD,
- [OMPI_OP_BAND] = MCA_BTL_ATOMIC_AND,
- [OMPI_OP_BOR] = MCA_BTL_ATOMIC_OR,
- [OMPI_OP_BXOR] = MCA_BTL_ATOMIC_XOR,
- [OMPI_OP_LAND] = MCA_BTL_ATOMIC_LAND,
- [OMPI_OP_LOR] = MCA_BTL_ATOMIC_LOR,
- [OMPI_OP_LXOR] = MCA_BTL_ATOMIC_LXOR,
- [OMPI_OP_REPLACE] = MCA_BTL_ATOMIC_SWAP,
-};
-
-static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt,
- ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
- mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req,
- bool lock_acquired)
-{
- ompi_osc_rdma_module_t *module = sync->module;
- int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
- int ret, btl_op, flags;
- int64_t origin;
-
- if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) ||
- (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) ||
- !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) {
- return OMPI_ERR_NOT_SUPPORTED;
- }
-
- btl_op = ompi_osc_rdma_op_mapping[op->op_type];
- if (0 == btl_op) {
- return OMPI_ERR_NOT_SUPPORTED;
- }
-
- flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0;
- if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) {
- flags |= MCA_BTL_ATOMIC_FLAG_FLOAT;
- }
-
- OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using %d-bit btl atomics. origin: 0x%" PRIx64,
- (4 == extent) ? 32 : 64, *((int64_t *) origin_addr));
-
- origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0];
-
- ret = ompi_osc_rdma_btl_fop (module, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags,
- result_addr, true, NULL, NULL, NULL);
- if (OPAL_SUCCESS == ret) {
- /* done. release the lock */
- ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired);
-
- if (req) {
- ompi_osc_rdma_request_complete (req, MPI_SUCCESS);
- }
- }
-
- return ret;
-}
-
-static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const void *origin_addr, void *result_addr, ompi_datatype_t *dt,
- ptrdiff_t extent, ompi_osc_rdma_peer_t *peer, uint64_t target_address,
- mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req,
- bool lock_acquired)
-{
- ompi_osc_rdma_module_t *module = sync->module;
- uint64_t address, offset, new_value, old_value;
- int ret;
-
- if (extent > 8) {
- return OMPI_ERR_NOT_SUPPORTED;
- }
-
- /* align the address. the user should not call with an unaligned address so don't need to range check here */
- address = target_address & ~7;
- offset = target_address & ~address;
-
- OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap");
-
- ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, &old_value, 8);
- if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
- return ret;
- }
-
- /* store the destination in the temporary buffer */
- do {
- new_value = old_value;
-
- if (&ompi_mpi_op_replace.op == op) {
- memcpy ((void *)((intptr_t) &new_value + offset), origin_addr, extent);
- } else if (&ompi_mpi_op_no_op.op != op) {
- ompi_op_reduce (op, (void *) origin_addr, (void*)((intptr_t) &new_value + offset), 1, dt);
- }
-
- ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle,
- old_value, new_value, 0, (int64_t*)&new_value);
- if (OPAL_SUCCESS != ret || new_value == old_value) {
- break;
- }
-
- old_value = new_value;
- } while (1);
-
- if (result_addr) {
- memcpy (result_addr, (void *)((intptr_t) &new_value + offset), extent);
- }
-
- if (OPAL_SUCCESS == ret) {
- /* done. release the lock */
- ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired);
-
- if (req) {
- ompi_osc_rdma_request_complete (req, MPI_SUCCESS);
- }
- }
-
- return ret;
-}
-
-static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const void *origin_addr, ompi_datatype_t *dt, ptrdiff_t extent,
- ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle,
- ompi_op_t *op, ompi_osc_rdma_request_t *req, bool lock_acquired)
-{
- ompi_osc_rdma_module_t *module = sync->module;
- int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
- int ret, btl_op, flags;
- int64_t origin;
-
- if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
- /* btl put atomics not supported or disabled. fall back on fetch-and-op */
- return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle,
- op, req, lock_acquired);
- }
-
- if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) ||
- (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) ||
- !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) {
- return OMPI_ERR_NOT_SUPPORTED;
- }
-
- origin = (8 == extent) ? ((uint64_t *) origin_addr)[0] : ((uint32_t *) origin_addr)[0];
-
- /* set the appropriate flags for this atomic */
- flags = (4 == extent) ? MCA_BTL_ATOMIC_FLAG_32BIT : 0;
- if (OMPI_DATATYPE_FLAG_DATA_FLOAT & dt->super.flags) {
- flags |= MCA_BTL_ATOMIC_FLAG_FLOAT;
- }
-
- btl_op = ompi_osc_rdma_op_mapping[op->op_type];
-
- OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating accumulate using 64-bit btl atomics. origin: 0x%" PRIx64,
- *((int64_t *) origin_addr));
-
- /* if we locked the peer its best to wait for completion before returning */
- ret = ompi_osc_rdma_btl_op (module, peer->data_endpoint, target_address, target_handle, btl_op, origin,
- flags, true, NULL, NULL, NULL);
- if (OPAL_SUCCESS == ret) {
- /* done. release the lock */
- ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired);
-
- if (req) {
- ompi_osc_rdma_request_complete (req, MPI_SUCCESS);
- }
- }
-
- return ret;
-}
-
/**
* ompi_osc_rdma_cas_get_complete:
* Note: This function will not work as is in a heterogeneous environment.
@@ -844,40 +849,50 @@ int ompi_osc_rdma_compare_and_swap (const void *origin_addr, const void *compare
static inline
-int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const void *origin_addr, int origin_count,
+int ompi_osc_rdma_rget_accumulate_internal (ompi_win_t *win, const void *origin_addr, int origin_count,
ompi_datatype_t *origin_datatype, void *result_addr, int result_count,
- ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer,
- int target_rank, MPI_Aint target_disp, int target_count,
- ompi_datatype_t *target_datatype, ompi_op_t *op,
- ompi_osc_rdma_request_t *request)
+ ompi_datatype_t *result_datatype, int target_rank, MPI_Aint target_disp,
+ int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op,
+ ompi_request_t **request_out)
{
- ompi_osc_rdma_module_t *module = sync->module;
+ ompi_osc_rdma_module_t *module = GET_MODULE(win);
mca_btl_base_registration_handle_t *target_handle;
uint64_t target_address;
- ptrdiff_t lb, origin_extent, target_span;
+ ptrdiff_t lb, target_lb, target_span;
+ ompi_osc_rdma_request_t *rdma_request = NULL;
bool lock_acquired = false;
+ ompi_osc_rdma_sync_t *sync;
+ ompi_osc_rdma_peer_t *peer;
int ret;
+ sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
+ if (OPAL_UNLIKELY(NULL == sync)) {
+ return OMPI_ERR_RMA_SYNC;
+ }
+
+ if (request_out) {
+ OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
+ *request_out = &rdma_request->super;
+ }
+
/* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */
if ((result_addr && 0 == result_count) || 0 == target_count) {
- if (request) {
- ompi_osc_rdma_request_complete (request, MPI_SUCCESS);
+ if (rdma_request) {
+ ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS);
}
return OMPI_SUCCESS;
}
- target_span = opal_datatype_span(&target_datatype->super, target_count, &lb);
+ target_span = opal_datatype_span(&target_datatype->super, target_count, &target_lb);
// a buffer defined by (buf, count, dt)
// will have data starting at buf+offset and ending len bytes later:
- ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_span+lb, &target_address, &target_handle);
+ ret = osc_rdma_get_remote_segment (module, peer, target_disp, target_span+target_lb, &target_address, &target_handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
- (void) ompi_datatype_get_extent (origin_datatype, &lb, &origin_extent);
-
/* to ensure order wait until the previous accumulate completes */
while (!ompi_osc_rdma_peer_test_set_flag (peer, OMPI_OSC_RDMA_PEER_ACCUMULATING)) {
ompi_osc_rdma_progress (module);
@@ -889,31 +904,6 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
(void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock));
}
- /* if the datatype is small enough (and the count is 1) then try to directly use the hardware to execute
- * the atomic operation. this should be safe in all cases as either 1) the user has assured us they will
- * never use atomics with count > 1, 2) we have the accumulate lock, or 3) we have an exclusive lock */
- if (origin_extent <= 8 && 1 == origin_count && !ompi_osc_rdma_peer_local_base (peer)) {
- if (module->acc_use_amo && ompi_datatype_is_predefined (origin_datatype)) {
- if (NULL == result_addr) {
- ret = ompi_osc_rdma_acc_single_atomic (sync, origin_addr, origin_datatype, origin_extent, peer, target_address,
- target_handle, op, request, lock_acquired);
- } else {
- ret = ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address,
- target_handle, op, request, lock_acquired);
- }
-
- if (OMPI_SUCCESS == ret) {
- return OMPI_SUCCESS;
- }
- }
-
- ret = ompi_osc_rdma_fetch_and_op_cas (sync, origin_addr, result_addr, origin_datatype, origin_extent, peer, target_address,
- target_handle, op, request, lock_acquired);
- if (OMPI_SUCCESS == ret) {
- return OMPI_SUCCESS;
- }
- }
-
/* could not use network atomics. acquire the lock if needed and continue. */
if (!lock_acquired && !ompi_osc_rdma_peer_is_exclusive (peer)) {
lock_acquired = true;
@@ -924,16 +914,20 @@ int ompi_osc_rdma_rget_accumulate_internal (ompi_osc_rdma_sync_t *sync, const vo
/* local/self optimization */
ret = ompi_osc_rdma_gacc_local (origin_addr, origin_count, origin_datatype, result_addr, result_count,
result_datatype, peer, target_address, target_handle, target_count,
- target_datatype, op, module, request, lock_acquired);
+ target_datatype, op, module, rdma_request, lock_acquired);
} else {
/* do not need to pass the lock acquired flag to this function. the value of the flag can be obtained
* just by calling ompi_osc_rdma_peer_is_exclusive() in this case. */
ret = ompi_osc_rdma_gacc_master (sync, origin_addr, origin_count, origin_datatype, result_addr, result_count,
result_datatype, peer, target_address, target_handle, target_count,
- target_datatype, op, request);
+ target_datatype, op, rdma_request);
}
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+ if (request_out) {
+ *request_out = &ompi_request_null.request;
+ OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
+ }
ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired);
}
@@ -945,24 +939,15 @@ int ompi_osc_rdma_get_accumulate (const void *origin_addr, int origin_count, omp
int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype,
ompi_op_t *op, ompi_win_t *win)
{
- ompi_osc_rdma_module_t *module = GET_MODULE(win);
- ompi_osc_rdma_peer_t *peer;
- ompi_osc_rdma_sync_t *sync;
-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "get_acc: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
(unsigned long) origin_addr, origin_count, origin_datatype->name,
(unsigned long) result_addr, result_count, result_datatype->name, target_rank,
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name,
win->w_name);
- sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
- if (OPAL_UNLIKELY(NULL == sync)) {
- return OMPI_ERR_RMA_SYNC;
- }
-
- return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype,
+ return ompi_osc_rdma_rget_accumulate_internal (win, origin_addr, origin_count, origin_datatype,
result_addr, result_count, result_datatype,
- peer, target_rank, target_disp, target_count,
+ target_rank, target_disp, target_count,
target_datatype, op, NULL);
}
@@ -972,91 +957,40 @@ int ompi_osc_rdma_rget_accumulate (const void *origin_addr, int origin_count, om
int target_rank, MPI_Aint target_disp, int target_count, ompi_datatype_t *target_datatype,
ompi_op_t *op, ompi_win_t *win, ompi_request_t **request)
{
- ompi_osc_rdma_module_t *module = GET_MODULE(win);
- ompi_osc_rdma_peer_t *peer;
- ompi_osc_rdma_request_t *rdma_request;
- ompi_osc_rdma_sync_t *sync;
- int ret;
-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "rget_acc: 0x%lx, %d, %s, 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
(unsigned long) origin_addr, origin_count, origin_datatype->name,
(unsigned long) result_addr, result_count, result_datatype->name, target_rank,
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name,
win->w_name);
- sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
- if (OPAL_UNLIKELY(NULL == sync)) {
- return OMPI_ERR_RMA_SYNC;
- }
-
- OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
-
- ret = ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, result_addr,
- result_count, result_datatype, peer, target_rank, target_disp,
- target_count, target_datatype, op, rdma_request);
- if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
- OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
- return ret;
- }
-
- *request = &rdma_request->super;
-
- return OMPI_SUCCESS;
+ return ompi_osc_rdma_rget_accumulate_internal (win, origin_addr, origin_count, origin_datatype, result_addr,
+ result_count, result_datatype, target_rank, target_disp,
+ target_count, target_datatype, op, request);
}
int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank,
ptrdiff_t target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op,
ompi_win_t *win, ompi_request_t **request)
{
- ompi_osc_rdma_module_t *module = GET_MODULE(win);
- ompi_osc_rdma_peer_t *peer;
- ompi_osc_rdma_request_t *rdma_request;
- ompi_osc_rdma_sync_t *sync;
- int ret;
-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "racc: 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
(unsigned long) origin_addr, origin_count, origin_datatype->name, target_rank,
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name, win->w_name);
- sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
- if (OPAL_UNLIKELY(NULL == sync)) {
- return OMPI_ERR_RMA_SYNC;
- }
-
- OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request);
-
- ret = ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, NULL, 0,
- NULL, peer, target_rank, target_disp, target_count, target_datatype,
- op, rdma_request);
- if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
- OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request);
- return ret;
- }
-
- *request = &rdma_request->super;
-
- return OMPI_SUCCESS;
+ return ompi_osc_rdma_rget_accumulate_internal (win, origin_addr, origin_count, origin_datatype, NULL, 0,
+ NULL, target_rank, target_disp, target_count, target_datatype,
+ op, request);
}
int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank,
ptrdiff_t target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op,
ompi_win_t *win)
{
- ompi_osc_rdma_module_t *module = GET_MODULE(win);
- ompi_osc_rdma_peer_t *peer;
- ompi_osc_rdma_sync_t *sync;
-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "acc: 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s",
(unsigned long) origin_addr, origin_count, origin_datatype->name, target_rank,
(unsigned long) target_disp, target_count, target_datatype->name, op->o_name, win->w_name);
- sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
- if (OPAL_UNLIKELY(NULL == sync)) {
- return OMPI_ERR_RMA_SYNC;
- }
-
- return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, NULL, 0,
- NULL, peer, target_rank, target_disp, target_count, target_datatype,
+ return ompi_osc_rdma_rget_accumulate_internal (win, origin_addr, origin_count, origin_datatype, NULL, 0,
+ NULL, target_rank, target_disp, target_count, target_datatype,
op, NULL);
}
@@ -1064,18 +998,9 @@ int ompi_osc_rdma_accumulate (const void *origin_addr, int origin_count, ompi_da
int ompi_osc_rdma_fetch_and_op (const void *origin_addr, void *result_addr, ompi_datatype_t *dt, int target_rank,
ptrdiff_t target_disp, ompi_op_t *op, ompi_win_t *win)
{
- ompi_osc_rdma_module_t *module = GET_MODULE(win);
- ompi_osc_rdma_peer_t *peer;
- ompi_osc_rdma_sync_t *sync;
-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "fop: %p, %s, %d, %lu, %s, %s", result_addr, dt->name,
target_rank, (unsigned long) target_disp, op->o_name, win->w_name);
- sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer);
- if (OPAL_UNLIKELY(NULL == sync)) {
- return OMPI_ERR_RMA_SYNC;
- }
-
- return ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, 1, dt, result_addr, 1, dt, peer,
+ return ompi_osc_rdma_rget_accumulate_internal (win, origin_addr, 1, dt, result_addr, 1, dt,
target_rank, target_disp, 1, dt, op, NULL);
}
diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c
index ee57b94..6222776 100644
--- a/ompi/mca/osc/rdma/osc_rdma_component.c
+++ b/ompi/mca/osc/rdma/osc_rdma_component.c
@@ -18,6 +18,8 @@
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
* Copyright (c) 2018 Cisco Systems, Inc. All rights reserved
+ * Copyright (c) 2019 Triad National Security, LLC. All rights
+ * reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -272,6 +274,16 @@ static int ompi_osc_rdma_component_register (void)
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_READONLY, &mca_osc_rdma_component.backing_directory);
+ mca_osc_rdma_component.network_amo_max_count = 32;
+ (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "network_max_amo",
+ "Maximum predefined datatype count for which network atomic operations "
+ "will be used. Accumulate operations larger than this count will use "
+ "a get/op/put protocol. The optimal value is dictated by the network "
+ "injection rate for the interconnect. Generally a smaller number will "
+ "yield better larger accumulate performance. (default: 32)",
+ MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, 0, 0, OPAL_INFO_LVL_3,
+ MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.network_amo_max_count);
+
/* register performance variables */
(void) mca_base_component_pvar_register (&mca_osc_rdma_component.super.osc_version, "put_retry_count",
@@ -655,6 +667,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
/* barrier to make sure all ranks have attached */
shared_comm->c_coll->coll_barrier(shared_comm, shared_comm->c_coll->coll_barrier_module);
+ /* ensure all ranks get the same result. this bcast also ensures no rank continues until local rank 0
+ * has attempted to register the state. */
+ shared_comm->c_coll->coll_bcast (&ret, 1, MPI_INT, 0, shared_comm, shared_comm->c_coll->coll_bcast_module);
+ if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
+ break;
+ }
/* unlink the shared memory backing file */
if (0 == local_rank) {
@@ -699,7 +717,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
peer->state_endpoint = NULL;
} else {
/* use my endpoint handle to modify the peer's state */
- if (module->selected_btl->btl_register_mem) {
+ if (module->selected_btl->btl_register_mem && local_size != global_size) {
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
}
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
@@ -955,7 +973,7 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module)
/* store my rank in the length field */
my_data->len = (osc_rdma_size_t) my_rank;
- if (module->selected_btl->btl_register_mem) {
+ if (module->selected_btl->btl_register_mem && module->state_handle) {
memcpy (my_data->btl_handle_data, module->state_handle, module->selected_btl->btl_registration_handle_size);
}
@@ -1130,6 +1148,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
module->locking_mode = mca_osc_rdma_component.locking_mode;
module->acc_single_intrinsic = check_config_value_bool ("acc_single_intrinsic", info);
module->acc_use_amo = mca_osc_rdma_component.acc_use_amo;
+ module->network_amo_max_count = mca_osc_rdma_component.network_amo_max_count;
module->all_sync.module = module;
diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h
index 7af4d70..1e61397 100644
--- a/ompi/mca/osc/rdma/osc_rdma_lock.h
+++ b/ompi/mca/osc/rdma/osc_rdma_lock.h
@@ -2,6 +2,8 @@
/*
* Copyright (c) 2014-2017 Los Alamos National Security, LLC. All rights
* reserved.
+ * Copyright (c) 2019 Triad National Security, LLC. All rights
+ * reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -45,9 +47,7 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, struct
pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t);
assert (NULL != pending_op);
- if (wait_for_completion) {
- OBJ_RETAIN(pending_op);
- } else {
+ if (!wait_for_completion) {
/* NTH: need to keep track of pending ops to avoid a potential teardown problem */
pending_op->module = module;
(void) opal_atomic_fetch_add_32 (&module->pending_ops, 1);
@@ -87,10 +87,10 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, struct
ret = OMPI_SUCCESS;
ompi_osc_rdma_atomic_complete (module->selected_btl, endpoint, pending_op->op_buffer,
pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS);
+ } else {
+ /* need to release here because ompi_osc_rdma_atomic_complete was not called */
+ OBJ_RELEASE(pending_op);
}
-
- /* need to release here because ompi_osc_rdma_atomic_complet was not called */
- OBJ_RELEASE(pending_op);
} else if (wait_for_completion) {
while (!pending_op->op_complete) {
ompi_osc_rdma_progress (module);
@@ -153,7 +153,7 @@ static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, struct m
} while (1);
if (OPAL_SUCCESS != ret) {
- /* need to release here because ompi_osc_rdma_atomic_complet was not called */
+ /* need to release here because ompi_osc_rdma_atomic_complete was not called */
OBJ_RELEASE(pending_op);
if (OPAL_LIKELY(1 == ret)) {
if (cbfunc) {
diff --git a/ompi/mca/osc/rdma/osc_rdma_request.c b/ompi/mca/osc/rdma/osc_rdma_request.c
index eb06368..d1cf115 100644
--- a/ompi/mca/osc/rdma/osc_rdma_request.c
+++ b/ompi/mca/osc/rdma/osc_rdma_request.c
@@ -6,6 +6,8 @@
* Copyright (c) 2016 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
+ * Copyright (c) 2019 Triad National Security, LLC. All rights
+ * reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -56,15 +58,7 @@ static void request_construct(ompi_osc_rdma_request_t *request)
request->internal = false;
request->cleanup = NULL;
request->outstanding_requests = 0;
- OBJ_CONSTRUCT(&request->convertor, opal_convertor_t);
-}
-
-static void request_destruct(ompi_osc_rdma_request_t *request)
-{
- OBJ_DESTRUCT(&request->convertor);
}
-OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t,
- ompi_request_t,
- request_construct,
- request_destruct);
+OBJ_CLASS_INSTANCE(ompi_osc_rdma_request_t, ompi_request_t,
+ request_construct, NULL);
diff --git a/ompi/mca/osc/rdma/osc_rdma_request.h b/ompi/mca/osc/rdma/osc_rdma_request.h
index ad052e1..3a57d84 100644
--- a/ompi/mca/osc/rdma/osc_rdma_request.h
+++ b/ompi/mca/osc/rdma/osc_rdma_request.h
@@ -3,6 +3,8 @@
* Copyright (c) 2012 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights
* reserved.
+ * Copyright (c) 2019 Triad National Security, LLC. All rights
+ * reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -51,8 +53,6 @@ struct ompi_osc_rdma_request_t {
uint64_t target_address;
struct ompi_osc_rdma_request_t *parent_request;
- /* used for non-contiguous get accumulate operations */
- opal_convertor_t convertor;
/** synchronization object */
struct ompi_osc_rdma_sync_t *sync;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment