From 0609cd676b7520487c4fb544ae275edbb4f47849 Mon Sep 17 00:00:00 2001 From: Sylvain Didelot Date: Wed, 27 Sep 2023 16:21:51 +0200 Subject: [PATCH] prov/verbs: Add support for IBV_ACCESS_RELAXED_ORDERING IBV_ACCESS_RELAXED_ORDERING allows the system to reorder Send/Write/Atomic operations to improve performance. The patch enables IBV_ACCESS_RELAXED_ORDERING if the application has requested no ordering in TX/RX attributes. Signed-off-by: Sylvain Didelot --- prov/verbs/configure.m4 | 10 +++++++ prov/verbs/src/verbs_domain.c | 12 ++++++--- prov/verbs/src/verbs_eq.c | 3 +++ prov/verbs/src/verbs_info.c | 50 +++++++++++++++++++++++++++++++---- prov/verbs/src/verbs_mr.c | 3 +++ prov/verbs/src/verbs_ofi.h | 22 +++++++++++++++ 6 files changed, 92 insertions(+), 8 deletions(-) diff --git a/prov/verbs/configure.m4 b/prov/verbs/configure.m4 index aa793e0180b..b096e9cae82 100644 --- a/prov/verbs/configure.m4 +++ b/prov/verbs/configure.m4 @@ -93,6 +93,16 @@ AC_DEFUN([FI_VERBS_CONFIGURE],[ AC_DEFINE_UNQUOTED([VERBS_HAVE_DMABUF_MR],[$VERBS_HAVE_DMABUF_MR], [Whether infiniband/verbs.h has ibv_reg_dmabuf_mr() support or not]) + #See if we have rdma-core IBV_ACCESS_RELAXED_ORDERING mr support + VERBS_HAVE_RELAXED_ORDERING_MR=0 + AS_IF([test $verbs_ibverbs_happy -eq 1],[ + AC_CHECK_DECL([IBV_ACCESS_RELAXED_ORDERING], + [VERBS_HAVE_RELAXED_ORDERING_MR=1],[], + [#include ]) + ]) + AC_DEFINE_UNQUOTED([VERBS_HAVE_RELAXED_ORDERING_MR],[$VERBS_HAVE_RELAXED_ORDERING_MR], + [Whether infiniband/verbs.h has IBV_ACCESS_RELAXED_ORDERING support or not]) + CPPFLAGS=$fi_verbs_configure_save_CPPFLAGS # Technically, verbs_ibverbs_CPPFLAGS and diff --git a/prov/verbs/src/verbs_domain.c b/prov/verbs/src/verbs_domain.c index f184e6cdf4f..9ae08dc2c15 100644 --- a/prov/verbs/src/verbs_domain.c +++ b/prov/verbs/src/verbs_domain.c @@ -246,9 +246,12 @@ static int vrb_open_device_by_name(struct vrb_domain *domain, const char *name) const char *rdma_name = ibv_get_device_name(dev_list[i]->device); switch (domain->ep_type) { case FI_EP_MSG: - ret = domain->ext_flags & VRB_USE_XRC ? - vrb_cmp_xrc_domain_name(name, rdma_name) : - strcmp(name, rdma_name); + if (domain->ext_flags & VRB_USE_XRC) + ret = vrb_cmp_xrc_domain_name(name, rdma_name); + else if (domain->ext_flags & VRB_USE_RO) + ret = vrb_cmp_ro_domain_name(name, rdma_name); + else + ret = strcmp(name, rdma_name); break; case FI_EP_DGRAM: ret = strncmp(name, rdma_name, @@ -345,6 +348,9 @@ vrb_domain(struct fid_fabric *fabric, struct fi_info *info, if (!_domain->info) goto err2; + if (VRB_RO_ENABLED(info)) + _domain->ext_flags |= VRB_USE_RO; + _domain->ep_type = VRB_EP_TYPE(info); _domain->ext_flags |= vrb_is_xrc_info(info) ? VRB_USE_XRC : 0; diff --git a/prov/verbs/src/verbs_eq.c b/prov/verbs/src/verbs_eq.c index 7cb2168f099..9475779b39f 100644 --- a/prov/verbs/src/verbs_eq.c +++ b/prov/verbs/src/verbs_eq.c @@ -169,6 +169,9 @@ vrb_pep_dev_domain_match(struct fi_info *hints, const char *devname) if ((VRB_EP_PROTO(hints)) == FI_PROTO_RDMA_CM_IB_XRC) ret = vrb_cmp_xrc_domain_name(hints->domain_attr->name, devname); + else if (VRB_RO_ENABLED(hints)) + ret = vrb_cmp_ro_domain_name(hints->domain_attr->name, + devname); else ret = strcmp(hints->domain_attr->name, devname); diff --git a/prov/verbs/src/verbs_info.c b/prov/verbs/src/verbs_info.c index fadf7f00007..c7f941a8cac 100644 --- a/prov/verbs/src/verbs_info.c +++ b/prov/verbs/src/verbs_info.c @@ -113,6 +113,15 @@ const struct fi_rx_attr verbs_rx_attr = { .total_buffered_recv = 0, }; +const struct fi_rx_attr verbs_ro_rx_attr = { + .caps = VERBS_MSG_RX_CAPS, + .mode = VERBS_RX_MODE, + .op_flags = FI_COMPLETION, + .msg_order = 0, + .comp_order = FI_ORDER_STRICT, + .total_buffered_recv = 0, +}; + const struct fi_rx_attr verbs_dgram_rx_attr = { .caps = VERBS_DGRAM_RX_CAPS, .mode = VERBS_DGRAM_RX_MODE | VERBS_RX_MODE, @@ -132,6 +141,16 @@ const struct fi_tx_attr verbs_tx_attr = { .rma_iov_limit = 1, }; +const struct fi_tx_attr verbs_ro_tx_attr = { + .caps = VERBS_MSG_TX_CAPS, + .mode = 0, + .op_flags = VERBS_TX_OP_FLAGS, + .msg_order = 0, + .comp_order = FI_ORDER_STRICT, + .inject_size = 0, + .rma_iov_limit = 1, +}; + const struct fi_tx_attr verbs_dgram_tx_attr = { .caps = VERBS_DGRAM_TX_CAPS, .mode = 0, @@ -146,18 +165,28 @@ const struct verbs_ep_domain verbs_msg_domain = { .suffix = "", .type = FI_EP_MSG, .protocol = FI_PROTO_UNSPEC, + .relaxed_ordering = false, +}; + +const struct verbs_ep_domain verbs_msg_ro_domain = { + .suffix = "-ro", + .type = FI_EP_MSG, + .protocol = FI_PROTO_UNSPEC, + .relaxed_ordering = true, }; const struct verbs_ep_domain verbs_msg_xrc_domain = { .suffix = "-xrc", .type = FI_EP_MSG, .protocol = FI_PROTO_RDMA_CM_IB_XRC, + .relaxed_ordering = false, }; const struct verbs_ep_domain verbs_dgram_domain = { .suffix = "-dgram", .type = FI_EP_DGRAM, .protocol = FI_PROTO_UNSPEC, + .relaxed_ordering = false, }; /* The list (not thread safe) is populated once when the provider is initialized */ @@ -770,8 +799,13 @@ static int vrb_alloc_info(struct ibv_context *ctx, struct fi_info **info, switch (ep_dom->type) { case FI_EP_MSG: fi->caps = VERBS_MSG_CAPS; - *(fi->tx_attr) = verbs_tx_attr; - *(fi->rx_attr) = verbs_rx_attr; + if (ep_dom->relaxed_ordering) { + *(fi->tx_attr) = verbs_ro_tx_attr; + *(fi->rx_attr) = verbs_ro_rx_attr; + } else { + *(fi->tx_attr) = verbs_tx_attr; + *(fi->rx_attr) = verbs_rx_attr; + } fi->addr_format = FI_SOCKADDR_IB; break; case FI_EP_DGRAM: @@ -1332,7 +1366,7 @@ static int vrb_device_has_ipoib_addr(const char *dev_name) return 0; } -#define VERBS_NUM_DOMAIN_TYPES 3 +#define VERBS_NUM_DOMAIN_TYPES 4 static int vrb_init_info(const struct fi_info **all_infos) { @@ -1379,12 +1413,14 @@ static int vrb_init_info(const struct fi_info **all_infos) if (!vrb_gl_data.iface) vrb_get_sib(&verbs_devs); - if (dlist_empty(&verbs_devs)) + if (dlist_empty(&verbs_devs)) { FI_WARN(&vrb_prov, FI_LOG_FABRIC, "no valid IPoIB interfaces found, FI_EP_MSG endpoint " "type would not be available\n"); - else + } else { ep_type[dom_count++] = &verbs_msg_domain; + ep_type[dom_count++] = &verbs_msg_ro_domain; + } if (!vrb_gl_data.msg.prefer_xrc && VERBS_HAVE_XRC) ep_type[dom_count++] = &verbs_msg_xrc_domain; @@ -1562,6 +1598,10 @@ int vrb_get_matching_info(uint32_t version, const struct fi_info *hints, "XRC FI_EP_MSG endpoints\n"); continue; } + + if (VRB_RO_ENABLED(hints) && (check_info->tx_attr->msg_order || + check_info->rx_attr->msg_order)) + continue; } if ((check_info->ep_attr->type == FI_EP_MSG) && passive) { diff --git a/prov/verbs/src/verbs_mr.c b/prov/verbs/src/verbs_mr.c index 360094432a2..ddf2d8672cb 100644 --- a/prov/verbs/src/verbs_mr.c +++ b/prov/verbs/src/verbs_mr.c @@ -201,6 +201,9 @@ vrb_mr_ofi2ibv_access(uint64_t ofi_access, struct vrb_domain *domain) IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC; + if (domain->ext_flags & VRB_USE_RO) + ibv_access |= VRB_ACCESS_RELAXED_ORDERING; + return ibv_access; } diff --git a/prov/verbs/src/verbs_ofi.h b/prov/verbs/src/verbs_ofi.h index 78b8c40369e..6a6b2f275c5 100644 --- a/prov/verbs/src/verbs_ofi.h +++ b/prov/verbs/src/verbs_ofi.h @@ -138,6 +138,9 @@ #define VRB_EP_PROTO(info) \ (((info) && (info)->ep_attr) ? (info)->ep_attr->protocol : \ FI_PROTO_UNSPEC) +#define VRB_RO_ENABLED(info) \ + ((info)->tx_attr && !(info)->tx_attr->msg_order && \ + (info)->rx_attr && !(info)->rx_attr->msg_order) #define VRB_MEM_ALIGNMENT (64) #define VRB_BUF_ALIGNMENT (4096) /* TODO: Page or MTU size */ @@ -366,6 +369,7 @@ struct fi_ops_cm *vrb_pep_ops_cm(struct vrb_pep *pep); enum { VRB_USE_XRC = BIT(0), VRB_USE_ODP = BIT(1), + VRB_USE_RO = BIT(2), }; struct vrb_domain { @@ -437,6 +441,12 @@ int vrb_cq_open(struct fid_domain *domain, struct fi_cq_attr *attr, struct fid_cq **cq, void *context); int vrb_cq_trywait(struct vrb_cq *cq); +#if VERBS_HAVE_RELAXED_ORDERING_MR +#define VRB_ACCESS_RELAXED_ORDERING IBV_ACCESS_RELAXED_ORDERING +#else +#define VRB_ACCESS_RELAXED_ORDERING 0 +#endif + struct vrb_mem_desc { struct fid_mr mr_fid; struct ibv_mr *mr; @@ -839,10 +849,12 @@ struct verbs_ep_domain { char *suffix; enum fi_ep_type type; uint32_t protocol; + bool relaxed_ordering; }; extern const struct verbs_ep_domain verbs_dgram_domain; extern const struct verbs_ep_domain verbs_msg_xrc_domain; +extern const struct verbs_ep_domain verbs_msg_ro_domain; int vrb_check_ep_attr(const struct fi_info *hints, const struct fi_info *info); @@ -860,6 +872,16 @@ static inline int vrb_cmp_xrc_domain_name(const char *domain_name, domain_len - suffix_len) : -1; } +static inline int vrb_cmp_ro_domain_name(const char *domain_name, + const char *rdma_name) +{ + size_t domain_len = strlen(domain_name); + size_t suffix_len = strlen(verbs_msg_ro_domain.suffix); + + return domain_len > suffix_len ? strncmp(domain_name, rdma_name, + domain_len - suffix_len) : -1; +} + int vrb_cq_signal(struct fid_cq *cq); struct vrb_eq_entry *vrb_eq_alloc_entry(uint32_t event,