diff options
Diffstat (limited to 'kernel/include/rdma/ib_verbs.h')
-rw-r--r-- | kernel/include/rdma/ib_verbs.h | 820 |
1 files changed, 601 insertions, 219 deletions
diff --git a/kernel/include/rdma/ib_verbs.h b/kernel/include/rdma/ib_verbs.h index 65994a19e..120da1d7f 100644 --- a/kernel/include/rdma/ib_verbs.h +++ b/kernel/include/rdma/ib_verbs.h @@ -48,6 +48,7 @@ #include <linux/rwsem.h> #include <linux/scatterlist.h> #include <linux/workqueue.h> +#include <linux/socket.h> #include <uapi/linux/if_ether.h> #include <linux/atomic.h> @@ -64,6 +65,12 @@ union ib_gid { } global; }; +extern union ib_gid zgid; + +struct ib_gid_attr { + struct net_device *ndev; +}; + enum rdma_node_type { /* IB values map to NodeInfo:NodeType. */ RDMA_NODE_IB_CA = 1, @@ -81,6 +88,13 @@ enum rdma_transport_type { RDMA_TRANSPORT_USNIC_UDP }; +enum rdma_protocol_type { + RDMA_PROTOCOL_IB, + RDMA_PROTOCOL_IBOE, + RDMA_PROTOCOL_IWARP, + RDMA_PROTOCOL_USNIC_UDP +}; + __attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type); @@ -123,6 +137,8 @@ enum ib_device_cap_flags { IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22), IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), + IB_DEVICE_RC_IP_CSUM = (1<<25), + IB_DEVICE_RAW_IP_CSUM = (1<<26), IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), IB_DEVICE_SIGNATURE_HANDOVER = (1<<30), IB_DEVICE_ON_DEMAND_PAGING = (1<<31), @@ -166,6 +182,16 @@ struct ib_odp_caps { } per_transport_caps; }; +enum ib_cq_creation_flags { + IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, +}; + +struct ib_cq_init_attr { + unsigned int cqe; + int comp_vector; + u32 flags; +}; + struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; @@ -210,6 +236,8 @@ struct ib_device_attr { int sig_prot_cap; int sig_guard_cap; struct ib_odp_caps odp_caps; + uint64_t timestamp_mask; + uint64_t hca_core_clock; /* in KHZ */ }; enum ib_mtu { @@ -265,7 +293,7 @@ enum ib_port_cap_flags { IB_PORT_BOOT_MGMT_SUP = 1 << 23, IB_PORT_LINK_LATENCY_SUP = 1 << 24, IB_PORT_CLIENT_REG_SUP = 1 << 25, - IB_PORT_IP_BASED_GIDS = 1 << 26 + IB_PORT_IP_BASED_GIDS = 1 << 26, }; enum ib_port_width { @@ -346,6 +374,42 @@ union rdma_protocol_stats { struct iw_protocol_stats iw; }; +/* Define bits for the various functionality this port needs to be supported by + * the core. + */ +/* Management 0x00000FFF */ +#define RDMA_CORE_CAP_IB_MAD 0x00000001 +#define RDMA_CORE_CAP_IB_SMI 0x00000002 +#define RDMA_CORE_CAP_IB_CM 0x00000004 +#define RDMA_CORE_CAP_IW_CM 0x00000008 +#define RDMA_CORE_CAP_IB_SA 0x00000010 +#define RDMA_CORE_CAP_OPA_MAD 0x00000020 + +/* Address format 0x000FF000 */ +#define RDMA_CORE_CAP_AF_IB 0x00001000 +#define RDMA_CORE_CAP_ETH_AH 0x00002000 + +/* Protocol 0xFFF00000 */ +#define RDMA_CORE_CAP_PROT_IB 0x00100000 +#define RDMA_CORE_CAP_PROT_ROCE 0x00200000 +#define RDMA_CORE_CAP_PROT_IWARP 0x00400000 + +#define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_SMI \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_IB_SA \ + | RDMA_CORE_CAP_AF_IB) +#define RDMA_CORE_PORT_IBA_ROCE (RDMA_CORE_CAP_PROT_ROCE \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_AF_IB \ + | RDMA_CORE_CAP_ETH_AH) +#define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ + | RDMA_CORE_CAP_IW_CM) +#define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ + | RDMA_CORE_CAP_OPA_MAD) + struct ib_port_attr { enum ib_port_state state; enum ib_mtu max_mtu; @@ -412,6 +476,8 @@ enum ib_event_type { IB_EVENT_GID_CHANGE, }; +const char *__attribute_const__ ib_event_msg(enum ib_event_type event); + struct ib_event { struct ib_device *device; union { @@ -499,20 +565,18 @@ __attribute_const__ int ib_rate_to_mult(enum ib_rate rate); */ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); -enum ib_mr_create_flags { - IB_MR_SIGNATURE_EN = 1, -}; /** - * ib_mr_init_attr - Memory region init attributes passed to routine - * ib_create_mr. - * @max_reg_descriptors: max number of registration descriptors that - * may be used with registration work requests. - * @flags: MR creation flags bit mask. + * enum ib_mr_type - memory region type + * @IB_MR_TYPE_MEM_REG: memory region that is used for + * normal registration + * @IB_MR_TYPE_SIGNATURE: memory region that is used for + * signature operations (data-integrity + * capable regions) */ -struct ib_mr_init_attr { - int max_reg_descriptors; - u32 flags; +enum ib_mr_type { + IB_MR_TYPE_MEM_REG, + IB_MR_TYPE_SIGNATURE, }; /** @@ -635,7 +699,6 @@ struct ib_ah_attr { u8 ah_flags; u8 port_num; u8 dmac[ETH_ALEN]; - u16 vlan_id; }; enum ib_wc_status { @@ -663,6 +726,8 @@ enum ib_wc_status { IB_WC_GENERAL_ERR }; +const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status); + enum ib_wc_opcode { IB_WC_SEND, IB_WC_RDMA_WRITE, @@ -672,7 +737,7 @@ enum ib_wc_opcode { IB_WC_BIND_MW, IB_WC_LSO, IB_WC_LOCAL_INV, - IB_WC_FAST_REG_MR, + IB_WC_REG_MR, IB_WC_MASKED_COMP_SWAP, IB_WC_MASKED_FETCH_ADD, /* @@ -809,7 +874,6 @@ enum ib_qp_create_flags { IB_QP_CREATE_RESERVED_END = 1 << 31, }; - /* * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler * callback to destroy the passed in QP. @@ -893,10 +957,10 @@ enum ib_qp_attr_mask { IB_QP_PATH_MIG_STATE = (1<<18), IB_QP_CAP = (1<<19), IB_QP_DEST_QPN = (1<<20), - IB_QP_SMAC = (1<<21), - IB_QP_ALT_SMAC = (1<<22), - IB_QP_VID = (1<<23), - IB_QP_ALT_VID = (1<<24), + IB_QP_RESERVED1 = (1<<21), + IB_QP_RESERVED2 = (1<<22), + IB_QP_RESERVED3 = (1<<23), + IB_QP_RESERVED4 = (1<<24), }; enum ib_qp_state { @@ -946,10 +1010,6 @@ struct ib_qp_attr { u8 rnr_retry; u8 alt_port_num; u8 alt_timeout; - u8 smac[ETH_ALEN]; - u8 alt_smac[ETH_ALEN]; - u16 vlan_id; - u16 alt_vlan_id; }; enum ib_wr_opcode { @@ -964,7 +1024,7 @@ enum ib_wr_opcode { IB_WR_SEND_WITH_INV, IB_WR_RDMA_READ_WITH_INV, IB_WR_LOCAL_INV, - IB_WR_FAST_REG_MR, + IB_WR_REG_MR, IB_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, IB_WR_BIND_MW, @@ -1002,12 +1062,6 @@ struct ib_sge { u32 lkey; }; -struct ib_fast_reg_page_list { - struct ib_device *device; - u64 *page_list; - unsigned int max_page_list_len; -}; - /** * struct ib_mw_bind_info - Parameters for a memory window bind operation. * @mr: A memory region to bind the memory window to. @@ -1036,54 +1090,89 @@ struct ib_send_wr { __be32 imm_data; u32 invalidate_rkey; } ex; - union { - struct { - u64 remote_addr; - u32 rkey; - } rdma; - struct { - u64 remote_addr; - u64 compare_add; - u64 swap; - u64 compare_add_mask; - u64 swap_mask; - u32 rkey; - } atomic; - struct { - struct ib_ah *ah; - void *header; - int hlen; - int mss; - u32 remote_qpn; - u32 remote_qkey; - u16 pkey_index; /* valid for GSI only */ - u8 port_num; /* valid for DR SMPs on switch only */ - } ud; - struct { - u64 iova_start; - struct ib_fast_reg_page_list *page_list; - unsigned int page_shift; - unsigned int page_list_len; - u32 length; - int access_flags; - u32 rkey; - } fast_reg; - struct { - struct ib_mw *mw; - /* The new rkey for the memory window. */ - u32 rkey; - struct ib_mw_bind_info bind_info; - } bind_mw; - struct { - struct ib_sig_attrs *sig_attrs; - struct ib_mr *sig_mr; - int access_flags; - struct ib_sge *prot; - } sig_handover; - } wr; - u32 xrc_remote_srq_num; /* XRC TGT QPs only */ }; +struct ib_rdma_wr { + struct ib_send_wr wr; + u64 remote_addr; + u32 rkey; +}; + +static inline struct ib_rdma_wr *rdma_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_rdma_wr, wr); +} + +struct ib_atomic_wr { + struct ib_send_wr wr; + u64 remote_addr; + u64 compare_add; + u64 swap; + u64 compare_add_mask; + u64 swap_mask; + u32 rkey; +}; + +static inline struct ib_atomic_wr *atomic_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_atomic_wr, wr); +} + +struct ib_ud_wr { + struct ib_send_wr wr; + struct ib_ah *ah; + void *header; + int hlen; + int mss; + u32 remote_qpn; + u32 remote_qkey; + u16 pkey_index; /* valid for GSI only */ + u8 port_num; /* valid for DR SMPs on switch only */ +}; + +static inline struct ib_ud_wr *ud_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_ud_wr, wr); +} + +struct ib_reg_wr { + struct ib_send_wr wr; + struct ib_mr *mr; + u32 key; + int access; +}; + +static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_reg_wr, wr); +} + +struct ib_bind_mw_wr { + struct ib_send_wr wr; + struct ib_mw *mw; + /* The new rkey for the memory window. */ + u32 rkey; + struct ib_mw_bind_info bind_info; +}; + +static inline struct ib_bind_mw_wr *bind_mw_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_bind_mw_wr, wr); +} + +struct ib_sig_handover_wr { + struct ib_send_wr wr; + struct ib_sig_attrs *sig_attrs; + struct ib_mr *sig_mr; + int access_flags; + struct ib_sge *prot; +}; + +static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr) +{ + return container_of(wr, struct ib_sig_handover_wr, wr); +} + struct ib_recv_wr { struct ib_recv_wr *next; u64 wr_id; @@ -1182,6 +1271,7 @@ struct ib_uobject { int id; /* index into kernel idr */ struct kref ref; struct rw_semaphore mutex; /* protects .live */ + struct rcu_head rcu; /* kfree_rcu() overhead */ int live; }; @@ -1193,9 +1283,11 @@ struct ib_udata { }; struct ib_pd { + u32 local_dma_lkey; struct ib_device *device; struct ib_uobject *uobject; atomic_t usecnt; /* count all resources */ + struct ib_mr *local_mr; }; struct ib_xrcd { @@ -1268,6 +1360,9 @@ struct ib_mr { struct ib_uobject *uobject; u32 lkey; u32 rkey; + u64 iova; + u32 length; + unsigned int page_size; atomic_t usecnt; /* count number of MWs */ }; @@ -1407,7 +1502,7 @@ struct ib_flow { struct ib_uobject *uobject; }; -struct ib_mad; +struct ib_mad_hdr; struct ib_grh; enum ib_process_mad_flags { @@ -1429,7 +1524,7 @@ struct ib_cache { rwlock_t lock; struct ib_event_handler event_handler; struct ib_pkey_cache **pkey_cache; - struct ib_gid_cache **gid_cache; + struct ib_gid_table **gid_cache; u8 *lmc_cache; }; @@ -1474,6 +1569,13 @@ struct ib_dma_mapping_ops { struct iw_cm_verbs; +struct ib_port_immutable { + int pkey_tbl_len; + int gid_tbl_len; + u32 core_cap_flags; + u32 max_mad_size; +}; + struct ib_device { struct device *dma_device; @@ -1484,11 +1586,15 @@ struct ib_device { spinlock_t client_data_lock; struct list_head core_list; + /* Access to the client_data_list is protected by the client_data_lock + * spinlock and the lists_rwsem read-write semaphore */ struct list_head client_data_list; struct ib_cache cache; - int *pkey_tbl_len; - int *gid_tbl_len; + /** + * port_immutable is indexed by port number + */ + struct ib_port_immutable *port_immutable; int num_comp_vectors; @@ -1497,15 +1603,54 @@ struct ib_device { int (*get_protocol_stats)(struct ib_device *device, union rdma_protocol_stats *stats); int (*query_device)(struct ib_device *device, - struct ib_device_attr *device_attr); + struct ib_device_attr *device_attr, + struct ib_udata *udata); int (*query_port)(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u8 port_num); + /* When calling get_netdev, the HW vendor's driver should return the + * net device of device @device at port @port_num or NULL if such + * a net device doesn't exist. The vendor driver should call dev_hold + * on this net device. The HW vendor's device driver must guarantee + * that this function returns NULL before the net device reaches + * NETDEV_UNREGISTER_FINAL state. + */ + struct net_device *(*get_netdev)(struct ib_device *device, + u8 port_num); int (*query_gid)(struct ib_device *device, u8 port_num, int index, union ib_gid *gid); + /* When calling add_gid, the HW vendor's driver should + * add the gid of device @device at gid index @index of + * port @port_num to be @gid. Meta-info of that gid (for example, + * the network device related to this gid is available + * at @attr. @context allows the HW vendor driver to store extra + * information together with a GID entry. The HW vendor may allocate + * memory to contain this information and store it in @context when a + * new GID entry is written to. Params are consistent until the next + * call of add_gid or delete_gid. The function should return 0 on + * success or error otherwise. The function could be called + * concurrently for different ports. This function is only called + * when roce_gid_table is used. + */ + int (*add_gid)(struct ib_device *device, + u8 port_num, + unsigned int index, + const union ib_gid *gid, + const struct ib_gid_attr *attr, + void **context); + /* When calling del_gid, the HW vendor's driver should delete the + * gid of device @device at gid index @index of port @port_num. + * Upon the deletion of a GID entry, the HW vendor must free any + * allocated memory. The caller will clear @context afterwards. + * This function is only called when roce_gid_table is used. + */ + int (*del_gid)(struct ib_device *device, + u8 port_num, + unsigned int index, + void **context); int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); int (*modify_device)(struct ib_device *device, @@ -1561,8 +1706,8 @@ struct ib_device { int (*post_recv)(struct ib_qp *qp, struct ib_recv_wr *recv_wr, struct ib_recv_wr **bad_recv_wr); - struct ib_cq * (*create_cq)(struct ib_device *device, int cqe, - int comp_vector, + struct ib_cq * (*create_cq)(struct ib_device *device, + const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, @@ -1599,14 +1744,12 @@ struct ib_device { int (*query_mr)(struct ib_mr *mr, struct ib_mr_attr *mr_attr); int (*dereg_mr)(struct ib_mr *mr); - int (*destroy_mr)(struct ib_mr *mr); - struct ib_mr * (*create_mr)(struct ib_pd *pd, - struct ib_mr_init_attr *mr_init_attr); - struct ib_mr * (*alloc_fast_reg_mr)(struct ib_pd *pd, - int max_page_list_len); - struct ib_fast_reg_page_list * (*alloc_fast_reg_page_list)(struct ib_device *device, - int page_list_len); - void (*free_fast_reg_page_list)(struct ib_fast_reg_page_list *page_list); + struct ib_mr * (*alloc_mr)(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg); + int (*map_mr_sg)(struct ib_mr *mr, + struct scatterlist *sg, + int sg_nents); int (*rereg_phys_mr)(struct ib_mr *mr, int mr_rereg_mask, struct ib_pd *pd, @@ -1637,10 +1780,13 @@ struct ib_device { int (*process_mad)(struct ib_device *device, int process_mad_flags, u8 port_num, - struct ib_wc *in_wc, - struct ib_grh *in_grh, - struct ib_mad *in_mad, - struct ib_mad *out_mad); + const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad_hdr *in_mad, + size_t in_mad_size, + struct ib_mad_hdr *out_mad, + size_t *out_mad_size, + u16 *out_mad_pkey_index); struct ib_xrcd * (*alloc_xrcd)(struct ib_device *device, struct ib_ucontext *ucontext, struct ib_udata *udata); @@ -1652,6 +1798,7 @@ struct ib_device { int (*destroy_flow)(struct ib_flow *flow_id); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); + void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); struct ib_dma_mapping_ops *dma_ops; @@ -1673,15 +1820,46 @@ struct ib_device { char node_desc[64]; __be64 node_guid; u32 local_dma_lkey; + u16 is_switch:1; u8 node_type; u8 phys_port_cnt; + + /** + * The following mandatory functions are used only at device + * registration. Keep functions such as these at the end of this + * structure to avoid cache line misses when accessing struct ib_device + * in fast paths. + */ + int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); }; struct ib_client { char *name; void (*add) (struct ib_device *); - void (*remove)(struct ib_device *); - + void (*remove)(struct ib_device *, void *client_data); + + /* Returns the net_dev belonging to this ib_client and matching the + * given parameters. + * @dev: An RDMA device that the net_dev use for communication. + * @port: A physical port number on the RDMA device. + * @pkey: P_Key that the net_dev uses if applicable. + * @gid: A GID that the net_dev uses to communicate. + * @addr: An IP address the net_dev is configured with. + * @client_data: The device's client data set by ib_set_client_data(). + * + * An ib_client that implements a net_dev on top of RDMA devices + * (such as IP over IB) should implement this callback, allowing the + * rdma_cm module to find the right net_dev for a given request. + * + * The caller is responsible for calling dev_put on the returned + * netdev. */ + struct net_device *(*get_net_dev_by_params)( + struct ib_device *dev, + u8 port, + u16 pkey, + const union ib_gid *gid, + const struct sockaddr *addr, + void *client_data); struct list_head list; }; @@ -1743,8 +1921,292 @@ int ib_query_port(struct ib_device *device, enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num); +/** + * rdma_cap_ib_switch - Check if the device is IB switch + * @device: Device to check + * + * Device driver is responsible for setting is_switch bit on + * in ib_device structure at init time. + * + * Return: true if the device is IB switch. + */ +static inline bool rdma_cap_ib_switch(const struct ib_device *device) +{ + return device->is_switch; +} + +/** + * rdma_start_port - Return the first valid port number for the device + * specified + * + * @device: Device to be checked + * + * Return start port number + */ +static inline u8 rdma_start_port(const struct ib_device *device) +{ + return rdma_cap_ib_switch(device) ? 0 : 1; +} + +/** + * rdma_end_port - Return the last valid port number for the device + * specified + * + * @device: Device to be checked + * + * Return last port number + */ +static inline u8 rdma_end_port(const struct ib_device *device) +{ + return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt; +} + +static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB; +} + +static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; +} + +static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; +} + +static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & + (RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_PROT_ROCE); +} + +/** + * rdma_cap_ib_mad - Check if the port of a device supports Infiniband + * Management Datagrams. + * @device: Device to check + * @port_num: Port number to check + * + * Management Datagrams (MAD) are a required part of the InfiniBand + * specification and are supported on all InfiniBand devices. A slightly + * extended version are also supported on OPA interfaces. + * + * Return: true if the port supports sending/receiving of MAD packets. + */ +static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD; +} + +/** + * rdma_cap_opa_mad - Check if the port of device provides support for OPA + * Management Datagrams. + * @device: Device to check + * @port_num: Port number to check + * + * Intel OmniPath devices extend and/or replace the InfiniBand Management + * datagrams with their own versions. These OPA MADs share many but not all of + * the characteristics of InfiniBand MADs. + * + * OPA MADs differ in the following ways: + * + * 1) MADs are variable size up to 2K + * IBTA defined MADs remain fixed at 256 bytes + * 2) OPA SMPs must carry valid PKeys + * 3) OPA SMP packets are a different format + * + * Return: true if the port supports OPA MAD packet formats. + */ +static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) +{ + return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD) + == RDMA_CORE_CAP_OPA_MAD; +} + +/** + * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband + * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI). + * @device: Device to check + * @port_num: Port number to check + * + * Each InfiniBand node is required to provide a Subnet Management Agent + * that the subnet manager can access. Prior to the fabric being fully + * configured by the subnet manager, the SMA is accessed via a well known + * interface called the Subnet Management Interface (SMI). This interface + * uses directed route packets to communicate with the SM to get around the + * chicken and egg problem of the SM needing to know what's on the fabric + * in order to configure the fabric, and needing to configure the fabric in + * order to send packets to the devices on the fabric. These directed + * route packets do not need the fabric fully configured in order to reach + * their destination. The SMI is the only method allowed to send + * directed route packets on an InfiniBand fabric. + * + * Return: true if the port provides an SMI. + */ +static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI; +} + +/** + * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband + * Communication Manager. + * @device: Device to check + * @port_num: Port number to check + * + * The InfiniBand Communication Manager is one of many pre-defined General + * Service Agents (GSA) that are accessed via the General Service + * Interface (GSI). It's role is to facilitate establishment of connections + * between nodes as well as other management related tasks for established + * connections. + * + * Return: true if the port supports an IB CM (this does not guarantee that + * a CM is actually running however). + */ +static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM; +} + +/** + * rdma_cap_iw_cm - Check if the port of device has the capability IWARP + * Communication Manager. + * @device: Device to check + * @port_num: Port number to check + * + * Similar to above, but specific to iWARP connections which have a different + * managment protocol than InfiniBand. + * + * Return: true if the port supports an iWARP CM (this does not guarantee that + * a CM is actually running however). + */ +static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM; +} + +/** + * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband + * Subnet Administration. + * @device: Device to check + * @port_num: Port number to check + * + * An InfiniBand Subnet Administration (SA) service is a pre-defined General + * Service Agent (GSA) provided by the Subnet Manager (SM). On InfiniBand + * fabrics, devices should resolve routes to other hosts by contacting the + * SA to query the proper route. + * + * Return: true if the port should act as a client to the fabric Subnet + * Administration interface. This does not imply that the SA service is + * running locally. + */ +static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA; +} + +/** + * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband + * Multicast. + * @device: Device to check + * @port_num: Port number to check + * + * InfiniBand multicast registration is more complex than normal IPv4 or + * IPv6 multicast registration. Each Host Channel Adapter must register + * with the Subnet Manager when it wishes to join a multicast group. It + * should do so only once regardless of how many queue pairs it subscribes + * to this group. And it should leave the group only after all queue pairs + * attached to the group have been detached. + * + * Return: true if the port must undertake the additional adminstrative + * overhead of registering/unregistering with the SM and tracking of the + * total number of queue pairs attached to the multicast group. + */ +static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num) +{ + return rdma_cap_ib_sa(device, port_num); +} + +/** + * rdma_cap_af_ib - Check if the port of device has the capability + * Native Infiniband Address. + * @device: Device to check + * @port_num: Port number to check + * + * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default + * GID. RoCE uses a different mechanism, but still generates a GID via + * a prescribed mechanism and port specific data. + * + * Return: true if the port uses a GID address to identify devices on the + * network. + */ +static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB; +} + +/** + * rdma_cap_eth_ah - Check if the port of device has the capability + * Ethernet Address Handle. + * @device: Device to check + * @port_num: Port number to check + * + * RoCE is InfiniBand over Ethernet, and it uses a well defined technique + * to fabricate GIDs over Ethernet/IP specific addresses native to the + * port. Normally, packet headers are generated by the sending host + * adapter, but when sending connectionless datagrams, we must manually + * inject the proper headers for the fabric we are communicating over. + * + * Return: true if we are running as a RoCE port and must force the + * addition of a Global Route Header built from our Ethernet Address + * Handle into our header list for connectionless packets. + */ +static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH; +} + +/** + * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. + * + * @device: Device + * @port_num: Port number + * + * This MAD size includes the MAD headers and MAD payload. No other headers + * are included. + * + * Return the max MAD size required by the Port. Will return 0 if the port + * does not support MADs + */ +static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].max_mad_size; +} + +/** + * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table + * @device: Device to check + * @port_num: Port number to check + * + * RoCE GID table mechanism manages the various GIDs for a device. + * + * NOTE: if allocating the port's GID table has failed, this call will still + * return true, but any RoCE GID table API will fail. + * + * Return: true if the port uses RoCE GID table mechanism in order to manage + * its GIDs. + */ +static inline bool rdma_cap_roce_gid_table(const struct ib_device *device, + u8 port_num) +{ + return rdma_protocol_roce(device, port_num) && + device->add_gid && device->del_gid; +} + int ib_query_gid(struct ib_device *device, - u8 port_num, int index, union ib_gid *gid); + u8 port_num, int index, union ib_gid *gid, + struct ib_gid_attr *attr); int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); @@ -1758,25 +2220,14 @@ int ib_modify_port(struct ib_device *device, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, - u8 *port_num, u16 *index); + struct net_device *ndev, u8 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); -/** - * ib_alloc_pd - Allocates an unused protection domain. - * @device: The device on which to allocate the protection domain. - * - * A protection domain object provides an association between QPs, shared - * receive queues, address handles, memory regions, and memory windows. - */ struct ib_pd *ib_alloc_pd(struct ib_device *device); -/** - * ib_dealloc_pd - Deallocates a protection domain. - * @pd: The protection domain to deallocate. - */ -int ib_dealloc_pd(struct ib_pd *pd); +void ib_dealloc_pd(struct ib_pd *pd); /** * ib_create_ah - Creates an address handle for the given address vector. @@ -1799,8 +2250,9 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. */ -int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, - struct ib_grh *grh, struct ib_ah_attr *ah_attr); +int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct ib_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the @@ -1814,8 +2266,8 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc, * The address handle is used to reference a local or global destination * in all UD QP post sends. */ -struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc, - struct ib_grh *grh, u8 port_num); +struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, + const struct ib_grh *grh, u8 port_num); /** * ib_modify_ah - Modifies the address vector associated with an address @@ -2011,16 +2463,15 @@ static inline int ib_post_recv(struct ib_qp *qp, * asynchronous event not associated with a completion occurs on the CQ. * @cq_context: Context associated with the CQ returned to the user via * the associated completion and event handlers. - * @cqe: The minimum size of the CQ. - * @comp_vector - Completion vector used to signal completion events. - * Must be >= 0 and < context->num_comp_vectors. + * @cq_attr: The attributes the CQ should be created upon. * * Users can examine the cq structure to determine the actual CQ size. */ struct ib_cq *ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), - void *cq_context, int cqe, int comp_vector); + void *cq_context, + const struct ib_cq_init_attr *cq_attr); /** * ib_resize_cq - Modifies the capacity of the CQ. @@ -2388,52 +2839,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev, } /** - * ib_reg_phys_mr - Prepares a virtually addressed memory region for use - * by an HCA. - * @pd: The protection domain associated assigned to the registered region. - * @phys_buf_array: Specifies a list of physical buffers to use in the - * memory region. - * @num_phys_buf: Specifies the size of the phys_buf_array. - * @mr_access_flags: Specifies the memory access rights. - * @iova_start: The offset of the region's starting I/O virtual address. - */ -struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); - -/** - * ib_rereg_phys_mr - Modifies the attributes of an existing memory region. - * Conceptually, this call performs the functions deregister memory region - * followed by register physical memory region. Where possible, - * resources are reused instead of deallocated and reallocated. - * @mr: The memory region to modify. - * @mr_rereg_mask: A bit-mask used to indicate which of the following - * properties of the memory region are being modified. - * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies - * the new protection domain to associated with the memory region, - * otherwise, this parameter is ignored. - * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this - * field specifies a list of physical buffers to use in the new - * translation, otherwise, this parameter is ignored. - * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this - * field specifies the size of the phys_buf_array, otherwise, this - * parameter is ignored. - * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this - * field specifies the new memory access rights, otherwise, this - * parameter is ignored. - * @iova_start: The offset of the region's starting I/O virtual address. - */ -int ib_rereg_phys_mr(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); - -/** * ib_query_mr - Retrieves information about a specific memory region. * @mr: The memory region to retrieve information about. * @mr_attr: The attributes of the specified memory region. @@ -2449,60 +2854,9 @@ int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); */ int ib_dereg_mr(struct ib_mr *mr); - -/** - * ib_create_mr - Allocates a memory region that may be used for - * signature handover operations. - * @pd: The protection domain associated with the region. - * @mr_init_attr: memory region init attributes. - */ -struct ib_mr *ib_create_mr(struct ib_pd *pd, - struct ib_mr_init_attr *mr_init_attr); - -/** - * ib_destroy_mr - Destroys a memory region that was created using - * ib_create_mr and removes it from HW translation tables. - * @mr: The memory region to destroy. - * - * This function can fail, if the memory region has memory windows bound to it. - */ -int ib_destroy_mr(struct ib_mr *mr); - -/** - * ib_alloc_fast_reg_mr - Allocates memory region usable with the - * IB_WR_FAST_REG_MR send work request. - * @pd: The protection domain associated with the region. - * @max_page_list_len: requested max physical buffer list length to be - * used with fast register work requests for this MR. - */ -struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len); - -/** - * ib_alloc_fast_reg_page_list - Allocates a page list array - * @device - ib device pointer. - * @page_list_len - size of the page list array to be allocated. - * - * This allocates and returns a struct ib_fast_reg_page_list * and a - * page_list array that is at least page_list_len in size. The actual - * size is returned in max_page_list_len. The caller is responsible - * for initializing the contents of the page_list array before posting - * a send work request with the IB_WC_FAST_REG_MR opcode. - * - * The page_list array entries must be translated using one of the - * ib_dma_*() functions just like the addresses passed to - * ib_map_phys_fmr(). Once the ib_post_send() is issued, the struct - * ib_fast_reg_page_list must not be modified by the caller until the - * IB_WC_FAST_REG_MR work request completes. - */ -struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list( - struct ib_device *device, int page_list_len); - -/** - * ib_free_fast_reg_page_list - Deallocates a previously allocated - * page list array. - * @page_list - struct ib_fast_reg_page_list pointer to be deallocated. - */ -void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list); +struct ib_mr *ib_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg); /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR @@ -2668,4 +3022,32 @@ static inline int ib_check_mr_access(int flags) int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); +struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, + u16 pkey, const union ib_gid *gid, + const struct sockaddr *addr); + +int ib_map_mr_sg(struct ib_mr *mr, + struct scatterlist *sg, + int sg_nents, + unsigned int page_size); + +static inline int +ib_map_mr_sg_zbva(struct ib_mr *mr, + struct scatterlist *sg, + int sg_nents, + unsigned int page_size) +{ + int n; + + n = ib_map_mr_sg(mr, sg, sg_nents, page_size); + mr->iova = 0; + + return n; +} + +int ib_sg_to_pages(struct ib_mr *mr, + struct scatterlist *sgl, + int sg_nents, + int (*set_page)(struct ib_mr *, u64)); + #endif /* IB_VERBS_H */ |